1;; ARM 1026EJ-S Pipeline Description 2;; Copyright (C) 2003-2013 Free Software Foundation, Inc. 3;; Written by CodeSourcery, LLC. 4;; 5;; This file is part of GCC. 6;; 7;; GCC is free software; you can redistribute it and/or modify it 8;; under the terms of the GNU General Public License as published by 9;; the Free Software Foundation; either version 3, or (at your option) 10;; any later version. 11;; 12;; GCC is distributed in the hope that it will be useful, but 13;; WITHOUT ANY WARRANTY; without even the implied warranty of 14;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;; General Public License for more details. 16;; 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING3. If not see 19;; <http://www.gnu.org/licenses/>. */ 20 21;; These descriptions are based on the information contained in the 22;; ARM1026EJ-S Technical Reference Manual, Copyright (c) 2003 ARM 23;; Limited. 24;; 25 26;; This automaton provides a pipeline description for the ARM 27;; 1026EJ-S core. 28;; 29;; The model given here assumes that the condition for all conditional 30;; instructions is "true", i.e., that all of the instructions are 31;; actually executed. 32 33(define_automaton "arm1026ejs") 34 35;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 36;; Pipelines 37;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 38 39;; There are two pipelines: 40;; 41;; - An Arithmetic Logic Unit (ALU) pipeline. 42;; 43;; The ALU pipeline has fetch, issue, decode, execute, memory, and 44;; write stages. We only need to model the execute, memory and write 45;; stages. 46;; 47;; - A Load-Store Unit (LSU) pipeline. 48;; 49;; The LSU pipeline has decode, execute, memory, and write stages. 50;; We only model the execute, memory and write stages. 51 52(define_cpu_unit "a_e,a_m,a_w" "arm1026ejs") 53(define_cpu_unit "l_e,l_m,l_w" "arm1026ejs") 54 55;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 56;; ALU Instructions 57;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 58 59;; ALU instructions require three cycles to execute, and use the ALU 60;; pipeline in each of the three stages. The results are available 61;; after the execute stage stage has finished. 62;; 63;; If the destination register is the PC, the pipelines are stalled 64;; for several cycles. That case is not modeled here. 65 66;; ALU operations with no shifted operand 67(define_insn_reservation "alu_op" 1 68 (and (eq_attr "tune" "arm1026ejs") 69 (eq_attr "type" "alu_reg,simple_alu_imm")) 70 "a_e,a_m,a_w") 71 72;; ALU operations with a shift-by-constant operand 73(define_insn_reservation "alu_shift_op" 1 74 (and (eq_attr "tune" "arm1026ejs") 75 (eq_attr "type" "simple_alu_shift,alu_shift")) 76 "a_e,a_m,a_w") 77 78;; ALU operations with a shift-by-register operand 79;; These really stall in the decoder, in order to read 80;; the shift value in a second cycle. Pretend we take two cycles in 81;; the execute stage. 82(define_insn_reservation "alu_shift_reg_op" 2 83 (and (eq_attr "tune" "arm1026ejs") 84 (eq_attr "type" "alu_shift_reg")) 85 "a_e*2,a_m,a_w") 86 87;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 88;; Multiplication Instructions 89;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 90 91;; Multiplication instructions loop in the execute stage until the 92;; instruction has been passed through the multiplier array enough 93;; times. 94 95;; The result of the "smul" and "smulw" instructions is not available 96;; until after the memory stage. 97(define_insn_reservation "mult1" 2 98 (and (eq_attr "tune" "arm1026ejs") 99 (eq_attr "insn" "smulxy,smulwy")) 100 "a_e,a_m,a_w") 101 102;; The "smlaxy" and "smlawx" instructions require two iterations through 103;; the execute stage; the result is available immediately following 104;; the execute stage. 105(define_insn_reservation "mult2" 2 106 (and (eq_attr "tune" "arm1026ejs") 107 (eq_attr "insn" "smlaxy,smlalxy,smlawx")) 108 "a_e*2,a_m,a_w") 109 110;; The "smlalxy", "mul", and "mla" instructions require two iterations 111;; through the execute stage; the result is not available until after 112;; the memory stage. 113(define_insn_reservation "mult3" 3 114 (and (eq_attr "tune" "arm1026ejs") 115 (eq_attr "insn" "smlalxy,mul,mla")) 116 "a_e*2,a_m,a_w") 117 118;; The "muls" and "mlas" instructions loop in the execute stage for 119;; four iterations in order to set the flags. The value result is 120;; available after three iterations. 121(define_insn_reservation "mult4" 3 122 (and (eq_attr "tune" "arm1026ejs") 123 (eq_attr "insn" "muls,mlas")) 124 "a_e*4,a_m,a_w") 125 126;; Long multiply instructions that produce two registers of 127;; output (such as umull) make their results available in two cycles; 128;; the least significant word is available before the most significant 129;; word. That fact is not modeled; instead, the instructions are 130;; described as if the entire result was available at the end of the 131;; cycle in which both words are available. 132 133;; The "umull", "umlal", "smull", and "smlal" instructions all take 134;; three iterations through the execute cycle, and make their results 135;; available after the memory cycle. 136(define_insn_reservation "mult5" 4 137 (and (eq_attr "tune" "arm1026ejs") 138 (eq_attr "insn" "umull,umlal,smull,smlal")) 139 "a_e*3,a_m,a_w") 140 141;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in 142;; the execute stage for five iterations in order to set the flags. 143;; The value result is available after four iterations. 144(define_insn_reservation "mult6" 4 145 (and (eq_attr "tune" "arm1026ejs") 146 (eq_attr "insn" "umulls,umlals,smulls,smlals")) 147 "a_e*5,a_m,a_w") 148 149;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 150;; Load/Store Instructions 151;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 152 153;; The models for load/store instructions do not accurately describe 154;; the difference between operations with a base register writeback 155;; (such as "ldm!"). These models assume that all memory references 156;; hit in dcache. 157 158;; LSU instructions require six cycles to execute. They use the ALU 159;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles 160;; three through six. 161;; Loads and stores which use a scaled register offset or scaled 162;; register pre-indexed addressing mode take three cycles EXCEPT for 163;; those that are base + offset with LSL of 0 or 2, or base - offset 164;; with LSL of zero. The remainder take 1 cycle to execute. 165;; For 4byte loads there is a bypass from the load stage 166 167(define_insn_reservation "load1_op" 2 168 (and (eq_attr "tune" "arm1026ejs") 169 (eq_attr "type" "load_byte,load1")) 170 "a_e+l_e,l_m,a_w+l_w") 171 172(define_insn_reservation "store1_op" 0 173 (and (eq_attr "tune" "arm1026ejs") 174 (eq_attr "type" "store1")) 175 "a_e+l_e,l_m,a_w+l_w") 176 177;; A load's result can be stored by an immediately following store 178(define_bypass 1 "load1_op" "store1_op" "arm_no_early_store_addr_dep") 179 180;; On a LDM/STM operation, the LSU pipeline iterates until all of the 181;; registers have been processed. 182;; 183;; The time it takes to load the data depends on whether or not the 184;; base address is 64-bit aligned; if it is not, an additional cycle 185;; is required. This model assumes that the address is always 64-bit 186;; aligned. Because the processor can load two registers per cycle, 187;; that assumption means that we use the same instruction reservations 188;; for loading 2k and 2k - 1 registers. 189;; 190;; The ALU pipeline is stalled until the completion of the last memory 191;; stage in the LSU pipeline. That is modeled by keeping the ALU 192;; execute stage busy until that point. 193;; 194;; As with ALU operations, if one of the destination registers is the 195;; PC, there are additional stalls; that is not modeled. 196 197(define_insn_reservation "load2_op" 2 198 (and (eq_attr "tune" "arm1026ejs") 199 (eq_attr "type" "load2")) 200 "a_e+l_e,l_m,a_w+l_w") 201 202(define_insn_reservation "store2_op" 0 203 (and (eq_attr "tune" "arm1026ejs") 204 (eq_attr "type" "store2")) 205 "a_e+l_e,l_m,a_w+l_w") 206 207(define_insn_reservation "load34_op" 3 208 (and (eq_attr "tune" "arm1026ejs") 209 (eq_attr "type" "load3,load4")) 210 "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") 211 212(define_insn_reservation "store34_op" 0 213 (and (eq_attr "tune" "arm1026ejs") 214 (eq_attr "type" "store3,store4")) 215 "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") 216 217;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 218;; Branch and Call Instructions 219;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 220 221;; Branch instructions are difficult to model accurately. The ARM 222;; core can predict most branches. If the branch is predicted 223;; correctly, and predicted early enough, the branch can be completely 224;; eliminated from the instruction stream. Some branches can 225;; therefore appear to require zero cycles to execute. We assume that 226;; all branches are predicted correctly, and that the latency is 227;; therefore the minimum value. 228 229(define_insn_reservation "branch_op" 0 230 (and (eq_attr "tune" "arm1026ejs") 231 (eq_attr "type" "branch")) 232 "nothing") 233 234;; The latency for a call is not predictable. Therefore, we use 32 as 235;; roughly equivalent to positive infinity. 236 237(define_insn_reservation "call_op" 32 238 (and (eq_attr "tune" "arm1026ejs") 239 (eq_attr "type" "call")) 240 "nothing") 241