1;; AMD K6/K6-2 Scheduling 2;; Copyright (C) 2002-2013 Free Software Foundation, Inc. 3;; 4;; This file is part of GCC. 5;; 6;; GCC is free software; you can redistribute it and/or modify 7;; it under the terms of the GNU General Public License as published by 8;; the Free Software Foundation; either version 3, or (at your option) 9;; any later version. 10;; 11;; GCC is distributed in the hope that it will be useful, 12;; but WITHOUT ANY WARRANTY; without even the implied warranty of 13;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14;; GNU General Public License for more details. 15;; 16;; You should have received a copy of the GNU General Public License 17;; along with GCC; see the file COPYING3. If not see 18;; <http://www.gnu.org/licenses/>. 19;; 20;; The K6 architecture is quite similar to PPro. Important difference is 21;; that there are only two decoders and they seems to be much slower than 22;; any of the execution units. So we have to pay much more attention to 23;; proper scheduling for the decoders. 24;; FIXME: We don't do that right now. A good start would be to sort the 25;; instructions based on length. 26;; 27;; This description is based on data from the following documents: 28;; 29;; "AMD-K6 Processor Data Sheet (Preliminary information)" 30;; Advanced Micro Devices, Inc., 1998. 31;; 32;; "AMD-K6 Processor Code Optimization Application Note" 33;; Advanced Micro Devices, Inc., 2000. 34;; 35;; CPU execution units of the K6: 36;; 37;; store describes the Store unit. This unit is not modelled 38;; completely and it is only used to model lea operation. 39;; Otherwise it lies outside of any critical path. 40;; load describes the Load unit 41;; alux describes the Integer X unit 42;; mm describes the Multimedia unit, which shares a pipe 43;; with the Integer X unit. This unit is used for MMX, 44;; which is not implemented for K6. 45;; aluy describes the Integer Y unit 46;; fpu describes the FPU unit 47;; branch describes the Branch unit 48;; 49;; The fp unit is not pipelined, and it can only do one operation per two 50;; cycles, including fxcg. 51;; 52;; Generally this is a very poor description, but at least no worse than 53;; the old description, and a lot easier to extend to something more 54;; reasonable if anyone still cares enough about this architecture in 2004. 55;; 56;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real. 57 58(define_automaton "k6_decoder,k6_load_unit,k6_store_unit,k6_integer_units,k6_fpu_unit,k6_branch_unit") 59 60;; The K6 instruction decoding begins before the on-chip instruction cache is 61;; filled. Depending on the length of the instruction, two simple instructions 62;; can be decoded in two parallel short decoders, or one complex instruction can 63;; be decoded in either the long or the vector decoder. For all practical 64;; purposes, the long and vector decoder can be modelled as one decoder. 65(define_cpu_unit "k6_decode_short0" "k6_decoder") 66(define_cpu_unit "k6_decode_short1" "k6_decoder") 67(define_cpu_unit "k6_decode_long" "k6_decoder") 68(exclusion_set "k6_decode_long" "k6_decode_short0,k6_decode_short1") 69(define_reservation "k6_decode_short" "k6_decode_short0|k6_decode_short1") 70(define_reservation "k6_decode_vector" "k6_decode_long") 71 72(define_cpu_unit "k6_store" "k6_store_unit") 73(define_cpu_unit "k6_load" "k6_load_unit") 74(define_cpu_unit "k6_alux,k6_aluy" "k6_integer_units") 75(define_cpu_unit "k6_fpu" "k6_fpu_unit") 76(define_cpu_unit "k6_branch" "k6_branch_unit") 77 78;; Shift instructions and certain arithmetic are issued only on Integer X. 79(define_insn_reservation "k6_alux_only" 1 80 (and (eq_attr "cpu" "k6") 81 (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot") 82 (eq_attr "memory" "none"))) 83 "k6_decode_short,k6_alux") 84 85(define_insn_reservation "k6_alux_only_load" 3 86 (and (eq_attr "cpu" "k6") 87 (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot") 88 (eq_attr "memory" "load"))) 89 "k6_decode_short,k6_load,k6_alux") 90 91(define_insn_reservation "k6_alux_only_store" 3 92 (and (eq_attr "cpu" "k6") 93 (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot") 94 (eq_attr "memory" "store,both,unknown"))) 95 "k6_decode_long,k6_load,k6_alux,k6_store") 96 97;; Integer divide and multiply can only be issued on Integer X, too. 98(define_insn_reservation "k6_alu_imul" 2 99 (and (eq_attr "cpu" "k6") 100 (eq_attr "type" "imul")) 101 "k6_decode_vector,k6_alux*3") 102 103(define_insn_reservation "k6_alu_imul_load" 4 104 (and (eq_attr "cpu" "k6") 105 (and (eq_attr "type" "imul") 106 (eq_attr "memory" "load"))) 107 "k6_decode_vector,k6_load,k6_alux*3") 108 109(define_insn_reservation "k6_alu_imul_store" 4 110 (and (eq_attr "cpu" "k6") 111 (and (eq_attr "type" "imul") 112 (eq_attr "memory" "store,both,unknown"))) 113 "k6_decode_vector,k6_load,k6_alux*3,k6_store") 114 115;; ??? Guessed latencies based on the old pipeline description. 116(define_insn_reservation "k6_alu_idiv" 17 117 (and (eq_attr "cpu" "k6") 118 (and (eq_attr "type" "idiv") 119 (eq_attr "memory" "none"))) 120 "k6_decode_vector,k6_alux*17") 121 122(define_insn_reservation "k6_alu_idiv_mem" 19 123 (and (eq_attr "cpu" "k6") 124 (and (eq_attr "type" "idiv") 125 (eq_attr "memory" "!none"))) 126 "k6_decode_vector,k6_load,k6_alux*17") 127 128;; Basic word and doubleword ALU ops can be issued on both Integer units. 129(define_insn_reservation "k6_alu" 1 130 (and (eq_attr "cpu" "k6") 131 (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") 132 (eq_attr "memory" "none"))) 133 "k6_decode_short,k6_alux|k6_aluy") 134 135(define_insn_reservation "k6_alu_load" 3 136 (and (eq_attr "cpu" "k6") 137 (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") 138 (eq_attr "memory" "load"))) 139 "k6_decode_short,k6_load,k6_alux|k6_aluy") 140 141(define_insn_reservation "k6_alu_store" 3 142 (and (eq_attr "cpu" "k6") 143 (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") 144 (eq_attr "memory" "store,both,unknown"))) 145 "k6_decode_long,k6_load,k6_alux|k6_aluy,k6_store") 146 147;; A "load immediate" operation does not require execution at all, 148;; it is available immediately after decoding. Special-case this. 149(define_insn_reservation "k6_alu_imov" 1 150 (and (eq_attr "cpu" "k6") 151 (and (eq_attr "type" "imov") 152 (and (eq_attr "memory" "none") 153 (match_operand 1 "nonimmediate_operand")))) 154 "k6_decode_short,k6_alux|k6_aluy") 155 156(define_insn_reservation "k6_alu_imov_imm" 0 157 (and (eq_attr "cpu" "k6") 158 (and (eq_attr "type" "imov") 159 (and (eq_attr "memory" "none") 160 (match_operand 1 "immediate_operand")))) 161 "k6_decode_short") 162 163(define_insn_reservation "k6_alu_imov_load" 2 164 (and (eq_attr "cpu" "k6") 165 (and (eq_attr "type" "imov") 166 (eq_attr "memory" "load"))) 167 "k6_decode_short,k6_load") 168 169(define_insn_reservation "k6_alu_imov_store" 1 170 (and (eq_attr "cpu" "k6") 171 (and (eq_attr "type" "imov") 172 (eq_attr "memory" "store"))) 173 "k6_decode_short,k6_store") 174 175(define_insn_reservation "k6_alu_imov_both" 2 176 (and (eq_attr "cpu" "k6") 177 (and (eq_attr "type" "imov") 178 (eq_attr "memory" "both,unknown"))) 179 "k6_decode_long,k6_load,k6_alux|k6_aluy") 180 181;; The branch unit. 182(define_insn_reservation "k6_branch_call" 1 183 (and (eq_attr "cpu" "k6") 184 (eq_attr "type" "call,callv")) 185 "k6_decode_vector,k6_branch") 186 187(define_insn_reservation "k6_branch_branch" 1 188 (and (eq_attr "cpu" "k6") 189 (eq_attr "type" "ibr")) 190 "k6_decode_short,k6_branch") 191 192;; The load and units have two pipeline stages. The load latency is 193;; two cycles. 194(define_insn_reservation "k6_load_pop" 3 195 (and (eq_attr "cpu" "k6") 196 (ior (eq_attr "type" "pop") 197 (eq_attr "memory" "load,both"))) 198 "k6_decode_short,k6_load") 199 200(define_insn_reservation "k6_load_leave" 5 201 (and (eq_attr "cpu" "k6") 202 (eq_attr "type" "leave")) 203 "k6_decode_long,k6_load,(k6_alux|k6_aluy)*2") 204 205;; ??? From the old pipeline description. Egad! 206;; ??? Apparently we take care of this reservation in adjust_cost. 207(define_insn_reservation "k6_load_str" 10 208 (and (eq_attr "cpu" "k6") 209 (and (eq_attr "type" "str") 210 (eq_attr "memory" "load,both"))) 211 "k6_decode_vector,k6_load*10") 212 213;; The store unit handles lea and push. It is otherwise unmodelled. 214(define_insn_reservation "k6_store_lea" 2 215 (and (eq_attr "cpu" "k6") 216 (eq_attr "type" "lea")) 217 "k6_decode_short,k6_store,k6_alux|k6_aluy") 218 219(define_insn_reservation "k6_store_push" 2 220 (and (eq_attr "cpu" "k6") 221 (ior (eq_attr "type" "push") 222 (eq_attr "memory" "store,both"))) 223 "k6_decode_short,k6_store") 224 225(define_insn_reservation "k6_store_str" 10 226 (and (eq_attr "cpu" "k6") 227 (eq_attr "type" "str")) 228 "k6_store*10") 229 230;; Most FPU instructions have latency 2 and throughput 2. 231(define_insn_reservation "k6_fpu" 2 232 (and (eq_attr "cpu" "k6") 233 (and (eq_attr "type" "fop,fmov,fcmp,fistp") 234 (eq_attr "memory" "none"))) 235 "k6_decode_vector,k6_fpu*2") 236 237(define_insn_reservation "k6_fpu_load" 6 238 (and (eq_attr "cpu" "k6") 239 (and (eq_attr "type" "fop,fmov,fcmp,fistp") 240 (eq_attr "memory" "load,both"))) 241 "k6_decode_short,k6_load,k6_fpu*2") 242 243(define_insn_reservation "k6_fpu_store" 6 244 (and (eq_attr "cpu" "k6") 245 (and (eq_attr "type" "fop,fmov,fcmp,fistp") 246 (eq_attr "memory" "store"))) 247 "k6_decode_short,k6_store,k6_fpu*2") 248 249(define_insn_reservation "k6_fpu_fmul" 2 250 (and (eq_attr "cpu" "k6") 251 (and (eq_attr "type" "fmul") 252 (eq_attr "memory" "none"))) 253 "k6_decode_short,k6_fpu*2") 254 255(define_insn_reservation "k6_fpu_fmul_load" 2 256 (and (eq_attr "cpu" "k6") 257 (and (eq_attr "type" "fmul") 258 (eq_attr "memory" "load,both"))) 259 "k6_decode_short,k6_load,k6_fpu*2") 260 261;; ??? Guessed latencies from the old pipeline description. 262(define_insn_reservation "k6_fpu_expensive" 56 263 (and (eq_attr "cpu" "k6") 264 (eq_attr "type" "fdiv,fpspc")) 265 "k6_decode_short,k6_fpu*56") 266 267