1*38fd1498Szrj /* Costs of operations of individual x86 CPUs. 2*38fd1498Szrj Copyright (C) 1988-2018 Free Software Foundation, Inc. 3*38fd1498Szrj 4*38fd1498Szrj This file is part of GCC. 5*38fd1498Szrj 6*38fd1498Szrj GCC is free software; you can redistribute it and/or modify 7*38fd1498Szrj it under the terms of the GNU General Public License as published by 8*38fd1498Szrj the Free Software Foundation; either version 3, or (at your option) 9*38fd1498Szrj any later version. 10*38fd1498Szrj 11*38fd1498Szrj GCC is distributed in the hope that it will be useful, 12*38fd1498Szrj but WITHOUT ANY WARRANTY; without even the implied warranty of 13*38fd1498Szrj MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14*38fd1498Szrj GNU General Public License for more details. 15*38fd1498Szrj 16*38fd1498Szrj Under Section 7 of GPL version 3, you are granted additional 17*38fd1498Szrj permissions described in the GCC Runtime Library Exception, version 18*38fd1498Szrj 3.1, as published by the Free Software Foundation. 19*38fd1498Szrj 20*38fd1498Szrj You should have received a copy of the GNU General Public License and 21*38fd1498Szrj a copy of the GCC Runtime Library Exception along with this program; 22*38fd1498Szrj see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23*38fd1498Szrj <http://www.gnu.org/licenses/>. */ 24*38fd1498Szrj /* Processor costs (relative to an add) */ 25*38fd1498Szrj /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ 26*38fd1498Szrj #define COSTS_N_BYTES(N) ((N) * 2) 27*38fd1498Szrj 28*38fd1498Szrj #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} 29*38fd1498Szrj 30*38fd1498Szrj static stringop_algs ix86_size_memcpy[2] = { 31*38fd1498Szrj {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 32*38fd1498Szrj {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 33*38fd1498Szrj static stringop_algs ix86_size_memset[2] = { 34*38fd1498Szrj {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 35*38fd1498Szrj {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 36*38fd1498Szrj 37*38fd1498Szrj const 38*38fd1498Szrj struct processor_costs ix86_size_cost = {/* costs for tuning for size */ 39*38fd1498Szrj COSTS_N_BYTES (2), /* cost of an add instruction */ 40*38fd1498Szrj COSTS_N_BYTES (3), /* cost of a lea instruction */ 41*38fd1498Szrj COSTS_N_BYTES (2), /* variable shift costs */ 42*38fd1498Szrj COSTS_N_BYTES (3), /* constant shift costs */ 43*38fd1498Szrj {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ 44*38fd1498Szrj COSTS_N_BYTES (3), /* HI */ 45*38fd1498Szrj COSTS_N_BYTES (3), /* SI */ 46*38fd1498Szrj COSTS_N_BYTES (3), /* DI */ 47*38fd1498Szrj COSTS_N_BYTES (5)}, /* other */ 48*38fd1498Szrj 0, /* cost of multiply per each bit set */ 49*38fd1498Szrj {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ 50*38fd1498Szrj COSTS_N_BYTES (3), /* HI */ 51*38fd1498Szrj COSTS_N_BYTES (3), /* SI */ 52*38fd1498Szrj COSTS_N_BYTES (3), /* DI */ 53*38fd1498Szrj COSTS_N_BYTES (5)}, /* other */ 54*38fd1498Szrj COSTS_N_BYTES (3), /* cost of movsx */ 55*38fd1498Szrj COSTS_N_BYTES (3), /* cost of movzx */ 56*38fd1498Szrj 0, /* "large" insn */ 57*38fd1498Szrj 2, /* MOVE_RATIO */ 58*38fd1498Szrj 59*38fd1498Szrj /* All move costs are relative to integer->integer move times 2. */ 60*38fd1498Szrj 2, /* cost for loading QImode using movzbl */ 61*38fd1498Szrj {2, 2, 2}, /* cost of loading integer registers 62*38fd1498Szrj in QImode, HImode and SImode. 63*38fd1498Szrj Relative to reg-reg move (2). */ 64*38fd1498Szrj {2, 2, 2}, /* cost of storing integer registers */ 65*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 66*38fd1498Szrj {2, 2, 2}, /* cost of loading fp registers 67*38fd1498Szrj in SFmode, DFmode and XFmode */ 68*38fd1498Szrj {2, 2, 2}, /* cost of storing fp registers 69*38fd1498Szrj in SFmode, DFmode and XFmode */ 70*38fd1498Szrj 3, /* cost of moving MMX register */ 71*38fd1498Szrj {3, 3}, /* cost of loading MMX registers 72*38fd1498Szrj in SImode and DImode */ 73*38fd1498Szrj {3, 3}, /* cost of storing MMX registers 74*38fd1498Szrj in SImode and DImode */ 75*38fd1498Szrj 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 76*38fd1498Szrj {3, 3, 3, 3, 3}, /* cost of loading SSE registers 77*38fd1498Szrj in 32,64,128,256 and 512-bit */ 78*38fd1498Szrj {3, 3, 3, 3, 3}, /* cost of unaligned SSE load 79*38fd1498Szrj in 128bit, 256bit and 512bit */ 80*38fd1498Szrj {3, 3, 3, 3, 3}, /* cost of storing SSE registers 81*38fd1498Szrj in 32,64,128,256 and 512-bit */ 82*38fd1498Szrj {3, 3, 3, 3, 3}, /* cost of unaligned SSE store 83*38fd1498Szrj in 128bit, 256bit and 512bit */ 84*38fd1498Szrj 3, 3, /* SSE->integer and integer->SSE moves */ 85*38fd1498Szrj 5, 0, /* Gather load static, per_elt. */ 86*38fd1498Szrj 5, 0, /* Gather store static, per_elt. */ 87*38fd1498Szrj 0, /* size of l1 cache */ 88*38fd1498Szrj 0, /* size of l2 cache */ 89*38fd1498Szrj 0, /* size of prefetch block */ 90*38fd1498Szrj 0, /* number of parallel prefetches */ 91*38fd1498Szrj 2, /* Branch cost */ 92*38fd1498Szrj COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ 93*38fd1498Szrj COSTS_N_BYTES (2), /* cost of FMUL instruction. */ 94*38fd1498Szrj COSTS_N_BYTES (2), /* cost of FDIV instruction. */ 95*38fd1498Szrj COSTS_N_BYTES (2), /* cost of FABS instruction. */ 96*38fd1498Szrj COSTS_N_BYTES (2), /* cost of FCHS instruction. */ 97*38fd1498Szrj COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ 98*38fd1498Szrj 99*38fd1498Szrj COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */ 100*38fd1498Szrj COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 101*38fd1498Szrj COSTS_N_BYTES (2), /* cost of MULSS instruction. */ 102*38fd1498Szrj COSTS_N_BYTES (2), /* cost of MULSD instruction. */ 103*38fd1498Szrj COSTS_N_BYTES (2), /* cost of FMA SS instruction. */ 104*38fd1498Szrj COSTS_N_BYTES (2), /* cost of FMA SD instruction. */ 105*38fd1498Szrj COSTS_N_BYTES (2), /* cost of DIVSS instruction. */ 106*38fd1498Szrj COSTS_N_BYTES (2), /* cost of DIVSD instruction. */ 107*38fd1498Szrj COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */ 108*38fd1498Szrj COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */ 109*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 110*38fd1498Szrj ix86_size_memcpy, 111*38fd1498Szrj ix86_size_memset, 112*38fd1498Szrj COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ 113*38fd1498Szrj COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */ 114*38fd1498Szrj }; 115*38fd1498Szrj 116*38fd1498Szrj /* Processor costs (relative to an add) */ 117*38fd1498Szrj static stringop_algs i386_memcpy[2] = { 118*38fd1498Szrj {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 119*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 120*38fd1498Szrj static stringop_algs i386_memset[2] = { 121*38fd1498Szrj {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 122*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 123*38fd1498Szrj 124*38fd1498Szrj static const 125*38fd1498Szrj struct processor_costs i386_cost = { /* 386 specific costs */ 126*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 127*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 128*38fd1498Szrj COSTS_N_INSNS (3), /* variable shift costs */ 129*38fd1498Szrj COSTS_N_INSNS (2), /* constant shift costs */ 130*38fd1498Szrj {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ 131*38fd1498Szrj COSTS_N_INSNS (6), /* HI */ 132*38fd1498Szrj COSTS_N_INSNS (6), /* SI */ 133*38fd1498Szrj COSTS_N_INSNS (6), /* DI */ 134*38fd1498Szrj COSTS_N_INSNS (6)}, /* other */ 135*38fd1498Szrj COSTS_N_INSNS (1), /* cost of multiply per each bit set */ 136*38fd1498Szrj {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ 137*38fd1498Szrj COSTS_N_INSNS (23), /* HI */ 138*38fd1498Szrj COSTS_N_INSNS (23), /* SI */ 139*38fd1498Szrj COSTS_N_INSNS (23), /* DI */ 140*38fd1498Szrj COSTS_N_INSNS (23)}, /* other */ 141*38fd1498Szrj COSTS_N_INSNS (3), /* cost of movsx */ 142*38fd1498Szrj COSTS_N_INSNS (2), /* cost of movzx */ 143*38fd1498Szrj 15, /* "large" insn */ 144*38fd1498Szrj 3, /* MOVE_RATIO */ 145*38fd1498Szrj 146*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 147*38fd1498Szrj they are latency*2. */ 148*38fd1498Szrj 4, /* cost for loading QImode using movzbl */ 149*38fd1498Szrj {2, 4, 2}, /* cost of loading integer registers 150*38fd1498Szrj in QImode, HImode and SImode. 151*38fd1498Szrj Relative to reg-reg move (2). */ 152*38fd1498Szrj {2, 4, 2}, /* cost of storing integer registers */ 153*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 154*38fd1498Szrj {8, 8, 8}, /* cost of loading fp registers 155*38fd1498Szrj in SFmode, DFmode and XFmode */ 156*38fd1498Szrj {8, 8, 8}, /* cost of storing fp registers 157*38fd1498Szrj in SFmode, DFmode and XFmode */ 158*38fd1498Szrj 2, /* cost of moving MMX register */ 159*38fd1498Szrj {4, 8}, /* cost of loading MMX registers 160*38fd1498Szrj in SImode and DImode */ 161*38fd1498Szrj {4, 8}, /* cost of storing MMX registers 162*38fd1498Szrj in SImode and DImode */ 163*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 164*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of loading SSE registers 165*38fd1498Szrj in 32,64,128,256 and 512-bit */ 166*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 167*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of storing SSE registers 168*38fd1498Szrj in 32,64,128,256 and 512-bit */ 169*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 170*38fd1498Szrj 3, 3, /* SSE->integer and integer->SSE moves */ 171*38fd1498Szrj 4, 4, /* Gather load static, per_elt. */ 172*38fd1498Szrj 4, 4, /* Gather store static, per_elt. */ 173*38fd1498Szrj 0, /* size of l1 cache */ 174*38fd1498Szrj 0, /* size of l2 cache */ 175*38fd1498Szrj 0, /* size of prefetch block */ 176*38fd1498Szrj 0, /* number of parallel prefetches */ 177*38fd1498Szrj 1, /* Branch cost */ 178*38fd1498Szrj COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ 179*38fd1498Szrj COSTS_N_INSNS (27), /* cost of FMUL instruction. */ 180*38fd1498Szrj COSTS_N_INSNS (88), /* cost of FDIV instruction. */ 181*38fd1498Szrj COSTS_N_INSNS (22), /* cost of FABS instruction. */ 182*38fd1498Szrj COSTS_N_INSNS (24), /* cost of FCHS instruction. */ 183*38fd1498Szrj COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ 184*38fd1498Szrj 185*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 186*38fd1498Szrj COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */ 187*38fd1498Szrj COSTS_N_INSNS (27), /* cost of MULSS instruction. */ 188*38fd1498Szrj COSTS_N_INSNS (27), /* cost of MULSD instruction. */ 189*38fd1498Szrj COSTS_N_INSNS (27), /* cost of FMA SS instruction. */ 190*38fd1498Szrj COSTS_N_INSNS (27), /* cost of FMA SD instruction. */ 191*38fd1498Szrj COSTS_N_INSNS (88), /* cost of DIVSS instruction. */ 192*38fd1498Szrj COSTS_N_INSNS (88), /* cost of DIVSD instruction. */ 193*38fd1498Szrj COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */ 194*38fd1498Szrj COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */ 195*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 196*38fd1498Szrj i386_memcpy, 197*38fd1498Szrj i386_memset, 198*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 199*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 200*38fd1498Szrj }; 201*38fd1498Szrj 202*38fd1498Szrj static stringop_algs i486_memcpy[2] = { 203*38fd1498Szrj {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 204*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 205*38fd1498Szrj static stringop_algs i486_memset[2] = { 206*38fd1498Szrj {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 207*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 208*38fd1498Szrj 209*38fd1498Szrj static const 210*38fd1498Szrj struct processor_costs i486_cost = { /* 486 specific costs */ 211*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 212*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 213*38fd1498Szrj COSTS_N_INSNS (3), /* variable shift costs */ 214*38fd1498Szrj COSTS_N_INSNS (2), /* constant shift costs */ 215*38fd1498Szrj {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ 216*38fd1498Szrj COSTS_N_INSNS (12), /* HI */ 217*38fd1498Szrj COSTS_N_INSNS (12), /* SI */ 218*38fd1498Szrj COSTS_N_INSNS (12), /* DI */ 219*38fd1498Szrj COSTS_N_INSNS (12)}, /* other */ 220*38fd1498Szrj 1, /* cost of multiply per each bit set */ 221*38fd1498Szrj {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ 222*38fd1498Szrj COSTS_N_INSNS (40), /* HI */ 223*38fd1498Szrj COSTS_N_INSNS (40), /* SI */ 224*38fd1498Szrj COSTS_N_INSNS (40), /* DI */ 225*38fd1498Szrj COSTS_N_INSNS (40)}, /* other */ 226*38fd1498Szrj COSTS_N_INSNS (3), /* cost of movsx */ 227*38fd1498Szrj COSTS_N_INSNS (2), /* cost of movzx */ 228*38fd1498Szrj 15, /* "large" insn */ 229*38fd1498Szrj 3, /* MOVE_RATIO */ 230*38fd1498Szrj 231*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 232*38fd1498Szrj they are latency*2. */ 233*38fd1498Szrj 4, /* cost for loading QImode using movzbl */ 234*38fd1498Szrj {2, 4, 2}, /* cost of loading integer registers 235*38fd1498Szrj in QImode, HImode and SImode. 236*38fd1498Szrj Relative to reg-reg move (2). */ 237*38fd1498Szrj {2, 4, 2}, /* cost of storing integer registers */ 238*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 239*38fd1498Szrj {8, 8, 8}, /* cost of loading fp registers 240*38fd1498Szrj in SFmode, DFmode and XFmode */ 241*38fd1498Szrj {8, 8, 8}, /* cost of storing fp registers 242*38fd1498Szrj in SFmode, DFmode and XFmode */ 243*38fd1498Szrj 2, /* cost of moving MMX register */ 244*38fd1498Szrj {4, 8}, /* cost of loading MMX registers 245*38fd1498Szrj in SImode and DImode */ 246*38fd1498Szrj {4, 8}, /* cost of storing MMX registers 247*38fd1498Szrj in SImode and DImode */ 248*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 249*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of loading SSE registers 250*38fd1498Szrj in 32,64,128,256 and 512-bit */ 251*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 252*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of storing SSE registers 253*38fd1498Szrj in 32,64,128,256 and 512-bit */ 254*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 255*38fd1498Szrj 3, 3, /* SSE->integer and integer->SSE moves */ 256*38fd1498Szrj 4, 4, /* Gather load static, per_elt. */ 257*38fd1498Szrj 4, 4, /* Gather store static, per_elt. */ 258*38fd1498Szrj 4, /* size of l1 cache. 486 has 8kB cache 259*38fd1498Szrj shared for code and data, so 4kB is 260*38fd1498Szrj not really precise. */ 261*38fd1498Szrj 4, /* size of l2 cache */ 262*38fd1498Szrj 0, /* size of prefetch block */ 263*38fd1498Szrj 0, /* number of parallel prefetches */ 264*38fd1498Szrj 1, /* Branch cost */ 265*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 266*38fd1498Szrj COSTS_N_INSNS (16), /* cost of FMUL instruction. */ 267*38fd1498Szrj COSTS_N_INSNS (73), /* cost of FDIV instruction. */ 268*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FABS instruction. */ 269*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 270*38fd1498Szrj COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ 271*38fd1498Szrj 272*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 273*38fd1498Szrj COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 274*38fd1498Szrj COSTS_N_INSNS (16), /* cost of MULSS instruction. */ 275*38fd1498Szrj COSTS_N_INSNS (16), /* cost of MULSD instruction. */ 276*38fd1498Szrj COSTS_N_INSNS (16), /* cost of FMA SS instruction. */ 277*38fd1498Szrj COSTS_N_INSNS (16), /* cost of FMA SD instruction. */ 278*38fd1498Szrj COSTS_N_INSNS (73), /* cost of DIVSS instruction. */ 279*38fd1498Szrj COSTS_N_INSNS (74), /* cost of DIVSD instruction. */ 280*38fd1498Szrj COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */ 281*38fd1498Szrj COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */ 282*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 283*38fd1498Szrj i486_memcpy, 284*38fd1498Szrj i486_memset, 285*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 286*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 287*38fd1498Szrj }; 288*38fd1498Szrj 289*38fd1498Szrj static stringop_algs pentium_memcpy[2] = { 290*38fd1498Szrj {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 291*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 292*38fd1498Szrj static stringop_algs pentium_memset[2] = { 293*38fd1498Szrj {libcall, {{-1, rep_prefix_4_byte, false}}}, 294*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 295*38fd1498Szrj 296*38fd1498Szrj static const 297*38fd1498Szrj struct processor_costs pentium_cost = { 298*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 299*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 300*38fd1498Szrj COSTS_N_INSNS (4), /* variable shift costs */ 301*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 302*38fd1498Szrj {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 303*38fd1498Szrj COSTS_N_INSNS (11), /* HI */ 304*38fd1498Szrj COSTS_N_INSNS (11), /* SI */ 305*38fd1498Szrj COSTS_N_INSNS (11), /* DI */ 306*38fd1498Szrj COSTS_N_INSNS (11)}, /* other */ 307*38fd1498Szrj 0, /* cost of multiply per each bit set */ 308*38fd1498Szrj {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 309*38fd1498Szrj COSTS_N_INSNS (25), /* HI */ 310*38fd1498Szrj COSTS_N_INSNS (25), /* SI */ 311*38fd1498Szrj COSTS_N_INSNS (25), /* DI */ 312*38fd1498Szrj COSTS_N_INSNS (25)}, /* other */ 313*38fd1498Szrj COSTS_N_INSNS (3), /* cost of movsx */ 314*38fd1498Szrj COSTS_N_INSNS (2), /* cost of movzx */ 315*38fd1498Szrj 8, /* "large" insn */ 316*38fd1498Szrj 6, /* MOVE_RATIO */ 317*38fd1498Szrj 318*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 319*38fd1498Szrj they are latency*2. */ 320*38fd1498Szrj 6, /* cost for loading QImode using movzbl */ 321*38fd1498Szrj {2, 4, 2}, /* cost of loading integer registers 322*38fd1498Szrj in QImode, HImode and SImode. 323*38fd1498Szrj Relative to reg-reg move (2). */ 324*38fd1498Szrj {2, 4, 2}, /* cost of storing integer registers */ 325*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 326*38fd1498Szrj {2, 2, 6}, /* cost of loading fp registers 327*38fd1498Szrj in SFmode, DFmode and XFmode */ 328*38fd1498Szrj {4, 4, 6}, /* cost of storing fp registers 329*38fd1498Szrj in SFmode, DFmode and XFmode */ 330*38fd1498Szrj 8, /* cost of moving MMX register */ 331*38fd1498Szrj {8, 8}, /* cost of loading MMX registers 332*38fd1498Szrj in SImode and DImode */ 333*38fd1498Szrj {8, 8}, /* cost of storing MMX registers 334*38fd1498Szrj in SImode and DImode */ 335*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 336*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of loading SSE registers 337*38fd1498Szrj in 32,64,128,256 and 512-bit */ 338*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 339*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of storing SSE registers 340*38fd1498Szrj in 32,64,128,256 and 512-bit */ 341*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 342*38fd1498Szrj 3, 3, /* SSE->integer and integer->SSE moves */ 343*38fd1498Szrj 4, 4, /* Gather load static, per_elt. */ 344*38fd1498Szrj 4, 4, /* Gather store static, per_elt. */ 345*38fd1498Szrj 8, /* size of l1 cache. */ 346*38fd1498Szrj 8, /* size of l2 cache */ 347*38fd1498Szrj 0, /* size of prefetch block */ 348*38fd1498Szrj 0, /* number of parallel prefetches */ 349*38fd1498Szrj 2, /* Branch cost */ 350*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 351*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 352*38fd1498Szrj COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 353*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FABS instruction. */ 354*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 355*38fd1498Szrj COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 356*38fd1498Szrj 357*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 358*38fd1498Szrj COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 359*38fd1498Szrj COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 360*38fd1498Szrj COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 361*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 362*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 363*38fd1498Szrj COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 364*38fd1498Szrj COSTS_N_INSNS (39), /* cost of DIVSD instruction. */ 365*38fd1498Szrj COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */ 366*38fd1498Szrj COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */ 367*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 368*38fd1498Szrj pentium_memcpy, 369*38fd1498Szrj pentium_memset, 370*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 371*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 372*38fd1498Szrj }; 373*38fd1498Szrj 374*38fd1498Szrj static const 375*38fd1498Szrj struct processor_costs lakemont_cost = { 376*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 377*38fd1498Szrj COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 378*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 379*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 380*38fd1498Szrj {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 381*38fd1498Szrj COSTS_N_INSNS (11), /* HI */ 382*38fd1498Szrj COSTS_N_INSNS (11), /* SI */ 383*38fd1498Szrj COSTS_N_INSNS (11), /* DI */ 384*38fd1498Szrj COSTS_N_INSNS (11)}, /* other */ 385*38fd1498Szrj 0, /* cost of multiply per each bit set */ 386*38fd1498Szrj {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 387*38fd1498Szrj COSTS_N_INSNS (25), /* HI */ 388*38fd1498Szrj COSTS_N_INSNS (25), /* SI */ 389*38fd1498Szrj COSTS_N_INSNS (25), /* DI */ 390*38fd1498Szrj COSTS_N_INSNS (25)}, /* other */ 391*38fd1498Szrj COSTS_N_INSNS (3), /* cost of movsx */ 392*38fd1498Szrj COSTS_N_INSNS (2), /* cost of movzx */ 393*38fd1498Szrj 8, /* "large" insn */ 394*38fd1498Szrj 17, /* MOVE_RATIO */ 395*38fd1498Szrj 396*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 397*38fd1498Szrj they are latency*2. */ 398*38fd1498Szrj 6, /* cost for loading QImode using movzbl */ 399*38fd1498Szrj {2, 4, 2}, /* cost of loading integer registers 400*38fd1498Szrj in QImode, HImode and SImode. 401*38fd1498Szrj Relative to reg-reg move (2). */ 402*38fd1498Szrj {2, 4, 2}, /* cost of storing integer registers */ 403*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 404*38fd1498Szrj {2, 2, 6}, /* cost of loading fp registers 405*38fd1498Szrj in SFmode, DFmode and XFmode */ 406*38fd1498Szrj {4, 4, 6}, /* cost of storing fp registers 407*38fd1498Szrj in SFmode, DFmode and XFmode */ 408*38fd1498Szrj 8, /* cost of moving MMX register */ 409*38fd1498Szrj {8, 8}, /* cost of loading MMX registers 410*38fd1498Szrj in SImode and DImode */ 411*38fd1498Szrj {8, 8}, /* cost of storing MMX registers 412*38fd1498Szrj in SImode and DImode */ 413*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 414*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of loading SSE registers 415*38fd1498Szrj in 32,64,128,256 and 512-bit */ 416*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 417*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of storing SSE registers 418*38fd1498Szrj in 32,64,128,256 and 512-bit */ 419*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 420*38fd1498Szrj 3, 3, /* SSE->integer and integer->SSE moves */ 421*38fd1498Szrj 4, 4, /* Gather load static, per_elt. */ 422*38fd1498Szrj 4, 4, /* Gather store static, per_elt. */ 423*38fd1498Szrj 8, /* size of l1 cache. */ 424*38fd1498Szrj 8, /* size of l2 cache */ 425*38fd1498Szrj 0, /* size of prefetch block */ 426*38fd1498Szrj 0, /* number of parallel prefetches */ 427*38fd1498Szrj 2, /* Branch cost */ 428*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 429*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 430*38fd1498Szrj COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 431*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FABS instruction. */ 432*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 433*38fd1498Szrj COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 434*38fd1498Szrj 435*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 436*38fd1498Szrj COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 437*38fd1498Szrj COSTS_N_INSNS (5), /* cost of MULSS instruction. */ 438*38fd1498Szrj COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 439*38fd1498Szrj COSTS_N_INSNS (10), /* cost of FMA SS instruction. */ 440*38fd1498Szrj COSTS_N_INSNS (10), /* cost of FMA SD instruction. */ 441*38fd1498Szrj COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 442*38fd1498Szrj COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 443*38fd1498Szrj COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 444*38fd1498Szrj COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 445*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 446*38fd1498Szrj pentium_memcpy, 447*38fd1498Szrj pentium_memset, 448*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 449*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 450*38fd1498Szrj }; 451*38fd1498Szrj 452*38fd1498Szrj /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes 453*38fd1498Szrj (we ensure the alignment). For small blocks inline loop is still a 454*38fd1498Szrj noticeable win, for bigger blocks either rep movsl or rep movsb is 455*38fd1498Szrj way to go. Rep movsb has apparently more expensive startup time in CPU, 456*38fd1498Szrj but after 4K the difference is down in the noise. */ 457*38fd1498Szrj static stringop_algs pentiumpro_memcpy[2] = { 458*38fd1498Szrj {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, 459*38fd1498Szrj {8192, rep_prefix_4_byte, false}, 460*38fd1498Szrj {-1, rep_prefix_1_byte, false}}}, 461*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 462*38fd1498Szrj static stringop_algs pentiumpro_memset[2] = { 463*38fd1498Szrj {rep_prefix_4_byte, {{1024, unrolled_loop, false}, 464*38fd1498Szrj {8192, rep_prefix_4_byte, false}, 465*38fd1498Szrj {-1, libcall, false}}}, 466*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 467*38fd1498Szrj static const 468*38fd1498Szrj struct processor_costs pentiumpro_cost = { 469*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 470*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 471*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 472*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 473*38fd1498Szrj {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 474*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 475*38fd1498Szrj COSTS_N_INSNS (4), /* SI */ 476*38fd1498Szrj COSTS_N_INSNS (4), /* DI */ 477*38fd1498Szrj COSTS_N_INSNS (4)}, /* other */ 478*38fd1498Szrj 0, /* cost of multiply per each bit set */ 479*38fd1498Szrj {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ 480*38fd1498Szrj COSTS_N_INSNS (17), /* HI */ 481*38fd1498Szrj COSTS_N_INSNS (17), /* SI */ 482*38fd1498Szrj COSTS_N_INSNS (17), /* DI */ 483*38fd1498Szrj COSTS_N_INSNS (17)}, /* other */ 484*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 485*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 486*38fd1498Szrj 8, /* "large" insn */ 487*38fd1498Szrj 6, /* MOVE_RATIO */ 488*38fd1498Szrj 489*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 490*38fd1498Szrj they are latency*2. */ 491*38fd1498Szrj 2, /* cost for loading QImode using movzbl */ 492*38fd1498Szrj {4, 4, 4}, /* cost of loading integer registers 493*38fd1498Szrj in QImode, HImode and SImode. 494*38fd1498Szrj Relative to reg-reg move (2). */ 495*38fd1498Szrj {2, 2, 2}, /* cost of storing integer registers */ 496*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 497*38fd1498Szrj {2, 2, 6}, /* cost of loading fp registers 498*38fd1498Szrj in SFmode, DFmode and XFmode */ 499*38fd1498Szrj {4, 4, 6}, /* cost of storing fp registers 500*38fd1498Szrj in SFmode, DFmode and XFmode */ 501*38fd1498Szrj 2, /* cost of moving MMX register */ 502*38fd1498Szrj {2, 2}, /* cost of loading MMX registers 503*38fd1498Szrj in SImode and DImode */ 504*38fd1498Szrj {2, 2}, /* cost of storing MMX registers 505*38fd1498Szrj in SImode and DImode */ 506*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 507*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of loading SSE registers 508*38fd1498Szrj in 32,64,128,256 and 512-bit */ 509*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 510*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of storing SSE registers 511*38fd1498Szrj in 32,64,128,256 and 512-bit */ 512*38fd1498Szrj {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 513*38fd1498Szrj 3, 3, /* SSE->integer and integer->SSE moves */ 514*38fd1498Szrj 4, 4, /* Gather load static, per_elt. */ 515*38fd1498Szrj 4, 4, /* Gather store static, per_elt. */ 516*38fd1498Szrj 8, /* size of l1 cache. */ 517*38fd1498Szrj 256, /* size of l2 cache */ 518*38fd1498Szrj 32, /* size of prefetch block */ 519*38fd1498Szrj 6, /* number of parallel prefetches */ 520*38fd1498Szrj 2, /* Branch cost */ 521*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 522*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 523*38fd1498Szrj COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 524*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 525*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 526*38fd1498Szrj COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 527*38fd1498Szrj 528*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 529*38fd1498Szrj COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 530*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 531*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 532*38fd1498Szrj COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 533*38fd1498Szrj COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 534*38fd1498Szrj COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 535*38fd1498Szrj COSTS_N_INSNS (18), /* cost of DIVSD instruction. */ 536*38fd1498Szrj COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 537*38fd1498Szrj COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */ 538*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 539*38fd1498Szrj pentiumpro_memcpy, 540*38fd1498Szrj pentiumpro_memset, 541*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 542*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 543*38fd1498Szrj }; 544*38fd1498Szrj 545*38fd1498Szrj static stringop_algs geode_memcpy[2] = { 546*38fd1498Szrj {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 547*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 548*38fd1498Szrj static stringop_algs geode_memset[2] = { 549*38fd1498Szrj {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 550*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 551*38fd1498Szrj static const 552*38fd1498Szrj struct processor_costs geode_cost = { 553*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 554*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 555*38fd1498Szrj COSTS_N_INSNS (2), /* variable shift costs */ 556*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 557*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 558*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 559*38fd1498Szrj COSTS_N_INSNS (7), /* SI */ 560*38fd1498Szrj COSTS_N_INSNS (7), /* DI */ 561*38fd1498Szrj COSTS_N_INSNS (7)}, /* other */ 562*38fd1498Szrj 0, /* cost of multiply per each bit set */ 563*38fd1498Szrj {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ 564*38fd1498Szrj COSTS_N_INSNS (23), /* HI */ 565*38fd1498Szrj COSTS_N_INSNS (39), /* SI */ 566*38fd1498Szrj COSTS_N_INSNS (39), /* DI */ 567*38fd1498Szrj COSTS_N_INSNS (39)}, /* other */ 568*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 569*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 570*38fd1498Szrj 8, /* "large" insn */ 571*38fd1498Szrj 4, /* MOVE_RATIO */ 572*38fd1498Szrj 573*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 574*38fd1498Szrj they are latency*2. */ 575*38fd1498Szrj 2, /* cost for loading QImode using movzbl */ 576*38fd1498Szrj {2, 2, 2}, /* cost of loading integer registers 577*38fd1498Szrj in QImode, HImode and SImode. 578*38fd1498Szrj Relative to reg-reg move (2). */ 579*38fd1498Szrj {2, 2, 2}, /* cost of storing integer registers */ 580*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 581*38fd1498Szrj {2, 2, 2}, /* cost of loading fp registers 582*38fd1498Szrj in SFmode, DFmode and XFmode */ 583*38fd1498Szrj {4, 6, 6}, /* cost of storing fp registers 584*38fd1498Szrj in SFmode, DFmode and XFmode */ 585*38fd1498Szrj 586*38fd1498Szrj 2, /* cost of moving MMX register */ 587*38fd1498Szrj {2, 2}, /* cost of loading MMX registers 588*38fd1498Szrj in SImode and DImode */ 589*38fd1498Szrj {2, 2}, /* cost of storing MMX registers 590*38fd1498Szrj in SImode and DImode */ 591*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 592*38fd1498Szrj {2, 2, 8, 16, 32}, /* cost of loading SSE registers 593*38fd1498Szrj in 32,64,128,256 and 512-bit */ 594*38fd1498Szrj {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 595*38fd1498Szrj {2, 2, 8, 16, 32}, /* cost of storing SSE registers 596*38fd1498Szrj in 32,64,128,256 and 512-bit */ 597*38fd1498Szrj {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 598*38fd1498Szrj 6, 6, /* SSE->integer and integer->SSE moves */ 599*38fd1498Szrj 2, 2, /* Gather load static, per_elt. */ 600*38fd1498Szrj 2, 2, /* Gather store static, per_elt. */ 601*38fd1498Szrj 64, /* size of l1 cache. */ 602*38fd1498Szrj 128, /* size of l2 cache. */ 603*38fd1498Szrj 32, /* size of prefetch block */ 604*38fd1498Szrj 1, /* number of parallel prefetches */ 605*38fd1498Szrj 1, /* Branch cost */ 606*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 607*38fd1498Szrj COSTS_N_INSNS (11), /* cost of FMUL instruction. */ 608*38fd1498Szrj COSTS_N_INSNS (47), /* cost of FDIV instruction. */ 609*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FABS instruction. */ 610*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 611*38fd1498Szrj COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ 612*38fd1498Szrj 613*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 614*38fd1498Szrj COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 615*38fd1498Szrj COSTS_N_INSNS (11), /* cost of MULSS instruction. */ 616*38fd1498Szrj COSTS_N_INSNS (11), /* cost of MULSD instruction. */ 617*38fd1498Szrj COSTS_N_INSNS (17), /* cost of FMA SS instruction. */ 618*38fd1498Szrj COSTS_N_INSNS (17), /* cost of FMA SD instruction. */ 619*38fd1498Szrj COSTS_N_INSNS (47), /* cost of DIVSS instruction. */ 620*38fd1498Szrj COSTS_N_INSNS (47), /* cost of DIVSD instruction. */ 621*38fd1498Szrj COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */ 622*38fd1498Szrj COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */ 623*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 624*38fd1498Szrj geode_memcpy, 625*38fd1498Szrj geode_memset, 626*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 627*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 628*38fd1498Szrj }; 629*38fd1498Szrj 630*38fd1498Szrj static stringop_algs k6_memcpy[2] = { 631*38fd1498Szrj {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 632*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 633*38fd1498Szrj static stringop_algs k6_memset[2] = { 634*38fd1498Szrj {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 635*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 636*38fd1498Szrj static const 637*38fd1498Szrj struct processor_costs k6_cost = { 638*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 639*38fd1498Szrj COSTS_N_INSNS (2), /* cost of a lea instruction */ 640*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 641*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 642*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 643*38fd1498Szrj COSTS_N_INSNS (3), /* HI */ 644*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 645*38fd1498Szrj COSTS_N_INSNS (3), /* DI */ 646*38fd1498Szrj COSTS_N_INSNS (3)}, /* other */ 647*38fd1498Szrj 0, /* cost of multiply per each bit set */ 648*38fd1498Szrj {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 649*38fd1498Szrj COSTS_N_INSNS (18), /* HI */ 650*38fd1498Szrj COSTS_N_INSNS (18), /* SI */ 651*38fd1498Szrj COSTS_N_INSNS (18), /* DI */ 652*38fd1498Szrj COSTS_N_INSNS (18)}, /* other */ 653*38fd1498Szrj COSTS_N_INSNS (2), /* cost of movsx */ 654*38fd1498Szrj COSTS_N_INSNS (2), /* cost of movzx */ 655*38fd1498Szrj 8, /* "large" insn */ 656*38fd1498Szrj 4, /* MOVE_RATIO */ 657*38fd1498Szrj 658*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 659*38fd1498Szrj they are latency*2. */ 660*38fd1498Szrj 3, /* cost for loading QImode using movzbl */ 661*38fd1498Szrj {4, 5, 4}, /* cost of loading integer registers 662*38fd1498Szrj in QImode, HImode and SImode. 663*38fd1498Szrj Relative to reg-reg move (2). */ 664*38fd1498Szrj {2, 3, 2}, /* cost of storing integer registers */ 665*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 666*38fd1498Szrj {6, 6, 6}, /* cost of loading fp registers 667*38fd1498Szrj in SFmode, DFmode and XFmode */ 668*38fd1498Szrj {4, 4, 4}, /* cost of storing fp registers 669*38fd1498Szrj in SFmode, DFmode and XFmode */ 670*38fd1498Szrj 2, /* cost of moving MMX register */ 671*38fd1498Szrj {2, 2}, /* cost of loading MMX registers 672*38fd1498Szrj in SImode and DImode */ 673*38fd1498Szrj {2, 2}, /* cost of storing MMX registers 674*38fd1498Szrj in SImode and DImode */ 675*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 676*38fd1498Szrj {2, 2, 8, 16, 32}, /* cost of loading SSE registers 677*38fd1498Szrj in 32,64,128,256 and 512-bit */ 678*38fd1498Szrj {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 679*38fd1498Szrj {2, 2, 8, 16, 32}, /* cost of storing SSE registers 680*38fd1498Szrj in 32,64,128,256 and 512-bit */ 681*38fd1498Szrj {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 682*38fd1498Szrj 6, 6, /* SSE->integer and integer->SSE moves */ 683*38fd1498Szrj 2, 2, /* Gather load static, per_elt. */ 684*38fd1498Szrj 2, 2, /* Gather store static, per_elt. */ 685*38fd1498Szrj 32, /* size of l1 cache. */ 686*38fd1498Szrj 32, /* size of l2 cache. Some models 687*38fd1498Szrj have integrated l2 cache, but 688*38fd1498Szrj optimizing for k6 is not important 689*38fd1498Szrj enough to worry about that. */ 690*38fd1498Szrj 32, /* size of prefetch block */ 691*38fd1498Szrj 1, /* number of parallel prefetches */ 692*38fd1498Szrj 1, /* Branch cost */ 693*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ 694*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FMUL instruction. */ 695*38fd1498Szrj COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 696*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 697*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 698*38fd1498Szrj COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 699*38fd1498Szrj 700*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 701*38fd1498Szrj COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 702*38fd1498Szrj COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 703*38fd1498Szrj COSTS_N_INSNS (2), /* cost of MULSD instruction. */ 704*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 705*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 706*38fd1498Szrj COSTS_N_INSNS (56), /* cost of DIVSS instruction. */ 707*38fd1498Szrj COSTS_N_INSNS (56), /* cost of DIVSD instruction. */ 708*38fd1498Szrj COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */ 709*38fd1498Szrj COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */ 710*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 711*38fd1498Szrj k6_memcpy, 712*38fd1498Szrj k6_memset, 713*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 714*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 715*38fd1498Szrj }; 716*38fd1498Szrj 717*38fd1498Szrj /* For some reason, Athlon deals better with REP prefix (relative to loops) 718*38fd1498Szrj compared to K8. Alignment becomes important after 8 bytes for memcpy and 719*38fd1498Szrj 128 bytes for memset. */ 720*38fd1498Szrj static stringop_algs athlon_memcpy[2] = { 721*38fd1498Szrj {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 722*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 723*38fd1498Szrj static stringop_algs athlon_memset[2] = { 724*38fd1498Szrj {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 725*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 726*38fd1498Szrj static const 727*38fd1498Szrj struct processor_costs athlon_cost = { 728*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 729*38fd1498Szrj COSTS_N_INSNS (2), /* cost of a lea instruction */ 730*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 731*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 732*38fd1498Szrj {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ 733*38fd1498Szrj COSTS_N_INSNS (5), /* HI */ 734*38fd1498Szrj COSTS_N_INSNS (5), /* SI */ 735*38fd1498Szrj COSTS_N_INSNS (5), /* DI */ 736*38fd1498Szrj COSTS_N_INSNS (5)}, /* other */ 737*38fd1498Szrj 0, /* cost of multiply per each bit set */ 738*38fd1498Szrj {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 739*38fd1498Szrj COSTS_N_INSNS (26), /* HI */ 740*38fd1498Szrj COSTS_N_INSNS (42), /* SI */ 741*38fd1498Szrj COSTS_N_INSNS (74), /* DI */ 742*38fd1498Szrj COSTS_N_INSNS (74)}, /* other */ 743*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 744*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 745*38fd1498Szrj 8, /* "large" insn */ 746*38fd1498Szrj 9, /* MOVE_RATIO */ 747*38fd1498Szrj 748*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 749*38fd1498Szrj they are latency*2. */ 750*38fd1498Szrj 4, /* cost for loading QImode using movzbl */ 751*38fd1498Szrj {3, 4, 3}, /* cost of loading integer registers 752*38fd1498Szrj in QImode, HImode and SImode. 753*38fd1498Szrj Relative to reg-reg move (2). */ 754*38fd1498Szrj {3, 4, 3}, /* cost of storing integer registers */ 755*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 756*38fd1498Szrj {4, 4, 12}, /* cost of loading fp registers 757*38fd1498Szrj in SFmode, DFmode and XFmode */ 758*38fd1498Szrj {6, 6, 8}, /* cost of storing fp registers 759*38fd1498Szrj in SFmode, DFmode and XFmode */ 760*38fd1498Szrj 2, /* cost of moving MMX register */ 761*38fd1498Szrj {4, 4}, /* cost of loading MMX registers 762*38fd1498Szrj in SImode and DImode */ 763*38fd1498Szrj {4, 4}, /* cost of storing MMX registers 764*38fd1498Szrj in SImode and DImode */ 765*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 766*38fd1498Szrj {4, 4, 6, 12, 24}, /* cost of loading SSE registers 767*38fd1498Szrj in 32,64,128,256 and 512-bit */ 768*38fd1498Szrj {4, 4, 6, 12, 24}, /* cost of unaligned loads. */ 769*38fd1498Szrj {4, 4, 5, 10, 20}, /* cost of storing SSE registers 770*38fd1498Szrj in 32,64,128,256 and 512-bit */ 771*38fd1498Szrj {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 772*38fd1498Szrj 5, 5, /* SSE->integer and integer->SSE moves */ 773*38fd1498Szrj 4, 4, /* Gather load static, per_elt. */ 774*38fd1498Szrj 4, 4, /* Gather store static, per_elt. */ 775*38fd1498Szrj 64, /* size of l1 cache. */ 776*38fd1498Szrj 256, /* size of l2 cache. */ 777*38fd1498Szrj 64, /* size of prefetch block */ 778*38fd1498Szrj 6, /* number of parallel prefetches */ 779*38fd1498Szrj 5, /* Branch cost */ 780*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 781*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 782*38fd1498Szrj COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 783*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 784*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 785*38fd1498Szrj COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 786*38fd1498Szrj 787*38fd1498Szrj COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 788*38fd1498Szrj COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 789*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 790*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 791*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 792*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 793*38fd1498Szrj /* 11-16 */ 794*38fd1498Szrj COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 795*38fd1498Szrj COSTS_N_INSNS (24), /* cost of DIVSD instruction. */ 796*38fd1498Szrj COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 797*38fd1498Szrj COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */ 798*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 799*38fd1498Szrj athlon_memcpy, 800*38fd1498Szrj athlon_memset, 801*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 802*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 803*38fd1498Szrj }; 804*38fd1498Szrj 805*38fd1498Szrj /* K8 has optimized REP instruction for medium sized blocks, but for very 806*38fd1498Szrj small blocks it is better to use loop. For large blocks, libcall can 807*38fd1498Szrj do nontemporary accesses and beat inline considerably. */ 808*38fd1498Szrj static stringop_algs k8_memcpy[2] = { 809*38fd1498Szrj {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 810*38fd1498Szrj {-1, rep_prefix_4_byte, false}}}, 811*38fd1498Szrj {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 812*38fd1498Szrj {-1, libcall, false}}}}; 813*38fd1498Szrj static stringop_algs k8_memset[2] = { 814*38fd1498Szrj {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 815*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 816*38fd1498Szrj {libcall, {{48, unrolled_loop, false}, 817*38fd1498Szrj {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 818*38fd1498Szrj static const 819*38fd1498Szrj struct processor_costs k8_cost = { 820*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 821*38fd1498Szrj COSTS_N_INSNS (2), /* cost of a lea instruction */ 822*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 823*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 824*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 825*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 826*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 827*38fd1498Szrj COSTS_N_INSNS (4), /* DI */ 828*38fd1498Szrj COSTS_N_INSNS (5)}, /* other */ 829*38fd1498Szrj 0, /* cost of multiply per each bit set */ 830*38fd1498Szrj {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 831*38fd1498Szrj COSTS_N_INSNS (26), /* HI */ 832*38fd1498Szrj COSTS_N_INSNS (42), /* SI */ 833*38fd1498Szrj COSTS_N_INSNS (74), /* DI */ 834*38fd1498Szrj COSTS_N_INSNS (74)}, /* other */ 835*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 836*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 837*38fd1498Szrj 8, /* "large" insn */ 838*38fd1498Szrj 9, /* MOVE_RATIO */ 839*38fd1498Szrj 840*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 841*38fd1498Szrj they are latency*2. */ 842*38fd1498Szrj 4, /* cost for loading QImode using movzbl */ 843*38fd1498Szrj {3, 4, 3}, /* cost of loading integer registers 844*38fd1498Szrj in QImode, HImode and SImode. 845*38fd1498Szrj Relative to reg-reg move (2). */ 846*38fd1498Szrj {3, 4, 3}, /* cost of storing integer registers */ 847*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 848*38fd1498Szrj {4, 4, 12}, /* cost of loading fp registers 849*38fd1498Szrj in SFmode, DFmode and XFmode */ 850*38fd1498Szrj {6, 6, 8}, /* cost of storing fp registers 851*38fd1498Szrj in SFmode, DFmode and XFmode */ 852*38fd1498Szrj 2, /* cost of moving MMX register */ 853*38fd1498Szrj {3, 3}, /* cost of loading MMX registers 854*38fd1498Szrj in SImode and DImode */ 855*38fd1498Szrj {4, 4}, /* cost of storing MMX registers 856*38fd1498Szrj in SImode and DImode */ 857*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 858*38fd1498Szrj {4, 3, 6, 12, 24}, /* cost of loading SSE registers 859*38fd1498Szrj in 32,64,128,256 and 512-bit */ 860*38fd1498Szrj {4, 3, 6, 12, 24}, /* cost of unaligned loads. */ 861*38fd1498Szrj {4, 4, 5, 10, 20}, /* cost of storing SSE registers 862*38fd1498Szrj in 32,64,128,256 and 512-bit */ 863*38fd1498Szrj {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 864*38fd1498Szrj 5, 5, /* SSE->integer and integer->SSE moves */ 865*38fd1498Szrj 4, 4, /* Gather load static, per_elt. */ 866*38fd1498Szrj 4, 4, /* Gather store static, per_elt. */ 867*38fd1498Szrj 64, /* size of l1 cache. */ 868*38fd1498Szrj 512, /* size of l2 cache. */ 869*38fd1498Szrj 64, /* size of prefetch block */ 870*38fd1498Szrj /* New AMD processors never drop prefetches; if they cannot be performed 871*38fd1498Szrj immediately, they are queued. We set number of simultaneous prefetches 872*38fd1498Szrj to a large constant to reflect this (it probably is not a good idea not 873*38fd1498Szrj to limit number of prefetches at all, as their execution also takes some 874*38fd1498Szrj time). */ 875*38fd1498Szrj 100, /* number of parallel prefetches */ 876*38fd1498Szrj 3, /* Branch cost */ 877*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 878*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 879*38fd1498Szrj COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 880*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 881*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 882*38fd1498Szrj COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 883*38fd1498Szrj 884*38fd1498Szrj COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 885*38fd1498Szrj COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 886*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 887*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 888*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 889*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 890*38fd1498Szrj /* 11-16 */ 891*38fd1498Szrj COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 892*38fd1498Szrj COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 893*38fd1498Szrj COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 894*38fd1498Szrj COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 895*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 896*38fd1498Szrj k8_memcpy, 897*38fd1498Szrj k8_memset, 898*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 899*38fd1498Szrj COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 900*38fd1498Szrj }; 901*38fd1498Szrj 902*38fd1498Szrj /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for 903*38fd1498Szrj very small blocks it is better to use loop. For large blocks, libcall can 904*38fd1498Szrj do nontemporary accesses and beat inline considerably. */ 905*38fd1498Szrj static stringop_algs amdfam10_memcpy[2] = { 906*38fd1498Szrj {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 907*38fd1498Szrj {-1, rep_prefix_4_byte, false}}}, 908*38fd1498Szrj {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 909*38fd1498Szrj {-1, libcall, false}}}}; 910*38fd1498Szrj static stringop_algs amdfam10_memset[2] = { 911*38fd1498Szrj {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 912*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 913*38fd1498Szrj {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 914*38fd1498Szrj {-1, libcall, false}}}}; 915*38fd1498Szrj struct processor_costs amdfam10_cost = { 916*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 917*38fd1498Szrj COSTS_N_INSNS (2), /* cost of a lea instruction */ 918*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 919*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 920*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 921*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 922*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 923*38fd1498Szrj COSTS_N_INSNS (4), /* DI */ 924*38fd1498Szrj COSTS_N_INSNS (5)}, /* other */ 925*38fd1498Szrj 0, /* cost of multiply per each bit set */ 926*38fd1498Szrj {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 927*38fd1498Szrj COSTS_N_INSNS (35), /* HI */ 928*38fd1498Szrj COSTS_N_INSNS (51), /* SI */ 929*38fd1498Szrj COSTS_N_INSNS (83), /* DI */ 930*38fd1498Szrj COSTS_N_INSNS (83)}, /* other */ 931*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 932*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 933*38fd1498Szrj 8, /* "large" insn */ 934*38fd1498Szrj 9, /* MOVE_RATIO */ 935*38fd1498Szrj 936*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 937*38fd1498Szrj they are latency*2. */ 938*38fd1498Szrj 4, /* cost for loading QImode using movzbl */ 939*38fd1498Szrj {3, 4, 3}, /* cost of loading integer registers 940*38fd1498Szrj in QImode, HImode and SImode. 941*38fd1498Szrj Relative to reg-reg move (2). */ 942*38fd1498Szrj {3, 4, 3}, /* cost of storing integer registers */ 943*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 944*38fd1498Szrj {4, 4, 12}, /* cost of loading fp registers 945*38fd1498Szrj in SFmode, DFmode and XFmode */ 946*38fd1498Szrj {6, 6, 8}, /* cost of storing fp registers 947*38fd1498Szrj in SFmode, DFmode and XFmode */ 948*38fd1498Szrj 2, /* cost of moving MMX register */ 949*38fd1498Szrj {3, 3}, /* cost of loading MMX registers 950*38fd1498Szrj in SImode and DImode */ 951*38fd1498Szrj {4, 4}, /* cost of storing MMX registers 952*38fd1498Szrj in SImode and DImode */ 953*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 954*38fd1498Szrj {4, 4, 3, 6, 12}, /* cost of loading SSE registers 955*38fd1498Szrj in 32,64,128,256 and 512-bit */ 956*38fd1498Szrj {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ 957*38fd1498Szrj {4, 4, 5, 10, 20}, /* cost of storing SSE registers 958*38fd1498Szrj in 32,64,128,256 and 512-bit */ 959*38fd1498Szrj {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 960*38fd1498Szrj 3, 3, /* SSE->integer and integer->SSE moves */ 961*38fd1498Szrj /* On K8: 962*38fd1498Szrj MOVD reg64, xmmreg Double FSTORE 4 963*38fd1498Szrj MOVD reg32, xmmreg Double FSTORE 4 964*38fd1498Szrj On AMDFAM10: 965*38fd1498Szrj MOVD reg64, xmmreg Double FADD 3 966*38fd1498Szrj 1/1 1/1 967*38fd1498Szrj MOVD reg32, xmmreg Double FADD 3 968*38fd1498Szrj 1/1 1/1 */ 969*38fd1498Szrj 4, 4, /* Gather load static, per_elt. */ 970*38fd1498Szrj 4, 4, /* Gather store static, per_elt. */ 971*38fd1498Szrj 64, /* size of l1 cache. */ 972*38fd1498Szrj 512, /* size of l2 cache. */ 973*38fd1498Szrj 64, /* size of prefetch block */ 974*38fd1498Szrj /* New AMD processors never drop prefetches; if they cannot be performed 975*38fd1498Szrj immediately, they are queued. We set number of simultaneous prefetches 976*38fd1498Szrj to a large constant to reflect this (it probably is not a good idea not 977*38fd1498Szrj to limit number of prefetches at all, as their execution also takes some 978*38fd1498Szrj time). */ 979*38fd1498Szrj 100, /* number of parallel prefetches */ 980*38fd1498Szrj 2, /* Branch cost */ 981*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 982*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 983*38fd1498Szrj COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 984*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 985*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 986*38fd1498Szrj COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 987*38fd1498Szrj 988*38fd1498Szrj COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 989*38fd1498Szrj COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 990*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 991*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 992*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 993*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 994*38fd1498Szrj /* 11-16 */ 995*38fd1498Szrj COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 996*38fd1498Szrj COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 997*38fd1498Szrj COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 998*38fd1498Szrj COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 999*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1000*38fd1498Szrj amdfam10_memcpy, 1001*38fd1498Szrj amdfam10_memset, 1002*38fd1498Szrj COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1003*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1004*38fd1498Szrj }; 1005*38fd1498Szrj 1006*38fd1498Szrj /* BDVER1 has optimized REP instruction for medium sized blocks, but for 1007*38fd1498Szrj very small blocks it is better to use loop. For large blocks, libcall 1008*38fd1498Szrj can do nontemporary accesses and beat inline considerably. */ 1009*38fd1498Szrj static stringop_algs bdver1_memcpy[2] = { 1010*38fd1498Szrj {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1011*38fd1498Szrj {-1, rep_prefix_4_byte, false}}}, 1012*38fd1498Szrj {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1013*38fd1498Szrj {-1, libcall, false}}}}; 1014*38fd1498Szrj static stringop_algs bdver1_memset[2] = { 1015*38fd1498Szrj {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1016*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1017*38fd1498Szrj {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1018*38fd1498Szrj {-1, libcall, false}}}}; 1019*38fd1498Szrj 1020*38fd1498Szrj const struct processor_costs bdver1_cost = { 1021*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1022*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 1023*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 1024*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 1025*38fd1498Szrj {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1026*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 1027*38fd1498Szrj COSTS_N_INSNS (4), /* SI */ 1028*38fd1498Szrj COSTS_N_INSNS (6), /* DI */ 1029*38fd1498Szrj COSTS_N_INSNS (6)}, /* other */ 1030*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1031*38fd1498Szrj {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1032*38fd1498Szrj COSTS_N_INSNS (35), /* HI */ 1033*38fd1498Szrj COSTS_N_INSNS (51), /* SI */ 1034*38fd1498Szrj COSTS_N_INSNS (83), /* DI */ 1035*38fd1498Szrj COSTS_N_INSNS (83)}, /* other */ 1036*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1037*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 1038*38fd1498Szrj 8, /* "large" insn */ 1039*38fd1498Szrj 9, /* MOVE_RATIO */ 1040*38fd1498Szrj 1041*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1042*38fd1498Szrj they are latency*2. */ 1043*38fd1498Szrj 8, /* cost for loading QImode using movzbl */ 1044*38fd1498Szrj {8, 8, 8}, /* cost of loading integer registers 1045*38fd1498Szrj in QImode, HImode and SImode. 1046*38fd1498Szrj Relative to reg-reg move (2). */ 1047*38fd1498Szrj {8, 8, 8}, /* cost of storing integer registers */ 1048*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 1049*38fd1498Szrj {12, 12, 28}, /* cost of loading fp registers 1050*38fd1498Szrj in SFmode, DFmode and XFmode */ 1051*38fd1498Szrj {10, 10, 18}, /* cost of storing fp registers 1052*38fd1498Szrj in SFmode, DFmode and XFmode */ 1053*38fd1498Szrj 4, /* cost of moving MMX register */ 1054*38fd1498Szrj {12, 12}, /* cost of loading MMX registers 1055*38fd1498Szrj in SImode and DImode */ 1056*38fd1498Szrj {10, 10}, /* cost of storing MMX registers 1057*38fd1498Szrj in SImode and DImode */ 1058*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1059*38fd1498Szrj {12, 12, 10, 20, 30}, /* cost of loading SSE registers 1060*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1061*38fd1498Szrj {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ 1062*38fd1498Szrj {10, 10, 10, 20, 30}, /* cost of storing SSE registers 1063*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1064*38fd1498Szrj {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 1065*38fd1498Szrj 16, 20, /* SSE->integer and integer->SSE moves */ 1066*38fd1498Szrj 12, 12, /* Gather load static, per_elt. */ 1067*38fd1498Szrj 10, 10, /* Gather store static, per_elt. */ 1068*38fd1498Szrj 16, /* size of l1 cache. */ 1069*38fd1498Szrj 2048, /* size of l2 cache. */ 1070*38fd1498Szrj 64, /* size of prefetch block */ 1071*38fd1498Szrj /* New AMD processors never drop prefetches; if they cannot be performed 1072*38fd1498Szrj immediately, they are queued. We set number of simultaneous prefetches 1073*38fd1498Szrj to a large constant to reflect this (it probably is not a good idea not 1074*38fd1498Szrj to limit number of prefetches at all, as their execution also takes some 1075*38fd1498Szrj time). */ 1076*38fd1498Szrj 100, /* number of parallel prefetches */ 1077*38fd1498Szrj 2, /* Branch cost */ 1078*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1079*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1080*38fd1498Szrj COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1081*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1082*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1083*38fd1498Szrj COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1084*38fd1498Szrj 1085*38fd1498Szrj COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1086*38fd1498Szrj COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1087*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1088*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1089*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1090*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1091*38fd1498Szrj /* 9-24 */ 1092*38fd1498Szrj COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1093*38fd1498Szrj /* 9-27 */ 1094*38fd1498Szrj COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1095*38fd1498Szrj COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1096*38fd1498Szrj COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1097*38fd1498Szrj 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1098*38fd1498Szrj bdver1_memcpy, 1099*38fd1498Szrj bdver1_memset, 1100*38fd1498Szrj COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1101*38fd1498Szrj COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1102*38fd1498Szrj }; 1103*38fd1498Szrj 1104*38fd1498Szrj /* BDVER2 has optimized REP instruction for medium sized blocks, but for 1105*38fd1498Szrj very small blocks it is better to use loop. For large blocks, libcall 1106*38fd1498Szrj can do nontemporary accesses and beat inline considerably. */ 1107*38fd1498Szrj 1108*38fd1498Szrj static stringop_algs bdver2_memcpy[2] = { 1109*38fd1498Szrj {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1110*38fd1498Szrj {-1, rep_prefix_4_byte, false}}}, 1111*38fd1498Szrj {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1112*38fd1498Szrj {-1, libcall, false}}}}; 1113*38fd1498Szrj static stringop_algs bdver2_memset[2] = { 1114*38fd1498Szrj {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1115*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1116*38fd1498Szrj {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1117*38fd1498Szrj {-1, libcall, false}}}}; 1118*38fd1498Szrj 1119*38fd1498Szrj const struct processor_costs bdver2_cost = { 1120*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1121*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 1122*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 1123*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 1124*38fd1498Szrj {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1125*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 1126*38fd1498Szrj COSTS_N_INSNS (4), /* SI */ 1127*38fd1498Szrj COSTS_N_INSNS (6), /* DI */ 1128*38fd1498Szrj COSTS_N_INSNS (6)}, /* other */ 1129*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1130*38fd1498Szrj {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1131*38fd1498Szrj COSTS_N_INSNS (35), /* HI */ 1132*38fd1498Szrj COSTS_N_INSNS (51), /* SI */ 1133*38fd1498Szrj COSTS_N_INSNS (83), /* DI */ 1134*38fd1498Szrj COSTS_N_INSNS (83)}, /* other */ 1135*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1136*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 1137*38fd1498Szrj 8, /* "large" insn */ 1138*38fd1498Szrj 9, /* MOVE_RATIO */ 1139*38fd1498Szrj 1140*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1141*38fd1498Szrj they are latency*2. */ 1142*38fd1498Szrj 8, /* cost for loading QImode using movzbl */ 1143*38fd1498Szrj {8, 8, 8}, /* cost of loading integer registers 1144*38fd1498Szrj in QImode, HImode and SImode. 1145*38fd1498Szrj Relative to reg-reg move (2). */ 1146*38fd1498Szrj {8, 8, 8}, /* cost of storing integer registers */ 1147*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 1148*38fd1498Szrj {12, 12, 28}, /* cost of loading fp registers 1149*38fd1498Szrj in SFmode, DFmode and XFmode */ 1150*38fd1498Szrj {10, 10, 18}, /* cost of storing fp registers 1151*38fd1498Szrj in SFmode, DFmode and XFmode */ 1152*38fd1498Szrj 4, /* cost of moving MMX register */ 1153*38fd1498Szrj {12, 12}, /* cost of loading MMX registers 1154*38fd1498Szrj in SImode and DImode */ 1155*38fd1498Szrj {10, 10}, /* cost of storing MMX registers 1156*38fd1498Szrj in SImode and DImode */ 1157*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1158*38fd1498Szrj {12, 12, 10, 20, 30}, /* cost of loading SSE registers 1159*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1160*38fd1498Szrj {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ 1161*38fd1498Szrj {10, 10, 10, 20, 30}, /* cost of storing SSE registers 1162*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1163*38fd1498Szrj {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 1164*38fd1498Szrj 16, 20, /* SSE->integer and integer->SSE moves */ 1165*38fd1498Szrj 12, 12, /* Gather load static, per_elt. */ 1166*38fd1498Szrj 10, 10, /* Gather store static, per_elt. */ 1167*38fd1498Szrj 16, /* size of l1 cache. */ 1168*38fd1498Szrj 2048, /* size of l2 cache. */ 1169*38fd1498Szrj 64, /* size of prefetch block */ 1170*38fd1498Szrj /* New AMD processors never drop prefetches; if they cannot be performed 1171*38fd1498Szrj immediately, they are queued. We set number of simultaneous prefetches 1172*38fd1498Szrj to a large constant to reflect this (it probably is not a good idea not 1173*38fd1498Szrj to limit number of prefetches at all, as their execution also takes some 1174*38fd1498Szrj time). */ 1175*38fd1498Szrj 100, /* number of parallel prefetches */ 1176*38fd1498Szrj 2, /* Branch cost */ 1177*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1178*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1179*38fd1498Szrj COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1180*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1181*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1182*38fd1498Szrj COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1183*38fd1498Szrj 1184*38fd1498Szrj COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1185*38fd1498Szrj COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1186*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1187*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1188*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1189*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1190*38fd1498Szrj /* 9-24 */ 1191*38fd1498Szrj COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1192*38fd1498Szrj /* 9-27 */ 1193*38fd1498Szrj COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1194*38fd1498Szrj COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1195*38fd1498Szrj COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1196*38fd1498Szrj 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1197*38fd1498Szrj bdver2_memcpy, 1198*38fd1498Szrj bdver2_memset, 1199*38fd1498Szrj COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1200*38fd1498Szrj COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1201*38fd1498Szrj }; 1202*38fd1498Szrj 1203*38fd1498Szrj 1204*38fd1498Szrj /* BDVER3 has optimized REP instruction for medium sized blocks, but for 1205*38fd1498Szrj very small blocks it is better to use loop. For large blocks, libcall 1206*38fd1498Szrj can do nontemporary accesses and beat inline considerably. */ 1207*38fd1498Szrj static stringop_algs bdver3_memcpy[2] = { 1208*38fd1498Szrj {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1209*38fd1498Szrj {-1, rep_prefix_4_byte, false}}}, 1210*38fd1498Szrj {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1211*38fd1498Szrj {-1, libcall, false}}}}; 1212*38fd1498Szrj static stringop_algs bdver3_memset[2] = { 1213*38fd1498Szrj {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1214*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1215*38fd1498Szrj {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1216*38fd1498Szrj {-1, libcall, false}}}}; 1217*38fd1498Szrj struct processor_costs bdver3_cost = { 1218*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1219*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 1220*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 1221*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 1222*38fd1498Szrj {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1223*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 1224*38fd1498Szrj COSTS_N_INSNS (4), /* SI */ 1225*38fd1498Szrj COSTS_N_INSNS (6), /* DI */ 1226*38fd1498Szrj COSTS_N_INSNS (6)}, /* other */ 1227*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1228*38fd1498Szrj {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1229*38fd1498Szrj COSTS_N_INSNS (35), /* HI */ 1230*38fd1498Szrj COSTS_N_INSNS (51), /* SI */ 1231*38fd1498Szrj COSTS_N_INSNS (83), /* DI */ 1232*38fd1498Szrj COSTS_N_INSNS (83)}, /* other */ 1233*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1234*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 1235*38fd1498Szrj 8, /* "large" insn */ 1236*38fd1498Szrj 9, /* MOVE_RATIO */ 1237*38fd1498Szrj 1238*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1239*38fd1498Szrj they are latency*2. */ 1240*38fd1498Szrj 8, /* cost for loading QImode using movzbl */ 1241*38fd1498Szrj {8, 8, 8}, /* cost of loading integer registers 1242*38fd1498Szrj in QImode, HImode and SImode. 1243*38fd1498Szrj Relative to reg-reg move (2). */ 1244*38fd1498Szrj {8, 8, 8}, /* cost of storing integer registers */ 1245*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 1246*38fd1498Szrj {12, 12, 28}, /* cost of loading fp registers 1247*38fd1498Szrj in SFmode, DFmode and XFmode */ 1248*38fd1498Szrj {10, 10, 18}, /* cost of storing fp registers 1249*38fd1498Szrj in SFmode, DFmode and XFmode */ 1250*38fd1498Szrj 4, /* cost of moving MMX register */ 1251*38fd1498Szrj {12, 12}, /* cost of loading MMX registers 1252*38fd1498Szrj in SImode and DImode */ 1253*38fd1498Szrj {10, 10}, /* cost of storing MMX registers 1254*38fd1498Szrj in SImode and DImode */ 1255*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1256*38fd1498Szrj {12, 12, 10, 20, 30}, /* cost of loading SSE registers 1257*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1258*38fd1498Szrj {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ 1259*38fd1498Szrj {10, 10, 10, 20, 30}, /* cost of storing SSE registers 1260*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1261*38fd1498Szrj {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 1262*38fd1498Szrj 16, 20, /* SSE->integer and integer->SSE moves */ 1263*38fd1498Szrj 12, 12, /* Gather load static, per_elt. */ 1264*38fd1498Szrj 10, 10, /* Gather store static, per_elt. */ 1265*38fd1498Szrj 16, /* size of l1 cache. */ 1266*38fd1498Szrj 2048, /* size of l2 cache. */ 1267*38fd1498Szrj 64, /* size of prefetch block */ 1268*38fd1498Szrj /* New AMD processors never drop prefetches; if they cannot be performed 1269*38fd1498Szrj immediately, they are queued. We set number of simultaneous prefetches 1270*38fd1498Szrj to a large constant to reflect this (it probably is not a good idea not 1271*38fd1498Szrj to limit number of prefetches at all, as their execution also takes some 1272*38fd1498Szrj time). */ 1273*38fd1498Szrj 100, /* number of parallel prefetches */ 1274*38fd1498Szrj 2, /* Branch cost */ 1275*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1276*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1277*38fd1498Szrj COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1278*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1279*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1280*38fd1498Szrj COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1281*38fd1498Szrj 1282*38fd1498Szrj COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1283*38fd1498Szrj COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1284*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1285*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1286*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1287*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1288*38fd1498Szrj /* 9-24 */ 1289*38fd1498Szrj COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1290*38fd1498Szrj /* 9-27 */ 1291*38fd1498Szrj COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1292*38fd1498Szrj COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1293*38fd1498Szrj COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1294*38fd1498Szrj 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1295*38fd1498Szrj bdver3_memcpy, 1296*38fd1498Szrj bdver3_memset, 1297*38fd1498Szrj COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1298*38fd1498Szrj COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1299*38fd1498Szrj }; 1300*38fd1498Szrj 1301*38fd1498Szrj /* BDVER4 has optimized REP instruction for medium sized blocks, but for 1302*38fd1498Szrj very small blocks it is better to use loop. For large blocks, libcall 1303*38fd1498Szrj can do nontemporary accesses and beat inline considerably. */ 1304*38fd1498Szrj static stringop_algs bdver4_memcpy[2] = { 1305*38fd1498Szrj {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1306*38fd1498Szrj {-1, rep_prefix_4_byte, false}}}, 1307*38fd1498Szrj {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1308*38fd1498Szrj {-1, libcall, false}}}}; 1309*38fd1498Szrj static stringop_algs bdver4_memset[2] = { 1310*38fd1498Szrj {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1311*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1312*38fd1498Szrj {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1313*38fd1498Szrj {-1, libcall, false}}}}; 1314*38fd1498Szrj struct processor_costs bdver4_cost = { 1315*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1316*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 1317*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 1318*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 1319*38fd1498Szrj {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1320*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 1321*38fd1498Szrj COSTS_N_INSNS (4), /* SI */ 1322*38fd1498Szrj COSTS_N_INSNS (6), /* DI */ 1323*38fd1498Szrj COSTS_N_INSNS (6)}, /* other */ 1324*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1325*38fd1498Szrj {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1326*38fd1498Szrj COSTS_N_INSNS (35), /* HI */ 1327*38fd1498Szrj COSTS_N_INSNS (51), /* SI */ 1328*38fd1498Szrj COSTS_N_INSNS (83), /* DI */ 1329*38fd1498Szrj COSTS_N_INSNS (83)}, /* other */ 1330*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1331*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 1332*38fd1498Szrj 8, /* "large" insn */ 1333*38fd1498Szrj 9, /* MOVE_RATIO */ 1334*38fd1498Szrj 1335*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1336*38fd1498Szrj they are latency*2. */ 1337*38fd1498Szrj 8, /* cost for loading QImode using movzbl */ 1338*38fd1498Szrj {8, 8, 8}, /* cost of loading integer registers 1339*38fd1498Szrj in QImode, HImode and SImode. 1340*38fd1498Szrj Relative to reg-reg move (2). */ 1341*38fd1498Szrj {8, 8, 8}, /* cost of storing integer registers */ 1342*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 1343*38fd1498Szrj {12, 12, 28}, /* cost of loading fp registers 1344*38fd1498Szrj in SFmode, DFmode and XFmode */ 1345*38fd1498Szrj {10, 10, 18}, /* cost of storing fp registers 1346*38fd1498Szrj in SFmode, DFmode and XFmode */ 1347*38fd1498Szrj 4, /* cost of moving MMX register */ 1348*38fd1498Szrj {12, 12}, /* cost of loading MMX registers 1349*38fd1498Szrj in SImode and DImode */ 1350*38fd1498Szrj {10, 10}, /* cost of storing MMX registers 1351*38fd1498Szrj in SImode and DImode */ 1352*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1353*38fd1498Szrj {12, 12, 10, 20, 30}, /* cost of loading SSE registers 1354*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1355*38fd1498Szrj {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ 1356*38fd1498Szrj {10, 10, 10, 20, 30}, /* cost of storing SSE registers 1357*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1358*38fd1498Szrj {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 1359*38fd1498Szrj 16, 20, /* SSE->integer and integer->SSE moves */ 1360*38fd1498Szrj 12, 12, /* Gather load static, per_elt. */ 1361*38fd1498Szrj 10, 10, /* Gather store static, per_elt. */ 1362*38fd1498Szrj 16, /* size of l1 cache. */ 1363*38fd1498Szrj 2048, /* size of l2 cache. */ 1364*38fd1498Szrj 64, /* size of prefetch block */ 1365*38fd1498Szrj /* New AMD processors never drop prefetches; if they cannot be performed 1366*38fd1498Szrj immediately, they are queued. We set number of simultaneous prefetches 1367*38fd1498Szrj to a large constant to reflect this (it probably is not a good idea not 1368*38fd1498Szrj to limit number of prefetches at all, as their execution also takes some 1369*38fd1498Szrj time). */ 1370*38fd1498Szrj 100, /* number of parallel prefetches */ 1371*38fd1498Szrj 2, /* Branch cost */ 1372*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1373*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1374*38fd1498Szrj COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1375*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1376*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1377*38fd1498Szrj COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1378*38fd1498Szrj 1379*38fd1498Szrj COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1380*38fd1498Szrj COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1381*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1382*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1383*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1384*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1385*38fd1498Szrj /* 9-24 */ 1386*38fd1498Szrj COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1387*38fd1498Szrj /* 9-27 */ 1388*38fd1498Szrj COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1389*38fd1498Szrj COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1390*38fd1498Szrj COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1391*38fd1498Szrj 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1392*38fd1498Szrj bdver4_memcpy, 1393*38fd1498Szrj bdver4_memset, 1394*38fd1498Szrj COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1395*38fd1498Szrj COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1396*38fd1498Szrj }; 1397*38fd1498Szrj 1398*38fd1498Szrj 1399*38fd1498Szrj /* ZNVER1 has optimized REP instruction for medium sized blocks, but for 1400*38fd1498Szrj very small blocks it is better to use loop. For large blocks, libcall 1401*38fd1498Szrj can do nontemporary accesses and beat inline considerably. */ 1402*38fd1498Szrj static stringop_algs znver1_memcpy[2] = { 1403*38fd1498Szrj {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1404*38fd1498Szrj {-1, rep_prefix_4_byte, false}}}, 1405*38fd1498Szrj {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1406*38fd1498Szrj {-1, libcall, false}}}}; 1407*38fd1498Szrj static stringop_algs znver1_memset[2] = { 1408*38fd1498Szrj {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1409*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1410*38fd1498Szrj {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1411*38fd1498Szrj {-1, libcall, false}}}}; 1412*38fd1498Szrj struct processor_costs znver1_cost = { 1413*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction. */ 1414*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1415*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs. */ 1416*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs. */ 1417*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1418*38fd1498Szrj COSTS_N_INSNS (3), /* HI. */ 1419*38fd1498Szrj COSTS_N_INSNS (3), /* SI. */ 1420*38fd1498Szrj COSTS_N_INSNS (3), /* DI. */ 1421*38fd1498Szrj COSTS_N_INSNS (3)}, /* other. */ 1422*38fd1498Szrj 0, /* cost of multiply per each bit 1423*38fd1498Szrj set. */ 1424*38fd1498Szrj /* Depending on parameters, idiv can get faster on ryzen. This is upper 1425*38fd1498Szrj bound. */ 1426*38fd1498Szrj {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ 1427*38fd1498Szrj COSTS_N_INSNS (22), /* HI. */ 1428*38fd1498Szrj COSTS_N_INSNS (30), /* SI. */ 1429*38fd1498Szrj COSTS_N_INSNS (45), /* DI. */ 1430*38fd1498Szrj COSTS_N_INSNS (45)}, /* other. */ 1431*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx. */ 1432*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx. */ 1433*38fd1498Szrj 8, /* "large" insn. */ 1434*38fd1498Szrj 9, /* MOVE_RATIO. */ 1435*38fd1498Szrj 1436*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1437*38fd1498Szrj they are latency*2. */ 1438*38fd1498Szrj 1439*38fd1498Szrj /* reg-reg moves are done by renaming and thus they are even cheaper than 1440*38fd1498Szrj 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond 1441*38fd1498Szrj to doubles of latencies, we do not model this correctly. It does not 1442*38fd1498Szrj seem to make practical difference to bump prices up even more. */ 1443*38fd1498Szrj 6, /* cost for loading QImode using 1444*38fd1498Szrj movzbl. */ 1445*38fd1498Szrj {6, 6, 6}, /* cost of loading integer registers 1446*38fd1498Szrj in QImode, HImode and SImode. 1447*38fd1498Szrj Relative to reg-reg move (2). */ 1448*38fd1498Szrj {8, 8, 8}, /* cost of storing integer 1449*38fd1498Szrj registers. */ 1450*38fd1498Szrj 2, /* cost of reg,reg fld/fst. */ 1451*38fd1498Szrj {6, 6, 16}, /* cost of loading fp registers 1452*38fd1498Szrj in SFmode, DFmode and XFmode. */ 1453*38fd1498Szrj {8, 8, 16}, /* cost of storing fp registers 1454*38fd1498Szrj in SFmode, DFmode and XFmode. */ 1455*38fd1498Szrj 2, /* cost of moving MMX register. */ 1456*38fd1498Szrj {6, 6}, /* cost of loading MMX registers 1457*38fd1498Szrj in SImode and DImode. */ 1458*38fd1498Szrj {8, 8}, /* cost of storing MMX registers 1459*38fd1498Szrj in SImode and DImode. */ 1460*38fd1498Szrj 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 1461*38fd1498Szrj {6, 6, 6, 10, 20}, /* cost of loading SSE registers 1462*38fd1498Szrj in 32,64,128,256 and 512-bit. */ 1463*38fd1498Szrj {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ 1464*38fd1498Szrj {8, 8, 8, 8, 16}, /* cost of storing SSE registers 1465*38fd1498Szrj in 32,64,128,256 and 512-bit. */ 1466*38fd1498Szrj {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1467*38fd1498Szrj 6, 6, /* SSE->integer and integer->SSE moves. */ 1468*38fd1498Szrj /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, 1469*38fd1498Szrj throughput 12. Approx 9 uops do not depend on vector size and every load 1470*38fd1498Szrj is 7 uops. */ 1471*38fd1498Szrj 18, 8, /* Gather load static, per_elt. */ 1472*38fd1498Szrj 18, 10, /* Gather store static, per_elt. */ 1473*38fd1498Szrj 32, /* size of l1 cache. */ 1474*38fd1498Szrj 512, /* size of l2 cache. */ 1475*38fd1498Szrj 64, /* size of prefetch block. */ 1476*38fd1498Szrj /* New AMD processors never drop prefetches; if they cannot be performed 1477*38fd1498Szrj immediately, they are queued. We set number of simultaneous prefetches 1478*38fd1498Szrj to a large constant to reflect this (it probably is not a good idea not 1479*38fd1498Szrj to limit number of prefetches at all, as their execution also takes some 1480*38fd1498Szrj time). */ 1481*38fd1498Szrj 100, /* number of parallel prefetches. */ 1482*38fd1498Szrj 3, /* Branch cost. */ 1483*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1484*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1485*38fd1498Szrj /* Latency of fdiv is 8-15. */ 1486*38fd1498Szrj COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1487*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1488*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1489*38fd1498Szrj /* Latency of fsqrt is 4-10. */ 1490*38fd1498Szrj COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1491*38fd1498Szrj 1492*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1493*38fd1498Szrj COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1494*38fd1498Szrj COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1495*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1496*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1497*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1498*38fd1498Szrj COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1499*38fd1498Szrj /* 9-13 */ 1500*38fd1498Szrj COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1501*38fd1498Szrj COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1502*38fd1498Szrj COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1503*38fd1498Szrj /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles 1504*38fd1498Szrj and it can execute 2 integer additions and 2 multiplications thus 1505*38fd1498Szrj reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests 1506*38fd1498Szrj that 4 works better than 6 probably due to register pressure. 1507*38fd1498Szrj 1508*38fd1498Szrj Integer vector operations are taken by FP unit and execute 3 vector 1509*38fd1498Szrj plus/minus operations per cycle but only one multiply. This is adjusted 1510*38fd1498Szrj in ix86_reassociation_width. */ 1511*38fd1498Szrj 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1512*38fd1498Szrj znver1_memcpy, 1513*38fd1498Szrj znver1_memset, 1514*38fd1498Szrj COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1515*38fd1498Szrj COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1516*38fd1498Szrj }; 1517*38fd1498Szrj 1518*38fd1498Szrj /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ 1519*38fd1498Szrj static stringop_algs skylake_memcpy[2] = { 1520*38fd1498Szrj {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, 1521*38fd1498Szrj {libcall, {{16, loop, false}, {512, rep_prefix_8_byte, false}, 1522*38fd1498Szrj {-1, libcall, false}}}}; 1523*38fd1498Szrj 1524*38fd1498Szrj static stringop_algs skylake_memset[2] = { 1525*38fd1498Szrj {libcall, {{6, loop_1_byte, true}, 1526*38fd1498Szrj {24, loop, true}, 1527*38fd1498Szrj {8192, rep_prefix_4_byte, true}, 1528*38fd1498Szrj {-1, libcall, false}}}, 1529*38fd1498Szrj {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, false}, 1530*38fd1498Szrj {-1, libcall, false}}}}; 1531*38fd1498Szrj 1532*38fd1498Szrj static const 1533*38fd1498Szrj struct processor_costs skylake_cost = { 1534*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1535*38fd1498Szrj COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ 1536*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 1537*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 1538*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1539*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 1540*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 1541*38fd1498Szrj COSTS_N_INSNS (3), /* DI */ 1542*38fd1498Szrj COSTS_N_INSNS (3)}, /* other */ 1543*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1544*38fd1498Szrj /* Expanding div/mod currently doesn't consider parallelism. So the cost 1545*38fd1498Szrj model is not realistic. We compensate by increasing the latencies a bit. */ 1546*38fd1498Szrj {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 1547*38fd1498Szrj COSTS_N_INSNS (11), /* HI */ 1548*38fd1498Szrj COSTS_N_INSNS (14), /* SI */ 1549*38fd1498Szrj COSTS_N_INSNS (76), /* DI */ 1550*38fd1498Szrj COSTS_N_INSNS (76)}, /* other */ 1551*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1552*38fd1498Szrj COSTS_N_INSNS (0), /* cost of movzx */ 1553*38fd1498Szrj 8, /* "large" insn */ 1554*38fd1498Szrj 17, /* MOVE_RATIO */ 1555*38fd1498Szrj 1556*38fd1498Szrj 6, /* cost for loading QImode using movzbl */ 1557*38fd1498Szrj {4, 4, 4}, /* cost of loading integer registers 1558*38fd1498Szrj in QImode, HImode and SImode. 1559*38fd1498Szrj Relative to reg-reg move (2). */ 1560*38fd1498Szrj {6, 6, 3}, /* cost of storing integer registers */ 1561*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 1562*38fd1498Szrj {6, 6, 8}, /* cost of loading fp registers 1563*38fd1498Szrj in SFmode, DFmode and XFmode */ 1564*38fd1498Szrj {6, 6, 10}, /* cost of storing fp registers 1565*38fd1498Szrj in SFmode, DFmode and XFmode */ 1566*38fd1498Szrj 2, /* cost of moving MMX register */ 1567*38fd1498Szrj {6, 6}, /* cost of loading MMX registers 1568*38fd1498Szrj in SImode and DImode */ 1569*38fd1498Szrj {6, 6}, /* cost of storing MMX registers 1570*38fd1498Szrj in SImode and DImode */ 1571*38fd1498Szrj 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 1572*38fd1498Szrj {6, 6, 6, 10, 20}, /* cost of loading SSE registers 1573*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1574*38fd1498Szrj {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ 1575*38fd1498Szrj {8, 8, 8, 12, 24}, /* cost of storing SSE registers 1576*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1577*38fd1498Szrj {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1578*38fd1498Szrj 2, 2, /* SSE->integer and integer->SSE moves */ 1579*38fd1498Szrj 20, 8, /* Gather load static, per_elt. */ 1580*38fd1498Szrj 22, 10, /* Gather store static, per_elt. */ 1581*38fd1498Szrj 64, /* size of l1 cache. */ 1582*38fd1498Szrj 512, /* size of l2 cache. */ 1583*38fd1498Szrj 64, /* size of prefetch block */ 1584*38fd1498Szrj 6, /* number of parallel prefetches */ 1585*38fd1498Szrj 3, /* Branch cost */ 1586*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 1587*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1588*38fd1498Szrj COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 1589*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1590*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1591*38fd1498Szrj COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ 1592*38fd1498Szrj 1593*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1594*38fd1498Szrj COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1595*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1596*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1597*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 1598*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 1599*38fd1498Szrj COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ 1600*38fd1498Szrj COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ 1601*38fd1498Szrj COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ 1602*38fd1498Szrj COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 1603*38fd1498Szrj 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 1604*38fd1498Szrj skylake_memcpy, 1605*38fd1498Szrj skylake_memset, 1606*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1607*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1608*38fd1498Szrj }; 1609*38fd1498Szrj /* BTVER1 has optimized REP instruction for medium sized blocks, but for 1610*38fd1498Szrj very small blocks it is better to use loop. For large blocks, libcall can 1611*38fd1498Szrj do nontemporary accesses and beat inline considerably. */ 1612*38fd1498Szrj static stringop_algs btver1_memcpy[2] = { 1613*38fd1498Szrj {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1614*38fd1498Szrj {-1, rep_prefix_4_byte, false}}}, 1615*38fd1498Szrj {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1616*38fd1498Szrj {-1, libcall, false}}}}; 1617*38fd1498Szrj static stringop_algs btver1_memset[2] = { 1618*38fd1498Szrj {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1619*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1620*38fd1498Szrj {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1621*38fd1498Szrj {-1, libcall, false}}}}; 1622*38fd1498Szrj const struct processor_costs btver1_cost = { 1623*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1624*38fd1498Szrj COSTS_N_INSNS (2), /* cost of a lea instruction */ 1625*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 1626*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 1627*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1628*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 1629*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 1630*38fd1498Szrj COSTS_N_INSNS (4), /* DI */ 1631*38fd1498Szrj COSTS_N_INSNS (5)}, /* other */ 1632*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1633*38fd1498Szrj {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1634*38fd1498Szrj COSTS_N_INSNS (35), /* HI */ 1635*38fd1498Szrj COSTS_N_INSNS (51), /* SI */ 1636*38fd1498Szrj COSTS_N_INSNS (83), /* DI */ 1637*38fd1498Szrj COSTS_N_INSNS (83)}, /* other */ 1638*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1639*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 1640*38fd1498Szrj 8, /* "large" insn */ 1641*38fd1498Szrj 9, /* MOVE_RATIO */ 1642*38fd1498Szrj 1643*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1644*38fd1498Szrj they are latency*2. */ 1645*38fd1498Szrj 8, /* cost for loading QImode using movzbl */ 1646*38fd1498Szrj {6, 8, 6}, /* cost of loading integer registers 1647*38fd1498Szrj in QImode, HImode and SImode. 1648*38fd1498Szrj Relative to reg-reg move (2). */ 1649*38fd1498Szrj {6, 8, 6}, /* cost of storing integer registers */ 1650*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 1651*38fd1498Szrj {12, 12, 28}, /* cost of loading fp registers 1652*38fd1498Szrj in SFmode, DFmode and XFmode */ 1653*38fd1498Szrj {12, 12, 38}, /* cost of storing fp registers 1654*38fd1498Szrj in SFmode, DFmode and XFmode */ 1655*38fd1498Szrj 4, /* cost of moving MMX register */ 1656*38fd1498Szrj {10, 10}, /* cost of loading MMX registers 1657*38fd1498Szrj in SImode and DImode */ 1658*38fd1498Szrj {12, 12}, /* cost of storing MMX registers 1659*38fd1498Szrj in SImode and DImode */ 1660*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1661*38fd1498Szrj {10, 10, 12, 24, 48}, /* cost of loading SSE registers 1662*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1663*38fd1498Szrj {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ 1664*38fd1498Szrj {10, 10, 12, 24, 48}, /* cost of storing SSE registers 1665*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1666*38fd1498Szrj {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ 1667*38fd1498Szrj 14, 14, /* SSE->integer and integer->SSE moves */ 1668*38fd1498Szrj 10, 10, /* Gather load static, per_elt. */ 1669*38fd1498Szrj 10, 10, /* Gather store static, per_elt. */ 1670*38fd1498Szrj 32, /* size of l1 cache. */ 1671*38fd1498Szrj 512, /* size of l2 cache. */ 1672*38fd1498Szrj 64, /* size of prefetch block */ 1673*38fd1498Szrj 100, /* number of parallel prefetches */ 1674*38fd1498Szrj 2, /* Branch cost */ 1675*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1676*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1677*38fd1498Szrj COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1678*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1679*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1680*38fd1498Szrj COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1681*38fd1498Szrj 1682*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1683*38fd1498Szrj COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1684*38fd1498Szrj COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 1685*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1686*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1687*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1688*38fd1498Szrj COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 1689*38fd1498Szrj COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 1690*38fd1498Szrj COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 1691*38fd1498Szrj COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */ 1692*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1693*38fd1498Szrj btver1_memcpy, 1694*38fd1498Szrj btver1_memset, 1695*38fd1498Szrj COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1696*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1697*38fd1498Szrj }; 1698*38fd1498Szrj 1699*38fd1498Szrj static stringop_algs btver2_memcpy[2] = { 1700*38fd1498Szrj {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1701*38fd1498Szrj {-1, rep_prefix_4_byte, false}}}, 1702*38fd1498Szrj {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1703*38fd1498Szrj {-1, libcall, false}}}}; 1704*38fd1498Szrj static stringop_algs btver2_memset[2] = { 1705*38fd1498Szrj {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1706*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1707*38fd1498Szrj {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1708*38fd1498Szrj {-1, libcall, false}}}}; 1709*38fd1498Szrj const struct processor_costs btver2_cost = { 1710*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1711*38fd1498Szrj COSTS_N_INSNS (2), /* cost of a lea instruction */ 1712*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 1713*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 1714*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1715*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 1716*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 1717*38fd1498Szrj COSTS_N_INSNS (4), /* DI */ 1718*38fd1498Szrj COSTS_N_INSNS (5)}, /* other */ 1719*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1720*38fd1498Szrj {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1721*38fd1498Szrj COSTS_N_INSNS (35), /* HI */ 1722*38fd1498Szrj COSTS_N_INSNS (51), /* SI */ 1723*38fd1498Szrj COSTS_N_INSNS (83), /* DI */ 1724*38fd1498Szrj COSTS_N_INSNS (83)}, /* other */ 1725*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1726*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 1727*38fd1498Szrj 8, /* "large" insn */ 1728*38fd1498Szrj 9, /* MOVE_RATIO */ 1729*38fd1498Szrj 1730*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1731*38fd1498Szrj they are latency*2. */ 1732*38fd1498Szrj 8, /* cost for loading QImode using movzbl */ 1733*38fd1498Szrj {8, 8, 6}, /* cost of loading integer registers 1734*38fd1498Szrj in QImode, HImode and SImode. 1735*38fd1498Szrj Relative to reg-reg move (2). */ 1736*38fd1498Szrj {8, 8, 6}, /* cost of storing integer registers */ 1737*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 1738*38fd1498Szrj {12, 12, 28}, /* cost of loading fp registers 1739*38fd1498Szrj in SFmode, DFmode and XFmode */ 1740*38fd1498Szrj {12, 12, 38}, /* cost of storing fp registers 1741*38fd1498Szrj in SFmode, DFmode and XFmode */ 1742*38fd1498Szrj 4, /* cost of moving MMX register */ 1743*38fd1498Szrj {10, 10}, /* cost of loading MMX registers 1744*38fd1498Szrj in SImode and DImode */ 1745*38fd1498Szrj {12, 12}, /* cost of storing MMX registers 1746*38fd1498Szrj in SImode and DImode */ 1747*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1748*38fd1498Szrj {10, 10, 12, 24, 48}, /* cost of loading SSE registers 1749*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1750*38fd1498Szrj {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ 1751*38fd1498Szrj {10, 10, 12, 24, 48}, /* cost of storing SSE registers 1752*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1753*38fd1498Szrj {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ 1754*38fd1498Szrj 14, 14, /* SSE->integer and integer->SSE moves */ 1755*38fd1498Szrj 10, 10, /* Gather load static, per_elt. */ 1756*38fd1498Szrj 10, 10, /* Gather store static, per_elt. */ 1757*38fd1498Szrj 32, /* size of l1 cache. */ 1758*38fd1498Szrj 2048, /* size of l2 cache. */ 1759*38fd1498Szrj 64, /* size of prefetch block */ 1760*38fd1498Szrj 100, /* number of parallel prefetches */ 1761*38fd1498Szrj 2, /* Branch cost */ 1762*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1763*38fd1498Szrj COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1764*38fd1498Szrj COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1765*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1766*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1767*38fd1498Szrj COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1768*38fd1498Szrj 1769*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1770*38fd1498Szrj COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1771*38fd1498Szrj COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 1772*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1773*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1774*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1775*38fd1498Szrj COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 1776*38fd1498Szrj COSTS_N_INSNS (19), /* cost of DIVSD instruction. */ 1777*38fd1498Szrj COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */ 1778*38fd1498Szrj COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ 1779*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1780*38fd1498Szrj btver2_memcpy, 1781*38fd1498Szrj btver2_memset, 1782*38fd1498Szrj COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1783*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1784*38fd1498Szrj }; 1785*38fd1498Szrj 1786*38fd1498Szrj static stringop_algs pentium4_memcpy[2] = { 1787*38fd1498Szrj {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 1788*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 1789*38fd1498Szrj static stringop_algs pentium4_memset[2] = { 1790*38fd1498Szrj {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 1791*38fd1498Szrj {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1792*38fd1498Szrj DUMMY_STRINGOP_ALGS}; 1793*38fd1498Szrj 1794*38fd1498Szrj static const 1795*38fd1498Szrj struct processor_costs pentium4_cost = { 1796*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1797*38fd1498Szrj COSTS_N_INSNS (3), /* cost of a lea instruction */ 1798*38fd1498Szrj COSTS_N_INSNS (4), /* variable shift costs */ 1799*38fd1498Szrj COSTS_N_INSNS (4), /* constant shift costs */ 1800*38fd1498Szrj {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ 1801*38fd1498Szrj COSTS_N_INSNS (15), /* HI */ 1802*38fd1498Szrj COSTS_N_INSNS (15), /* SI */ 1803*38fd1498Szrj COSTS_N_INSNS (15), /* DI */ 1804*38fd1498Szrj COSTS_N_INSNS (15)}, /* other */ 1805*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1806*38fd1498Szrj {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ 1807*38fd1498Szrj COSTS_N_INSNS (56), /* HI */ 1808*38fd1498Szrj COSTS_N_INSNS (56), /* SI */ 1809*38fd1498Szrj COSTS_N_INSNS (56), /* DI */ 1810*38fd1498Szrj COSTS_N_INSNS (56)}, /* other */ 1811*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1812*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 1813*38fd1498Szrj 16, /* "large" insn */ 1814*38fd1498Szrj 6, /* MOVE_RATIO */ 1815*38fd1498Szrj 1816*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1817*38fd1498Szrj they are latency*2. */ 1818*38fd1498Szrj 5, /* cost for loading QImode using movzbl */ 1819*38fd1498Szrj {4, 5, 4}, /* cost of loading integer registers 1820*38fd1498Szrj in QImode, HImode and SImode. 1821*38fd1498Szrj Relative to reg-reg move (2). */ 1822*38fd1498Szrj {2, 3, 2}, /* cost of storing integer registers */ 1823*38fd1498Szrj 12, /* cost of reg,reg fld/fst */ 1824*38fd1498Szrj {14, 14, 14}, /* cost of loading fp registers 1825*38fd1498Szrj in SFmode, DFmode and XFmode */ 1826*38fd1498Szrj {14, 14, 14}, /* cost of storing fp registers 1827*38fd1498Szrj in SFmode, DFmode and XFmode */ 1828*38fd1498Szrj 12, /* cost of moving MMX register */ 1829*38fd1498Szrj {16, 16}, /* cost of loading MMX registers 1830*38fd1498Szrj in SImode and DImode */ 1831*38fd1498Szrj {16, 16}, /* cost of storing MMX registers 1832*38fd1498Szrj in SImode and DImode */ 1833*38fd1498Szrj 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 1834*38fd1498Szrj {16, 16, 16, 32, 64}, /* cost of loading SSE registers 1835*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1836*38fd1498Szrj {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ 1837*38fd1498Szrj {16, 16, 16, 32, 64}, /* cost of storing SSE registers 1838*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1839*38fd1498Szrj {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 1840*38fd1498Szrj 20, 12, /* SSE->integer and integer->SSE moves */ 1841*38fd1498Szrj 16, 16, /* Gather load static, per_elt. */ 1842*38fd1498Szrj 16, 16, /* Gather store static, per_elt. */ 1843*38fd1498Szrj 8, /* size of l1 cache. */ 1844*38fd1498Szrj 256, /* size of l2 cache. */ 1845*38fd1498Szrj 64, /* size of prefetch block */ 1846*38fd1498Szrj 6, /* number of parallel prefetches */ 1847*38fd1498Szrj 2, /* Branch cost */ 1848*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1849*38fd1498Szrj COSTS_N_INSNS (7), /* cost of FMUL instruction. */ 1850*38fd1498Szrj COSTS_N_INSNS (43), /* cost of FDIV instruction. */ 1851*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1852*38fd1498Szrj COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1853*38fd1498Szrj COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ 1854*38fd1498Szrj 1855*38fd1498Szrj COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1856*38fd1498Szrj COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1857*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1858*38fd1498Szrj COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1859*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1860*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1861*38fd1498Szrj COSTS_N_INSNS (23), /* cost of DIVSS instruction. */ 1862*38fd1498Szrj COSTS_N_INSNS (38), /* cost of DIVSD instruction. */ 1863*38fd1498Szrj COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */ 1864*38fd1498Szrj COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */ 1865*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1866*38fd1498Szrj pentium4_memcpy, 1867*38fd1498Szrj pentium4_memset, 1868*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1869*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1870*38fd1498Szrj }; 1871*38fd1498Szrj 1872*38fd1498Szrj static stringop_algs nocona_memcpy[2] = { 1873*38fd1498Szrj {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 1874*38fd1498Szrj {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, 1875*38fd1498Szrj {100000, unrolled_loop, false}, {-1, libcall, false}}}}; 1876*38fd1498Szrj 1877*38fd1498Szrj static stringop_algs nocona_memset[2] = { 1878*38fd1498Szrj {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 1879*38fd1498Szrj {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1880*38fd1498Szrj {libcall, {{24, loop, false}, {64, unrolled_loop, false}, 1881*38fd1498Szrj {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1882*38fd1498Szrj 1883*38fd1498Szrj static const 1884*38fd1498Szrj struct processor_costs nocona_cost = { 1885*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1886*38fd1498Szrj COSTS_N_INSNS (1), /* cost of a lea instruction */ 1887*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 1888*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 1889*38fd1498Szrj {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ 1890*38fd1498Szrj COSTS_N_INSNS (10), /* HI */ 1891*38fd1498Szrj COSTS_N_INSNS (10), /* SI */ 1892*38fd1498Szrj COSTS_N_INSNS (10), /* DI */ 1893*38fd1498Szrj COSTS_N_INSNS (10)}, /* other */ 1894*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1895*38fd1498Szrj {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ 1896*38fd1498Szrj COSTS_N_INSNS (66), /* HI */ 1897*38fd1498Szrj COSTS_N_INSNS (66), /* SI */ 1898*38fd1498Szrj COSTS_N_INSNS (66), /* DI */ 1899*38fd1498Szrj COSTS_N_INSNS (66)}, /* other */ 1900*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1901*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 1902*38fd1498Szrj 16, /* "large" insn */ 1903*38fd1498Szrj 17, /* MOVE_RATIO */ 1904*38fd1498Szrj 1905*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1906*38fd1498Szrj they are latency*2. */ 1907*38fd1498Szrj 4, /* cost for loading QImode using movzbl */ 1908*38fd1498Szrj {4, 4, 4}, /* cost of loading integer registers 1909*38fd1498Szrj in QImode, HImode and SImode. 1910*38fd1498Szrj Relative to reg-reg move (2). */ 1911*38fd1498Szrj {4, 4, 4}, /* cost of storing integer registers */ 1912*38fd1498Szrj 12, /* cost of reg,reg fld/fst */ 1913*38fd1498Szrj {14, 14, 14}, /* cost of loading fp registers 1914*38fd1498Szrj in SFmode, DFmode and XFmode */ 1915*38fd1498Szrj {14, 14, 14}, /* cost of storing fp registers 1916*38fd1498Szrj in SFmode, DFmode and XFmode */ 1917*38fd1498Szrj 14, /* cost of moving MMX register */ 1918*38fd1498Szrj {12, 12}, /* cost of loading MMX registers 1919*38fd1498Szrj in SImode and DImode */ 1920*38fd1498Szrj {12, 12}, /* cost of storing MMX registers 1921*38fd1498Szrj in SImode and DImode */ 1922*38fd1498Szrj 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 1923*38fd1498Szrj {12, 12, 12, 24, 48}, /* cost of loading SSE registers 1924*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1925*38fd1498Szrj {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ 1926*38fd1498Szrj {12, 12, 12, 24, 48}, /* cost of storing SSE registers 1927*38fd1498Szrj in 32,64,128,256 and 512-bit */ 1928*38fd1498Szrj {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 1929*38fd1498Szrj 20, 12, /* SSE->integer and integer->SSE moves */ 1930*38fd1498Szrj 12, 12, /* Gather load static, per_elt. */ 1931*38fd1498Szrj 12, 12, /* Gather store static, per_elt. */ 1932*38fd1498Szrj 8, /* size of l1 cache. */ 1933*38fd1498Szrj 1024, /* size of l2 cache. */ 1934*38fd1498Szrj 64, /* size of prefetch block */ 1935*38fd1498Szrj 8, /* number of parallel prefetches */ 1936*38fd1498Szrj 1, /* Branch cost */ 1937*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1938*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 1939*38fd1498Szrj COSTS_N_INSNS (40), /* cost of FDIV instruction. */ 1940*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FABS instruction. */ 1941*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 1942*38fd1498Szrj COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ 1943*38fd1498Szrj 1944*38fd1498Szrj COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1945*38fd1498Szrj COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 1946*38fd1498Szrj COSTS_N_INSNS (7), /* cost of MULSS instruction. */ 1947*38fd1498Szrj COSTS_N_INSNS (7), /* cost of MULSD instruction. */ 1948*38fd1498Szrj COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 1949*38fd1498Szrj COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 1950*38fd1498Szrj COSTS_N_INSNS (32), /* cost of DIVSS instruction. */ 1951*38fd1498Szrj COSTS_N_INSNS (40), /* cost of DIVSD instruction. */ 1952*38fd1498Szrj COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */ 1953*38fd1498Szrj COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */ 1954*38fd1498Szrj 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1955*38fd1498Szrj nocona_memcpy, 1956*38fd1498Szrj nocona_memset, 1957*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1958*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1959*38fd1498Szrj }; 1960*38fd1498Szrj 1961*38fd1498Szrj static stringop_algs atom_memcpy[2] = { 1962*38fd1498Szrj {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 1963*38fd1498Szrj {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 1964*38fd1498Szrj {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1965*38fd1498Szrj static stringop_algs atom_memset[2] = { 1966*38fd1498Szrj {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 1967*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1968*38fd1498Szrj {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 1969*38fd1498Szrj {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1970*38fd1498Szrj static const 1971*38fd1498Szrj struct processor_costs atom_cost = { 1972*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 1973*38fd1498Szrj COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 1974*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 1975*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 1976*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1977*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 1978*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 1979*38fd1498Szrj COSTS_N_INSNS (4), /* DI */ 1980*38fd1498Szrj COSTS_N_INSNS (2)}, /* other */ 1981*38fd1498Szrj 0, /* cost of multiply per each bit set */ 1982*38fd1498Szrj {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 1983*38fd1498Szrj COSTS_N_INSNS (26), /* HI */ 1984*38fd1498Szrj COSTS_N_INSNS (42), /* SI */ 1985*38fd1498Szrj COSTS_N_INSNS (74), /* DI */ 1986*38fd1498Szrj COSTS_N_INSNS (74)}, /* other */ 1987*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 1988*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 1989*38fd1498Szrj 8, /* "large" insn */ 1990*38fd1498Szrj 17, /* MOVE_RATIO */ 1991*38fd1498Szrj 1992*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 1993*38fd1498Szrj they are latency*2. */ 1994*38fd1498Szrj 6, /* cost for loading QImode using movzbl */ 1995*38fd1498Szrj {6, 6, 6}, /* cost of loading integer registers 1996*38fd1498Szrj in QImode, HImode and SImode. 1997*38fd1498Szrj Relative to reg-reg move (2). */ 1998*38fd1498Szrj {6, 6, 6}, /* cost of storing integer registers */ 1999*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 2000*38fd1498Szrj {6, 6, 18}, /* cost of loading fp registers 2001*38fd1498Szrj in SFmode, DFmode and XFmode */ 2002*38fd1498Szrj {14, 14, 24}, /* cost of storing fp registers 2003*38fd1498Szrj in SFmode, DFmode and XFmode */ 2004*38fd1498Szrj 2, /* cost of moving MMX register */ 2005*38fd1498Szrj {8, 8}, /* cost of loading MMX registers 2006*38fd1498Szrj in SImode and DImode */ 2007*38fd1498Szrj {10, 10}, /* cost of storing MMX registers 2008*38fd1498Szrj in SImode and DImode */ 2009*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2010*38fd1498Szrj {8, 8, 8, 16, 32}, /* cost of loading SSE registers 2011*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2012*38fd1498Szrj {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 2013*38fd1498Szrj {8, 8, 8, 16, 32}, /* cost of storing SSE registers 2014*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2015*38fd1498Szrj {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2016*38fd1498Szrj 8, 6, /* SSE->integer and integer->SSE moves */ 2017*38fd1498Szrj 8, 8, /* Gather load static, per_elt. */ 2018*38fd1498Szrj 8, 8, /* Gather store static, per_elt. */ 2019*38fd1498Szrj 32, /* size of l1 cache. */ 2020*38fd1498Szrj 256, /* size of l2 cache. */ 2021*38fd1498Szrj 64, /* size of prefetch block */ 2022*38fd1498Szrj 6, /* number of parallel prefetches */ 2023*38fd1498Szrj 3, /* Branch cost */ 2024*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2025*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2026*38fd1498Szrj COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2027*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2028*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2029*38fd1498Szrj COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2030*38fd1498Szrj 2031*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2032*38fd1498Szrj COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 2033*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2034*38fd1498Szrj COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2035*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2036*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2037*38fd1498Szrj COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 2038*38fd1498Szrj COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 2039*38fd1498Szrj COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 2040*38fd1498Szrj COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 2041*38fd1498Szrj 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2042*38fd1498Szrj atom_memcpy, 2043*38fd1498Szrj atom_memset, 2044*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2045*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2046*38fd1498Szrj }; 2047*38fd1498Szrj 2048*38fd1498Szrj static stringop_algs slm_memcpy[2] = { 2049*38fd1498Szrj {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2050*38fd1498Szrj {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2051*38fd1498Szrj {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2052*38fd1498Szrj static stringop_algs slm_memset[2] = { 2053*38fd1498Szrj {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2054*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2055*38fd1498Szrj {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2056*38fd1498Szrj {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2057*38fd1498Szrj static const 2058*38fd1498Szrj struct processor_costs slm_cost = { 2059*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 2060*38fd1498Szrj COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2061*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 2062*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 2063*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2064*38fd1498Szrj COSTS_N_INSNS (3), /* HI */ 2065*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 2066*38fd1498Szrj COSTS_N_INSNS (4), /* DI */ 2067*38fd1498Szrj COSTS_N_INSNS (2)}, /* other */ 2068*38fd1498Szrj 0, /* cost of multiply per each bit set */ 2069*38fd1498Szrj {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2070*38fd1498Szrj COSTS_N_INSNS (26), /* HI */ 2071*38fd1498Szrj COSTS_N_INSNS (42), /* SI */ 2072*38fd1498Szrj COSTS_N_INSNS (74), /* DI */ 2073*38fd1498Szrj COSTS_N_INSNS (74)}, /* other */ 2074*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 2075*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 2076*38fd1498Szrj 8, /* "large" insn */ 2077*38fd1498Szrj 17, /* MOVE_RATIO */ 2078*38fd1498Szrj 2079*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 2080*38fd1498Szrj they are latency*2. */ 2081*38fd1498Szrj 8, /* cost for loading QImode using movzbl */ 2082*38fd1498Szrj {8, 8, 8}, /* cost of loading integer registers 2083*38fd1498Szrj in QImode, HImode and SImode. 2084*38fd1498Szrj Relative to reg-reg move (2). */ 2085*38fd1498Szrj {6, 6, 6}, /* cost of storing integer registers */ 2086*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 2087*38fd1498Szrj {8, 8, 18}, /* cost of loading fp registers 2088*38fd1498Szrj in SFmode, DFmode and XFmode */ 2089*38fd1498Szrj {6, 6, 18}, /* cost of storing fp registers 2090*38fd1498Szrj in SFmode, DFmode and XFmode */ 2091*38fd1498Szrj 2, /* cost of moving MMX register */ 2092*38fd1498Szrj {8, 8}, /* cost of loading MMX registers 2093*38fd1498Szrj in SImode and DImode */ 2094*38fd1498Szrj {6, 6}, /* cost of storing MMX registers 2095*38fd1498Szrj in SImode and DImode */ 2096*38fd1498Szrj 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2097*38fd1498Szrj {8, 8, 8, 16, 32}, /* cost of loading SSE registers 2098*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2099*38fd1498Szrj {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 2100*38fd1498Szrj {8, 8, 8, 16, 32}, /* cost of storing SSE registers 2101*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2102*38fd1498Szrj {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2103*38fd1498Szrj 8, 6, /* SSE->integer and integer->SSE moves */ 2104*38fd1498Szrj 8, 8, /* Gather load static, per_elt. */ 2105*38fd1498Szrj 8, 8, /* Gather store static, per_elt. */ 2106*38fd1498Szrj 32, /* size of l1 cache. */ 2107*38fd1498Szrj 256, /* size of l2 cache. */ 2108*38fd1498Szrj 64, /* size of prefetch block */ 2109*38fd1498Szrj 6, /* number of parallel prefetches */ 2110*38fd1498Szrj 3, /* Branch cost */ 2111*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2112*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2113*38fd1498Szrj COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2114*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2115*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2116*38fd1498Szrj COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2117*38fd1498Szrj 2118*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2119*38fd1498Szrj COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2120*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2121*38fd1498Szrj COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2122*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2123*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2124*38fd1498Szrj COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 2125*38fd1498Szrj COSTS_N_INSNS (69), /* cost of DIVSD instruction. */ 2126*38fd1498Szrj COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */ 2127*38fd1498Szrj COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */ 2128*38fd1498Szrj 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2129*38fd1498Szrj slm_memcpy, 2130*38fd1498Szrj slm_memset, 2131*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2132*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2133*38fd1498Szrj }; 2134*38fd1498Szrj 2135*38fd1498Szrj static stringop_algs intel_memcpy[2] = { 2136*38fd1498Szrj {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2137*38fd1498Szrj {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2138*38fd1498Szrj {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2139*38fd1498Szrj static stringop_algs intel_memset[2] = { 2140*38fd1498Szrj {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2141*38fd1498Szrj {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2142*38fd1498Szrj {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2143*38fd1498Szrj {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2144*38fd1498Szrj static const 2145*38fd1498Szrj struct processor_costs intel_cost = { 2146*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 2147*38fd1498Szrj COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2148*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 2149*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 2150*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2151*38fd1498Szrj COSTS_N_INSNS (3), /* HI */ 2152*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 2153*38fd1498Szrj COSTS_N_INSNS (4), /* DI */ 2154*38fd1498Szrj COSTS_N_INSNS (2)}, /* other */ 2155*38fd1498Szrj 0, /* cost of multiply per each bit set */ 2156*38fd1498Szrj {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2157*38fd1498Szrj COSTS_N_INSNS (26), /* HI */ 2158*38fd1498Szrj COSTS_N_INSNS (42), /* SI */ 2159*38fd1498Szrj COSTS_N_INSNS (74), /* DI */ 2160*38fd1498Szrj COSTS_N_INSNS (74)}, /* other */ 2161*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 2162*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 2163*38fd1498Szrj 8, /* "large" insn */ 2164*38fd1498Szrj 17, /* MOVE_RATIO */ 2165*38fd1498Szrj 2166*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 2167*38fd1498Szrj they are latency*2. */ 2168*38fd1498Szrj 6, /* cost for loading QImode using movzbl */ 2169*38fd1498Szrj {4, 4, 4}, /* cost of loading integer registers 2170*38fd1498Szrj in QImode, HImode and SImode. 2171*38fd1498Szrj Relative to reg-reg move (2). */ 2172*38fd1498Szrj {6, 6, 6}, /* cost of storing integer registers */ 2173*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 2174*38fd1498Szrj {6, 6, 8}, /* cost of loading fp registers 2175*38fd1498Szrj in SFmode, DFmode and XFmode */ 2176*38fd1498Szrj {6, 6, 10}, /* cost of storing fp registers 2177*38fd1498Szrj in SFmode, DFmode and XFmode */ 2178*38fd1498Szrj 2, /* cost of moving MMX register */ 2179*38fd1498Szrj {6, 6}, /* cost of loading MMX registers 2180*38fd1498Szrj in SImode and DImode */ 2181*38fd1498Szrj {6, 6}, /* cost of storing MMX registers 2182*38fd1498Szrj in SImode and DImode */ 2183*38fd1498Szrj 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 2184*38fd1498Szrj {6, 6, 6, 6, 6}, /* cost of loading SSE registers 2185*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2186*38fd1498Szrj {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2187*38fd1498Szrj {6, 6, 6, 6, 6}, /* cost of storing SSE registers 2188*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2189*38fd1498Szrj {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2190*38fd1498Szrj 4, 4, /* SSE->integer and integer->SSE moves */ 2191*38fd1498Szrj 6, 6, /* Gather load static, per_elt. */ 2192*38fd1498Szrj 6, 6, /* Gather store static, per_elt. */ 2193*38fd1498Szrj 32, /* size of l1 cache. */ 2194*38fd1498Szrj 256, /* size of l2 cache. */ 2195*38fd1498Szrj 64, /* size of prefetch block */ 2196*38fd1498Szrj 6, /* number of parallel prefetches */ 2197*38fd1498Szrj 3, /* Branch cost */ 2198*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2199*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2200*38fd1498Szrj COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2201*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2202*38fd1498Szrj COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2203*38fd1498Szrj COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2204*38fd1498Szrj 2205*38fd1498Szrj COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */ 2206*38fd1498Szrj COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 2207*38fd1498Szrj COSTS_N_INSNS (8), /* cost of MULSS instruction. */ 2208*38fd1498Szrj COSTS_N_INSNS (8), /* cost of MULSD instruction. */ 2209*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2210*38fd1498Szrj COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2211*38fd1498Szrj COSTS_N_INSNS (20), /* cost of DIVSS instruction. */ 2212*38fd1498Szrj COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 2213*38fd1498Szrj COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */ 2214*38fd1498Szrj COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */ 2215*38fd1498Szrj 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2216*38fd1498Szrj intel_memcpy, 2217*38fd1498Szrj intel_memset, 2218*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2219*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2220*38fd1498Szrj }; 2221*38fd1498Szrj 2222*38fd1498Szrj /* Generic should produce code tuned for Core-i7 (and newer chips) 2223*38fd1498Szrj and btver1 (and newer chips). */ 2224*38fd1498Szrj 2225*38fd1498Szrj static stringop_algs generic_memcpy[2] = { 2226*38fd1498Szrj {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 2227*38fd1498Szrj {-1, libcall, false}}}, 2228*38fd1498Szrj {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 2229*38fd1498Szrj {-1, libcall, false}}}}; 2230*38fd1498Szrj static stringop_algs generic_memset[2] = { 2231*38fd1498Szrj {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 2232*38fd1498Szrj {-1, libcall, false}}}, 2233*38fd1498Szrj {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 2234*38fd1498Szrj {-1, libcall, false}}}}; 2235*38fd1498Szrj static const 2236*38fd1498Szrj struct processor_costs generic_cost = { 2237*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 2238*38fd1498Szrj /* Setting cost to 2 makes our current implementation of synth_mult result in 2239*38fd1498Szrj use of unnecessary temporary registers causing regression on several 2240*38fd1498Szrj SPECfp benchmarks. */ 2241*38fd1498Szrj COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2242*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 2243*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 2244*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2245*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 2246*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 2247*38fd1498Szrj COSTS_N_INSNS (4), /* DI */ 2248*38fd1498Szrj COSTS_N_INSNS (4)}, /* other */ 2249*38fd1498Szrj 0, /* cost of multiply per each bit set */ 2250*38fd1498Szrj {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ 2251*38fd1498Szrj COSTS_N_INSNS (22), /* HI */ 2252*38fd1498Szrj COSTS_N_INSNS (30), /* SI */ 2253*38fd1498Szrj COSTS_N_INSNS (74), /* DI */ 2254*38fd1498Szrj COSTS_N_INSNS (74)}, /* other */ 2255*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 2256*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 2257*38fd1498Szrj 8, /* "large" insn */ 2258*38fd1498Szrj 17, /* MOVE_RATIO */ 2259*38fd1498Szrj 2260*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 2261*38fd1498Szrj they are latency*2. */ 2262*38fd1498Szrj 6, /* cost for loading QImode using movzbl */ 2263*38fd1498Szrj {6, 6, 6}, /* cost of loading integer registers 2264*38fd1498Szrj in QImode, HImode and SImode. 2265*38fd1498Szrj Relative to reg-reg move (2). */ 2266*38fd1498Szrj {6, 6, 6}, /* cost of storing integer registers */ 2267*38fd1498Szrj 4, /* cost of reg,reg fld/fst */ 2268*38fd1498Szrj {6, 6, 12}, /* cost of loading fp registers 2269*38fd1498Szrj in SFmode, DFmode and XFmode */ 2270*38fd1498Szrj {6, 6, 12}, /* cost of storing fp registers 2271*38fd1498Szrj in SFmode, DFmode and XFmode */ 2272*38fd1498Szrj 2, /* cost of moving MMX register */ 2273*38fd1498Szrj {6, 6}, /* cost of loading MMX registers 2274*38fd1498Szrj in SImode and DImode */ 2275*38fd1498Szrj {6, 6}, /* cost of storing MMX registers 2276*38fd1498Szrj in SImode and DImode */ 2277*38fd1498Szrj 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 2278*38fd1498Szrj {6, 6, 6, 10, 15}, /* cost of loading SSE registers 2279*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2280*38fd1498Szrj {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ 2281*38fd1498Szrj {6, 6, 6, 10, 15}, /* cost of storing SSE registers 2282*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2283*38fd1498Szrj {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2284*38fd1498Szrj 6, 6, /* SSE->integer and integer->SSE moves */ 2285*38fd1498Szrj 18, 6, /* Gather load static, per_elt. */ 2286*38fd1498Szrj 18, 6, /* Gather store static, per_elt. */ 2287*38fd1498Szrj 32, /* size of l1 cache. */ 2288*38fd1498Szrj 512, /* size of l2 cache. */ 2289*38fd1498Szrj 64, /* size of prefetch block */ 2290*38fd1498Szrj 6, /* number of parallel prefetches */ 2291*38fd1498Szrj /* Benchmarks shows large regressions on K8 sixtrack benchmark when this 2292*38fd1498Szrj value is increased to perhaps more appropriate value of 5. */ 2293*38fd1498Szrj 3, /* Branch cost */ 2294*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2295*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 2296*38fd1498Szrj COSTS_N_INSNS (17), /* cost of FDIV instruction. */ 2297*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2298*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2299*38fd1498Szrj COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ 2300*38fd1498Szrj 2301*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2302*38fd1498Szrj COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2303*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2304*38fd1498Szrj COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2305*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2306*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2307*38fd1498Szrj COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 2308*38fd1498Szrj COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 2309*38fd1498Szrj COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 2310*38fd1498Szrj COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 2311*38fd1498Szrj 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ 2312*38fd1498Szrj generic_memcpy, 2313*38fd1498Szrj generic_memset, 2314*38fd1498Szrj COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 2315*38fd1498Szrj COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 2316*38fd1498Szrj }; 2317*38fd1498Szrj 2318*38fd1498Szrj /* core_cost should produce code tuned for Core familly of CPUs. */ 2319*38fd1498Szrj static stringop_algs core_memcpy[2] = { 2320*38fd1498Szrj {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, 2321*38fd1498Szrj {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, 2322*38fd1498Szrj {-1, libcall, false}}}}; 2323*38fd1498Szrj static stringop_algs core_memset[2] = { 2324*38fd1498Szrj {libcall, {{6, loop_1_byte, true}, 2325*38fd1498Szrj {24, loop, true}, 2326*38fd1498Szrj {8192, rep_prefix_4_byte, true}, 2327*38fd1498Szrj {-1, libcall, false}}}, 2328*38fd1498Szrj {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, 2329*38fd1498Szrj {-1, libcall, false}}}}; 2330*38fd1498Szrj 2331*38fd1498Szrj static const 2332*38fd1498Szrj struct processor_costs core_cost = { 2333*38fd1498Szrj COSTS_N_INSNS (1), /* cost of an add instruction */ 2334*38fd1498Szrj /* On all chips taken into consideration lea is 2 cycles and more. With 2335*38fd1498Szrj this cost however our current implementation of synth_mult results in 2336*38fd1498Szrj use of unnecessary temporary registers causing regression on several 2337*38fd1498Szrj SPECfp benchmarks. */ 2338*38fd1498Szrj COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2339*38fd1498Szrj COSTS_N_INSNS (1), /* variable shift costs */ 2340*38fd1498Szrj COSTS_N_INSNS (1), /* constant shift costs */ 2341*38fd1498Szrj {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2342*38fd1498Szrj COSTS_N_INSNS (4), /* HI */ 2343*38fd1498Szrj COSTS_N_INSNS (3), /* SI */ 2344*38fd1498Szrj /* Here we tune for Sandybridge or newer. */ 2345*38fd1498Szrj COSTS_N_INSNS (3), /* DI */ 2346*38fd1498Szrj COSTS_N_INSNS (3)}, /* other */ 2347*38fd1498Szrj 0, /* cost of multiply per each bit set */ 2348*38fd1498Szrj /* Expanding div/mod currently doesn't consider parallelism. So the cost 2349*38fd1498Szrj model is not realistic. We compensate by increasing the latencies a bit. */ 2350*38fd1498Szrj {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 2351*38fd1498Szrj COSTS_N_INSNS (11), /* HI */ 2352*38fd1498Szrj COSTS_N_INSNS (14), /* SI */ 2353*38fd1498Szrj COSTS_N_INSNS (81), /* DI */ 2354*38fd1498Szrj COSTS_N_INSNS (81)}, /* other */ 2355*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movsx */ 2356*38fd1498Szrj COSTS_N_INSNS (1), /* cost of movzx */ 2357*38fd1498Szrj 8, /* "large" insn */ 2358*38fd1498Szrj 17, /* MOVE_RATIO */ 2359*38fd1498Szrj 2360*38fd1498Szrj /* All move costs are relative to integer->integer move times 2 and thus 2361*38fd1498Szrj they are latency*2. */ 2362*38fd1498Szrj 6, /* cost for loading QImode using movzbl */ 2363*38fd1498Szrj {4, 4, 4}, /* cost of loading integer registers 2364*38fd1498Szrj in QImode, HImode and SImode. 2365*38fd1498Szrj Relative to reg-reg move (2). */ 2366*38fd1498Szrj {6, 6, 6}, /* cost of storing integer registers */ 2367*38fd1498Szrj 2, /* cost of reg,reg fld/fst */ 2368*38fd1498Szrj {6, 6, 8}, /* cost of loading fp registers 2369*38fd1498Szrj in SFmode, DFmode and XFmode */ 2370*38fd1498Szrj {6, 6, 10}, /* cost of storing fp registers 2371*38fd1498Szrj in SFmode, DFmode and XFmode */ 2372*38fd1498Szrj 2, /* cost of moving MMX register */ 2373*38fd1498Szrj {6, 6}, /* cost of loading MMX registers 2374*38fd1498Szrj in SImode and DImode */ 2375*38fd1498Szrj {6, 6}, /* cost of storing MMX registers 2376*38fd1498Szrj in SImode and DImode */ 2377*38fd1498Szrj 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2378*38fd1498Szrj {6, 6, 6, 6, 12}, /* cost of loading SSE registers 2379*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2380*38fd1498Szrj {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 2381*38fd1498Szrj {6, 6, 6, 6, 12}, /* cost of storing SSE registers 2382*38fd1498Szrj in 32,64,128,256 and 512-bit */ 2383*38fd1498Szrj {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 2384*38fd1498Szrj 2, 2, /* SSE->integer and integer->SSE moves */ 2385*38fd1498Szrj /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, 2386*38fd1498Szrj rec. throughput 6. 2387*38fd1498Szrj So 5 uops statically and one uops per load. */ 2388*38fd1498Szrj 10, 6, /* Gather load static, per_elt. */ 2389*38fd1498Szrj 10, 6, /* Gather store static, per_elt. */ 2390*38fd1498Szrj 64, /* size of l1 cache. */ 2391*38fd1498Szrj 512, /* size of l2 cache. */ 2392*38fd1498Szrj 64, /* size of prefetch block */ 2393*38fd1498Szrj 6, /* number of parallel prefetches */ 2394*38fd1498Szrj /* FIXME perhaps more appropriate value is 5. */ 2395*38fd1498Szrj 3, /* Branch cost */ 2396*38fd1498Szrj COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2397*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 2398*38fd1498Szrj /* 10-24 */ 2399*38fd1498Szrj COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 2400*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2401*38fd1498Szrj COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2402*38fd1498Szrj COSTS_N_INSNS (23), /* cost of FSQRT instruction. */ 2403*38fd1498Szrj 2404*38fd1498Szrj COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2405*38fd1498Szrj COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2406*38fd1498Szrj COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2407*38fd1498Szrj COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2408*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2409*38fd1498Szrj COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2410*38fd1498Szrj COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 2411*38fd1498Szrj COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ 2412*38fd1498Szrj COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ 2413*38fd1498Szrj COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ 2414*38fd1498Szrj 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2415*38fd1498Szrj core_memcpy, 2416*38fd1498Szrj core_memset, 2417*38fd1498Szrj COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2418*38fd1498Szrj COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2419*38fd1498Szrj }; 2420*38fd1498Szrj 2421