1 /* Costs of operations of individual x86 CPUs. 2 Copyright (C) 1988-2022 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 GCC is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 Under Section 7 of GPL version 3, you are granted additional 17 permissions described in the GCC Runtime Library Exception, version 18 3.1, as published by the Free Software Foundation. 19 20 You should have received a copy of the GNU General Public License and 21 a copy of the GCC Runtime Library Exception along with this program; 22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 <http://www.gnu.org/licenses/>. */ 24 /* Processor costs (relative to an add) */ 25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ 26 #define COSTS_N_BYTES(N) ((N) * 2) 27 28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} 29 30 static stringop_algs ix86_size_memcpy[2] = { 31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 33 static stringop_algs ix86_size_memset[2] = { 34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 36 37 const 38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */ 39 { 40 /* Start of register allocator costs. integer->integer move cost is 2. */ 41 2, /* cost for loading QImode using movzbl */ 42 {2, 2, 2}, /* cost of loading integer registers 43 in QImode, HImode and SImode. 44 Relative to reg-reg move (2). */ 45 {2, 2, 2}, /* cost of storing integer registers */ 46 2, /* cost of reg,reg fld/fst */ 47 {2, 2, 2}, /* cost of loading fp registers 48 in SFmode, DFmode and XFmode */ 49 {2, 2, 2}, /* cost of storing fp registers 50 in SFmode, DFmode and XFmode */ 51 3, /* cost of moving MMX register */ 52 {3, 3}, /* cost of loading MMX registers 53 in SImode and DImode */ 54 {3, 3}, /* cost of storing MMX registers 55 in SImode and DImode */ 56 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 57 {3, 3, 3, 3, 3}, /* cost of loading SSE registers 58 in 32,64,128,256 and 512-bit */ 59 {3, 3, 3, 3, 3}, /* cost of storing SSE registers 60 in 32,64,128,256 and 512-bit */ 61 3, 3, /* SSE->integer and integer->SSE moves */ 62 3, 3, /* mask->integer and integer->mask moves */ 63 {2, 2, 2}, /* cost of loading mask register 64 in QImode, HImode, SImode. */ 65 {2, 2, 2}, /* cost if storing mask register 66 in QImode, HImode, SImode. */ 67 2, /* cost of moving mask register. */ 68 /* End of register allocator costs. */ 69 }, 70 71 COSTS_N_BYTES (2), /* cost of an add instruction */ 72 COSTS_N_BYTES (3), /* cost of a lea instruction */ 73 COSTS_N_BYTES (2), /* variable shift costs */ 74 COSTS_N_BYTES (3), /* constant shift costs */ 75 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ 76 COSTS_N_BYTES (3), /* HI */ 77 COSTS_N_BYTES (3), /* SI */ 78 COSTS_N_BYTES (3), /* DI */ 79 COSTS_N_BYTES (5)}, /* other */ 80 0, /* cost of multiply per each bit set */ 81 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ 82 COSTS_N_BYTES (3), /* HI */ 83 COSTS_N_BYTES (3), /* SI */ 84 COSTS_N_BYTES (3), /* DI */ 85 COSTS_N_BYTES (5)}, /* other */ 86 COSTS_N_BYTES (3), /* cost of movsx */ 87 COSTS_N_BYTES (3), /* cost of movzx */ 88 0, /* "large" insn */ 89 2, /* MOVE_RATIO */ 90 2, /* CLEAR_RATIO */ 91 {2, 2, 2}, /* cost of loading integer registers 92 in QImode, HImode and SImode. 93 Relative to reg-reg move (2). */ 94 {2, 2, 2}, /* cost of storing integer registers */ 95 {3, 3, 3, 3, 3}, /* cost of loading SSE register 96 in 32bit, 64bit, 128bit, 256bit and 512bit */ 97 {3, 3, 3, 3, 3}, /* cost of storing SSE register 98 in 32bit, 64bit, 128bit, 256bit and 512bit */ 99 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load 100 in 128bit, 256bit and 512bit */ 101 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store 102 in 128bit, 256bit and 512bit */ 103 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 104 3, /* cost of moving SSE register to integer. */ 105 5, 0, /* Gather load static, per_elt. */ 106 5, 0, /* Gather store static, per_elt. */ 107 0, /* size of l1 cache */ 108 0, /* size of l2 cache */ 109 0, /* size of prefetch block */ 110 0, /* number of parallel prefetches */ 111 2, /* Branch cost */ 112 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ 113 COSTS_N_BYTES (2), /* cost of FMUL instruction. */ 114 COSTS_N_BYTES (2), /* cost of FDIV instruction. */ 115 COSTS_N_BYTES (2), /* cost of FABS instruction. */ 116 COSTS_N_BYTES (2), /* cost of FCHS instruction. */ 117 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ 118 119 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */ 120 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 121 COSTS_N_BYTES (2), /* cost of MULSS instruction. */ 122 COSTS_N_BYTES (2), /* cost of MULSD instruction. */ 123 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */ 124 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */ 125 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */ 126 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */ 127 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */ 128 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */ 129 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 130 ix86_size_memcpy, 131 ix86_size_memset, 132 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ 133 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */ 134 NULL, /* Loop alignment. */ 135 NULL, /* Jump alignment. */ 136 NULL, /* Label alignment. */ 137 NULL, /* Func alignment. */ 138 }; 139 140 /* Processor costs (relative to an add) */ 141 static stringop_algs i386_memcpy[2] = { 142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 143 DUMMY_STRINGOP_ALGS}; 144 static stringop_algs i386_memset[2] = { 145 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 146 DUMMY_STRINGOP_ALGS}; 147 148 static const 149 struct processor_costs i386_cost = { /* 386 specific costs */ 150 { 151 /* Start of register allocator costs. integer->integer move cost is 2. */ 152 4, /* cost for loading QImode using movzbl */ 153 {2, 4, 2}, /* cost of loading integer registers 154 in QImode, HImode and SImode. 155 Relative to reg-reg move (2). */ 156 {2, 4, 2}, /* cost of storing integer registers */ 157 2, /* cost of reg,reg fld/fst */ 158 {8, 8, 8}, /* cost of loading fp registers 159 in SFmode, DFmode and XFmode */ 160 {8, 8, 8}, /* cost of storing fp registers 161 in SFmode, DFmode and XFmode */ 162 2, /* cost of moving MMX register */ 163 {4, 8}, /* cost of loading MMX registers 164 in SImode and DImode */ 165 {4, 8}, /* cost of storing MMX registers 166 in SImode and DImode */ 167 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 168 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 169 in 32,64,128,256 and 512-bit */ 170 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 171 in 32,64,128,256 and 512-bit */ 172 3, 3, /* SSE->integer and integer->SSE moves */ 173 3, 3, /* mask->integer and integer->mask moves */ 174 {2, 4, 2}, /* cost of loading mask register 175 in QImode, HImode, SImode. */ 176 {2, 4, 2}, /* cost if storing mask register 177 in QImode, HImode, SImode. */ 178 2, /* cost of moving mask register. */ 179 /* End of register allocator costs. */ 180 }, 181 182 COSTS_N_INSNS (1), /* cost of an add instruction */ 183 COSTS_N_INSNS (1), /* cost of a lea instruction */ 184 COSTS_N_INSNS (3), /* variable shift costs */ 185 COSTS_N_INSNS (2), /* constant shift costs */ 186 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ 187 COSTS_N_INSNS (6), /* HI */ 188 COSTS_N_INSNS (6), /* SI */ 189 COSTS_N_INSNS (6), /* DI */ 190 COSTS_N_INSNS (6)}, /* other */ 191 COSTS_N_INSNS (1), /* cost of multiply per each bit set */ 192 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ 193 COSTS_N_INSNS (23), /* HI */ 194 COSTS_N_INSNS (23), /* SI */ 195 COSTS_N_INSNS (23), /* DI */ 196 COSTS_N_INSNS (23)}, /* other */ 197 COSTS_N_INSNS (3), /* cost of movsx */ 198 COSTS_N_INSNS (2), /* cost of movzx */ 199 15, /* "large" insn */ 200 3, /* MOVE_RATIO */ 201 3, /* CLEAR_RATIO */ 202 {2, 4, 2}, /* cost of loading integer registers 203 in QImode, HImode and SImode. 204 Relative to reg-reg move (2). */ 205 {2, 4, 2}, /* cost of storing integer registers */ 206 {4, 8, 16, 32, 64}, /* cost of loading SSE register 207 in 32bit, 64bit, 128bit, 256bit and 512bit */ 208 {4, 8, 16, 32, 64}, /* cost of storing SSE register 209 in 32bit, 64bit, 128bit, 256bit and 512bit */ 210 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 211 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 212 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 213 3, /* cost of moving SSE register to integer. */ 214 4, 4, /* Gather load static, per_elt. */ 215 4, 4, /* Gather store static, per_elt. */ 216 0, /* size of l1 cache */ 217 0, /* size of l2 cache */ 218 0, /* size of prefetch block */ 219 0, /* number of parallel prefetches */ 220 1, /* Branch cost */ 221 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ 222 COSTS_N_INSNS (27), /* cost of FMUL instruction. */ 223 COSTS_N_INSNS (88), /* cost of FDIV instruction. */ 224 COSTS_N_INSNS (22), /* cost of FABS instruction. */ 225 COSTS_N_INSNS (24), /* cost of FCHS instruction. */ 226 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ 227 228 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 229 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */ 230 COSTS_N_INSNS (27), /* cost of MULSS instruction. */ 231 COSTS_N_INSNS (27), /* cost of MULSD instruction. */ 232 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */ 233 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */ 234 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */ 235 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */ 236 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */ 237 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */ 238 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 239 i386_memcpy, 240 i386_memset, 241 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 242 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 243 "4", /* Loop alignment. */ 244 "4", /* Jump alignment. */ 245 NULL, /* Label alignment. */ 246 "4", /* Func alignment. */ 247 }; 248 249 static stringop_algs i486_memcpy[2] = { 250 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 251 DUMMY_STRINGOP_ALGS}; 252 static stringop_algs i486_memset[2] = { 253 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 254 DUMMY_STRINGOP_ALGS}; 255 256 static const 257 struct processor_costs i486_cost = { /* 486 specific costs */ 258 { 259 /* Start of register allocator costs. integer->integer move cost is 2. */ 260 4, /* cost for loading QImode using movzbl */ 261 {2, 4, 2}, /* cost of loading integer registers 262 in QImode, HImode and SImode. 263 Relative to reg-reg move (2). */ 264 {2, 4, 2}, /* cost of storing integer registers */ 265 2, /* cost of reg,reg fld/fst */ 266 {8, 8, 8}, /* cost of loading fp registers 267 in SFmode, DFmode and XFmode */ 268 {8, 8, 8}, /* cost of storing fp registers 269 in SFmode, DFmode and XFmode */ 270 2, /* cost of moving MMX register */ 271 {4, 8}, /* cost of loading MMX registers 272 in SImode and DImode */ 273 {4, 8}, /* cost of storing MMX registers 274 in SImode and DImode */ 275 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 276 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 277 in 32,64,128,256 and 512-bit */ 278 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 279 in 32,64,128,256 and 512-bit */ 280 3, 3, /* SSE->integer and integer->SSE moves */ 281 3, 3, /* mask->integer and integer->mask moves */ 282 {2, 4, 2}, /* cost of loading mask register 283 in QImode, HImode, SImode. */ 284 {2, 4, 2}, /* cost if storing mask register 285 in QImode, HImode, SImode. */ 286 2, /* cost of moving mask register. */ 287 /* End of register allocator costs. */ 288 }, 289 290 COSTS_N_INSNS (1), /* cost of an add instruction */ 291 COSTS_N_INSNS (1), /* cost of a lea instruction */ 292 COSTS_N_INSNS (3), /* variable shift costs */ 293 COSTS_N_INSNS (2), /* constant shift costs */ 294 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ 295 COSTS_N_INSNS (12), /* HI */ 296 COSTS_N_INSNS (12), /* SI */ 297 COSTS_N_INSNS (12), /* DI */ 298 COSTS_N_INSNS (12)}, /* other */ 299 1, /* cost of multiply per each bit set */ 300 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ 301 COSTS_N_INSNS (40), /* HI */ 302 COSTS_N_INSNS (40), /* SI */ 303 COSTS_N_INSNS (40), /* DI */ 304 COSTS_N_INSNS (40)}, /* other */ 305 COSTS_N_INSNS (3), /* cost of movsx */ 306 COSTS_N_INSNS (2), /* cost of movzx */ 307 15, /* "large" insn */ 308 3, /* MOVE_RATIO */ 309 3, /* CLEAR_RATIO */ 310 {2, 4, 2}, /* cost of loading integer registers 311 in QImode, HImode and SImode. 312 Relative to reg-reg move (2). */ 313 {2, 4, 2}, /* cost of storing integer registers */ 314 {4, 8, 16, 32, 64}, /* cost of loading SSE register 315 in 32bit, 64bit, 128bit, 256bit and 512bit */ 316 {4, 8, 16, 32, 64}, /* cost of storing SSE register 317 in 32bit, 64bit, 128bit, 256bit and 512bit */ 318 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 319 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 320 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 321 3, /* cost of moving SSE register to integer. */ 322 4, 4, /* Gather load static, per_elt. */ 323 4, 4, /* Gather store static, per_elt. */ 324 4, /* size of l1 cache. 486 has 8kB cache 325 shared for code and data, so 4kB is 326 not really precise. */ 327 4, /* size of l2 cache */ 328 0, /* size of prefetch block */ 329 0, /* number of parallel prefetches */ 330 1, /* Branch cost */ 331 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 332 COSTS_N_INSNS (16), /* cost of FMUL instruction. */ 333 COSTS_N_INSNS (73), /* cost of FDIV instruction. */ 334 COSTS_N_INSNS (3), /* cost of FABS instruction. */ 335 COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 336 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ 337 338 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 339 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 340 COSTS_N_INSNS (16), /* cost of MULSS instruction. */ 341 COSTS_N_INSNS (16), /* cost of MULSD instruction. */ 342 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */ 343 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */ 344 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */ 345 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */ 346 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */ 347 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */ 348 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 349 i486_memcpy, 350 i486_memset, 351 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 352 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 353 "16", /* Loop alignment. */ 354 "16", /* Jump alignment. */ 355 "0:0:8", /* Label alignment. */ 356 "16", /* Func alignment. */ 357 }; 358 359 static stringop_algs pentium_memcpy[2] = { 360 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 361 DUMMY_STRINGOP_ALGS}; 362 static stringop_algs pentium_memset[2] = { 363 {libcall, {{-1, rep_prefix_4_byte, false}}}, 364 DUMMY_STRINGOP_ALGS}; 365 366 static const 367 struct processor_costs pentium_cost = { 368 { 369 /* Start of register allocator costs. integer->integer move cost is 2. */ 370 6, /* cost for loading QImode using movzbl */ 371 {2, 4, 2}, /* cost of loading integer registers 372 in QImode, HImode and SImode. 373 Relative to reg-reg move (2). */ 374 {2, 4, 2}, /* cost of storing integer registers */ 375 2, /* cost of reg,reg fld/fst */ 376 {2, 2, 6}, /* cost of loading fp registers 377 in SFmode, DFmode and XFmode */ 378 {4, 4, 6}, /* cost of storing fp registers 379 in SFmode, DFmode and XFmode */ 380 8, /* cost of moving MMX register */ 381 {8, 8}, /* cost of loading MMX registers 382 in SImode and DImode */ 383 {8, 8}, /* cost of storing MMX registers 384 in SImode and DImode */ 385 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 386 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 387 in 32,64,128,256 and 512-bit */ 388 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 389 in 32,64,128,256 and 512-bit */ 390 3, 3, /* SSE->integer and integer->SSE moves */ 391 3, 3, /* mask->integer and integer->mask moves */ 392 {2, 4, 2}, /* cost of loading mask register 393 in QImode, HImode, SImode. */ 394 {2, 4, 2}, /* cost if storing mask register 395 in QImode, HImode, SImode. */ 396 2, /* cost of moving mask register. */ 397 /* End of register allocator costs. */ 398 }, 399 400 COSTS_N_INSNS (1), /* cost of an add instruction */ 401 COSTS_N_INSNS (1), /* cost of a lea instruction */ 402 COSTS_N_INSNS (4), /* variable shift costs */ 403 COSTS_N_INSNS (1), /* constant shift costs */ 404 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 405 COSTS_N_INSNS (11), /* HI */ 406 COSTS_N_INSNS (11), /* SI */ 407 COSTS_N_INSNS (11), /* DI */ 408 COSTS_N_INSNS (11)}, /* other */ 409 0, /* cost of multiply per each bit set */ 410 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 411 COSTS_N_INSNS (25), /* HI */ 412 COSTS_N_INSNS (25), /* SI */ 413 COSTS_N_INSNS (25), /* DI */ 414 COSTS_N_INSNS (25)}, /* other */ 415 COSTS_N_INSNS (3), /* cost of movsx */ 416 COSTS_N_INSNS (2), /* cost of movzx */ 417 8, /* "large" insn */ 418 6, /* MOVE_RATIO */ 419 6, /* CLEAR_RATIO */ 420 {2, 4, 2}, /* cost of loading integer registers 421 in QImode, HImode and SImode. 422 Relative to reg-reg move (2). */ 423 {2, 4, 2}, /* cost of storing integer registers */ 424 {4, 8, 16, 32, 64}, /* cost of loading SSE register 425 in 32bit, 64bit, 128bit, 256bit and 512bit */ 426 {4, 8, 16, 32, 64}, /* cost of storing SSE register 427 in 32bit, 64bit, 128bit, 256bit and 512bit */ 428 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 429 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 430 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 431 3, /* cost of moving SSE register to integer. */ 432 4, 4, /* Gather load static, per_elt. */ 433 4, 4, /* Gather store static, per_elt. */ 434 8, /* size of l1 cache. */ 435 8, /* size of l2 cache */ 436 0, /* size of prefetch block */ 437 0, /* number of parallel prefetches */ 438 2, /* Branch cost */ 439 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 440 COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 441 COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 442 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 443 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 444 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 445 446 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 447 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 448 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 449 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 450 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 451 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 452 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 453 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */ 454 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */ 455 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */ 456 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 457 pentium_memcpy, 458 pentium_memset, 459 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 460 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 461 "16:8:8", /* Loop alignment. */ 462 "16:8:8", /* Jump alignment. */ 463 "0:0:8", /* Label alignment. */ 464 "16", /* Func alignment. */ 465 }; 466 467 static const 468 struct processor_costs lakemont_cost = { 469 { 470 /* Start of register allocator costs. integer->integer move cost is 2. */ 471 6, /* cost for loading QImode using movzbl */ 472 {2, 4, 2}, /* cost of loading integer registers 473 in QImode, HImode and SImode. 474 Relative to reg-reg move (2). */ 475 {2, 4, 2}, /* cost of storing integer registers */ 476 2, /* cost of reg,reg fld/fst */ 477 {2, 2, 6}, /* cost of loading fp registers 478 in SFmode, DFmode and XFmode */ 479 {4, 4, 6}, /* cost of storing fp registers 480 in SFmode, DFmode and XFmode */ 481 8, /* cost of moving MMX register */ 482 {8, 8}, /* cost of loading MMX registers 483 in SImode and DImode */ 484 {8, 8}, /* cost of storing MMX registers 485 in SImode and DImode */ 486 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 487 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 488 in 32,64,128,256 and 512-bit */ 489 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 490 in 32,64,128,256 and 512-bit */ 491 3, 3, /* SSE->integer and integer->SSE moves */ 492 3, 3, /* mask->integer and integer->mask moves */ 493 {2, 4, 2}, /* cost of loading mask register 494 in QImode, HImode, SImode. */ 495 {2, 4, 2}, /* cost if storing mask register 496 in QImode, HImode, SImode. */ 497 2, /* cost of moving mask register. */ 498 /* End of register allocator costs. */ 499 }, 500 501 COSTS_N_INSNS (1), /* cost of an add instruction */ 502 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 503 COSTS_N_INSNS (1), /* variable shift costs */ 504 COSTS_N_INSNS (1), /* constant shift costs */ 505 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 506 COSTS_N_INSNS (11), /* HI */ 507 COSTS_N_INSNS (11), /* SI */ 508 COSTS_N_INSNS (11), /* DI */ 509 COSTS_N_INSNS (11)}, /* other */ 510 0, /* cost of multiply per each bit set */ 511 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 512 COSTS_N_INSNS (25), /* HI */ 513 COSTS_N_INSNS (25), /* SI */ 514 COSTS_N_INSNS (25), /* DI */ 515 COSTS_N_INSNS (25)}, /* other */ 516 COSTS_N_INSNS (3), /* cost of movsx */ 517 COSTS_N_INSNS (2), /* cost of movzx */ 518 8, /* "large" insn */ 519 17, /* MOVE_RATIO */ 520 6, /* CLEAR_RATIO */ 521 {2, 4, 2}, /* cost of loading integer registers 522 in QImode, HImode and SImode. 523 Relative to reg-reg move (2). */ 524 {2, 4, 2}, /* cost of storing integer registers */ 525 {4, 8, 16, 32, 64}, /* cost of loading SSE register 526 in 32bit, 64bit, 128bit, 256bit and 512bit */ 527 {4, 8, 16, 32, 64}, /* cost of storing SSE register 528 in 32bit, 64bit, 128bit, 256bit and 512bit */ 529 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 530 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 531 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 532 3, /* cost of moving SSE register to integer. */ 533 4, 4, /* Gather load static, per_elt. */ 534 4, 4, /* Gather store static, per_elt. */ 535 8, /* size of l1 cache. */ 536 8, /* size of l2 cache */ 537 0, /* size of prefetch block */ 538 0, /* number of parallel prefetches */ 539 2, /* Branch cost */ 540 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 541 COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 542 COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 543 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 544 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 545 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 546 547 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 548 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 549 COSTS_N_INSNS (5), /* cost of MULSS instruction. */ 550 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 551 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */ 552 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */ 553 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 554 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 555 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 556 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 557 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 558 pentium_memcpy, 559 pentium_memset, 560 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 561 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 562 "16:8:8", /* Loop alignment. */ 563 "16:8:8", /* Jump alignment. */ 564 "0:0:8", /* Label alignment. */ 565 "16", /* Func alignment. */ 566 }; 567 568 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes 569 (we ensure the alignment). For small blocks inline loop is still a 570 noticeable win, for bigger blocks either rep movsl or rep movsb is 571 way to go. Rep movsb has apparently more expensive startup time in CPU, 572 but after 4K the difference is down in the noise. */ 573 static stringop_algs pentiumpro_memcpy[2] = { 574 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, 575 {8192, rep_prefix_4_byte, false}, 576 {-1, rep_prefix_1_byte, false}}}, 577 DUMMY_STRINGOP_ALGS}; 578 static stringop_algs pentiumpro_memset[2] = { 579 {rep_prefix_4_byte, {{1024, unrolled_loop, false}, 580 {8192, rep_prefix_4_byte, false}, 581 {-1, libcall, false}}}, 582 DUMMY_STRINGOP_ALGS}; 583 static const 584 struct processor_costs pentiumpro_cost = { 585 { 586 /* Start of register allocator costs. integer->integer move cost is 2. */ 587 2, /* cost for loading QImode using movzbl */ 588 {4, 4, 4}, /* cost of loading integer registers 589 in QImode, HImode and SImode. 590 Relative to reg-reg move (2). */ 591 {2, 2, 2}, /* cost of storing integer registers */ 592 2, /* cost of reg,reg fld/fst */ 593 {2, 2, 6}, /* cost of loading fp registers 594 in SFmode, DFmode and XFmode */ 595 {4, 4, 6}, /* cost of storing fp registers 596 in SFmode, DFmode and XFmode */ 597 2, /* cost of moving MMX register */ 598 {2, 2}, /* cost of loading MMX registers 599 in SImode and DImode */ 600 {2, 2}, /* cost of storing MMX registers 601 in SImode and DImode */ 602 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 603 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 604 in 32,64,128,256 and 512-bit */ 605 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 606 in 32,64,128,256 and 512-bit */ 607 3, 3, /* SSE->integer and integer->SSE moves */ 608 3, 3, /* mask->integer and integer->mask moves */ 609 {4, 4, 4}, /* cost of loading mask register 610 in QImode, HImode, SImode. */ 611 {2, 2, 2}, /* cost if storing mask register 612 in QImode, HImode, SImode. */ 613 2, /* cost of moving mask register. */ 614 /* End of register allocator costs. */ 615 }, 616 617 COSTS_N_INSNS (1), /* cost of an add instruction */ 618 COSTS_N_INSNS (1), /* cost of a lea instruction */ 619 COSTS_N_INSNS (1), /* variable shift costs */ 620 COSTS_N_INSNS (1), /* constant shift costs */ 621 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 622 COSTS_N_INSNS (4), /* HI */ 623 COSTS_N_INSNS (4), /* SI */ 624 COSTS_N_INSNS (4), /* DI */ 625 COSTS_N_INSNS (4)}, /* other */ 626 0, /* cost of multiply per each bit set */ 627 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ 628 COSTS_N_INSNS (17), /* HI */ 629 COSTS_N_INSNS (17), /* SI */ 630 COSTS_N_INSNS (17), /* DI */ 631 COSTS_N_INSNS (17)}, /* other */ 632 COSTS_N_INSNS (1), /* cost of movsx */ 633 COSTS_N_INSNS (1), /* cost of movzx */ 634 8, /* "large" insn */ 635 6, /* MOVE_RATIO */ 636 6, /* CLEAR_RATIO */ 637 {4, 4, 4}, /* cost of loading integer registers 638 in QImode, HImode and SImode. 639 Relative to reg-reg move (2). */ 640 {2, 2, 2}, /* cost of storing integer registers */ 641 {4, 8, 16, 32, 64}, /* cost of loading SSE register 642 in 32bit, 64bit, 128bit, 256bit and 512bit */ 643 {4, 8, 16, 32, 64}, /* cost of storing SSE register 644 in 32bit, 64bit, 128bit, 256bit and 512bit */ 645 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 646 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 647 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 648 3, /* cost of moving SSE register to integer. */ 649 4, 4, /* Gather load static, per_elt. */ 650 4, 4, /* Gather store static, per_elt. */ 651 8, /* size of l1 cache. */ 652 256, /* size of l2 cache */ 653 32, /* size of prefetch block */ 654 6, /* number of parallel prefetches */ 655 2, /* Branch cost */ 656 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 657 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 658 COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 659 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 660 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 661 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 662 663 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 664 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 665 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 666 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 667 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 668 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 669 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 670 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */ 671 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 672 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */ 673 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 674 pentiumpro_memcpy, 675 pentiumpro_memset, 676 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 677 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 678 "16", /* Loop alignment. */ 679 "16:11:8", /* Jump alignment. */ 680 "0:0:8", /* Label alignment. */ 681 "16", /* Func alignment. */ 682 }; 683 684 static stringop_algs geode_memcpy[2] = { 685 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 686 DUMMY_STRINGOP_ALGS}; 687 static stringop_algs geode_memset[2] = { 688 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 689 DUMMY_STRINGOP_ALGS}; 690 static const 691 struct processor_costs geode_cost = { 692 { 693 /* Start of register allocator costs. integer->integer move cost is 2. */ 694 2, /* cost for loading QImode using movzbl */ 695 {2, 2, 2}, /* cost of loading integer registers 696 in QImode, HImode and SImode. 697 Relative to reg-reg move (2). */ 698 {2, 2, 2}, /* cost of storing integer registers */ 699 2, /* cost of reg,reg fld/fst */ 700 {2, 2, 2}, /* cost of loading fp registers 701 in SFmode, DFmode and XFmode */ 702 {4, 6, 6}, /* cost of storing fp registers 703 in SFmode, DFmode and XFmode */ 704 2, /* cost of moving MMX register */ 705 {2, 2}, /* cost of loading MMX registers 706 in SImode and DImode */ 707 {2, 2}, /* cost of storing MMX registers 708 in SImode and DImode */ 709 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 710 {2, 2, 8, 16, 32}, /* cost of loading SSE registers 711 in 32,64,128,256 and 512-bit */ 712 {2, 2, 8, 16, 32}, /* cost of storing SSE registers 713 in 32,64,128,256 and 512-bit */ 714 6, 6, /* SSE->integer and integer->SSE moves */ 715 6, 6, /* mask->integer and integer->mask moves */ 716 {2, 2, 2}, /* cost of loading mask register 717 in QImode, HImode, SImode. */ 718 {2, 2, 2}, /* cost if storing mask register 719 in QImode, HImode, SImode. */ 720 2, /* cost of moving mask register. */ 721 /* End of register allocator costs. */ 722 }, 723 724 COSTS_N_INSNS (1), /* cost of an add instruction */ 725 COSTS_N_INSNS (1), /* cost of a lea instruction */ 726 COSTS_N_INSNS (2), /* variable shift costs */ 727 COSTS_N_INSNS (1), /* constant shift costs */ 728 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 729 COSTS_N_INSNS (4), /* HI */ 730 COSTS_N_INSNS (7), /* SI */ 731 COSTS_N_INSNS (7), /* DI */ 732 COSTS_N_INSNS (7)}, /* other */ 733 0, /* cost of multiply per each bit set */ 734 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ 735 COSTS_N_INSNS (23), /* HI */ 736 COSTS_N_INSNS (39), /* SI */ 737 COSTS_N_INSNS (39), /* DI */ 738 COSTS_N_INSNS (39)}, /* other */ 739 COSTS_N_INSNS (1), /* cost of movsx */ 740 COSTS_N_INSNS (1), /* cost of movzx */ 741 8, /* "large" insn */ 742 4, /* MOVE_RATIO */ 743 4, /* CLEAR_RATIO */ 744 {2, 2, 2}, /* cost of loading integer registers 745 in QImode, HImode and SImode. 746 Relative to reg-reg move (2). */ 747 {2, 2, 2}, /* cost of storing integer registers */ 748 {2, 2, 8, 16, 32}, /* cost of loading SSE register 749 in 32bit, 64bit, 128bit, 256bit and 512bit */ 750 {2, 2, 8, 16, 32}, /* cost of storing SSE register 751 in 32bit, 64bit, 128bit, 256bit and 512bit */ 752 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 753 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 754 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 755 6, /* cost of moving SSE register to integer. */ 756 2, 2, /* Gather load static, per_elt. */ 757 2, 2, /* Gather store static, per_elt. */ 758 64, /* size of l1 cache. */ 759 128, /* size of l2 cache. */ 760 32, /* size of prefetch block */ 761 1, /* number of parallel prefetches */ 762 1, /* Branch cost */ 763 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 764 COSTS_N_INSNS (11), /* cost of FMUL instruction. */ 765 COSTS_N_INSNS (47), /* cost of FDIV instruction. */ 766 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 767 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 768 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ 769 770 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 771 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 772 COSTS_N_INSNS (11), /* cost of MULSS instruction. */ 773 COSTS_N_INSNS (11), /* cost of MULSD instruction. */ 774 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */ 775 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */ 776 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */ 777 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */ 778 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */ 779 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */ 780 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 781 geode_memcpy, 782 geode_memset, 783 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 784 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 785 NULL, /* Loop alignment. */ 786 NULL, /* Jump alignment. */ 787 NULL, /* Label alignment. */ 788 NULL, /* Func alignment. */ 789 }; 790 791 static stringop_algs k6_memcpy[2] = { 792 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 793 DUMMY_STRINGOP_ALGS}; 794 static stringop_algs k6_memset[2] = { 795 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 796 DUMMY_STRINGOP_ALGS}; 797 static const 798 struct processor_costs k6_cost = { 799 { 800 /* Start of register allocator costs. integer->integer move cost is 2. */ 801 3, /* cost for loading QImode using movzbl */ 802 {4, 5, 4}, /* cost of loading integer registers 803 in QImode, HImode and SImode. 804 Relative to reg-reg move (2). */ 805 {2, 3, 2}, /* cost of storing integer registers */ 806 4, /* cost of reg,reg fld/fst */ 807 {6, 6, 6}, /* cost of loading fp registers 808 in SFmode, DFmode and XFmode */ 809 {4, 4, 4}, /* cost of storing fp registers 810 in SFmode, DFmode and XFmode */ 811 2, /* cost of moving MMX register */ 812 {2, 2}, /* cost of loading MMX registers 813 in SImode and DImode */ 814 {2, 2}, /* cost of storing MMX registers 815 in SImode and DImode */ 816 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 817 {2, 2, 8, 16, 32}, /* cost of loading SSE registers 818 in 32,64,128,256 and 512-bit */ 819 {2, 2, 8, 16, 32}, /* cost of storing SSE registers 820 in 32,64,128,256 and 512-bit */ 821 6, 6, /* SSE->integer and integer->SSE moves */ 822 6, 6, /* mask->integer and integer->mask moves */ 823 {4, 5, 4}, /* cost of loading mask register 824 in QImode, HImode, SImode. */ 825 {2, 3, 2}, /* cost if storing mask register 826 in QImode, HImode, SImode. */ 827 2, /* cost of moving mask register. */ 828 /* End of register allocator costs. */ 829 }, 830 831 COSTS_N_INSNS (1), /* cost of an add instruction */ 832 COSTS_N_INSNS (2), /* cost of a lea instruction */ 833 COSTS_N_INSNS (1), /* variable shift costs */ 834 COSTS_N_INSNS (1), /* constant shift costs */ 835 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 836 COSTS_N_INSNS (3), /* HI */ 837 COSTS_N_INSNS (3), /* SI */ 838 COSTS_N_INSNS (3), /* DI */ 839 COSTS_N_INSNS (3)}, /* other */ 840 0, /* cost of multiply per each bit set */ 841 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 842 COSTS_N_INSNS (18), /* HI */ 843 COSTS_N_INSNS (18), /* SI */ 844 COSTS_N_INSNS (18), /* DI */ 845 COSTS_N_INSNS (18)}, /* other */ 846 COSTS_N_INSNS (2), /* cost of movsx */ 847 COSTS_N_INSNS (2), /* cost of movzx */ 848 8, /* "large" insn */ 849 4, /* MOVE_RATIO */ 850 4, /* CLEAR_RATIO */ 851 {4, 5, 4}, /* cost of loading integer registers 852 in QImode, HImode and SImode. 853 Relative to reg-reg move (2). */ 854 {2, 3, 2}, /* cost of storing integer registers */ 855 {2, 2, 8, 16, 32}, /* cost of loading SSE register 856 in 32bit, 64bit, 128bit, 256bit and 512bit */ 857 {2, 2, 8, 16, 32}, /* cost of storing SSE register 858 in 32bit, 64bit, 128bit, 256bit and 512bit */ 859 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 860 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 861 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 862 6, /* cost of moving SSE register to integer. */ 863 2, 2, /* Gather load static, per_elt. */ 864 2, 2, /* Gather store static, per_elt. */ 865 32, /* size of l1 cache. */ 866 32, /* size of l2 cache. Some models 867 have integrated l2 cache, but 868 optimizing for k6 is not important 869 enough to worry about that. */ 870 32, /* size of prefetch block */ 871 1, /* number of parallel prefetches */ 872 1, /* Branch cost */ 873 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ 874 COSTS_N_INSNS (2), /* cost of FMUL instruction. */ 875 COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 876 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 877 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 878 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 879 880 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 881 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 882 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 883 COSTS_N_INSNS (2), /* cost of MULSD instruction. */ 884 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 885 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 886 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */ 887 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */ 888 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */ 889 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */ 890 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 891 k6_memcpy, 892 k6_memset, 893 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 894 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 895 "32:8:8", /* Loop alignment. */ 896 "32:8:8", /* Jump alignment. */ 897 "0:0:8", /* Label alignment. */ 898 "32", /* Func alignment. */ 899 }; 900 901 /* For some reason, Athlon deals better with REP prefix (relative to loops) 902 compared to K8. Alignment becomes important after 8 bytes for memcpy and 903 128 bytes for memset. */ 904 static stringop_algs athlon_memcpy[2] = { 905 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 906 DUMMY_STRINGOP_ALGS}; 907 static stringop_algs athlon_memset[2] = { 908 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 909 DUMMY_STRINGOP_ALGS}; 910 static const 911 struct processor_costs athlon_cost = { 912 { 913 /* Start of register allocator costs. integer->integer move cost is 2. */ 914 4, /* cost for loading QImode using movzbl */ 915 {3, 4, 3}, /* cost of loading integer registers 916 in QImode, HImode and SImode. 917 Relative to reg-reg move (2). */ 918 {3, 4, 3}, /* cost of storing integer registers */ 919 4, /* cost of reg,reg fld/fst */ 920 {4, 4, 12}, /* cost of loading fp registers 921 in SFmode, DFmode and XFmode */ 922 {6, 6, 8}, /* cost of storing fp registers 923 in SFmode, DFmode and XFmode */ 924 2, /* cost of moving MMX register */ 925 {4, 4}, /* cost of loading MMX registers 926 in SImode and DImode */ 927 {4, 4}, /* cost of storing MMX registers 928 in SImode and DImode */ 929 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 930 {4, 4, 12, 12, 24}, /* cost of loading SSE registers 931 in 32,64,128,256 and 512-bit */ 932 {4, 4, 10, 10, 20}, /* cost of storing SSE registers 933 in 32,64,128,256 and 512-bit */ 934 5, 5, /* SSE->integer and integer->SSE moves */ 935 5, 5, /* mask->integer and integer->mask moves */ 936 {3, 4, 3}, /* cost of loading mask register 937 in QImode, HImode, SImode. */ 938 {3, 4, 3}, /* cost if storing mask register 939 in QImode, HImode, SImode. */ 940 2, /* cost of moving mask register. */ 941 /* End of register allocator costs. */ 942 }, 943 944 COSTS_N_INSNS (1), /* cost of an add instruction */ 945 COSTS_N_INSNS (2), /* cost of a lea instruction */ 946 COSTS_N_INSNS (1), /* variable shift costs */ 947 COSTS_N_INSNS (1), /* constant shift costs */ 948 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ 949 COSTS_N_INSNS (5), /* HI */ 950 COSTS_N_INSNS (5), /* SI */ 951 COSTS_N_INSNS (5), /* DI */ 952 COSTS_N_INSNS (5)}, /* other */ 953 0, /* cost of multiply per each bit set */ 954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 955 COSTS_N_INSNS (26), /* HI */ 956 COSTS_N_INSNS (42), /* SI */ 957 COSTS_N_INSNS (74), /* DI */ 958 COSTS_N_INSNS (74)}, /* other */ 959 COSTS_N_INSNS (1), /* cost of movsx */ 960 COSTS_N_INSNS (1), /* cost of movzx */ 961 8, /* "large" insn */ 962 9, /* MOVE_RATIO */ 963 6, /* CLEAR_RATIO */ 964 {3, 4, 3}, /* cost of loading integer registers 965 in QImode, HImode and SImode. 966 Relative to reg-reg move (2). */ 967 {3, 4, 3}, /* cost of storing integer registers */ 968 {4, 4, 12, 12, 24}, /* cost of loading SSE register 969 in 32bit, 64bit, 128bit, 256bit and 512bit */ 970 {4, 4, 10, 10, 20}, /* cost of storing SSE register 971 in 32bit, 64bit, 128bit, 256bit and 512bit */ 972 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */ 973 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 974 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 975 5, /* cost of moving SSE register to integer. */ 976 4, 4, /* Gather load static, per_elt. */ 977 4, 4, /* Gather store static, per_elt. */ 978 64, /* size of l1 cache. */ 979 256, /* size of l2 cache. */ 980 64, /* size of prefetch block */ 981 6, /* number of parallel prefetches */ 982 5, /* Branch cost */ 983 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 984 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 985 COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 986 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 987 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 988 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 989 990 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 991 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 992 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 993 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 994 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 995 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 996 /* 11-16 */ 997 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 998 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */ 999 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 1000 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */ 1001 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1002 athlon_memcpy, 1003 athlon_memset, 1004 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1005 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1006 "16:8:8", /* Loop alignment. */ 1007 "16:8:8", /* Jump alignment. */ 1008 "0:0:8", /* Label alignment. */ 1009 "16", /* Func alignment. */ 1010 }; 1011 1012 /* K8 has optimized REP instruction for medium sized blocks, but for very 1013 small blocks it is better to use loop. For large blocks, libcall can 1014 do nontemporary accesses and beat inline considerably. */ 1015 static stringop_algs k8_memcpy[2] = { 1016 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1017 {-1, rep_prefix_4_byte, false}}}, 1018 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1019 {-1, libcall, false}}}}; 1020 static stringop_algs k8_memset[2] = { 1021 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1022 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1023 {libcall, {{48, unrolled_loop, false}, 1024 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1025 static const 1026 struct processor_costs k8_cost = { 1027 { 1028 /* Start of register allocator costs. integer->integer move cost is 2. */ 1029 4, /* cost for loading QImode using movzbl */ 1030 {3, 4, 3}, /* cost of loading integer registers 1031 in QImode, HImode and SImode. 1032 Relative to reg-reg move (2). */ 1033 {3, 4, 3}, /* cost of storing integer registers */ 1034 4, /* cost of reg,reg fld/fst */ 1035 {4, 4, 12}, /* cost of loading fp registers 1036 in SFmode, DFmode and XFmode */ 1037 {6, 6, 8}, /* cost of storing fp registers 1038 in SFmode, DFmode and XFmode */ 1039 2, /* cost of moving MMX register */ 1040 {3, 3}, /* cost of loading MMX registers 1041 in SImode and DImode */ 1042 {4, 4}, /* cost of storing MMX registers 1043 in SImode and DImode */ 1044 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1045 {4, 3, 12, 12, 24}, /* cost of loading SSE registers 1046 in 32,64,128,256 and 512-bit */ 1047 {4, 4, 10, 10, 20}, /* cost of storing SSE registers 1048 in 32,64,128,256 and 512-bit */ 1049 5, 5, /* SSE->integer and integer->SSE moves */ 1050 5, 5, /* mask->integer and integer->mask moves */ 1051 {3, 4, 3}, /* cost of loading mask register 1052 in QImode, HImode, SImode. */ 1053 {3, 4, 3}, /* cost if storing mask register 1054 in QImode, HImode, SImode. */ 1055 2, /* cost of moving mask register. */ 1056 /* End of register allocator costs. */ 1057 }, 1058 1059 COSTS_N_INSNS (1), /* cost of an add instruction */ 1060 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1061 COSTS_N_INSNS (1), /* variable shift costs */ 1062 COSTS_N_INSNS (1), /* constant shift costs */ 1063 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1064 COSTS_N_INSNS (4), /* HI */ 1065 COSTS_N_INSNS (3), /* SI */ 1066 COSTS_N_INSNS (4), /* DI */ 1067 COSTS_N_INSNS (5)}, /* other */ 1068 0, /* cost of multiply per each bit set */ 1069 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 1070 COSTS_N_INSNS (26), /* HI */ 1071 COSTS_N_INSNS (42), /* SI */ 1072 COSTS_N_INSNS (74), /* DI */ 1073 COSTS_N_INSNS (74)}, /* other */ 1074 COSTS_N_INSNS (1), /* cost of movsx */ 1075 COSTS_N_INSNS (1), /* cost of movzx */ 1076 8, /* "large" insn */ 1077 9, /* MOVE_RATIO */ 1078 6, /* CLEAR_RATIO */ 1079 {3, 4, 3}, /* cost of loading integer registers 1080 in QImode, HImode and SImode. 1081 Relative to reg-reg move (2). */ 1082 {3, 4, 3}, /* cost of storing integer registers */ 1083 {4, 3, 12, 12, 24}, /* cost of loading SSE register 1084 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1085 {4, 4, 10, 10, 20}, /* cost of storing SSE register 1086 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1087 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */ 1088 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 1089 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1090 5, /* cost of moving SSE register to integer. */ 1091 4, 4, /* Gather load static, per_elt. */ 1092 4, 4, /* Gather store static, per_elt. */ 1093 64, /* size of l1 cache. */ 1094 512, /* size of l2 cache. */ 1095 64, /* size of prefetch block */ 1096 /* New AMD processors never drop prefetches; if they cannot be performed 1097 immediately, they are queued. We set number of simultaneous prefetches 1098 to a large constant to reflect this (it probably is not a good idea not 1099 to limit number of prefetches at all, as their execution also takes some 1100 time). */ 1101 100, /* number of parallel prefetches */ 1102 3, /* Branch cost */ 1103 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1104 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1105 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1106 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1107 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1108 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1109 1110 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1111 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1112 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1113 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1114 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 1115 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 1116 /* 11-16 */ 1117 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 1118 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 1119 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 1120 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 1121 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1122 k8_memcpy, 1123 k8_memset, 1124 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1125 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1126 "16:8:8", /* Loop alignment. */ 1127 "16:8:8", /* Jump alignment. */ 1128 "0:0:8", /* Label alignment. */ 1129 "16", /* Func alignment. */ 1130 }; 1131 1132 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for 1133 very small blocks it is better to use loop. For large blocks, libcall can 1134 do nontemporary accesses and beat inline considerably. */ 1135 static stringop_algs amdfam10_memcpy[2] = { 1136 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1137 {-1, rep_prefix_4_byte, false}}}, 1138 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1139 {-1, libcall, false}}}}; 1140 static stringop_algs amdfam10_memset[2] = { 1141 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1142 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1143 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1144 {-1, libcall, false}}}}; 1145 struct processor_costs amdfam10_cost = { 1146 { 1147 /* Start of register allocator costs. integer->integer move cost is 2. */ 1148 4, /* cost for loading QImode using movzbl */ 1149 {3, 4, 3}, /* cost of loading integer registers 1150 in QImode, HImode and SImode. 1151 Relative to reg-reg move (2). */ 1152 {3, 4, 3}, /* cost of storing integer registers */ 1153 4, /* cost of reg,reg fld/fst */ 1154 {4, 4, 12}, /* cost of loading fp registers 1155 in SFmode, DFmode and XFmode */ 1156 {6, 6, 8}, /* cost of storing fp registers 1157 in SFmode, DFmode and XFmode */ 1158 2, /* cost of moving MMX register */ 1159 {3, 3}, /* cost of loading MMX registers 1160 in SImode and DImode */ 1161 {4, 4}, /* cost of storing MMX registers 1162 in SImode and DImode */ 1163 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1164 {4, 4, 3, 6, 12}, /* cost of loading SSE registers 1165 in 32,64,128,256 and 512-bit */ 1166 {4, 4, 5, 10, 20}, /* cost of storing SSE registers 1167 in 32,64,128,256 and 512-bit */ 1168 3, 3, /* SSE->integer and integer->SSE moves */ 1169 3, 3, /* mask->integer and integer->mask moves */ 1170 {3, 4, 3}, /* cost of loading mask register 1171 in QImode, HImode, SImode. */ 1172 {3, 4, 3}, /* cost if storing mask register 1173 in QImode, HImode, SImode. */ 1174 2, /* cost of moving mask register. */ 1175 1176 /* On K8: 1177 MOVD reg64, xmmreg Double FSTORE 4 1178 MOVD reg32, xmmreg Double FSTORE 4 1179 On AMDFAM10: 1180 MOVD reg64, xmmreg Double FADD 3 1181 1/1 1/1 1182 MOVD reg32, xmmreg Double FADD 3 1183 1/1 1/1 */ 1184 /* End of register allocator costs. */ 1185 }, 1186 1187 COSTS_N_INSNS (1), /* cost of an add instruction */ 1188 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1189 COSTS_N_INSNS (1), /* variable shift costs */ 1190 COSTS_N_INSNS (1), /* constant shift costs */ 1191 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1192 COSTS_N_INSNS (4), /* HI */ 1193 COSTS_N_INSNS (3), /* SI */ 1194 COSTS_N_INSNS (4), /* DI */ 1195 COSTS_N_INSNS (5)}, /* other */ 1196 0, /* cost of multiply per each bit set */ 1197 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1198 COSTS_N_INSNS (35), /* HI */ 1199 COSTS_N_INSNS (51), /* SI */ 1200 COSTS_N_INSNS (83), /* DI */ 1201 COSTS_N_INSNS (83)}, /* other */ 1202 COSTS_N_INSNS (1), /* cost of movsx */ 1203 COSTS_N_INSNS (1), /* cost of movzx */ 1204 8, /* "large" insn */ 1205 9, /* MOVE_RATIO */ 1206 6, /* CLEAR_RATIO */ 1207 {3, 4, 3}, /* cost of loading integer registers 1208 in QImode, HImode and SImode. 1209 Relative to reg-reg move (2). */ 1210 {3, 4, 3}, /* cost of storing integer registers */ 1211 {4, 4, 3, 6, 12}, /* cost of loading SSE register 1212 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1213 {4, 4, 5, 10, 20}, /* cost of storing SSE register 1214 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1215 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ 1216 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 1217 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1218 3, /* cost of moving SSE register to integer. */ 1219 4, 4, /* Gather load static, per_elt. */ 1220 4, 4, /* Gather store static, per_elt. */ 1221 64, /* size of l1 cache. */ 1222 512, /* size of l2 cache. */ 1223 64, /* size of prefetch block */ 1224 /* New AMD processors never drop prefetches; if they cannot be performed 1225 immediately, they are queued. We set number of simultaneous prefetches 1226 to a large constant to reflect this (it probably is not a good idea not 1227 to limit number of prefetches at all, as their execution also takes some 1228 time). */ 1229 100, /* number of parallel prefetches */ 1230 2, /* Branch cost */ 1231 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1232 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1233 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1234 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1235 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1236 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1237 1238 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1239 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1240 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1241 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1242 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 1243 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 1244 /* 11-16 */ 1245 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 1246 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 1247 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 1248 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 1249 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1250 amdfam10_memcpy, 1251 amdfam10_memset, 1252 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1253 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1254 "32:25:8", /* Loop alignment. */ 1255 "32:8:8", /* Jump alignment. */ 1256 "0:0:8", /* Label alignment. */ 1257 "32", /* Func alignment. */ 1258 }; 1259 1260 /* BDVER has optimized REP instruction for medium sized blocks, but for 1261 very small blocks it is better to use loop. For large blocks, libcall 1262 can do nontemporary accesses and beat inline considerably. */ 1263 static stringop_algs bdver_memcpy[2] = { 1264 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1265 {-1, rep_prefix_4_byte, false}}}, 1266 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1267 {-1, libcall, false}}}}; 1268 static stringop_algs bdver_memset[2] = { 1269 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1270 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1271 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1272 {-1, libcall, false}}}}; 1273 1274 const struct processor_costs bdver_cost = { 1275 { 1276 /* Start of register allocator costs. integer->integer move cost is 2. */ 1277 8, /* cost for loading QImode using movzbl */ 1278 {8, 8, 8}, /* cost of loading integer registers 1279 in QImode, HImode and SImode. 1280 Relative to reg-reg move (2). */ 1281 {8, 8, 8}, /* cost of storing integer registers */ 1282 4, /* cost of reg,reg fld/fst */ 1283 {12, 12, 28}, /* cost of loading fp registers 1284 in SFmode, DFmode and XFmode */ 1285 {10, 10, 18}, /* cost of storing fp registers 1286 in SFmode, DFmode and XFmode */ 1287 4, /* cost of moving MMX register */ 1288 {12, 12}, /* cost of loading MMX registers 1289 in SImode and DImode */ 1290 {10, 10}, /* cost of storing MMX registers 1291 in SImode and DImode */ 1292 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1293 {12, 12, 10, 40, 60}, /* cost of loading SSE registers 1294 in 32,64,128,256 and 512-bit */ 1295 {10, 10, 10, 40, 60}, /* cost of storing SSE registers 1296 in 32,64,128,256 and 512-bit */ 1297 16, 20, /* SSE->integer and integer->SSE moves */ 1298 16, 20, /* mask->integer and integer->mask moves */ 1299 {8, 8, 8}, /* cost of loading mask register 1300 in QImode, HImode, SImode. */ 1301 {8, 8, 8}, /* cost if storing mask register 1302 in QImode, HImode, SImode. */ 1303 2, /* cost of moving mask register. */ 1304 /* End of register allocator costs. */ 1305 }, 1306 1307 COSTS_N_INSNS (1), /* cost of an add instruction */ 1308 COSTS_N_INSNS (1), /* cost of a lea instruction */ 1309 COSTS_N_INSNS (1), /* variable shift costs */ 1310 COSTS_N_INSNS (1), /* constant shift costs */ 1311 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1312 COSTS_N_INSNS (4), /* HI */ 1313 COSTS_N_INSNS (4), /* SI */ 1314 COSTS_N_INSNS (6), /* DI */ 1315 COSTS_N_INSNS (6)}, /* other */ 1316 0, /* cost of multiply per each bit set */ 1317 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1318 COSTS_N_INSNS (35), /* HI */ 1319 COSTS_N_INSNS (51), /* SI */ 1320 COSTS_N_INSNS (83), /* DI */ 1321 COSTS_N_INSNS (83)}, /* other */ 1322 COSTS_N_INSNS (1), /* cost of movsx */ 1323 COSTS_N_INSNS (1), /* cost of movzx */ 1324 8, /* "large" insn */ 1325 9, /* MOVE_RATIO */ 1326 6, /* CLEAR_RATIO */ 1327 {8, 8, 8}, /* cost of loading integer registers 1328 in QImode, HImode and SImode. 1329 Relative to reg-reg move (2). */ 1330 {8, 8, 8}, /* cost of storing integer registers */ 1331 {12, 12, 10, 40, 60}, /* cost of loading SSE register 1332 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1333 {10, 10, 10, 40, 60}, /* cost of storing SSE register 1334 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1335 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ 1336 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 1337 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1338 16, /* cost of moving SSE register to integer. */ 1339 12, 12, /* Gather load static, per_elt. */ 1340 10, 10, /* Gather store static, per_elt. */ 1341 16, /* size of l1 cache. */ 1342 2048, /* size of l2 cache. */ 1343 64, /* size of prefetch block */ 1344 /* New AMD processors never drop prefetches; if they cannot be performed 1345 immediately, they are queued. We set number of simultaneous prefetches 1346 to a large constant to reflect this (it probably is not a good idea not 1347 to limit number of prefetches at all, as their execution also takes some 1348 time). */ 1349 100, /* number of parallel prefetches */ 1350 2, /* Branch cost */ 1351 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1352 COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1353 COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1354 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1355 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1356 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1357 1358 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1359 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1360 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1361 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1362 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1363 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1364 /* 9-24 */ 1365 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1366 /* 9-27 */ 1367 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1368 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1369 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1370 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1371 bdver_memcpy, 1372 bdver_memset, 1373 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1374 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1375 "16:11:8", /* Loop alignment. */ 1376 "16:8:8", /* Jump alignment. */ 1377 "0:0:8", /* Label alignment. */ 1378 "11", /* Func alignment. */ 1379 }; 1380 1381 1382 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for 1383 very small blocks it is better to use loop. For large blocks, libcall 1384 can do nontemporary accesses and beat inline considerably. */ 1385 static stringop_algs znver1_memcpy[2] = { 1386 /* 32-bit tuning. */ 1387 {libcall, {{6, loop, false}, 1388 {14, unrolled_loop, false}, 1389 {-1, libcall, false}}}, 1390 /* 64-bit tuning. */ 1391 {libcall, {{16, loop, false}, 1392 {128, rep_prefix_8_byte, false}, 1393 {-1, libcall, false}}}}; 1394 static stringop_algs znver1_memset[2] = { 1395 /* 32-bit tuning. */ 1396 {libcall, {{8, loop, false}, 1397 {24, unrolled_loop, false}, 1398 {128, rep_prefix_4_byte, false}, 1399 {-1, libcall, false}}}, 1400 /* 64-bit tuning. */ 1401 {libcall, {{48, unrolled_loop, false}, 1402 {128, rep_prefix_8_byte, false}, 1403 {-1, libcall, false}}}}; 1404 struct processor_costs znver1_cost = { 1405 { 1406 /* Start of register allocator costs. integer->integer move cost is 2. */ 1407 1408 /* reg-reg moves are done by renaming and thus they are even cheaper than 1409 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond 1410 to doubles of latencies, we do not model this correctly. It does not 1411 seem to make practical difference to bump prices up even more. */ 1412 6, /* cost for loading QImode using 1413 movzbl. */ 1414 {6, 6, 6}, /* cost of loading integer registers 1415 in QImode, HImode and SImode. 1416 Relative to reg-reg move (2). */ 1417 {8, 8, 8}, /* cost of storing integer 1418 registers. */ 1419 2, /* cost of reg,reg fld/fst. */ 1420 {6, 6, 16}, /* cost of loading fp registers 1421 in SFmode, DFmode and XFmode. */ 1422 {8, 8, 16}, /* cost of storing fp registers 1423 in SFmode, DFmode and XFmode. */ 1424 2, /* cost of moving MMX register. */ 1425 {6, 6}, /* cost of loading MMX registers 1426 in SImode and DImode. */ 1427 {8, 8}, /* cost of storing MMX registers 1428 in SImode and DImode. */ 1429 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 1430 {6, 6, 6, 12, 24}, /* cost of loading SSE registers 1431 in 32,64,128,256 and 512-bit. */ 1432 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 1433 in 32,64,128,256 and 512-bit. */ 1434 6, 6, /* SSE->integer and integer->SSE moves. */ 1435 8, 8, /* mask->integer and integer->mask moves */ 1436 {6, 6, 6}, /* cost of loading mask register 1437 in QImode, HImode, SImode. */ 1438 {8, 8, 8}, /* cost if storing mask register 1439 in QImode, HImode, SImode. */ 1440 2, /* cost of moving mask register. */ 1441 /* End of register allocator costs. */ 1442 }, 1443 1444 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1445 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1446 COSTS_N_INSNS (1), /* variable shift costs. */ 1447 COSTS_N_INSNS (1), /* constant shift costs. */ 1448 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1449 COSTS_N_INSNS (3), /* HI. */ 1450 COSTS_N_INSNS (3), /* SI. */ 1451 COSTS_N_INSNS (3), /* DI. */ 1452 COSTS_N_INSNS (3)}, /* other. */ 1453 0, /* cost of multiply per each bit 1454 set. */ 1455 /* Depending on parameters, idiv can get faster on ryzen. This is upper 1456 bound. */ 1457 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ 1458 COSTS_N_INSNS (22), /* HI. */ 1459 COSTS_N_INSNS (30), /* SI. */ 1460 COSTS_N_INSNS (45), /* DI. */ 1461 COSTS_N_INSNS (45)}, /* other. */ 1462 COSTS_N_INSNS (1), /* cost of movsx. */ 1463 COSTS_N_INSNS (1), /* cost of movzx. */ 1464 8, /* "large" insn. */ 1465 9, /* MOVE_RATIO. */ 1466 6, /* CLEAR_RATIO */ 1467 {6, 6, 6}, /* cost of loading integer registers 1468 in QImode, HImode and SImode. 1469 Relative to reg-reg move (2). */ 1470 {8, 8, 8}, /* cost of storing integer 1471 registers. */ 1472 {6, 6, 6, 12, 24}, /* cost of loading SSE register 1473 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1474 {8, 8, 8, 16, 32}, /* cost of storing SSE register 1475 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1476 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */ 1477 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ 1478 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 1479 6, /* cost of moving SSE register to integer. */ 1480 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, 1481 throughput 12. Approx 9 uops do not depend on vector size and every load 1482 is 7 uops. */ 1483 18, 8, /* Gather load static, per_elt. */ 1484 18, 10, /* Gather store static, per_elt. */ 1485 32, /* size of l1 cache. */ 1486 512, /* size of l2 cache. */ 1487 64, /* size of prefetch block. */ 1488 /* New AMD processors never drop prefetches; if they cannot be performed 1489 immediately, they are queued. We set number of simultaneous prefetches 1490 to a large constant to reflect this (it probably is not a good idea not 1491 to limit number of prefetches at all, as their execution also takes some 1492 time). */ 1493 100, /* number of parallel prefetches. */ 1494 3, /* Branch cost. */ 1495 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1496 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1497 /* Latency of fdiv is 8-15. */ 1498 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1499 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1501 /* Latency of fsqrt is 4-10. */ 1502 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1503 1504 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1505 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1506 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1507 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1508 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1509 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1510 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1511 /* 9-13 */ 1512 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1513 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1514 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1515 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles 1516 and it can execute 2 integer additions and 2 multiplications thus 1517 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests 1518 that 4 works better than 6 probably due to register pressure. 1519 1520 Integer vector operations are taken by FP unit and execute 3 vector 1521 plus/minus operations per cycle but only one multiply. This is adjusted 1522 in ix86_reassociation_width. */ 1523 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1524 znver1_memcpy, 1525 znver1_memset, 1526 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1527 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1528 "16", /* Loop alignment. */ 1529 "16", /* Jump alignment. */ 1530 "0:0:8", /* Label alignment. */ 1531 "16", /* Func alignment. */ 1532 }; 1533 1534 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for 1535 very small blocks it is better to use loop. For large blocks, libcall 1536 can do nontemporary accesses and beat inline considerably. */ 1537 static stringop_algs znver2_memcpy[2] = { 1538 /* 32-bit tuning. */ 1539 {libcall, {{6, loop, false}, 1540 {14, unrolled_loop, false}, 1541 {-1, libcall, false}}}, 1542 /* 64-bit tuning. */ 1543 {libcall, {{16, loop, false}, 1544 {64, rep_prefix_4_byte, false}, 1545 {-1, libcall, false}}}}; 1546 static stringop_algs znver2_memset[2] = { 1547 /* 32-bit tuning. */ 1548 {libcall, {{8, loop, false}, 1549 {24, unrolled_loop, false}, 1550 {128, rep_prefix_4_byte, false}, 1551 {-1, libcall, false}}}, 1552 /* 64-bit tuning. */ 1553 {libcall, {{24, rep_prefix_4_byte, false}, 1554 {128, rep_prefix_8_byte, false}, 1555 {-1, libcall, false}}}}; 1556 1557 struct processor_costs znver2_cost = { 1558 { 1559 /* Start of register allocator costs. integer->integer move cost is 2. */ 1560 1561 /* reg-reg moves are done by renaming and thus they are even cheaper than 1562 1 cycle. Because reg-reg move cost is 2 and following tables correspond 1563 to doubles of latencies, we do not model this correctly. It does not 1564 seem to make practical difference to bump prices up even more. */ 1565 6, /* cost for loading QImode using 1566 movzbl. */ 1567 {6, 6, 6}, /* cost of loading integer registers 1568 in QImode, HImode and SImode. 1569 Relative to reg-reg move (2). */ 1570 {8, 8, 8}, /* cost of storing integer 1571 registers. */ 1572 2, /* cost of reg,reg fld/fst. */ 1573 {6, 6, 16}, /* cost of loading fp registers 1574 in SFmode, DFmode and XFmode. */ 1575 {8, 8, 16}, /* cost of storing fp registers 1576 in SFmode, DFmode and XFmode. */ 1577 2, /* cost of moving MMX register. */ 1578 {6, 6}, /* cost of loading MMX registers 1579 in SImode and DImode. */ 1580 {8, 8}, /* cost of storing MMX registers 1581 in SImode and DImode. */ 1582 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1583 register. */ 1584 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 1585 in 32,64,128,256 and 512-bit. */ 1586 {8, 8, 8, 8, 16}, /* cost of storing SSE registers 1587 in 32,64,128,256 and 512-bit. */ 1588 6, 6, /* SSE->integer and integer->SSE 1589 moves. */ 1590 8, 8, /* mask->integer and integer->mask moves */ 1591 {6, 6, 6}, /* cost of loading mask register 1592 in QImode, HImode, SImode. */ 1593 {8, 8, 8}, /* cost if storing mask register 1594 in QImode, HImode, SImode. */ 1595 2, /* cost of moving mask register. */ 1596 /* End of register allocator costs. */ 1597 }, 1598 1599 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1600 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1601 COSTS_N_INSNS (1), /* variable shift costs. */ 1602 COSTS_N_INSNS (1), /* constant shift costs. */ 1603 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1604 COSTS_N_INSNS (3), /* HI. */ 1605 COSTS_N_INSNS (3), /* SI. */ 1606 COSTS_N_INSNS (3), /* DI. */ 1607 COSTS_N_INSNS (3)}, /* other. */ 1608 0, /* cost of multiply per each bit 1609 set. */ 1610 /* Depending on parameters, idiv can get faster on ryzen. This is upper 1611 bound. */ 1612 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ 1613 COSTS_N_INSNS (22), /* HI. */ 1614 COSTS_N_INSNS (30), /* SI. */ 1615 COSTS_N_INSNS (45), /* DI. */ 1616 COSTS_N_INSNS (45)}, /* other. */ 1617 COSTS_N_INSNS (1), /* cost of movsx. */ 1618 COSTS_N_INSNS (1), /* cost of movzx. */ 1619 8, /* "large" insn. */ 1620 9, /* MOVE_RATIO. */ 1621 6, /* CLEAR_RATIO */ 1622 {6, 6, 6}, /* cost of loading integer registers 1623 in QImode, HImode and SImode. 1624 Relative to reg-reg move (2). */ 1625 {8, 8, 8}, /* cost of storing integer 1626 registers. */ 1627 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 1628 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1629 {8, 8, 8, 8, 16}, /* cost of storing SSE register 1630 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1631 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 1632 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1633 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1634 register. */ 1635 6, /* cost of moving SSE register to integer. */ 1636 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, 1637 throughput 12. Approx 9 uops do not depend on vector size and every load 1638 is 7 uops. */ 1639 18, 8, /* Gather load static, per_elt. */ 1640 18, 10, /* Gather store static, per_elt. */ 1641 32, /* size of l1 cache. */ 1642 512, /* size of l2 cache. */ 1643 64, /* size of prefetch block. */ 1644 /* New AMD processors never drop prefetches; if they cannot be performed 1645 immediately, they are queued. We set number of simultaneous prefetches 1646 to a large constant to reflect this (it probably is not a good idea not 1647 to limit number of prefetches at all, as their execution also takes some 1648 time). */ 1649 100, /* number of parallel prefetches. */ 1650 3, /* Branch cost. */ 1651 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1652 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1653 /* Latency of fdiv is 8-15. */ 1654 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1655 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1656 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1657 /* Latency of fsqrt is 4-10. */ 1658 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1659 1660 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1661 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1662 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1663 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 1664 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1665 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1666 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1667 /* 9-13. */ 1668 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1669 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1670 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1671 /* Zen can execute 4 integer operations per cycle. FP operations 1672 take 3 cycles and it can execute 2 integer additions and 2 1673 multiplications thus reassociation may make sense up to with of 6. 1674 SPEC2k6 bencharks suggests 1675 that 4 works better than 6 probably due to register pressure. 1676 1677 Integer vector operations are taken by FP unit and execute 3 vector 1678 plus/minus operations per cycle but only one multiply. This is adjusted 1679 in ix86_reassociation_width. */ 1680 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1681 znver2_memcpy, 1682 znver2_memset, 1683 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1684 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1685 "16", /* Loop alignment. */ 1686 "16", /* Jump alignment. */ 1687 "0:0:8", /* Label alignment. */ 1688 "16", /* Func alignment. */ 1689 }; 1690 1691 struct processor_costs znver3_cost = { 1692 { 1693 /* Start of register allocator costs. integer->integer move cost is 2. */ 1694 1695 /* reg-reg moves are done by renaming and thus they are even cheaper than 1696 1 cycle. Because reg-reg move cost is 2 and following tables correspond 1697 to doubles of latencies, we do not model this correctly. It does not 1698 seem to make practical difference to bump prices up even more. */ 1699 6, /* cost for loading QImode using 1700 movzbl. */ 1701 {6, 6, 6}, /* cost of loading integer registers 1702 in QImode, HImode and SImode. 1703 Relative to reg-reg move (2). */ 1704 {8, 8, 8}, /* cost of storing integer 1705 registers. */ 1706 2, /* cost of reg,reg fld/fst. */ 1707 {6, 6, 16}, /* cost of loading fp registers 1708 in SFmode, DFmode and XFmode. */ 1709 {8, 8, 16}, /* cost of storing fp registers 1710 in SFmode, DFmode and XFmode. */ 1711 2, /* cost of moving MMX register. */ 1712 {6, 6}, /* cost of loading MMX registers 1713 in SImode and DImode. */ 1714 {8, 8}, /* cost of storing MMX registers 1715 in SImode and DImode. */ 1716 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1717 register. */ 1718 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 1719 in 32,64,128,256 and 512-bit. */ 1720 {8, 8, 8, 8, 16}, /* cost of storing SSE registers 1721 in 32,64,128,256 and 512-bit. */ 1722 6, 6, /* SSE->integer and integer->SSE 1723 moves. */ 1724 8, 8, /* mask->integer and integer->mask moves */ 1725 {6, 6, 6}, /* cost of loading mask register 1726 in QImode, HImode, SImode. */ 1727 {8, 8, 8}, /* cost if storing mask register 1728 in QImode, HImode, SImode. */ 1729 2, /* cost of moving mask register. */ 1730 /* End of register allocator costs. */ 1731 }, 1732 1733 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1734 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1735 COSTS_N_INSNS (1), /* variable shift costs. */ 1736 COSTS_N_INSNS (1), /* constant shift costs. */ 1737 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1738 COSTS_N_INSNS (3), /* HI. */ 1739 COSTS_N_INSNS (3), /* SI. */ 1740 COSTS_N_INSNS (3), /* DI. */ 1741 COSTS_N_INSNS (3)}, /* other. */ 1742 0, /* cost of multiply per each bit 1743 set. */ 1744 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */ 1745 COSTS_N_INSNS (10), /* HI. */ 1746 COSTS_N_INSNS (12), /* SI. */ 1747 COSTS_N_INSNS (17), /* DI. */ 1748 COSTS_N_INSNS (17)}, /* other. */ 1749 COSTS_N_INSNS (1), /* cost of movsx. */ 1750 COSTS_N_INSNS (1), /* cost of movzx. */ 1751 8, /* "large" insn. */ 1752 9, /* MOVE_RATIO. */ 1753 6, /* CLEAR_RATIO */ 1754 {6, 6, 6}, /* cost of loading integer registers 1755 in QImode, HImode and SImode. 1756 Relative to reg-reg move (2). */ 1757 {8, 8, 8}, /* cost of storing integer 1758 registers. */ 1759 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 1760 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1761 {8, 8, 8, 8, 16}, /* cost of storing SSE register 1762 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1763 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 1764 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1765 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1766 register. */ 1767 6, /* cost of moving SSE register to integer. */ 1768 /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, 1769 throughput 9. Approx 7 uops do not depend on vector size and every load 1770 is 4 uops. */ 1771 14, 8, /* Gather load static, per_elt. */ 1772 14, 10, /* Gather store static, per_elt. */ 1773 32, /* size of l1 cache. */ 1774 512, /* size of l2 cache. */ 1775 64, /* size of prefetch block. */ 1776 /* New AMD processors never drop prefetches; if they cannot be performed 1777 immediately, they are queued. We set number of simultaneous prefetches 1778 to a large constant to reflect this (it probably is not a good idea not 1779 to limit number of prefetches at all, as their execution also takes some 1780 time). */ 1781 100, /* number of parallel prefetches. */ 1782 3, /* Branch cost. */ 1783 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1784 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1785 /* Latency of fdiv is 8-15. */ 1786 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1787 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1788 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1789 /* Latency of fsqrt is 4-10. */ 1790 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1791 1792 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1793 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1794 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1795 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 1796 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1797 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1798 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1799 /* 9-13. */ 1800 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1801 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1802 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1803 /* Zen can execute 4 integer operations per cycle. FP operations 1804 take 3 cycles and it can execute 2 integer additions and 2 1805 multiplications thus reassociation may make sense up to with of 6. 1806 SPEC2k6 bencharks suggests 1807 that 4 works better than 6 probably due to register pressure. 1808 1809 Integer vector operations are taken by FP unit and execute 3 vector 1810 plus/minus operations per cycle but only one multiply. This is adjusted 1811 in ix86_reassociation_width. */ 1812 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1813 znver2_memcpy, 1814 znver2_memset, 1815 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1816 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1817 "16", /* Loop alignment. */ 1818 "16", /* Jump alignment. */ 1819 "0:0:8", /* Label alignment. */ 1820 "16", /* Func alignment. */ 1821 }; 1822 1823 /* This table currently replicates znver3_cost table. */ 1824 struct processor_costs znver4_cost = { 1825 { 1826 /* Start of register allocator costs. integer->integer move cost is 2. */ 1827 1828 /* reg-reg moves are done by renaming and thus they are even cheaper than 1829 1 cycle. Because reg-reg move cost is 2 and following tables correspond 1830 to doubles of latencies, we do not model this correctly. It does not 1831 seem to make practical difference to bump prices up even more. */ 1832 6, /* cost for loading QImode using 1833 movzbl. */ 1834 {6, 6, 6}, /* cost of loading integer registers 1835 in QImode, HImode and SImode. 1836 Relative to reg-reg move (2). */ 1837 {8, 8, 8}, /* cost of storing integer 1838 registers. */ 1839 2, /* cost of reg,reg fld/fst. */ 1840 {14, 14, 17}, /* cost of loading fp registers 1841 in SFmode, DFmode and XFmode. */ 1842 {12, 12, 16}, /* cost of storing fp registers 1843 in SFmode, DFmode and XFmode. */ 1844 2, /* cost of moving MMX register. */ 1845 {6, 6}, /* cost of loading MMX registers 1846 in SImode and DImode. */ 1847 {8, 8}, /* cost of storing MMX registers 1848 in SImode and DImode. */ 1849 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1850 register. */ 1851 {6, 6, 10, 10, 12}, /* cost of loading SSE registers 1852 in 32,64,128,256 and 512-bit. */ 1853 {8, 8, 8, 12, 12}, /* cost of storing SSE registers 1854 in 32,64,128,256 and 512-bit. */ 1855 6, 8, /* SSE->integer and integer->SSE 1856 moves. */ 1857 8, 8, /* mask->integer and integer->mask moves */ 1858 {6, 6, 6}, /* cost of loading mask register 1859 in QImode, HImode, SImode. */ 1860 {8, 8, 8}, /* cost if storing mask register 1861 in QImode, HImode, SImode. */ 1862 2, /* cost of moving mask register. */ 1863 /* End of register allocator costs. */ 1864 }, 1865 1866 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1867 /* TODO: Lea with 3 components has cost 2. */ 1868 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1869 COSTS_N_INSNS (1), /* variable shift costs. */ 1870 COSTS_N_INSNS (1), /* constant shift costs. */ 1871 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1872 COSTS_N_INSNS (3), /* HI. */ 1873 COSTS_N_INSNS (3), /* SI. */ 1874 COSTS_N_INSNS (3), /* DI. */ 1875 COSTS_N_INSNS (3)}, /* other. */ 1876 0, /* cost of multiply per each bit 1877 set. */ 1878 {COSTS_N_INSNS (12), /* cost of a divide/mod for QI. */ 1879 COSTS_N_INSNS (13), /* HI. */ 1880 COSTS_N_INSNS (13), /* SI. */ 1881 COSTS_N_INSNS (18), /* DI. */ 1882 COSTS_N_INSNS (18)}, /* other. */ 1883 COSTS_N_INSNS (1), /* cost of movsx. */ 1884 COSTS_N_INSNS (1), /* cost of movzx. */ 1885 8, /* "large" insn. */ 1886 9, /* MOVE_RATIO. */ 1887 6, /* CLEAR_RATIO */ 1888 {6, 6, 6}, /* cost of loading integer registers 1889 in QImode, HImode and SImode. 1890 Relative to reg-reg move (2). */ 1891 {8, 8, 8}, /* cost of storing integer 1892 registers. */ 1893 {6, 6, 10, 10, 12}, /* cost of loading SSE registers 1894 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1895 {8, 8, 8, 12, 12}, /* cost of storing SSE register 1896 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1897 {6, 6, 6, 6, 6}, /* cost of unaligned loads. */ 1898 {8, 8, 8, 8, 8}, /* cost of unaligned stores. */ 1899 2, 2, 2, /* cost of moving XMM,YMM,ZMM 1900 register. */ 1901 6, /* cost of moving SSE register to integer. */ 1902 /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops, 1903 throughput 5. Approx 7 uops do not depend on vector size and every load 1904 is 5 uops. */ 1905 14, 10, /* Gather load static, per_elt. */ 1906 14, 20, /* Gather store static, per_elt. */ 1907 32, /* size of l1 cache. */ 1908 1024, /* size of l2 cache. */ 1909 64, /* size of prefetch block. */ 1910 /* New AMD processors never drop prefetches; if they cannot be performed 1911 immediately, they are queued. We set number of simultaneous prefetches 1912 to a large constant to reflect this (it probably is not a good idea not 1913 to limit number of prefetches at all, as their execution also takes some 1914 time). */ 1915 100, /* number of parallel prefetches. */ 1916 3, /* Branch cost. */ 1917 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */ 1918 COSTS_N_INSNS (7), /* cost of FMUL instruction. */ 1919 /* Latency of fdiv is 8-15. */ 1920 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1921 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1922 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1923 /* Latency of fsqrt is 4-10. */ 1924 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */ 1925 1926 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1927 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1928 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1929 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 1930 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 1931 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 1932 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 1933 /* 9-13. */ 1934 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1935 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1936 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ 1937 /* Zen can execute 4 integer operations per cycle. FP operations 1938 take 3 cycles and it can execute 2 integer additions and 2 1939 multiplications thus reassociation may make sense up to with of 6. 1940 SPEC2k6 bencharks suggests 1941 that 4 works better than 6 probably due to register pressure. 1942 1943 Integer vector operations are taken by FP unit and execute 3 vector 1944 plus/minus operations per cycle but only one multiply. This is adjusted 1945 in ix86_reassociation_width. */ 1946 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1947 znver2_memcpy, 1948 znver2_memset, 1949 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1950 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1951 "16", /* Loop alignment. */ 1952 "16", /* Jump alignment. */ 1953 "0:0:8", /* Label alignment. */ 1954 "16", /* Func alignment. */ 1955 }; 1956 1957 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ 1958 static stringop_algs skylake_memcpy[2] = { 1959 {libcall, 1960 {{256, rep_prefix_1_byte, true}, 1961 {256, loop, false}, 1962 {-1, libcall, false}}}, 1963 {libcall, 1964 {{256, rep_prefix_1_byte, true}, 1965 {256, loop, false}, 1966 {-1, libcall, false}}}}; 1967 1968 static stringop_algs skylake_memset[2] = { 1969 {libcall, 1970 {{256, rep_prefix_1_byte, true}, 1971 {256, loop, false}, 1972 {-1, libcall, false}}}, 1973 {libcall, 1974 {{256, rep_prefix_1_byte, true}, 1975 {256, loop, false}, 1976 {-1, libcall, false}}}}; 1977 1978 static const 1979 struct processor_costs skylake_cost = { 1980 { 1981 /* Start of register allocator costs. integer->integer move cost is 2. */ 1982 6, /* cost for loading QImode using movzbl */ 1983 {4, 4, 4}, /* cost of loading integer registers 1984 in QImode, HImode and SImode. 1985 Relative to reg-reg move (2). */ 1986 {6, 6, 6}, /* cost of storing integer registers */ 1987 2, /* cost of reg,reg fld/fst */ 1988 {6, 6, 8}, /* cost of loading fp registers 1989 in SFmode, DFmode and XFmode */ 1990 {6, 6, 10}, /* cost of storing fp registers 1991 in SFmode, DFmode and XFmode */ 1992 2, /* cost of moving MMX register */ 1993 {6, 6}, /* cost of loading MMX registers 1994 in SImode and DImode */ 1995 {6, 6}, /* cost of storing MMX registers 1996 in SImode and DImode */ 1997 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 1998 {6, 6, 6, 10, 20}, /* cost of loading SSE registers 1999 in 32,64,128,256 and 512-bit */ 2000 {8, 8, 8, 12, 24}, /* cost of storing SSE registers 2001 in 32,64,128,256 and 512-bit */ 2002 6, 6, /* SSE->integer and integer->SSE moves */ 2003 5, 5, /* mask->integer and integer->mask moves */ 2004 {8, 8, 8}, /* cost of loading mask register 2005 in QImode, HImode, SImode. */ 2006 {6, 6, 6}, /* cost if storing mask register 2007 in QImode, HImode, SImode. */ 2008 3, /* cost of moving mask register. */ 2009 /* End of register allocator costs. */ 2010 }, 2011 2012 COSTS_N_INSNS (1), /* cost of an add instruction */ 2013 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ 2014 COSTS_N_INSNS (1), /* variable shift costs */ 2015 COSTS_N_INSNS (1), /* constant shift costs */ 2016 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2017 COSTS_N_INSNS (4), /* HI */ 2018 COSTS_N_INSNS (3), /* SI */ 2019 COSTS_N_INSNS (3), /* DI */ 2020 COSTS_N_INSNS (3)}, /* other */ 2021 0, /* cost of multiply per each bit set */ 2022 /* Expanding div/mod currently doesn't consider parallelism. So the cost 2023 model is not realistic. We compensate by increasing the latencies a bit. */ 2024 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 2025 COSTS_N_INSNS (11), /* HI */ 2026 COSTS_N_INSNS (14), /* SI */ 2027 COSTS_N_INSNS (76), /* DI */ 2028 COSTS_N_INSNS (76)}, /* other */ 2029 COSTS_N_INSNS (1), /* cost of movsx */ 2030 COSTS_N_INSNS (0), /* cost of movzx */ 2031 8, /* "large" insn */ 2032 17, /* MOVE_RATIO */ 2033 17, /* CLEAR_RATIO */ 2034 {4, 4, 4}, /* cost of loading integer registers 2035 in QImode, HImode and SImode. 2036 Relative to reg-reg move (2). */ 2037 {6, 6, 6}, /* cost of storing integer registers */ 2038 {6, 6, 6, 10, 20}, /* cost of loading SSE register 2039 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2040 {8, 8, 8, 8, 16}, /* cost of storing SSE register 2041 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2042 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ 2043 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2044 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2045 6, /* cost of moving SSE register to integer. */ 2046 20, 8, /* Gather load static, per_elt. */ 2047 22, 10, /* Gather store static, per_elt. */ 2048 64, /* size of l1 cache. */ 2049 512, /* size of l2 cache. */ 2050 64, /* size of prefetch block */ 2051 6, /* number of parallel prefetches */ 2052 3, /* Branch cost */ 2053 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2054 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 2055 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2056 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2057 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2058 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ 2059 2060 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2061 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 2062 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2063 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 2064 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 2065 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 2066 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ 2067 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ 2068 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ 2069 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 2070 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2071 skylake_memcpy, 2072 skylake_memset, 2073 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2074 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2075 "16:11:8", /* Loop alignment. */ 2076 "16:11:8", /* Jump alignment. */ 2077 "0:0:8", /* Label alignment. */ 2078 "16", /* Func alignment. */ 2079 }; 2080 2081 /* icelake_cost should produce code tuned for Icelake family of CPUs. 2082 NB: rep_prefix_1_byte is used only for known size. */ 2083 2084 static stringop_algs icelake_memcpy[2] = { 2085 {libcall, 2086 {{256, rep_prefix_1_byte, true}, 2087 {256, loop, false}, 2088 {-1, libcall, false}}}, 2089 {libcall, 2090 {{256, rep_prefix_1_byte, true}, 2091 {256, loop, false}, 2092 {-1, libcall, false}}}}; 2093 2094 static stringop_algs icelake_memset[2] = { 2095 {libcall, 2096 {{256, rep_prefix_1_byte, true}, 2097 {256, loop, false}, 2098 {-1, libcall, false}}}, 2099 {libcall, 2100 {{256, rep_prefix_1_byte, true}, 2101 {256, loop, false}, 2102 {-1, libcall, false}}}}; 2103 2104 static const 2105 struct processor_costs icelake_cost = { 2106 { 2107 /* Start of register allocator costs. integer->integer move cost is 2. */ 2108 6, /* cost for loading QImode using movzbl */ 2109 {4, 4, 4}, /* cost of loading integer registers 2110 in QImode, HImode and SImode. 2111 Relative to reg-reg move (2). */ 2112 {6, 6, 6}, /* cost of storing integer registers */ 2113 2, /* cost of reg,reg fld/fst */ 2114 {6, 6, 8}, /* cost of loading fp registers 2115 in SFmode, DFmode and XFmode */ 2116 {6, 6, 10}, /* cost of storing fp registers 2117 in SFmode, DFmode and XFmode */ 2118 2, /* cost of moving MMX register */ 2119 {6, 6}, /* cost of loading MMX registers 2120 in SImode and DImode */ 2121 {6, 6}, /* cost of storing MMX registers 2122 in SImode and DImode */ 2123 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2124 {6, 6, 6, 10, 20}, /* cost of loading SSE registers 2125 in 32,64,128,256 and 512-bit */ 2126 {8, 8, 8, 12, 24}, /* cost of storing SSE registers 2127 in 32,64,128,256 and 512-bit */ 2128 6, 6, /* SSE->integer and integer->SSE moves */ 2129 5, 5, /* mask->integer and integer->mask moves */ 2130 {8, 8, 8}, /* cost of loading mask register 2131 in QImode, HImode, SImode. */ 2132 {6, 6, 6}, /* cost if storing mask register 2133 in QImode, HImode, SImode. */ 2134 3, /* cost of moving mask register. */ 2135 /* End of register allocator costs. */ 2136 }, 2137 2138 COSTS_N_INSNS (1), /* cost of an add instruction */ 2139 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ 2140 COSTS_N_INSNS (1), /* variable shift costs */ 2141 COSTS_N_INSNS (1), /* constant shift costs */ 2142 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2143 COSTS_N_INSNS (4), /* HI */ 2144 COSTS_N_INSNS (3), /* SI */ 2145 COSTS_N_INSNS (3), /* DI */ 2146 COSTS_N_INSNS (3)}, /* other */ 2147 0, /* cost of multiply per each bit set */ 2148 /* Expanding div/mod currently doesn't consider parallelism. So the cost 2149 model is not realistic. We compensate by increasing the latencies a bit. */ 2150 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 2151 COSTS_N_INSNS (11), /* HI */ 2152 COSTS_N_INSNS (14), /* SI */ 2153 COSTS_N_INSNS (76), /* DI */ 2154 COSTS_N_INSNS (76)}, /* other */ 2155 COSTS_N_INSNS (1), /* cost of movsx */ 2156 COSTS_N_INSNS (0), /* cost of movzx */ 2157 8, /* "large" insn */ 2158 17, /* MOVE_RATIO */ 2159 17, /* CLEAR_RATIO */ 2160 {4, 4, 4}, /* cost of loading integer registers 2161 in QImode, HImode and SImode. 2162 Relative to reg-reg move (2). */ 2163 {6, 6, 6}, /* cost of storing integer registers */ 2164 {6, 6, 6, 10, 20}, /* cost of loading SSE register 2165 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2166 {8, 8, 8, 8, 16}, /* cost of storing SSE register 2167 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2168 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ 2169 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2170 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2171 6, /* cost of moving SSE register to integer. */ 2172 20, 8, /* Gather load static, per_elt. */ 2173 22, 10, /* Gather store static, per_elt. */ 2174 64, /* size of l1 cache. */ 2175 512, /* size of l2 cache. */ 2176 64, /* size of prefetch block */ 2177 6, /* number of parallel prefetches */ 2178 3, /* Branch cost */ 2179 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2180 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 2181 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2182 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2183 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2184 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ 2185 2186 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2187 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 2188 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2189 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 2190 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 2191 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 2192 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ 2193 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ 2194 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ 2195 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 2196 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2197 icelake_memcpy, 2198 icelake_memset, 2199 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2200 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2201 "16:11:8", /* Loop alignment. */ 2202 "16:11:8", /* Jump alignment. */ 2203 "0:0:8", /* Label alignment. */ 2204 "16", /* Func alignment. */ 2205 }; 2206 2207 /* alderlake_cost should produce code tuned for alderlake family of CPUs. */ 2208 static stringop_algs alderlake_memcpy[2] = { 2209 {libcall, 2210 {{256, rep_prefix_1_byte, true}, 2211 {256, loop, false}, 2212 {-1, libcall, false}}}, 2213 {libcall, 2214 {{256, rep_prefix_1_byte, true}, 2215 {256, loop, false}, 2216 {-1, libcall, false}}}}; 2217 static stringop_algs alderlake_memset[2] = { 2218 {libcall, 2219 {{256, rep_prefix_1_byte, true}, 2220 {256, loop, false}, 2221 {-1, libcall, false}}}, 2222 {libcall, 2223 {{256, rep_prefix_1_byte, true}, 2224 {256, loop, false}, 2225 {-1, libcall, false}}}}; 2226 static const 2227 struct processor_costs alderlake_cost = { 2228 { 2229 /* Start of register allocator costs. integer->integer move cost is 2. */ 2230 6, /* cost for loading QImode using movzbl */ 2231 {6, 6, 6}, /* cost of loading integer registers 2232 in QImode, HImode and SImode. 2233 Relative to reg-reg move (2). */ 2234 {6, 6, 6}, /* cost of storing integer registers */ 2235 4, /* cost of reg,reg fld/fst */ 2236 {6, 6, 12}, /* cost of loading fp registers 2237 in SFmode, DFmode and XFmode */ 2238 {6, 6, 12}, /* cost of storing fp registers 2239 in SFmode, DFmode and XFmode */ 2240 2, /* cost of moving MMX register */ 2241 {6, 6}, /* cost of loading MMX registers 2242 in SImode and DImode */ 2243 {6, 6}, /* cost of storing MMX registers 2244 in SImode and DImode */ 2245 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 2246 {6, 6, 6, 10, 15}, /* cost of loading SSE registers 2247 in 32,64,128,256 and 512-bit */ 2248 {6, 6, 6, 10, 15}, /* cost of storing SSE registers 2249 in 32,64,128,256 and 512-bit */ 2250 6, 6, /* SSE->integer and integer->SSE moves */ 2251 6, 6, /* mask->integer and integer->mask moves */ 2252 {6, 6, 6}, /* cost of loading mask register 2253 in QImode, HImode, SImode. */ 2254 {6, 6, 6}, /* cost if storing mask register 2255 in QImode, HImode, SImode. */ 2256 2, /* cost of moving mask register. */ 2257 /* End of register allocator costs. */ 2258 }, 2259 2260 COSTS_N_INSNS (1), /* cost of an add instruction */ 2261 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2262 COSTS_N_INSNS (1), /* variable shift costs */ 2263 COSTS_N_INSNS (1), /* constant shift costs */ 2264 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2265 COSTS_N_INSNS (4), /* HI */ 2266 COSTS_N_INSNS (3), /* SI */ 2267 COSTS_N_INSNS (4), /* DI */ 2268 COSTS_N_INSNS (4)}, /* other */ 2269 0, /* cost of multiply per each bit set */ 2270 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ 2271 COSTS_N_INSNS (22), /* HI */ 2272 COSTS_N_INSNS (30), /* SI */ 2273 COSTS_N_INSNS (74), /* DI */ 2274 COSTS_N_INSNS (74)}, /* other */ 2275 COSTS_N_INSNS (1), /* cost of movsx */ 2276 COSTS_N_INSNS (1), /* cost of movzx */ 2277 8, /* "large" insn */ 2278 17, /* MOVE_RATIO */ 2279 17, /* CLEAR_RATIO */ 2280 {6, 6, 6}, /* cost of loading integer registers 2281 in QImode, HImode and SImode. 2282 Relative to reg-reg move (2). */ 2283 {6, 6, 6}, /* cost of storing integer registers */ 2284 {6, 6, 6, 10, 15}, /* cost of loading SSE register 2285 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2286 {6, 6, 6, 10, 15}, /* cost of storing SSE register 2287 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2288 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ 2289 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2290 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 2291 6, /* cost of moving SSE register to integer. */ 2292 18, 6, /* Gather load static, per_elt. */ 2293 18, 6, /* Gather store static, per_elt. */ 2294 32, /* size of l1 cache. */ 2295 512, /* size of l2 cache. */ 2296 64, /* size of prefetch block */ 2297 6, /* number of parallel prefetches */ 2298 3, /* Branch cost */ 2299 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2300 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 2301 COSTS_N_INSNS (17), /* cost of FDIV instruction. */ 2302 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2303 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2304 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ 2305 2306 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2307 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2308 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2309 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2310 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2311 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2312 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 2313 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 2314 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 2315 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 2316 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ 2317 alderlake_memcpy, 2318 alderlake_memset, 2319 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 2320 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 2321 "16:11:8", /* Loop alignment. */ 2322 "16:11:8", /* Jump alignment. */ 2323 "0:0:8", /* Label alignment. */ 2324 "16", /* Func alignment. */ 2325 }; 2326 2327 /* BTVER1 has optimized REP instruction for medium sized blocks, but for 2328 very small blocks it is better to use loop. For large blocks, libcall can 2329 do nontemporary accesses and beat inline considerably. */ 2330 static stringop_algs btver1_memcpy[2] = { 2331 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 2332 {-1, rep_prefix_4_byte, false}}}, 2333 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 2334 {-1, libcall, false}}}}; 2335 static stringop_algs btver1_memset[2] = { 2336 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 2337 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2338 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 2339 {-1, libcall, false}}}}; 2340 const struct processor_costs btver1_cost = { 2341 { 2342 /* Start of register allocator costs. integer->integer move cost is 2. */ 2343 8, /* cost for loading QImode using movzbl */ 2344 {6, 8, 6}, /* cost of loading integer registers 2345 in QImode, HImode and SImode. 2346 Relative to reg-reg move (2). */ 2347 {6, 8, 6}, /* cost of storing integer registers */ 2348 4, /* cost of reg,reg fld/fst */ 2349 {12, 12, 28}, /* cost of loading fp registers 2350 in SFmode, DFmode and XFmode */ 2351 {12, 12, 38}, /* cost of storing fp registers 2352 in SFmode, DFmode and XFmode */ 2353 4, /* cost of moving MMX register */ 2354 {10, 10}, /* cost of loading MMX registers 2355 in SImode and DImode */ 2356 {12, 12}, /* cost of storing MMX registers 2357 in SImode and DImode */ 2358 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2359 {10, 10, 12, 48, 96}, /* cost of loading SSE registers 2360 in 32,64,128,256 and 512-bit */ 2361 {10, 10, 12, 48, 96}, /* cost of storing SSE registers 2362 in 32,64,128,256 and 512-bit */ 2363 14, 14, /* SSE->integer and integer->SSE moves */ 2364 14, 14, /* mask->integer and integer->mask moves */ 2365 {6, 8, 6}, /* cost of loading mask register 2366 in QImode, HImode, SImode. */ 2367 {6, 8, 6}, /* cost if storing mask register 2368 in QImode, HImode, SImode. */ 2369 2, /* cost of moving mask register. */ 2370 /* End of register allocator costs. */ 2371 }, 2372 2373 COSTS_N_INSNS (1), /* cost of an add instruction */ 2374 COSTS_N_INSNS (2), /* cost of a lea instruction */ 2375 COSTS_N_INSNS (1), /* variable shift costs */ 2376 COSTS_N_INSNS (1), /* constant shift costs */ 2377 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2378 COSTS_N_INSNS (4), /* HI */ 2379 COSTS_N_INSNS (3), /* SI */ 2380 COSTS_N_INSNS (4), /* DI */ 2381 COSTS_N_INSNS (5)}, /* other */ 2382 0, /* cost of multiply per each bit set */ 2383 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 2384 COSTS_N_INSNS (35), /* HI */ 2385 COSTS_N_INSNS (51), /* SI */ 2386 COSTS_N_INSNS (83), /* DI */ 2387 COSTS_N_INSNS (83)}, /* other */ 2388 COSTS_N_INSNS (1), /* cost of movsx */ 2389 COSTS_N_INSNS (1), /* cost of movzx */ 2390 8, /* "large" insn */ 2391 9, /* MOVE_RATIO */ 2392 6, /* CLEAR_RATIO */ 2393 {6, 8, 6}, /* cost of loading integer registers 2394 in QImode, HImode and SImode. 2395 Relative to reg-reg move (2). */ 2396 {6, 8, 6}, /* cost of storing integer registers */ 2397 {10, 10, 12, 48, 96}, /* cost of loading SSE register 2398 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2399 {10, 10, 12, 48, 96}, /* cost of storing SSE register 2400 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2401 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ 2402 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2403 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2404 14, /* cost of moving SSE register to integer. */ 2405 10, 10, /* Gather load static, per_elt. */ 2406 10, 10, /* Gather store static, per_elt. */ 2407 32, /* size of l1 cache. */ 2408 512, /* size of l2 cache. */ 2409 64, /* size of prefetch block */ 2410 100, /* number of parallel prefetches */ 2411 2, /* Branch cost */ 2412 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 2413 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 2414 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 2415 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 2416 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 2417 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 2418 2419 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2420 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2421 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 2422 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 2423 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2424 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2425 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 2426 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 2427 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 2428 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */ 2429 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2430 btver1_memcpy, 2431 btver1_memset, 2432 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 2433 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2434 "16:11:8", /* Loop alignment. */ 2435 "16:8:8", /* Jump alignment. */ 2436 "0:0:8", /* Label alignment. */ 2437 "11", /* Func alignment. */ 2438 }; 2439 2440 static stringop_algs btver2_memcpy[2] = { 2441 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 2442 {-1, rep_prefix_4_byte, false}}}, 2443 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 2444 {-1, libcall, false}}}}; 2445 static stringop_algs btver2_memset[2] = { 2446 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 2447 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2448 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 2449 {-1, libcall, false}}}}; 2450 const struct processor_costs btver2_cost = { 2451 { 2452 /* Start of register allocator costs. integer->integer move cost is 2. */ 2453 8, /* cost for loading QImode using movzbl */ 2454 {8, 8, 6}, /* cost of loading integer registers 2455 in QImode, HImode and SImode. 2456 Relative to reg-reg move (2). */ 2457 {8, 8, 6}, /* cost of storing integer registers */ 2458 4, /* cost of reg,reg fld/fst */ 2459 {12, 12, 28}, /* cost of loading fp registers 2460 in SFmode, DFmode and XFmode */ 2461 {12, 12, 38}, /* cost of storing fp registers 2462 in SFmode, DFmode and XFmode */ 2463 4, /* cost of moving MMX register */ 2464 {10, 10}, /* cost of loading MMX registers 2465 in SImode and DImode */ 2466 {12, 12}, /* cost of storing MMX registers 2467 in SImode and DImode */ 2468 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2469 {10, 10, 12, 48, 96}, /* cost of loading SSE registers 2470 in 32,64,128,256 and 512-bit */ 2471 {10, 10, 12, 48, 96}, /* cost of storing SSE registers 2472 in 32,64,128,256 and 512-bit */ 2473 14, 14, /* SSE->integer and integer->SSE moves */ 2474 14, 14, /* mask->integer and integer->mask moves */ 2475 {8, 8, 6}, /* cost of loading mask register 2476 in QImode, HImode, SImode. */ 2477 {8, 8, 6}, /* cost if storing mask register 2478 in QImode, HImode, SImode. */ 2479 2, /* cost of moving mask register. */ 2480 /* End of register allocator costs. */ 2481 }, 2482 2483 COSTS_N_INSNS (1), /* cost of an add instruction */ 2484 COSTS_N_INSNS (2), /* cost of a lea instruction */ 2485 COSTS_N_INSNS (1), /* variable shift costs */ 2486 COSTS_N_INSNS (1), /* constant shift costs */ 2487 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2488 COSTS_N_INSNS (4), /* HI */ 2489 COSTS_N_INSNS (3), /* SI */ 2490 COSTS_N_INSNS (4), /* DI */ 2491 COSTS_N_INSNS (5)}, /* other */ 2492 0, /* cost of multiply per each bit set */ 2493 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 2494 COSTS_N_INSNS (35), /* HI */ 2495 COSTS_N_INSNS (51), /* SI */ 2496 COSTS_N_INSNS (83), /* DI */ 2497 COSTS_N_INSNS (83)}, /* other */ 2498 COSTS_N_INSNS (1), /* cost of movsx */ 2499 COSTS_N_INSNS (1), /* cost of movzx */ 2500 8, /* "large" insn */ 2501 9, /* MOVE_RATIO */ 2502 6, /* CLEAR_RATIO */ 2503 {8, 8, 6}, /* cost of loading integer registers 2504 in QImode, HImode and SImode. 2505 Relative to reg-reg move (2). */ 2506 {8, 8, 6}, /* cost of storing integer registers */ 2507 {10, 10, 12, 48, 96}, /* cost of loading SSE register 2508 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2509 {10, 10, 12, 48, 96}, /* cost of storing SSE register 2510 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2511 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ 2512 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2513 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2514 14, /* cost of moving SSE register to integer. */ 2515 10, 10, /* Gather load static, per_elt. */ 2516 10, 10, /* Gather store static, per_elt. */ 2517 32, /* size of l1 cache. */ 2518 2048, /* size of l2 cache. */ 2519 64, /* size of prefetch block */ 2520 100, /* number of parallel prefetches */ 2521 2, /* Branch cost */ 2522 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 2523 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 2524 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 2525 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 2526 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 2527 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 2528 2529 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2530 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2531 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 2532 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 2533 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2534 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2535 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 2536 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */ 2537 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */ 2538 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ 2539 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2540 btver2_memcpy, 2541 btver2_memset, 2542 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 2543 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2544 "16:11:8", /* Loop alignment. */ 2545 "16:8:8", /* Jump alignment. */ 2546 "0:0:8", /* Label alignment. */ 2547 "11", /* Func alignment. */ 2548 }; 2549 2550 static stringop_algs pentium4_memcpy[2] = { 2551 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 2552 DUMMY_STRINGOP_ALGS}; 2553 static stringop_algs pentium4_memset[2] = { 2554 {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 2555 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2556 DUMMY_STRINGOP_ALGS}; 2557 2558 static const 2559 struct processor_costs pentium4_cost = { 2560 { 2561 /* Start of register allocator costs. integer->integer move cost is 2. */ 2562 5, /* cost for loading QImode using movzbl */ 2563 {4, 5, 4}, /* cost of loading integer registers 2564 in QImode, HImode and SImode. 2565 Relative to reg-reg move (2). */ 2566 {2, 3, 2}, /* cost of storing integer registers */ 2567 12, /* cost of reg,reg fld/fst */ 2568 {14, 14, 14}, /* cost of loading fp registers 2569 in SFmode, DFmode and XFmode */ 2570 {14, 14, 14}, /* cost of storing fp registers 2571 in SFmode, DFmode and XFmode */ 2572 12, /* cost of moving MMX register */ 2573 {16, 16}, /* cost of loading MMX registers 2574 in SImode and DImode */ 2575 {16, 16}, /* cost of storing MMX registers 2576 in SImode and DImode */ 2577 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 2578 {16, 16, 16, 32, 64}, /* cost of loading SSE registers 2579 in 32,64,128,256 and 512-bit */ 2580 {16, 16, 16, 32, 64}, /* cost of storing SSE registers 2581 in 32,64,128,256 and 512-bit */ 2582 20, 12, /* SSE->integer and integer->SSE moves */ 2583 20, 12, /* mask->integer and integer->mask moves */ 2584 {4, 5, 4}, /* cost of loading mask register 2585 in QImode, HImode, SImode. */ 2586 {2, 3, 2}, /* cost if storing mask register 2587 in QImode, HImode, SImode. */ 2588 2, /* cost of moving mask register. */ 2589 /* End of register allocator costs. */ 2590 }, 2591 2592 COSTS_N_INSNS (1), /* cost of an add instruction */ 2593 COSTS_N_INSNS (3), /* cost of a lea instruction */ 2594 COSTS_N_INSNS (4), /* variable shift costs */ 2595 COSTS_N_INSNS (4), /* constant shift costs */ 2596 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ 2597 COSTS_N_INSNS (15), /* HI */ 2598 COSTS_N_INSNS (15), /* SI */ 2599 COSTS_N_INSNS (15), /* DI */ 2600 COSTS_N_INSNS (15)}, /* other */ 2601 0, /* cost of multiply per each bit set */ 2602 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ 2603 COSTS_N_INSNS (56), /* HI */ 2604 COSTS_N_INSNS (56), /* SI */ 2605 COSTS_N_INSNS (56), /* DI */ 2606 COSTS_N_INSNS (56)}, /* other */ 2607 COSTS_N_INSNS (1), /* cost of movsx */ 2608 COSTS_N_INSNS (1), /* cost of movzx */ 2609 16, /* "large" insn */ 2610 6, /* MOVE_RATIO */ 2611 6, /* CLEAR_RATIO */ 2612 {4, 5, 4}, /* cost of loading integer registers 2613 in QImode, HImode and SImode. 2614 Relative to reg-reg move (2). */ 2615 {2, 3, 2}, /* cost of storing integer registers */ 2616 {16, 16, 16, 32, 64}, /* cost of loading SSE register 2617 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2618 {16, 16, 16, 32, 64}, /* cost of storing SSE register 2619 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2620 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ 2621 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 2622 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 2623 20, /* cost of moving SSE register to integer. */ 2624 16, 16, /* Gather load static, per_elt. */ 2625 16, 16, /* Gather store static, per_elt. */ 2626 8, /* size of l1 cache. */ 2627 256, /* size of l2 cache. */ 2628 64, /* size of prefetch block */ 2629 6, /* number of parallel prefetches */ 2630 2, /* Branch cost */ 2631 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 2632 COSTS_N_INSNS (7), /* cost of FMUL instruction. */ 2633 COSTS_N_INSNS (43), /* cost of FDIV instruction. */ 2634 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 2635 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 2636 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ 2637 2638 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 2639 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 2640 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 2641 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 2642 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2643 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2644 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */ 2645 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */ 2646 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */ 2647 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */ 2648 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2649 pentium4_memcpy, 2650 pentium4_memset, 2651 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2652 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2653 NULL, /* Loop alignment. */ 2654 NULL, /* Jump alignment. */ 2655 NULL, /* Label alignment. */ 2656 NULL, /* Func alignment. */ 2657 }; 2658 2659 static stringop_algs nocona_memcpy[2] = { 2660 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 2661 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, 2662 {100000, unrolled_loop, false}, {-1, libcall, false}}}}; 2663 2664 static stringop_algs nocona_memset[2] = { 2665 {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 2666 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2667 {libcall, {{24, loop, false}, {64, unrolled_loop, false}, 2668 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2669 2670 static const 2671 struct processor_costs nocona_cost = { 2672 { 2673 /* Start of register allocator costs. integer->integer move cost is 2. */ 2674 4, /* cost for loading QImode using movzbl */ 2675 {4, 4, 4}, /* cost of loading integer registers 2676 in QImode, HImode and SImode. 2677 Relative to reg-reg move (2). */ 2678 {4, 4, 4}, /* cost of storing integer registers */ 2679 12, /* cost of reg,reg fld/fst */ 2680 {14, 14, 14}, /* cost of loading fp registers 2681 in SFmode, DFmode and XFmode */ 2682 {14, 14, 14}, /* cost of storing fp registers 2683 in SFmode, DFmode and XFmode */ 2684 14, /* cost of moving MMX register */ 2685 {12, 12}, /* cost of loading MMX registers 2686 in SImode and DImode */ 2687 {12, 12}, /* cost of storing MMX registers 2688 in SImode and DImode */ 2689 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 2690 {12, 12, 12, 24, 48}, /* cost of loading SSE registers 2691 in 32,64,128,256 and 512-bit */ 2692 {12, 12, 12, 24, 48}, /* cost of storing SSE registers 2693 in 32,64,128,256 and 512-bit */ 2694 20, 12, /* SSE->integer and integer->SSE moves */ 2695 20, 12, /* mask->integer and integer->mask moves */ 2696 {4, 4, 4}, /* cost of loading mask register 2697 in QImode, HImode, SImode. */ 2698 {4, 4, 4}, /* cost if storing mask register 2699 in QImode, HImode, SImode. */ 2700 2, /* cost of moving mask register. */ 2701 /* End of register allocator costs. */ 2702 }, 2703 2704 COSTS_N_INSNS (1), /* cost of an add instruction */ 2705 COSTS_N_INSNS (1), /* cost of a lea instruction */ 2706 COSTS_N_INSNS (1), /* variable shift costs */ 2707 COSTS_N_INSNS (1), /* constant shift costs */ 2708 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ 2709 COSTS_N_INSNS (10), /* HI */ 2710 COSTS_N_INSNS (10), /* SI */ 2711 COSTS_N_INSNS (10), /* DI */ 2712 COSTS_N_INSNS (10)}, /* other */ 2713 0, /* cost of multiply per each bit set */ 2714 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ 2715 COSTS_N_INSNS (66), /* HI */ 2716 COSTS_N_INSNS (66), /* SI */ 2717 COSTS_N_INSNS (66), /* DI */ 2718 COSTS_N_INSNS (66)}, /* other */ 2719 COSTS_N_INSNS (1), /* cost of movsx */ 2720 COSTS_N_INSNS (1), /* cost of movzx */ 2721 16, /* "large" insn */ 2722 17, /* MOVE_RATIO */ 2723 6, /* CLEAR_RATIO */ 2724 {4, 4, 4}, /* cost of loading integer registers 2725 in QImode, HImode and SImode. 2726 Relative to reg-reg move (2). */ 2727 {4, 4, 4}, /* cost of storing integer registers */ 2728 {12, 12, 12, 24, 48}, /* cost of loading SSE register 2729 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2730 {12, 12, 12, 24, 48}, /* cost of storing SSE register 2731 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2732 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ 2733 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 2734 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 2735 20, /* cost of moving SSE register to integer. */ 2736 12, 12, /* Gather load static, per_elt. */ 2737 12, 12, /* Gather store static, per_elt. */ 2738 8, /* size of l1 cache. */ 2739 1024, /* size of l2 cache. */ 2740 64, /* size of prefetch block */ 2741 8, /* number of parallel prefetches */ 2742 1, /* Branch cost */ 2743 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 2744 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2745 COSTS_N_INSNS (40), /* cost of FDIV instruction. */ 2746 COSTS_N_INSNS (3), /* cost of FABS instruction. */ 2747 COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 2748 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ 2749 2750 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 2751 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 2752 COSTS_N_INSNS (7), /* cost of MULSS instruction. */ 2753 COSTS_N_INSNS (7), /* cost of MULSD instruction. */ 2754 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 2755 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 2756 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */ 2757 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */ 2758 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */ 2759 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */ 2760 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2761 nocona_memcpy, 2762 nocona_memset, 2763 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2764 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2765 NULL, /* Loop alignment. */ 2766 NULL, /* Jump alignment. */ 2767 NULL, /* Label alignment. */ 2768 NULL, /* Func alignment. */ 2769 }; 2770 2771 static stringop_algs atom_memcpy[2] = { 2772 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2773 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2774 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2775 static stringop_algs atom_memset[2] = { 2776 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2777 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2778 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2779 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2780 static const 2781 struct processor_costs atom_cost = { 2782 { 2783 /* Start of register allocator costs. integer->integer move cost is 2. */ 2784 6, /* cost for loading QImode using movzbl */ 2785 {6, 6, 6}, /* cost of loading integer registers 2786 in QImode, HImode and SImode. 2787 Relative to reg-reg move (2). */ 2788 {6, 6, 6}, /* cost of storing integer registers */ 2789 4, /* cost of reg,reg fld/fst */ 2790 {6, 6, 18}, /* cost of loading fp registers 2791 in SFmode, DFmode and XFmode */ 2792 {14, 14, 24}, /* cost of storing fp registers 2793 in SFmode, DFmode and XFmode */ 2794 2, /* cost of moving MMX register */ 2795 {8, 8}, /* cost of loading MMX registers 2796 in SImode and DImode */ 2797 {10, 10}, /* cost of storing MMX registers 2798 in SImode and DImode */ 2799 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2800 {8, 8, 8, 16, 32}, /* cost of loading SSE registers 2801 in 32,64,128,256 and 512-bit */ 2802 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 2803 in 32,64,128,256 and 512-bit */ 2804 8, 6, /* SSE->integer and integer->SSE moves */ 2805 8, 6, /* mask->integer and integer->mask moves */ 2806 {6, 6, 6}, /* cost of loading mask register 2807 in QImode, HImode, SImode. */ 2808 {6, 6, 6}, /* cost if storing mask register 2809 in QImode, HImode, SImode. */ 2810 2, /* cost of moving mask register. */ 2811 /* End of register allocator costs. */ 2812 }, 2813 2814 COSTS_N_INSNS (1), /* cost of an add instruction */ 2815 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2816 COSTS_N_INSNS (1), /* variable shift costs */ 2817 COSTS_N_INSNS (1), /* constant shift costs */ 2818 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2819 COSTS_N_INSNS (4), /* HI */ 2820 COSTS_N_INSNS (3), /* SI */ 2821 COSTS_N_INSNS (4), /* DI */ 2822 COSTS_N_INSNS (2)}, /* other */ 2823 0, /* cost of multiply per each bit set */ 2824 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2825 COSTS_N_INSNS (26), /* HI */ 2826 COSTS_N_INSNS (42), /* SI */ 2827 COSTS_N_INSNS (74), /* DI */ 2828 COSTS_N_INSNS (74)}, /* other */ 2829 COSTS_N_INSNS (1), /* cost of movsx */ 2830 COSTS_N_INSNS (1), /* cost of movzx */ 2831 8, /* "large" insn */ 2832 17, /* MOVE_RATIO */ 2833 6, /* CLEAR_RATIO */ 2834 {6, 6, 6}, /* cost of loading integer registers 2835 in QImode, HImode and SImode. 2836 Relative to reg-reg move (2). */ 2837 {6, 6, 6}, /* cost of storing integer registers */ 2838 {8, 8, 8, 16, 32}, /* cost of loading SSE register 2839 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2840 {8, 8, 8, 16, 32}, /* cost of storing SSE register 2841 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2842 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 2843 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2844 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2845 8, /* cost of moving SSE register to integer. */ 2846 8, 8, /* Gather load static, per_elt. */ 2847 8, 8, /* Gather store static, per_elt. */ 2848 32, /* size of l1 cache. */ 2849 256, /* size of l2 cache. */ 2850 64, /* size of prefetch block */ 2851 6, /* number of parallel prefetches */ 2852 3, /* Branch cost */ 2853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2856 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2859 2860 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2861 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 2862 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2863 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2864 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2865 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2866 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 2867 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 2868 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 2869 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 2870 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2871 atom_memcpy, 2872 atom_memset, 2873 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2874 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2875 "16", /* Loop alignment. */ 2876 "16:8:8", /* Jump alignment. */ 2877 "0:0:8", /* Label alignment. */ 2878 "16", /* Func alignment. */ 2879 }; 2880 2881 static stringop_algs slm_memcpy[2] = { 2882 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2883 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2884 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2885 static stringop_algs slm_memset[2] = { 2886 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2887 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2888 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2889 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2890 static const 2891 struct processor_costs slm_cost = { 2892 { 2893 /* Start of register allocator costs. integer->integer move cost is 2. */ 2894 8, /* cost for loading QImode using movzbl */ 2895 {8, 8, 8}, /* cost of loading integer registers 2896 in QImode, HImode and SImode. 2897 Relative to reg-reg move (2). */ 2898 {6, 6, 6}, /* cost of storing integer registers */ 2899 2, /* cost of reg,reg fld/fst */ 2900 {8, 8, 18}, /* cost of loading fp registers 2901 in SFmode, DFmode and XFmode */ 2902 {6, 6, 18}, /* cost of storing fp registers 2903 in SFmode, DFmode and XFmode */ 2904 2, /* cost of moving MMX register */ 2905 {8, 8}, /* cost of loading MMX registers 2906 in SImode and DImode */ 2907 {6, 6}, /* cost of storing MMX registers 2908 in SImode and DImode */ 2909 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2910 {8, 8, 8, 16, 32}, /* cost of loading SSE registers 2911 in 32,64,128,256 and 512-bit */ 2912 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 2913 in 32,64,128,256 and 512-bit */ 2914 8, 6, /* SSE->integer and integer->SSE moves */ 2915 8, 6, /* mask->integer and integer->mask moves */ 2916 {8, 8, 8}, /* cost of loading mask register 2917 in QImode, HImode, SImode. */ 2918 {6, 6, 6}, /* cost if storing mask register 2919 in QImode, HImode, SImode. */ 2920 2, /* cost of moving mask register. */ 2921 /* End of register allocator costs. */ 2922 }, 2923 2924 COSTS_N_INSNS (1), /* cost of an add instruction */ 2925 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2926 COSTS_N_INSNS (1), /* variable shift costs */ 2927 COSTS_N_INSNS (1), /* constant shift costs */ 2928 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2929 COSTS_N_INSNS (3), /* HI */ 2930 COSTS_N_INSNS (3), /* SI */ 2931 COSTS_N_INSNS (4), /* DI */ 2932 COSTS_N_INSNS (2)}, /* other */ 2933 0, /* cost of multiply per each bit set */ 2934 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2935 COSTS_N_INSNS (26), /* HI */ 2936 COSTS_N_INSNS (42), /* SI */ 2937 COSTS_N_INSNS (74), /* DI */ 2938 COSTS_N_INSNS (74)}, /* other */ 2939 COSTS_N_INSNS (1), /* cost of movsx */ 2940 COSTS_N_INSNS (1), /* cost of movzx */ 2941 8, /* "large" insn */ 2942 17, /* MOVE_RATIO */ 2943 6, /* CLEAR_RATIO */ 2944 {8, 8, 8}, /* cost of loading integer registers 2945 in QImode, HImode and SImode. 2946 Relative to reg-reg move (2). */ 2947 {6, 6, 6}, /* cost of storing integer registers */ 2948 {8, 8, 8, 16, 32}, /* cost of loading SSE register 2949 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2950 {8, 8, 8, 16, 32}, /* cost of storing SSE register 2951 in SImode, DImode and TImode. */ 2952 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 2953 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2954 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2955 8, /* cost of moving SSE register to integer. */ 2956 8, 8, /* Gather load static, per_elt. */ 2957 8, 8, /* Gather store static, per_elt. */ 2958 32, /* size of l1 cache. */ 2959 256, /* size of l2 cache. */ 2960 64, /* size of prefetch block */ 2961 6, /* number of parallel prefetches */ 2962 3, /* Branch cost */ 2963 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2964 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2965 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2966 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2967 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2968 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2969 2970 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2971 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2972 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2973 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2974 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2975 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2976 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 2977 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */ 2978 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */ 2979 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */ 2980 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2981 slm_memcpy, 2982 slm_memset, 2983 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2984 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2985 "16", /* Loop alignment. */ 2986 "16:8:8", /* Jump alignment. */ 2987 "0:0:8", /* Label alignment. */ 2988 "16", /* Func alignment. */ 2989 }; 2990 2991 static stringop_algs tremont_memcpy[2] = { 2992 {libcall, 2993 {{256, rep_prefix_1_byte, true}, 2994 {256, loop, false}, 2995 {-1, libcall, false}}}, 2996 {libcall, 2997 {{256, rep_prefix_1_byte, true}, 2998 {256, loop, false}, 2999 {-1, libcall, false}}}}; 3000 static stringop_algs tremont_memset[2] = { 3001 {libcall, 3002 {{256, rep_prefix_1_byte, true}, 3003 {256, loop, false}, 3004 {-1, libcall, false}}}, 3005 {libcall, 3006 {{256, rep_prefix_1_byte, true}, 3007 {256, loop, false}, 3008 {-1, libcall, false}}}}; 3009 static const 3010 struct processor_costs tremont_cost = { 3011 { 3012 /* Start of register allocator costs. integer->integer move cost is 2. */ 3013 6, /* cost for loading QImode using movzbl */ 3014 {6, 6, 6}, /* cost of loading integer registers 3015 in QImode, HImode and SImode. 3016 Relative to reg-reg move (2). */ 3017 {6, 6, 6}, /* cost of storing integer registers */ 3018 4, /* cost of reg,reg fld/fst */ 3019 {6, 6, 12}, /* cost of loading fp registers 3020 in SFmode, DFmode and XFmode */ 3021 {6, 6, 12}, /* cost of storing fp registers 3022 in SFmode, DFmode and XFmode */ 3023 2, /* cost of moving MMX register */ 3024 {6, 6}, /* cost of loading MMX registers 3025 in SImode and DImode */ 3026 {6, 6}, /* cost of storing MMX registers 3027 in SImode and DImode */ 3028 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 3029 {6, 6, 6, 10, 15}, /* cost of loading SSE registers 3030 in 32,64,128,256 and 512-bit */ 3031 {6, 6, 6, 10, 15}, /* cost of storing SSE registers 3032 in 32,64,128,256 and 512-bit */ 3033 6, 6, /* SSE->integer and integer->SSE moves */ 3034 6, 6, /* mask->integer and integer->mask moves */ 3035 {6, 6, 6}, /* cost of loading mask register 3036 in QImode, HImode, SImode. */ 3037 {6, 6, 6}, /* cost if storing mask register 3038 in QImode, HImode, SImode. */ 3039 2, /* cost of moving mask register. */ 3040 /* End of register allocator costs. */ 3041 }, 3042 3043 COSTS_N_INSNS (1), /* cost of an add instruction */ 3044 /* Setting cost to 2 makes our current implementation of synth_mult result in 3045 use of unnecessary temporary registers causing regression on several 3046 SPECfp benchmarks. */ 3047 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 3048 COSTS_N_INSNS (1), /* variable shift costs */ 3049 COSTS_N_INSNS (1), /* constant shift costs */ 3050 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 3051 COSTS_N_INSNS (4), /* HI */ 3052 COSTS_N_INSNS (3), /* SI */ 3053 COSTS_N_INSNS (4), /* DI */ 3054 COSTS_N_INSNS (4)}, /* other */ 3055 0, /* cost of multiply per each bit set */ 3056 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ 3057 COSTS_N_INSNS (22), /* HI */ 3058 COSTS_N_INSNS (30), /* SI */ 3059 COSTS_N_INSNS (74), /* DI */ 3060 COSTS_N_INSNS (74)}, /* other */ 3061 COSTS_N_INSNS (1), /* cost of movsx */ 3062 COSTS_N_INSNS (1), /* cost of movzx */ 3063 8, /* "large" insn */ 3064 17, /* MOVE_RATIO */ 3065 17, /* CLEAR_RATIO */ 3066 {6, 6, 6}, /* cost of loading integer registers 3067 in QImode, HImode and SImode. 3068 Relative to reg-reg move (2). */ 3069 {6, 6, 6}, /* cost of storing integer registers */ 3070 {6, 6, 6, 10, 15}, /* cost of loading SSE register 3071 in 32bit, 64bit, 128bit, 256bit and 512bit */ 3072 {6, 6, 6, 10, 15}, /* cost of storing SSE register 3073 in 32bit, 64bit, 128bit, 256bit and 512bit */ 3074 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ 3075 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 3076 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 3077 6, /* cost of moving SSE register to integer. */ 3078 18, 6, /* Gather load static, per_elt. */ 3079 18, 6, /* Gather store static, per_elt. */ 3080 32, /* size of l1 cache. */ 3081 512, /* size of l2 cache. */ 3082 64, /* size of prefetch block */ 3083 6, /* number of parallel prefetches */ 3084 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this 3085 value is increased to perhaps more appropriate value of 5. */ 3086 3, /* Branch cost */ 3087 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 3088 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 3089 COSTS_N_INSNS (17), /* cost of FDIV instruction. */ 3090 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 3091 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 3092 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ 3093 3094 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 3095 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 3096 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 3097 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 3098 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 3099 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 3100 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 3101 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 3102 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 3103 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 3104 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ 3105 tremont_memcpy, 3106 tremont_memset, 3107 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 3108 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 3109 "16:11:8", /* Loop alignment. */ 3110 "16:11:8", /* Jump alignment. */ 3111 "0:0:8", /* Label alignment. */ 3112 "16", /* Func alignment. */ 3113 }; 3114 3115 static stringop_algs intel_memcpy[2] = { 3116 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 3117 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 3118 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 3119 static stringop_algs intel_memset[2] = { 3120 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 3121 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 3122 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 3123 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 3124 static const 3125 struct processor_costs intel_cost = { 3126 { 3127 /* Start of register allocator costs. integer->integer move cost is 2. */ 3128 6, /* cost for loading QImode using movzbl */ 3129 {4, 4, 4}, /* cost of loading integer registers 3130 in QImode, HImode and SImode. 3131 Relative to reg-reg move (2). */ 3132 {6, 6, 6}, /* cost of storing integer registers */ 3133 2, /* cost of reg,reg fld/fst */ 3134 {6, 6, 8}, /* cost of loading fp registers 3135 in SFmode, DFmode and XFmode */ 3136 {6, 6, 10}, /* cost of storing fp registers 3137 in SFmode, DFmode and XFmode */ 3138 2, /* cost of moving MMX register */ 3139 {6, 6}, /* cost of loading MMX registers 3140 in SImode and DImode */ 3141 {6, 6}, /* cost of storing MMX registers 3142 in SImode and DImode */ 3143 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 3144 {6, 6, 6, 6, 6}, /* cost of loading SSE registers 3145 in 32,64,128,256 and 512-bit */ 3146 {6, 6, 6, 6, 6}, /* cost of storing SSE registers 3147 in 32,64,128,256 and 512-bit */ 3148 4, 4, /* SSE->integer and integer->SSE moves */ 3149 4, 4, /* mask->integer and integer->mask moves */ 3150 {4, 4, 4}, /* cost of loading mask register 3151 in QImode, HImode, SImode. */ 3152 {6, 6, 6}, /* cost if storing mask register 3153 in QImode, HImode, SImode. */ 3154 2, /* cost of moving mask register. */ 3155 /* End of register allocator costs. */ 3156 }, 3157 3158 COSTS_N_INSNS (1), /* cost of an add instruction */ 3159 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 3160 COSTS_N_INSNS (1), /* variable shift costs */ 3161 COSTS_N_INSNS (1), /* constant shift costs */ 3162 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 3163 COSTS_N_INSNS (3), /* HI */ 3164 COSTS_N_INSNS (3), /* SI */ 3165 COSTS_N_INSNS (4), /* DI */ 3166 COSTS_N_INSNS (2)}, /* other */ 3167 0, /* cost of multiply per each bit set */ 3168 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 3169 COSTS_N_INSNS (26), /* HI */ 3170 COSTS_N_INSNS (42), /* SI */ 3171 COSTS_N_INSNS (74), /* DI */ 3172 COSTS_N_INSNS (74)}, /* other */ 3173 COSTS_N_INSNS (1), /* cost of movsx */ 3174 COSTS_N_INSNS (1), /* cost of movzx */ 3175 8, /* "large" insn */ 3176 17, /* MOVE_RATIO */ 3177 6, /* CLEAR_RATIO */ 3178 {4, 4, 4}, /* cost of loading integer registers 3179 in QImode, HImode and SImode. 3180 Relative to reg-reg move (2). */ 3181 {6, 6, 6}, /* cost of storing integer registers */ 3182 {6, 6, 6, 6, 6}, /* cost of loading SSE register 3183 in 32bit, 64bit, 128bit, 256bit and 512bit */ 3184 {6, 6, 6, 6, 6}, /* cost of storing SSE register 3185 in 32bit, 64bit, 128bit, 256bit and 512bit */ 3186 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 3187 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 3188 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 3189 4, /* cost of moving SSE register to integer. */ 3190 6, 6, /* Gather load static, per_elt. */ 3191 6, 6, /* Gather store static, per_elt. */ 3192 32, /* size of l1 cache. */ 3193 256, /* size of l2 cache. */ 3194 64, /* size of prefetch block */ 3195 6, /* number of parallel prefetches */ 3196 3, /* Branch cost */ 3197 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 3198 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 3199 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 3200 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 3201 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 3202 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 3203 3204 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 3205 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 3206 COSTS_N_INSNS (8), /* cost of MULSS instruction. */ 3207 COSTS_N_INSNS (8), /* cost of MULSD instruction. */ 3208 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 3209 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 3210 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */ 3211 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 3212 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */ 3213 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */ 3214 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 3215 intel_memcpy, 3216 intel_memset, 3217 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 3218 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 3219 "16", /* Loop alignment. */ 3220 "16:8:8", /* Jump alignment. */ 3221 "0:0:8", /* Label alignment. */ 3222 "16", /* Func alignment. */ 3223 }; 3224 3225 /* Generic should produce code tuned for Core-i7 (and newer chips) 3226 and btver1 (and newer chips). */ 3227 3228 static stringop_algs generic_memcpy[2] = { 3229 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 3230 {-1, libcall, false}}}, 3231 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 3232 {-1, libcall, false}}}}; 3233 static stringop_algs generic_memset[2] = { 3234 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 3235 {-1, libcall, false}}}, 3236 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 3237 {-1, libcall, false}}}}; 3238 static const 3239 struct processor_costs generic_cost = { 3240 { 3241 /* Start of register allocator costs. integer->integer move cost is 2. */ 3242 6, /* cost for loading QImode using movzbl */ 3243 {6, 6, 6}, /* cost of loading integer registers 3244 in QImode, HImode and SImode. 3245 Relative to reg-reg move (2). */ 3246 {6, 6, 6}, /* cost of storing integer registers */ 3247 4, /* cost of reg,reg fld/fst */ 3248 {6, 6, 12}, /* cost of loading fp registers 3249 in SFmode, DFmode and XFmode */ 3250 {6, 6, 12}, /* cost of storing fp registers 3251 in SFmode, DFmode and XFmode */ 3252 2, /* cost of moving MMX register */ 3253 {6, 6}, /* cost of loading MMX registers 3254 in SImode and DImode */ 3255 {6, 6}, /* cost of storing MMX registers 3256 in SImode and DImode */ 3257 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 3258 {6, 6, 6, 10, 15}, /* cost of loading SSE registers 3259 in 32,64,128,256 and 512-bit */ 3260 {6, 6, 6, 10, 15}, /* cost of storing SSE registers 3261 in 32,64,128,256 and 512-bit */ 3262 6, 6, /* SSE->integer and integer->SSE moves */ 3263 6, 6, /* mask->integer and integer->mask moves */ 3264 {6, 6, 6}, /* cost of loading mask register 3265 in QImode, HImode, SImode. */ 3266 {6, 6, 6}, /* cost if storing mask register 3267 in QImode, HImode, SImode. */ 3268 2, /* cost of moving mask register. */ 3269 /* End of register allocator costs. */ 3270 }, 3271 3272 COSTS_N_INSNS (1), /* cost of an add instruction */ 3273 /* Setting cost to 2 makes our current implementation of synth_mult result in 3274 use of unnecessary temporary registers causing regression on several 3275 SPECfp benchmarks. */ 3276 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 3277 COSTS_N_INSNS (1), /* variable shift costs */ 3278 COSTS_N_INSNS (1), /* constant shift costs */ 3279 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 3280 COSTS_N_INSNS (4), /* HI */ 3281 COSTS_N_INSNS (3), /* SI */ 3282 COSTS_N_INSNS (4), /* DI */ 3283 COSTS_N_INSNS (4)}, /* other */ 3284 0, /* cost of multiply per each bit set */ 3285 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ 3286 COSTS_N_INSNS (22), /* HI */ 3287 COSTS_N_INSNS (30), /* SI */ 3288 COSTS_N_INSNS (74), /* DI */ 3289 COSTS_N_INSNS (74)}, /* other */ 3290 COSTS_N_INSNS (1), /* cost of movsx */ 3291 COSTS_N_INSNS (1), /* cost of movzx */ 3292 8, /* "large" insn */ 3293 17, /* MOVE_RATIO */ 3294 6, /* CLEAR_RATIO */ 3295 {6, 6, 6}, /* cost of loading integer registers 3296 in QImode, HImode and SImode. 3297 Relative to reg-reg move (2). */ 3298 {6, 6, 6}, /* cost of storing integer registers */ 3299 {6, 6, 6, 10, 15}, /* cost of loading SSE register 3300 in 32bit, 64bit, 128bit, 256bit and 512bit */ 3301 {6, 6, 6, 10, 15}, /* cost of storing SSE register 3302 in 32bit, 64bit, 128bit, 256bit and 512bit */ 3303 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ 3304 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 3305 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 3306 6, /* cost of moving SSE register to integer. */ 3307 18, 6, /* Gather load static, per_elt. */ 3308 18, 6, /* Gather store static, per_elt. */ 3309 32, /* size of l1 cache. */ 3310 512, /* size of l2 cache. */ 3311 64, /* size of prefetch block */ 3312 6, /* number of parallel prefetches */ 3313 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this 3314 value is increased to perhaps more appropriate value of 5. */ 3315 3, /* Branch cost */ 3316 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 3317 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 3318 COSTS_N_INSNS (17), /* cost of FDIV instruction. */ 3319 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 3320 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 3321 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ 3322 3323 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 3324 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 3325 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 3326 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 3327 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 3328 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 3329 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 3330 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 3331 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 3332 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 3333 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ 3334 generic_memcpy, 3335 generic_memset, 3336 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 3337 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 3338 "16:11:8", /* Loop alignment. */ 3339 "16:11:8", /* Jump alignment. */ 3340 "0:0:8", /* Label alignment. */ 3341 "16", /* Func alignment. */ 3342 }; 3343 3344 /* core_cost should produce code tuned for Core familly of CPUs. */ 3345 static stringop_algs core_memcpy[2] = { 3346 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, 3347 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, 3348 {-1, libcall, false}}}}; 3349 static stringop_algs core_memset[2] = { 3350 {libcall, {{6, loop_1_byte, true}, 3351 {24, loop, true}, 3352 {8192, rep_prefix_4_byte, true}, 3353 {-1, libcall, false}}}, 3354 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, 3355 {-1, libcall, false}}}}; 3356 3357 static const 3358 struct processor_costs core_cost = { 3359 { 3360 /* Start of register allocator costs. integer->integer move cost is 2. */ 3361 6, /* cost for loading QImode using movzbl */ 3362 {4, 4, 4}, /* cost of loading integer registers 3363 in QImode, HImode and SImode. 3364 Relative to reg-reg move (2). */ 3365 {6, 6, 6}, /* cost of storing integer registers */ 3366 2, /* cost of reg,reg fld/fst */ 3367 {6, 6, 8}, /* cost of loading fp registers 3368 in SFmode, DFmode and XFmode */ 3369 {6, 6, 10}, /* cost of storing fp registers 3370 in SFmode, DFmode and XFmode */ 3371 2, /* cost of moving MMX register */ 3372 {6, 6}, /* cost of loading MMX registers 3373 in SImode and DImode */ 3374 {6, 6}, /* cost of storing MMX registers 3375 in SImode and DImode */ 3376 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 3377 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 3378 in 32,64,128,256 and 512-bit */ 3379 {6, 6, 6, 6, 12}, /* cost of storing SSE registers 3380 in 32,64,128,256 and 512-bit */ 3381 6, 6, /* SSE->integer and integer->SSE moves */ 3382 6, 6, /* mask->integer and integer->mask moves */ 3383 {4, 4, 4}, /* cost of loading mask register 3384 in QImode, HImode, SImode. */ 3385 {6, 6, 6}, /* cost if storing mask register 3386 in QImode, HImode, SImode. */ 3387 2, /* cost of moving mask register. */ 3388 /* End of register allocator costs. */ 3389 }, 3390 3391 COSTS_N_INSNS (1), /* cost of an add instruction */ 3392 /* On all chips taken into consideration lea is 2 cycles and more. With 3393 this cost however our current implementation of synth_mult results in 3394 use of unnecessary temporary registers causing regression on several 3395 SPECfp benchmarks. */ 3396 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 3397 COSTS_N_INSNS (1), /* variable shift costs */ 3398 COSTS_N_INSNS (1), /* constant shift costs */ 3399 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 3400 COSTS_N_INSNS (4), /* HI */ 3401 COSTS_N_INSNS (3), /* SI */ 3402 /* Here we tune for Sandybridge or newer. */ 3403 COSTS_N_INSNS (3), /* DI */ 3404 COSTS_N_INSNS (3)}, /* other */ 3405 0, /* cost of multiply per each bit set */ 3406 /* Expanding div/mod currently doesn't consider parallelism. So the cost 3407 model is not realistic. We compensate by increasing the latencies a bit. */ 3408 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 3409 COSTS_N_INSNS (11), /* HI */ 3410 COSTS_N_INSNS (14), /* SI */ 3411 COSTS_N_INSNS (81), /* DI */ 3412 COSTS_N_INSNS (81)}, /* other */ 3413 COSTS_N_INSNS (1), /* cost of movsx */ 3414 COSTS_N_INSNS (1), /* cost of movzx */ 3415 8, /* "large" insn */ 3416 17, /* MOVE_RATIO */ 3417 6, /* CLEAR_RATIO */ 3418 {4, 4, 4}, /* cost of loading integer registers 3419 in QImode, HImode and SImode. 3420 Relative to reg-reg move (2). */ 3421 {6, 6, 6}, /* cost of storing integer registers */ 3422 {6, 6, 6, 6, 12}, /* cost of loading SSE register 3423 in 32bit, 64bit, 128bit, 256bit and 512bit */ 3424 {6, 6, 6, 6, 12}, /* cost of storing SSE register 3425 in 32bit, 64bit, 128bit, 256bit and 512bit */ 3426 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 3427 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 3428 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 3429 2, /* cost of moving SSE register to integer. */ 3430 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, 3431 rec. throughput 6. 3432 So 5 uops statically and one uops per load. */ 3433 10, 6, /* Gather load static, per_elt. */ 3434 10, 6, /* Gather store static, per_elt. */ 3435 64, /* size of l1 cache. */ 3436 512, /* size of l2 cache. */ 3437 64, /* size of prefetch block */ 3438 6, /* number of parallel prefetches */ 3439 /* FIXME perhaps more appropriate value is 5. */ 3440 3, /* Branch cost */ 3441 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 3442 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 3443 /* 10-24 */ 3444 COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 3445 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 3446 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 3447 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */ 3448 3449 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 3450 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 3451 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 3452 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 3453 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 3454 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 3455 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 3456 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ 3457 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ 3458 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ 3459 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 3460 core_memcpy, 3461 core_memset, 3462 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 3463 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 3464 "16:11:8", /* Loop alignment. */ 3465 "16:11:8", /* Jump alignment. */ 3466 "0:0:8", /* Label alignment. */ 3467 "16", /* Func alignment. */ 3468 }; 3469 3470