1 /* Costs of operations of individual x86 CPUs. 2 Copyright (C) 1988-2020 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 GCC is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 Under Section 7 of GPL version 3, you are granted additional 17 permissions described in the GCC Runtime Library Exception, version 18 3.1, as published by the Free Software Foundation. 19 20 You should have received a copy of the GNU General Public License and 21 a copy of the GCC Runtime Library Exception along with this program; 22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 <http://www.gnu.org/licenses/>. */ 24 /* Processor costs (relative to an add) */ 25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ 26 #define COSTS_N_BYTES(N) ((N) * 2) 27 28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} 29 30 static stringop_algs ix86_size_memcpy[2] = { 31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 33 static stringop_algs ix86_size_memset[2] = { 34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 36 37 const 38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */ 39 { 40 /* Start of register allocator costs. integer->integer move cost is 2. */ 41 2, /* cost for loading QImode using movzbl */ 42 {2, 2, 2}, /* cost of loading integer registers 43 in QImode, HImode and SImode. 44 Relative to reg-reg move (2). */ 45 {2, 2, 2}, /* cost of storing integer registers */ 46 2, /* cost of reg,reg fld/fst */ 47 {2, 2, 2}, /* cost of loading fp registers 48 in SFmode, DFmode and XFmode */ 49 {2, 2, 2}, /* cost of storing fp registers 50 in SFmode, DFmode and XFmode */ 51 3, /* cost of moving MMX register */ 52 {3, 3}, /* cost of loading MMX registers 53 in SImode and DImode */ 54 {3, 3}, /* cost of storing MMX registers 55 in SImode and DImode */ 56 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 57 {3, 3, 3, 3, 3}, /* cost of loading SSE registers 58 in 32,64,128,256 and 512-bit */ 59 {3, 3, 3, 3, 3}, /* cost of storing SSE registers 60 in 32,64,128,256 and 512-bit */ 61 3, 3, /* SSE->integer and integer->SSE moves */ 62 /* End of register allocator costs. */ 63 }, 64 65 COSTS_N_BYTES (2), /* cost of an add instruction */ 66 COSTS_N_BYTES (3), /* cost of a lea instruction */ 67 COSTS_N_BYTES (2), /* variable shift costs */ 68 COSTS_N_BYTES (3), /* constant shift costs */ 69 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ 70 COSTS_N_BYTES (3), /* HI */ 71 COSTS_N_BYTES (3), /* SI */ 72 COSTS_N_BYTES (3), /* DI */ 73 COSTS_N_BYTES (5)}, /* other */ 74 0, /* cost of multiply per each bit set */ 75 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ 76 COSTS_N_BYTES (3), /* HI */ 77 COSTS_N_BYTES (3), /* SI */ 78 COSTS_N_BYTES (3), /* DI */ 79 COSTS_N_BYTES (5)}, /* other */ 80 COSTS_N_BYTES (3), /* cost of movsx */ 81 COSTS_N_BYTES (3), /* cost of movzx */ 82 0, /* "large" insn */ 83 2, /* MOVE_RATIO */ 84 2, /* CLEAR_RATIO */ 85 {2, 2, 2}, /* cost of loading integer registers 86 in QImode, HImode and SImode. 87 Relative to reg-reg move (2). */ 88 {2, 2, 2}, /* cost of storing integer registers */ 89 {3, 3, 3, 3, 3}, /* cost of loading SSE register 90 in 32bit, 64bit, 128bit, 256bit and 512bit */ 91 {3, 3, 3, 3, 3}, /* cost of storing SSE register 92 in 32bit, 64bit, 128bit, 256bit and 512bit */ 93 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load 94 in 128bit, 256bit and 512bit */ 95 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store 96 in 128bit, 256bit and 512bit */ 97 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 98 3, /* cost of moving SSE register to integer. */ 99 5, 0, /* Gather load static, per_elt. */ 100 5, 0, /* Gather store static, per_elt. */ 101 0, /* size of l1 cache */ 102 0, /* size of l2 cache */ 103 0, /* size of prefetch block */ 104 0, /* number of parallel prefetches */ 105 2, /* Branch cost */ 106 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ 107 COSTS_N_BYTES (2), /* cost of FMUL instruction. */ 108 COSTS_N_BYTES (2), /* cost of FDIV instruction. */ 109 COSTS_N_BYTES (2), /* cost of FABS instruction. */ 110 COSTS_N_BYTES (2), /* cost of FCHS instruction. */ 111 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ 112 113 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */ 114 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 115 COSTS_N_BYTES (2), /* cost of MULSS instruction. */ 116 COSTS_N_BYTES (2), /* cost of MULSD instruction. */ 117 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */ 118 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */ 119 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */ 120 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */ 121 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */ 122 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */ 123 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 124 ix86_size_memcpy, 125 ix86_size_memset, 126 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ 127 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */ 128 NULL, /* Loop alignment. */ 129 NULL, /* Jump alignment. */ 130 NULL, /* Label alignment. */ 131 NULL, /* Func alignment. */ 132 }; 133 134 /* Processor costs (relative to an add) */ 135 static stringop_algs i386_memcpy[2] = { 136 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 137 DUMMY_STRINGOP_ALGS}; 138 static stringop_algs i386_memset[2] = { 139 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 140 DUMMY_STRINGOP_ALGS}; 141 142 static const 143 struct processor_costs i386_cost = { /* 386 specific costs */ 144 { 145 /* Start of register allocator costs. integer->integer move cost is 2. */ 146 4, /* cost for loading QImode using movzbl */ 147 {2, 4, 2}, /* cost of loading integer registers 148 in QImode, HImode and SImode. 149 Relative to reg-reg move (2). */ 150 {2, 4, 2}, /* cost of storing integer registers */ 151 2, /* cost of reg,reg fld/fst */ 152 {8, 8, 8}, /* cost of loading fp registers 153 in SFmode, DFmode and XFmode */ 154 {8, 8, 8}, /* cost of storing fp registers 155 in SFmode, DFmode and XFmode */ 156 2, /* cost of moving MMX register */ 157 {4, 8}, /* cost of loading MMX registers 158 in SImode and DImode */ 159 {4, 8}, /* cost of storing MMX registers 160 in SImode and DImode */ 161 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 162 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 163 in 32,64,128,256 and 512-bit */ 164 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 165 in 32,64,128,256 and 512-bit */ 166 3, 3, /* SSE->integer and integer->SSE moves */ 167 /* End of register allocator costs. */ 168 }, 169 170 COSTS_N_INSNS (1), /* cost of an add instruction */ 171 COSTS_N_INSNS (1), /* cost of a lea instruction */ 172 COSTS_N_INSNS (3), /* variable shift costs */ 173 COSTS_N_INSNS (2), /* constant shift costs */ 174 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ 175 COSTS_N_INSNS (6), /* HI */ 176 COSTS_N_INSNS (6), /* SI */ 177 COSTS_N_INSNS (6), /* DI */ 178 COSTS_N_INSNS (6)}, /* other */ 179 COSTS_N_INSNS (1), /* cost of multiply per each bit set */ 180 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ 181 COSTS_N_INSNS (23), /* HI */ 182 COSTS_N_INSNS (23), /* SI */ 183 COSTS_N_INSNS (23), /* DI */ 184 COSTS_N_INSNS (23)}, /* other */ 185 COSTS_N_INSNS (3), /* cost of movsx */ 186 COSTS_N_INSNS (2), /* cost of movzx */ 187 15, /* "large" insn */ 188 3, /* MOVE_RATIO */ 189 3, /* CLEAR_RATIO */ 190 {2, 4, 2}, /* cost of loading integer registers 191 in QImode, HImode and SImode. 192 Relative to reg-reg move (2). */ 193 {2, 4, 2}, /* cost of storing integer registers */ 194 {4, 8, 16, 32, 64}, /* cost of loading SSE register 195 in 32bit, 64bit, 128bit, 256bit and 512bit */ 196 {4, 8, 16, 32, 64}, /* cost of storing SSE register 197 in 32bit, 64bit, 128bit, 256bit and 512bit */ 198 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 199 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 200 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 201 3, /* cost of moving SSE register to integer. */ 202 4, 4, /* Gather load static, per_elt. */ 203 4, 4, /* Gather store static, per_elt. */ 204 0, /* size of l1 cache */ 205 0, /* size of l2 cache */ 206 0, /* size of prefetch block */ 207 0, /* number of parallel prefetches */ 208 1, /* Branch cost */ 209 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ 210 COSTS_N_INSNS (27), /* cost of FMUL instruction. */ 211 COSTS_N_INSNS (88), /* cost of FDIV instruction. */ 212 COSTS_N_INSNS (22), /* cost of FABS instruction. */ 213 COSTS_N_INSNS (24), /* cost of FCHS instruction. */ 214 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ 215 216 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 217 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */ 218 COSTS_N_INSNS (27), /* cost of MULSS instruction. */ 219 COSTS_N_INSNS (27), /* cost of MULSD instruction. */ 220 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */ 221 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */ 222 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */ 223 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */ 224 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */ 225 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */ 226 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 227 i386_memcpy, 228 i386_memset, 229 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 230 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 231 "4", /* Loop alignment. */ 232 "4", /* Jump alignment. */ 233 NULL, /* Label alignment. */ 234 "4", /* Func alignment. */ 235 }; 236 237 static stringop_algs i486_memcpy[2] = { 238 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 239 DUMMY_STRINGOP_ALGS}; 240 static stringop_algs i486_memset[2] = { 241 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 242 DUMMY_STRINGOP_ALGS}; 243 244 static const 245 struct processor_costs i486_cost = { /* 486 specific costs */ 246 { 247 /* Start of register allocator costs. integer->integer move cost is 2. */ 248 4, /* cost for loading QImode using movzbl */ 249 {2, 4, 2}, /* cost of loading integer registers 250 in QImode, HImode and SImode. 251 Relative to reg-reg move (2). */ 252 {2, 4, 2}, /* cost of storing integer registers */ 253 2, /* cost of reg,reg fld/fst */ 254 {8, 8, 8}, /* cost of loading fp registers 255 in SFmode, DFmode and XFmode */ 256 {8, 8, 8}, /* cost of storing fp registers 257 in SFmode, DFmode and XFmode */ 258 2, /* cost of moving MMX register */ 259 {4, 8}, /* cost of loading MMX registers 260 in SImode and DImode */ 261 {4, 8}, /* cost of storing MMX registers 262 in SImode and DImode */ 263 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 264 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 265 in 32,64,128,256 and 512-bit */ 266 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 267 in 32,64,128,256 and 512-bit */ 268 3, 3, /* SSE->integer and integer->SSE moves */ 269 /* End of register allocator costs. */ 270 }, 271 272 COSTS_N_INSNS (1), /* cost of an add instruction */ 273 COSTS_N_INSNS (1), /* cost of a lea instruction */ 274 COSTS_N_INSNS (3), /* variable shift costs */ 275 COSTS_N_INSNS (2), /* constant shift costs */ 276 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ 277 COSTS_N_INSNS (12), /* HI */ 278 COSTS_N_INSNS (12), /* SI */ 279 COSTS_N_INSNS (12), /* DI */ 280 COSTS_N_INSNS (12)}, /* other */ 281 1, /* cost of multiply per each bit set */ 282 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ 283 COSTS_N_INSNS (40), /* HI */ 284 COSTS_N_INSNS (40), /* SI */ 285 COSTS_N_INSNS (40), /* DI */ 286 COSTS_N_INSNS (40)}, /* other */ 287 COSTS_N_INSNS (3), /* cost of movsx */ 288 COSTS_N_INSNS (2), /* cost of movzx */ 289 15, /* "large" insn */ 290 3, /* MOVE_RATIO */ 291 3, /* CLEAR_RATIO */ 292 {2, 4, 2}, /* cost of loading integer registers 293 in QImode, HImode and SImode. 294 Relative to reg-reg move (2). */ 295 {2, 4, 2}, /* cost of storing integer registers */ 296 {4, 8, 16, 32, 64}, /* cost of loading SSE register 297 in 32bit, 64bit, 128bit, 256bit and 512bit */ 298 {4, 8, 16, 32, 64}, /* cost of storing SSE register 299 in 32bit, 64bit, 128bit, 256bit and 512bit */ 300 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 301 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 302 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 303 3, /* cost of moving SSE register to integer. */ 304 4, 4, /* Gather load static, per_elt. */ 305 4, 4, /* Gather store static, per_elt. */ 306 4, /* size of l1 cache. 486 has 8kB cache 307 shared for code and data, so 4kB is 308 not really precise. */ 309 4, /* size of l2 cache */ 310 0, /* size of prefetch block */ 311 0, /* number of parallel prefetches */ 312 1, /* Branch cost */ 313 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 314 COSTS_N_INSNS (16), /* cost of FMUL instruction. */ 315 COSTS_N_INSNS (73), /* cost of FDIV instruction. */ 316 COSTS_N_INSNS (3), /* cost of FABS instruction. */ 317 COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 318 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ 319 320 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 321 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 322 COSTS_N_INSNS (16), /* cost of MULSS instruction. */ 323 COSTS_N_INSNS (16), /* cost of MULSD instruction. */ 324 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */ 325 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */ 326 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */ 327 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */ 328 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */ 329 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */ 330 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 331 i486_memcpy, 332 i486_memset, 333 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 334 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 335 "16", /* Loop alignment. */ 336 "16", /* Jump alignment. */ 337 "0:0:8", /* Label alignment. */ 338 "16", /* Func alignment. */ 339 }; 340 341 static stringop_algs pentium_memcpy[2] = { 342 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 343 DUMMY_STRINGOP_ALGS}; 344 static stringop_algs pentium_memset[2] = { 345 {libcall, {{-1, rep_prefix_4_byte, false}}}, 346 DUMMY_STRINGOP_ALGS}; 347 348 static const 349 struct processor_costs pentium_cost = { 350 { 351 /* Start of register allocator costs. integer->integer move cost is 2. */ 352 6, /* cost for loading QImode using movzbl */ 353 {2, 4, 2}, /* cost of loading integer registers 354 in QImode, HImode and SImode. 355 Relative to reg-reg move (2). */ 356 {2, 4, 2}, /* cost of storing integer registers */ 357 2, /* cost of reg,reg fld/fst */ 358 {2, 2, 6}, /* cost of loading fp registers 359 in SFmode, DFmode and XFmode */ 360 {4, 4, 6}, /* cost of storing fp registers 361 in SFmode, DFmode and XFmode */ 362 8, /* cost of moving MMX register */ 363 {8, 8}, /* cost of loading MMX registers 364 in SImode and DImode */ 365 {8, 8}, /* cost of storing MMX registers 366 in SImode and DImode */ 367 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 368 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 369 in 32,64,128,256 and 512-bit */ 370 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 371 in 32,64,128,256 and 512-bit */ 372 3, 3, /* SSE->integer and integer->SSE moves */ 373 /* End of register allocator costs. */ 374 }, 375 376 COSTS_N_INSNS (1), /* cost of an add instruction */ 377 COSTS_N_INSNS (1), /* cost of a lea instruction */ 378 COSTS_N_INSNS (4), /* variable shift costs */ 379 COSTS_N_INSNS (1), /* constant shift costs */ 380 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 381 COSTS_N_INSNS (11), /* HI */ 382 COSTS_N_INSNS (11), /* SI */ 383 COSTS_N_INSNS (11), /* DI */ 384 COSTS_N_INSNS (11)}, /* other */ 385 0, /* cost of multiply per each bit set */ 386 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 387 COSTS_N_INSNS (25), /* HI */ 388 COSTS_N_INSNS (25), /* SI */ 389 COSTS_N_INSNS (25), /* DI */ 390 COSTS_N_INSNS (25)}, /* other */ 391 COSTS_N_INSNS (3), /* cost of movsx */ 392 COSTS_N_INSNS (2), /* cost of movzx */ 393 8, /* "large" insn */ 394 6, /* MOVE_RATIO */ 395 6, /* CLEAR_RATIO */ 396 {2, 4, 2}, /* cost of loading integer registers 397 in QImode, HImode and SImode. 398 Relative to reg-reg move (2). */ 399 {2, 4, 2}, /* cost of storing integer registers */ 400 {4, 8, 16, 32, 64}, /* cost of loading SSE register 401 in 32bit, 64bit, 128bit, 256bit and 512bit */ 402 {4, 8, 16, 32, 64}, /* cost of storing SSE register 403 in 32bit, 64bit, 128bit, 256bit and 512bit */ 404 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 405 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 406 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 407 3, /* cost of moving SSE register to integer. */ 408 4, 4, /* Gather load static, per_elt. */ 409 4, 4, /* Gather store static, per_elt. */ 410 8, /* size of l1 cache. */ 411 8, /* size of l2 cache */ 412 0, /* size of prefetch block */ 413 0, /* number of parallel prefetches */ 414 2, /* Branch cost */ 415 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 416 COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 417 COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 418 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 419 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 420 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 421 422 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 423 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 424 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 425 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 426 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 427 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 428 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 429 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */ 430 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */ 431 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */ 432 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 433 pentium_memcpy, 434 pentium_memset, 435 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 436 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 437 "16:8:8", /* Loop alignment. */ 438 "16:8:8", /* Jump alignment. */ 439 "0:0:8", /* Label alignment. */ 440 "16", /* Func alignment. */ 441 }; 442 443 static const 444 struct processor_costs lakemont_cost = { 445 { 446 /* Start of register allocator costs. integer->integer move cost is 2. */ 447 6, /* cost for loading QImode using movzbl */ 448 {2, 4, 2}, /* cost of loading integer registers 449 in QImode, HImode and SImode. 450 Relative to reg-reg move (2). */ 451 {2, 4, 2}, /* cost of storing integer registers */ 452 2, /* cost of reg,reg fld/fst */ 453 {2, 2, 6}, /* cost of loading fp registers 454 in SFmode, DFmode and XFmode */ 455 {4, 4, 6}, /* cost of storing fp registers 456 in SFmode, DFmode and XFmode */ 457 8, /* cost of moving MMX register */ 458 {8, 8}, /* cost of loading MMX registers 459 in SImode and DImode */ 460 {8, 8}, /* cost of storing MMX registers 461 in SImode and DImode */ 462 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 463 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 464 in 32,64,128,256 and 512-bit */ 465 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 466 in 32,64,128,256 and 512-bit */ 467 3, 3, /* SSE->integer and integer->SSE moves */ 468 /* End of register allocator costs. */ 469 }, 470 471 COSTS_N_INSNS (1), /* cost of an add instruction */ 472 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 473 COSTS_N_INSNS (1), /* variable shift costs */ 474 COSTS_N_INSNS (1), /* constant shift costs */ 475 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 476 COSTS_N_INSNS (11), /* HI */ 477 COSTS_N_INSNS (11), /* SI */ 478 COSTS_N_INSNS (11), /* DI */ 479 COSTS_N_INSNS (11)}, /* other */ 480 0, /* cost of multiply per each bit set */ 481 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 482 COSTS_N_INSNS (25), /* HI */ 483 COSTS_N_INSNS (25), /* SI */ 484 COSTS_N_INSNS (25), /* DI */ 485 COSTS_N_INSNS (25)}, /* other */ 486 COSTS_N_INSNS (3), /* cost of movsx */ 487 COSTS_N_INSNS (2), /* cost of movzx */ 488 8, /* "large" insn */ 489 17, /* MOVE_RATIO */ 490 6, /* CLEAR_RATIO */ 491 {2, 4, 2}, /* cost of loading integer registers 492 in QImode, HImode and SImode. 493 Relative to reg-reg move (2). */ 494 {2, 4, 2}, /* cost of storing integer registers */ 495 {4, 8, 16, 32, 64}, /* cost of loading SSE register 496 in 32bit, 64bit, 128bit, 256bit and 512bit */ 497 {4, 8, 16, 32, 64}, /* cost of storing SSE register 498 in 32bit, 64bit, 128bit, 256bit and 512bit */ 499 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 500 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 501 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 502 3, /* cost of moving SSE register to integer. */ 503 4, 4, /* Gather load static, per_elt. */ 504 4, 4, /* Gather store static, per_elt. */ 505 8, /* size of l1 cache. */ 506 8, /* size of l2 cache */ 507 0, /* size of prefetch block */ 508 0, /* number of parallel prefetches */ 509 2, /* Branch cost */ 510 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 511 COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 512 COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 513 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 514 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 515 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 516 517 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 518 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 519 COSTS_N_INSNS (5), /* cost of MULSS instruction. */ 520 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 521 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */ 522 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */ 523 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 524 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 525 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 526 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 527 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 528 pentium_memcpy, 529 pentium_memset, 530 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 531 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 532 "16:8:8", /* Loop alignment. */ 533 "16:8:8", /* Jump alignment. */ 534 "0:0:8", /* Label alignment. */ 535 "16", /* Func alignment. */ 536 }; 537 538 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes 539 (we ensure the alignment). For small blocks inline loop is still a 540 noticeable win, for bigger blocks either rep movsl or rep movsb is 541 way to go. Rep movsb has apparently more expensive startup time in CPU, 542 but after 4K the difference is down in the noise. */ 543 static stringop_algs pentiumpro_memcpy[2] = { 544 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, 545 {8192, rep_prefix_4_byte, false}, 546 {-1, rep_prefix_1_byte, false}}}, 547 DUMMY_STRINGOP_ALGS}; 548 static stringop_algs pentiumpro_memset[2] = { 549 {rep_prefix_4_byte, {{1024, unrolled_loop, false}, 550 {8192, rep_prefix_4_byte, false}, 551 {-1, libcall, false}}}, 552 DUMMY_STRINGOP_ALGS}; 553 static const 554 struct processor_costs pentiumpro_cost = { 555 { 556 /* Start of register allocator costs. integer->integer move cost is 2. */ 557 2, /* cost for loading QImode using movzbl */ 558 {4, 4, 4}, /* cost of loading integer registers 559 in QImode, HImode and SImode. 560 Relative to reg-reg move (2). */ 561 {2, 2, 2}, /* cost of storing integer registers */ 562 2, /* cost of reg,reg fld/fst */ 563 {2, 2, 6}, /* cost of loading fp registers 564 in SFmode, DFmode and XFmode */ 565 {4, 4, 6}, /* cost of storing fp registers 566 in SFmode, DFmode and XFmode */ 567 2, /* cost of moving MMX register */ 568 {2, 2}, /* cost of loading MMX registers 569 in SImode and DImode */ 570 {2, 2}, /* cost of storing MMX registers 571 in SImode and DImode */ 572 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 573 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 574 in 32,64,128,256 and 512-bit */ 575 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 576 in 32,64,128,256 and 512-bit */ 577 3, 3, /* SSE->integer and integer->SSE moves */ 578 /* End of register allocator costs. */ 579 }, 580 581 COSTS_N_INSNS (1), /* cost of an add instruction */ 582 COSTS_N_INSNS (1), /* cost of a lea instruction */ 583 COSTS_N_INSNS (1), /* variable shift costs */ 584 COSTS_N_INSNS (1), /* constant shift costs */ 585 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 586 COSTS_N_INSNS (4), /* HI */ 587 COSTS_N_INSNS (4), /* SI */ 588 COSTS_N_INSNS (4), /* DI */ 589 COSTS_N_INSNS (4)}, /* other */ 590 0, /* cost of multiply per each bit set */ 591 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ 592 COSTS_N_INSNS (17), /* HI */ 593 COSTS_N_INSNS (17), /* SI */ 594 COSTS_N_INSNS (17), /* DI */ 595 COSTS_N_INSNS (17)}, /* other */ 596 COSTS_N_INSNS (1), /* cost of movsx */ 597 COSTS_N_INSNS (1), /* cost of movzx */ 598 8, /* "large" insn */ 599 6, /* MOVE_RATIO */ 600 6, /* CLEAR_RATIO */ 601 {4, 4, 4}, /* cost of loading integer registers 602 in QImode, HImode and SImode. 603 Relative to reg-reg move (2). */ 604 {2, 2, 2}, /* cost of storing integer registers */ 605 {4, 8, 16, 32, 64}, /* cost of loading SSE register 606 in 32bit, 64bit, 128bit, 256bit and 512bit */ 607 {4, 8, 16, 32, 64}, /* cost of storing SSE register 608 in 32bit, 64bit, 128bit, 256bit and 512bit */ 609 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 610 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 611 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 612 3, /* cost of moving SSE register to integer. */ 613 4, 4, /* Gather load static, per_elt. */ 614 4, 4, /* Gather store static, per_elt. */ 615 8, /* size of l1 cache. */ 616 256, /* size of l2 cache */ 617 32, /* size of prefetch block */ 618 6, /* number of parallel prefetches */ 619 2, /* Branch cost */ 620 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 621 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 622 COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 623 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 624 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 625 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 626 627 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 628 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 629 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 630 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 631 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 632 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 633 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 634 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */ 635 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 636 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */ 637 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 638 pentiumpro_memcpy, 639 pentiumpro_memset, 640 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 641 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 642 "16", /* Loop alignment. */ 643 "16:11:8", /* Jump alignment. */ 644 "0:0:8", /* Label alignment. */ 645 "16", /* Func alignment. */ 646 }; 647 648 static stringop_algs geode_memcpy[2] = { 649 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 650 DUMMY_STRINGOP_ALGS}; 651 static stringop_algs geode_memset[2] = { 652 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 653 DUMMY_STRINGOP_ALGS}; 654 static const 655 struct processor_costs geode_cost = { 656 { 657 /* Start of register allocator costs. integer->integer move cost is 2. */ 658 2, /* cost for loading QImode using movzbl */ 659 {2, 2, 2}, /* cost of loading integer registers 660 in QImode, HImode and SImode. 661 Relative to reg-reg move (2). */ 662 {2, 2, 2}, /* cost of storing integer registers */ 663 2, /* cost of reg,reg fld/fst */ 664 {2, 2, 2}, /* cost of loading fp registers 665 in SFmode, DFmode and XFmode */ 666 {4, 6, 6}, /* cost of storing fp registers 667 in SFmode, DFmode and XFmode */ 668 2, /* cost of moving MMX register */ 669 {2, 2}, /* cost of loading MMX registers 670 in SImode and DImode */ 671 {2, 2}, /* cost of storing MMX registers 672 in SImode and DImode */ 673 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 674 {2, 2, 8, 16, 32}, /* cost of loading SSE registers 675 in 32,64,128,256 and 512-bit */ 676 {2, 2, 8, 16, 32}, /* cost of storing SSE registers 677 in 32,64,128,256 and 512-bit */ 678 6, 6, /* SSE->integer and integer->SSE moves */ 679 /* End of register allocator costs. */ 680 }, 681 682 COSTS_N_INSNS (1), /* cost of an add instruction */ 683 COSTS_N_INSNS (1), /* cost of a lea instruction */ 684 COSTS_N_INSNS (2), /* variable shift costs */ 685 COSTS_N_INSNS (1), /* constant shift costs */ 686 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 687 COSTS_N_INSNS (4), /* HI */ 688 COSTS_N_INSNS (7), /* SI */ 689 COSTS_N_INSNS (7), /* DI */ 690 COSTS_N_INSNS (7)}, /* other */ 691 0, /* cost of multiply per each bit set */ 692 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ 693 COSTS_N_INSNS (23), /* HI */ 694 COSTS_N_INSNS (39), /* SI */ 695 COSTS_N_INSNS (39), /* DI */ 696 COSTS_N_INSNS (39)}, /* other */ 697 COSTS_N_INSNS (1), /* cost of movsx */ 698 COSTS_N_INSNS (1), /* cost of movzx */ 699 8, /* "large" insn */ 700 4, /* MOVE_RATIO */ 701 4, /* CLEAR_RATIO */ 702 {2, 2, 2}, /* cost of loading integer registers 703 in QImode, HImode and SImode. 704 Relative to reg-reg move (2). */ 705 {2, 2, 2}, /* cost of storing integer registers */ 706 {2, 2, 8, 16, 32}, /* cost of loading SSE register 707 in 32bit, 64bit, 128bit, 256bit and 512bit */ 708 {2, 2, 8, 16, 32}, /* cost of storing SSE register 709 in 32bit, 64bit, 128bit, 256bit and 512bit */ 710 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 711 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 712 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 713 6, /* cost of moving SSE register to integer. */ 714 2, 2, /* Gather load static, per_elt. */ 715 2, 2, /* Gather store static, per_elt. */ 716 64, /* size of l1 cache. */ 717 128, /* size of l2 cache. */ 718 32, /* size of prefetch block */ 719 1, /* number of parallel prefetches */ 720 1, /* Branch cost */ 721 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 722 COSTS_N_INSNS (11), /* cost of FMUL instruction. */ 723 COSTS_N_INSNS (47), /* cost of FDIV instruction. */ 724 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 725 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 726 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ 727 728 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 729 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 730 COSTS_N_INSNS (11), /* cost of MULSS instruction. */ 731 COSTS_N_INSNS (11), /* cost of MULSD instruction. */ 732 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */ 733 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */ 734 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */ 735 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */ 736 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */ 737 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */ 738 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 739 geode_memcpy, 740 geode_memset, 741 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 742 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 743 NULL, /* Loop alignment. */ 744 NULL, /* Jump alignment. */ 745 NULL, /* Label alignment. */ 746 NULL, /* Func alignment. */ 747 }; 748 749 static stringop_algs k6_memcpy[2] = { 750 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 751 DUMMY_STRINGOP_ALGS}; 752 static stringop_algs k6_memset[2] = { 753 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 754 DUMMY_STRINGOP_ALGS}; 755 static const 756 struct processor_costs k6_cost = { 757 { 758 /* Start of register allocator costs. integer->integer move cost is 2. */ 759 3, /* cost for loading QImode using movzbl */ 760 {4, 5, 4}, /* cost of loading integer registers 761 in QImode, HImode and SImode. 762 Relative to reg-reg move (2). */ 763 {2, 3, 2}, /* cost of storing integer registers */ 764 4, /* cost of reg,reg fld/fst */ 765 {6, 6, 6}, /* cost of loading fp registers 766 in SFmode, DFmode and XFmode */ 767 {4, 4, 4}, /* cost of storing fp registers 768 in SFmode, DFmode and XFmode */ 769 2, /* cost of moving MMX register */ 770 {2, 2}, /* cost of loading MMX registers 771 in SImode and DImode */ 772 {2, 2}, /* cost of storing MMX registers 773 in SImode and DImode */ 774 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 775 {2, 2, 8, 16, 32}, /* cost of loading SSE registers 776 in 32,64,128,256 and 512-bit */ 777 {2, 2, 8, 16, 32}, /* cost of storing SSE registers 778 in 32,64,128,256 and 512-bit */ 779 6, 6, /* SSE->integer and integer->SSE moves */ 780 /* End of register allocator costs. */ 781 }, 782 783 COSTS_N_INSNS (1), /* cost of an add instruction */ 784 COSTS_N_INSNS (2), /* cost of a lea instruction */ 785 COSTS_N_INSNS (1), /* variable shift costs */ 786 COSTS_N_INSNS (1), /* constant shift costs */ 787 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 788 COSTS_N_INSNS (3), /* HI */ 789 COSTS_N_INSNS (3), /* SI */ 790 COSTS_N_INSNS (3), /* DI */ 791 COSTS_N_INSNS (3)}, /* other */ 792 0, /* cost of multiply per each bit set */ 793 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 794 COSTS_N_INSNS (18), /* HI */ 795 COSTS_N_INSNS (18), /* SI */ 796 COSTS_N_INSNS (18), /* DI */ 797 COSTS_N_INSNS (18)}, /* other */ 798 COSTS_N_INSNS (2), /* cost of movsx */ 799 COSTS_N_INSNS (2), /* cost of movzx */ 800 8, /* "large" insn */ 801 4, /* MOVE_RATIO */ 802 4, /* CLEAR_RATIO */ 803 {4, 5, 4}, /* cost of loading integer registers 804 in QImode, HImode and SImode. 805 Relative to reg-reg move (2). */ 806 {2, 3, 2}, /* cost of storing integer registers */ 807 {2, 2, 8, 16, 32}, /* cost of loading SSE register 808 in 32bit, 64bit, 128bit, 256bit and 512bit */ 809 {2, 2, 8, 16, 32}, /* cost of storing SSE register 810 in 32bit, 64bit, 128bit, 256bit and 512bit */ 811 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 812 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 813 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 814 6, /* cost of moving SSE register to integer. */ 815 2, 2, /* Gather load static, per_elt. */ 816 2, 2, /* Gather store static, per_elt. */ 817 32, /* size of l1 cache. */ 818 32, /* size of l2 cache. Some models 819 have integrated l2 cache, but 820 optimizing for k6 is not important 821 enough to worry about that. */ 822 32, /* size of prefetch block */ 823 1, /* number of parallel prefetches */ 824 1, /* Branch cost */ 825 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ 826 COSTS_N_INSNS (2), /* cost of FMUL instruction. */ 827 COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 828 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 829 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 830 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 831 832 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 833 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 834 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 835 COSTS_N_INSNS (2), /* cost of MULSD instruction. */ 836 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 837 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 838 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */ 839 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */ 840 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */ 841 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */ 842 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 843 k6_memcpy, 844 k6_memset, 845 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 846 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 847 "32:8:8", /* Loop alignment. */ 848 "32:8:8", /* Jump alignment. */ 849 "0:0:8", /* Label alignment. */ 850 "32", /* Func alignment. */ 851 }; 852 853 /* For some reason, Athlon deals better with REP prefix (relative to loops) 854 compared to K8. Alignment becomes important after 8 bytes for memcpy and 855 128 bytes for memset. */ 856 static stringop_algs athlon_memcpy[2] = { 857 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 858 DUMMY_STRINGOP_ALGS}; 859 static stringop_algs athlon_memset[2] = { 860 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 861 DUMMY_STRINGOP_ALGS}; 862 static const 863 struct processor_costs athlon_cost = { 864 { 865 /* Start of register allocator costs. integer->integer move cost is 2. */ 866 4, /* cost for loading QImode using movzbl */ 867 {3, 4, 3}, /* cost of loading integer registers 868 in QImode, HImode and SImode. 869 Relative to reg-reg move (2). */ 870 {3, 4, 3}, /* cost of storing integer registers */ 871 4, /* cost of reg,reg fld/fst */ 872 {4, 4, 12}, /* cost of loading fp registers 873 in SFmode, DFmode and XFmode */ 874 {6, 6, 8}, /* cost of storing fp registers 875 in SFmode, DFmode and XFmode */ 876 2, /* cost of moving MMX register */ 877 {4, 4}, /* cost of loading MMX registers 878 in SImode and DImode */ 879 {4, 4}, /* cost of storing MMX registers 880 in SImode and DImode */ 881 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 882 {4, 4, 12, 12, 24}, /* cost of loading SSE registers 883 in 32,64,128,256 and 512-bit */ 884 {4, 4, 10, 10, 20}, /* cost of storing SSE registers 885 in 32,64,128,256 and 512-bit */ 886 5, 5, /* SSE->integer and integer->SSE moves */ 887 /* End of register allocator costs. */ 888 }, 889 890 COSTS_N_INSNS (1), /* cost of an add instruction */ 891 COSTS_N_INSNS (2), /* cost of a lea instruction */ 892 COSTS_N_INSNS (1), /* variable shift costs */ 893 COSTS_N_INSNS (1), /* constant shift costs */ 894 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ 895 COSTS_N_INSNS (5), /* HI */ 896 COSTS_N_INSNS (5), /* SI */ 897 COSTS_N_INSNS (5), /* DI */ 898 COSTS_N_INSNS (5)}, /* other */ 899 0, /* cost of multiply per each bit set */ 900 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 901 COSTS_N_INSNS (26), /* HI */ 902 COSTS_N_INSNS (42), /* SI */ 903 COSTS_N_INSNS (74), /* DI */ 904 COSTS_N_INSNS (74)}, /* other */ 905 COSTS_N_INSNS (1), /* cost of movsx */ 906 COSTS_N_INSNS (1), /* cost of movzx */ 907 8, /* "large" insn */ 908 9, /* MOVE_RATIO */ 909 6, /* CLEAR_RATIO */ 910 {3, 4, 3}, /* cost of loading integer registers 911 in QImode, HImode and SImode. 912 Relative to reg-reg move (2). */ 913 {3, 4, 3}, /* cost of storing integer registers */ 914 {4, 4, 12, 12, 24}, /* cost of loading SSE register 915 in 32bit, 64bit, 128bit, 256bit and 512bit */ 916 {4, 4, 10, 10, 20}, /* cost of storing SSE register 917 in 32bit, 64bit, 128bit, 256bit and 512bit */ 918 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */ 919 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 920 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 921 5, /* cost of moving SSE register to integer. */ 922 4, 4, /* Gather load static, per_elt. */ 923 4, 4, /* Gather store static, per_elt. */ 924 64, /* size of l1 cache. */ 925 256, /* size of l2 cache. */ 926 64, /* size of prefetch block */ 927 6, /* number of parallel prefetches */ 928 5, /* Branch cost */ 929 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 930 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 931 COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 932 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 933 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 934 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 935 936 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 937 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 938 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 939 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 940 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 941 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 942 /* 11-16 */ 943 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 944 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */ 945 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 946 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */ 947 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 948 athlon_memcpy, 949 athlon_memset, 950 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 951 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 952 "16:8:8", /* Loop alignment. */ 953 "16:8:8", /* Jump alignment. */ 954 "0:0:8", /* Label alignment. */ 955 "16", /* Func alignment. */ 956 }; 957 958 /* K8 has optimized REP instruction for medium sized blocks, but for very 959 small blocks it is better to use loop. For large blocks, libcall can 960 do nontemporary accesses and beat inline considerably. */ 961 static stringop_algs k8_memcpy[2] = { 962 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 963 {-1, rep_prefix_4_byte, false}}}, 964 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 965 {-1, libcall, false}}}}; 966 static stringop_algs k8_memset[2] = { 967 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 968 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 969 {libcall, {{48, unrolled_loop, false}, 970 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 971 static const 972 struct processor_costs k8_cost = { 973 { 974 /* Start of register allocator costs. integer->integer move cost is 2. */ 975 4, /* cost for loading QImode using movzbl */ 976 {3, 4, 3}, /* cost of loading integer registers 977 in QImode, HImode and SImode. 978 Relative to reg-reg move (2). */ 979 {3, 4, 3}, /* cost of storing integer registers */ 980 4, /* cost of reg,reg fld/fst */ 981 {4, 4, 12}, /* cost of loading fp registers 982 in SFmode, DFmode and XFmode */ 983 {6, 6, 8}, /* cost of storing fp registers 984 in SFmode, DFmode and XFmode */ 985 2, /* cost of moving MMX register */ 986 {3, 3}, /* cost of loading MMX registers 987 in SImode and DImode */ 988 {4, 4}, /* cost of storing MMX registers 989 in SImode and DImode */ 990 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 991 {4, 3, 12, 12, 24}, /* cost of loading SSE registers 992 in 32,64,128,256 and 512-bit */ 993 {4, 4, 10, 10, 20}, /* cost of storing SSE registers 994 in 32,64,128,256 and 512-bit */ 995 5, 5, /* SSE->integer and integer->SSE moves */ 996 /* End of register allocator costs. */ 997 }, 998 999 COSTS_N_INSNS (1), /* cost of an add instruction */ 1000 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1001 COSTS_N_INSNS (1), /* variable shift costs */ 1002 COSTS_N_INSNS (1), /* constant shift costs */ 1003 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1004 COSTS_N_INSNS (4), /* HI */ 1005 COSTS_N_INSNS (3), /* SI */ 1006 COSTS_N_INSNS (4), /* DI */ 1007 COSTS_N_INSNS (5)}, /* other */ 1008 0, /* cost of multiply per each bit set */ 1009 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 1010 COSTS_N_INSNS (26), /* HI */ 1011 COSTS_N_INSNS (42), /* SI */ 1012 COSTS_N_INSNS (74), /* DI */ 1013 COSTS_N_INSNS (74)}, /* other */ 1014 COSTS_N_INSNS (1), /* cost of movsx */ 1015 COSTS_N_INSNS (1), /* cost of movzx */ 1016 8, /* "large" insn */ 1017 9, /* MOVE_RATIO */ 1018 6, /* CLEAR_RATIO */ 1019 {3, 4, 3}, /* cost of loading integer registers 1020 in QImode, HImode and SImode. 1021 Relative to reg-reg move (2). */ 1022 {3, 4, 3}, /* cost of storing integer registers */ 1023 {4, 3, 12, 12, 24}, /* cost of loading SSE register 1024 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1025 {4, 4, 10, 10, 20}, /* cost of storing SSE register 1026 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1027 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */ 1028 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 1029 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1030 5, /* cost of moving SSE register to integer. */ 1031 4, 4, /* Gather load static, per_elt. */ 1032 4, 4, /* Gather store static, per_elt. */ 1033 64, /* size of l1 cache. */ 1034 512, /* size of l2 cache. */ 1035 64, /* size of prefetch block */ 1036 /* New AMD processors never drop prefetches; if they cannot be performed 1037 immediately, they are queued. We set number of simultaneous prefetches 1038 to a large constant to reflect this (it probably is not a good idea not 1039 to limit number of prefetches at all, as their execution also takes some 1040 time). */ 1041 100, /* number of parallel prefetches */ 1042 3, /* Branch cost */ 1043 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1044 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1045 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1046 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1047 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1048 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1049 1050 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1051 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1052 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1053 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1054 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 1055 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 1056 /* 11-16 */ 1057 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 1058 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 1059 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 1060 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 1061 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1062 k8_memcpy, 1063 k8_memset, 1064 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1065 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1066 "16:8:8", /* Loop alignment. */ 1067 "16:8:8", /* Jump alignment. */ 1068 "0:0:8", /* Label alignment. */ 1069 "16", /* Func alignment. */ 1070 }; 1071 1072 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for 1073 very small blocks it is better to use loop. For large blocks, libcall can 1074 do nontemporary accesses and beat inline considerably. */ 1075 static stringop_algs amdfam10_memcpy[2] = { 1076 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1077 {-1, rep_prefix_4_byte, false}}}, 1078 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1079 {-1, libcall, false}}}}; 1080 static stringop_algs amdfam10_memset[2] = { 1081 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1082 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1083 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1084 {-1, libcall, false}}}}; 1085 struct processor_costs amdfam10_cost = { 1086 { 1087 /* Start of register allocator costs. integer->integer move cost is 2. */ 1088 4, /* cost for loading QImode using movzbl */ 1089 {3, 4, 3}, /* cost of loading integer registers 1090 in QImode, HImode and SImode. 1091 Relative to reg-reg move (2). */ 1092 {3, 4, 3}, /* cost of storing integer registers */ 1093 4, /* cost of reg,reg fld/fst */ 1094 {4, 4, 12}, /* cost of loading fp registers 1095 in SFmode, DFmode and XFmode */ 1096 {6, 6, 8}, /* cost of storing fp registers 1097 in SFmode, DFmode and XFmode */ 1098 2, /* cost of moving MMX register */ 1099 {3, 3}, /* cost of loading MMX registers 1100 in SImode and DImode */ 1101 {4, 4}, /* cost of storing MMX registers 1102 in SImode and DImode */ 1103 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1104 {4, 4, 3, 6, 12}, /* cost of loading SSE registers 1105 in 32,64,128,256 and 512-bit */ 1106 {4, 4, 5, 10, 20}, /* cost of storing SSE registers 1107 in 32,64,128,256 and 512-bit */ 1108 3, 3, /* SSE->integer and integer->SSE moves */ 1109 1110 /* On K8: 1111 MOVD reg64, xmmreg Double FSTORE 4 1112 MOVD reg32, xmmreg Double FSTORE 4 1113 On AMDFAM10: 1114 MOVD reg64, xmmreg Double FADD 3 1115 1/1 1/1 1116 MOVD reg32, xmmreg Double FADD 3 1117 1/1 1/1 */ 1118 /* End of register allocator costs. */ 1119 }, 1120 1121 COSTS_N_INSNS (1), /* cost of an add instruction */ 1122 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1123 COSTS_N_INSNS (1), /* variable shift costs */ 1124 COSTS_N_INSNS (1), /* constant shift costs */ 1125 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1126 COSTS_N_INSNS (4), /* HI */ 1127 COSTS_N_INSNS (3), /* SI */ 1128 COSTS_N_INSNS (4), /* DI */ 1129 COSTS_N_INSNS (5)}, /* other */ 1130 0, /* cost of multiply per each bit set */ 1131 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1132 COSTS_N_INSNS (35), /* HI */ 1133 COSTS_N_INSNS (51), /* SI */ 1134 COSTS_N_INSNS (83), /* DI */ 1135 COSTS_N_INSNS (83)}, /* other */ 1136 COSTS_N_INSNS (1), /* cost of movsx */ 1137 COSTS_N_INSNS (1), /* cost of movzx */ 1138 8, /* "large" insn */ 1139 9, /* MOVE_RATIO */ 1140 6, /* CLEAR_RATIO */ 1141 {3, 4, 3}, /* cost of loading integer registers 1142 in QImode, HImode and SImode. 1143 Relative to reg-reg move (2). */ 1144 {3, 4, 3}, /* cost of storing integer registers */ 1145 {4, 4, 3, 6, 12}, /* cost of loading SSE register 1146 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1147 {4, 4, 5, 10, 20}, /* cost of storing SSE register 1148 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1149 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ 1150 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 1151 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1152 3, /* cost of moving SSE register to integer. */ 1153 4, 4, /* Gather load static, per_elt. */ 1154 4, 4, /* Gather store static, per_elt. */ 1155 64, /* size of l1 cache. */ 1156 512, /* size of l2 cache. */ 1157 64, /* size of prefetch block */ 1158 /* New AMD processors never drop prefetches; if they cannot be performed 1159 immediately, they are queued. We set number of simultaneous prefetches 1160 to a large constant to reflect this (it probably is not a good idea not 1161 to limit number of prefetches at all, as their execution also takes some 1162 time). */ 1163 100, /* number of parallel prefetches */ 1164 2, /* Branch cost */ 1165 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1166 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1167 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1168 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1169 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1170 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1171 1172 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1173 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1174 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1175 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1176 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 1177 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 1178 /* 11-16 */ 1179 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 1180 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 1181 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 1182 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 1183 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1184 amdfam10_memcpy, 1185 amdfam10_memset, 1186 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1187 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1188 "32:25:8", /* Loop alignment. */ 1189 "32:8:8", /* Jump alignment. */ 1190 "0:0:8", /* Label alignment. */ 1191 "32", /* Func alignment. */ 1192 }; 1193 1194 /* BDVER has optimized REP instruction for medium sized blocks, but for 1195 very small blocks it is better to use loop. For large blocks, libcall 1196 can do nontemporary accesses and beat inline considerably. */ 1197 static stringop_algs bdver_memcpy[2] = { 1198 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1199 {-1, rep_prefix_4_byte, false}}}, 1200 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1201 {-1, libcall, false}}}}; 1202 static stringop_algs bdver_memset[2] = { 1203 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1204 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1205 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1206 {-1, libcall, false}}}}; 1207 1208 const struct processor_costs bdver_cost = { 1209 { 1210 /* Start of register allocator costs. integer->integer move cost is 2. */ 1211 8, /* cost for loading QImode using movzbl */ 1212 {8, 8, 8}, /* cost of loading integer registers 1213 in QImode, HImode and SImode. 1214 Relative to reg-reg move (2). */ 1215 {8, 8, 8}, /* cost of storing integer registers */ 1216 4, /* cost of reg,reg fld/fst */ 1217 {12, 12, 28}, /* cost of loading fp registers 1218 in SFmode, DFmode and XFmode */ 1219 {10, 10, 18}, /* cost of storing fp registers 1220 in SFmode, DFmode and XFmode */ 1221 4, /* cost of moving MMX register */ 1222 {12, 12}, /* cost of loading MMX registers 1223 in SImode and DImode */ 1224 {10, 10}, /* cost of storing MMX registers 1225 in SImode and DImode */ 1226 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1227 {12, 12, 10, 40, 60}, /* cost of loading SSE registers 1228 in 32,64,128,256 and 512-bit */ 1229 {10, 10, 10, 40, 60}, /* cost of storing SSE registers 1230 in 32,64,128,256 and 512-bit */ 1231 16, 20, /* SSE->integer and integer->SSE moves */ 1232 /* End of register allocator costs. */ 1233 }, 1234 1235 COSTS_N_INSNS (1), /* cost of an add instruction */ 1236 COSTS_N_INSNS (1), /* cost of a lea instruction */ 1237 COSTS_N_INSNS (1), /* variable shift costs */ 1238 COSTS_N_INSNS (1), /* constant shift costs */ 1239 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1240 COSTS_N_INSNS (4), /* HI */ 1241 COSTS_N_INSNS (4), /* SI */ 1242 COSTS_N_INSNS (6), /* DI */ 1243 COSTS_N_INSNS (6)}, /* other */ 1244 0, /* cost of multiply per each bit set */ 1245 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1246 COSTS_N_INSNS (35), /* HI */ 1247 COSTS_N_INSNS (51), /* SI */ 1248 COSTS_N_INSNS (83), /* DI */ 1249 COSTS_N_INSNS (83)}, /* other */ 1250 COSTS_N_INSNS (1), /* cost of movsx */ 1251 COSTS_N_INSNS (1), /* cost of movzx */ 1252 8, /* "large" insn */ 1253 9, /* MOVE_RATIO */ 1254 6, /* CLEAR_RATIO */ 1255 {8, 8, 8}, /* cost of loading integer registers 1256 in QImode, HImode and SImode. 1257 Relative to reg-reg move (2). */ 1258 {8, 8, 8}, /* cost of storing integer registers */ 1259 {12, 12, 10, 40, 60}, /* cost of loading SSE register 1260 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1261 {10, 10, 10, 40, 60}, /* cost of storing SSE register 1262 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1263 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ 1264 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 1265 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1266 16, /* cost of moving SSE register to integer. */ 1267 12, 12, /* Gather load static, per_elt. */ 1268 10, 10, /* Gather store static, per_elt. */ 1269 16, /* size of l1 cache. */ 1270 2048, /* size of l2 cache. */ 1271 64, /* size of prefetch block */ 1272 /* New AMD processors never drop prefetches; if they cannot be performed 1273 immediately, they are queued. We set number of simultaneous prefetches 1274 to a large constant to reflect this (it probably is not a good idea not 1275 to limit number of prefetches at all, as their execution also takes some 1276 time). */ 1277 100, /* number of parallel prefetches */ 1278 2, /* Branch cost */ 1279 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1280 COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1281 COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1282 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1283 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1284 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1285 1286 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1287 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1288 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1289 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1290 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1291 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1292 /* 9-24 */ 1293 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1294 /* 9-27 */ 1295 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1296 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1297 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1298 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1299 bdver_memcpy, 1300 bdver_memset, 1301 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1302 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1303 "16:11:8", /* Loop alignment. */ 1304 "16:8:8", /* Jump alignment. */ 1305 "0:0:8", /* Label alignment. */ 1306 "11", /* Func alignment. */ 1307 }; 1308 1309 1310 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for 1311 very small blocks it is better to use loop. For large blocks, libcall 1312 can do nontemporary accesses and beat inline considerably. */ 1313 static stringop_algs znver1_memcpy[2] = { 1314 /* 32-bit tuning. */ 1315 {libcall, {{6, loop, false}, 1316 {14, unrolled_loop, false}, 1317 {-1, libcall, false}}}, 1318 /* 64-bit tuning. */ 1319 {libcall, {{16, loop, false}, 1320 {128, rep_prefix_8_byte, false}, 1321 {-1, libcall, false}}}}; 1322 static stringop_algs znver1_memset[2] = { 1323 /* 32-bit tuning. */ 1324 {libcall, {{8, loop, false}, 1325 {24, unrolled_loop, false}, 1326 {128, rep_prefix_4_byte, false}, 1327 {-1, libcall, false}}}, 1328 /* 64-bit tuning. */ 1329 {libcall, {{48, unrolled_loop, false}, 1330 {128, rep_prefix_8_byte, false}, 1331 {-1, libcall, false}}}}; 1332 struct processor_costs znver1_cost = { 1333 { 1334 /* Start of register allocator costs. integer->integer move cost is 2. */ 1335 1336 /* reg-reg moves are done by renaming and thus they are even cheaper than 1337 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond 1338 to doubles of latencies, we do not model this correctly. It does not 1339 seem to make practical difference to bump prices up even more. */ 1340 6, /* cost for loading QImode using 1341 movzbl. */ 1342 {6, 6, 6}, /* cost of loading integer registers 1343 in QImode, HImode and SImode. 1344 Relative to reg-reg move (2). */ 1345 {8, 8, 8}, /* cost of storing integer 1346 registers. */ 1347 2, /* cost of reg,reg fld/fst. */ 1348 {6, 6, 16}, /* cost of loading fp registers 1349 in SFmode, DFmode and XFmode. */ 1350 {8, 8, 16}, /* cost of storing fp registers 1351 in SFmode, DFmode and XFmode. */ 1352 2, /* cost of moving MMX register. */ 1353 {6, 6}, /* cost of loading MMX registers 1354 in SImode and DImode. */ 1355 {8, 8}, /* cost of storing MMX registers 1356 in SImode and DImode. */ 1357 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 1358 {6, 6, 6, 12, 24}, /* cost of loading SSE registers 1359 in 32,64,128,256 and 512-bit. */ 1360 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 1361 in 32,64,128,256 and 512-bit. */ 1362 6, 6, /* SSE->integer and integer->SSE moves. */ 1363 /* End of register allocator costs. */ 1364 }, 1365 1366 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1367 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1368 COSTS_N_INSNS (1), /* variable shift costs. */ 1369 COSTS_N_INSNS (1), /* constant shift costs. */ 1370 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1371 COSTS_N_INSNS (3), /* HI. */ 1372 COSTS_N_INSNS (3), /* SI. */ 1373 COSTS_N_INSNS (3), /* DI. */ 1374 COSTS_N_INSNS (3)}, /* other. */ 1375 0, /* cost of multiply per each bit 1376 set. */ 1377 /* Depending on parameters, idiv can get faster on ryzen. This is upper 1378 bound. */ 1379 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ 1380 COSTS_N_INSNS (22), /* HI. */ 1381 COSTS_N_INSNS (30), /* SI. */ 1382 COSTS_N_INSNS (45), /* DI. */ 1383 COSTS_N_INSNS (45)}, /* other. */ 1384 COSTS_N_INSNS (1), /* cost of movsx. */ 1385 COSTS_N_INSNS (1), /* cost of movzx. */ 1386 8, /* "large" insn. */ 1387 9, /* MOVE_RATIO. */ 1388 6, /* CLEAR_RATIO */ 1389 {6, 6, 6}, /* cost of loading integer registers 1390 in QImode, HImode and SImode. 1391 Relative to reg-reg move (2). */ 1392 {8, 8, 8}, /* cost of storing integer 1393 registers. */ 1394 {6, 6, 6, 12, 24}, /* cost of loading SSE register 1395 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1396 {8, 8, 8, 16, 32}, /* cost of storing SSE register 1397 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1398 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */ 1399 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ 1400 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 1401 6, /* cost of moving SSE register to integer. */ 1402 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, 1403 throughput 12. Approx 9 uops do not depend on vector size and every load 1404 is 7 uops. */ 1405 18, 8, /* Gather load static, per_elt. */ 1406 18, 10, /* Gather store static, per_elt. */ 1407 32, /* size of l1 cache. */ 1408 512, /* size of l2 cache. */ 1409 64, /* size of prefetch block. */ 1410 /* New AMD processors never drop prefetches; if they cannot be performed 1411 immediately, they are queued. We set number of simultaneous prefetches 1412 to a large constant to reflect this (it probably is not a good idea not 1413 to limit number of prefetches at all, as their execution also takes some 1414 time). */ 1415 100, /* number of parallel prefetches. */ 1416 3, /* Branch cost. */ 1417 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1418 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1419 /* Latency of fdiv is 8-15. */ 1420 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1421 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1422 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1423 /* Latency of fsqrt is 4-10. */ 1424 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1425 1426 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1427 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1428 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1429 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1430 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1431 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1432 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1433 /* 9-13 */ 1434 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1435 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1436 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1437 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles 1438 and it can execute 2 integer additions and 2 multiplications thus 1439 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests 1440 that 4 works better than 6 probably due to register pressure. 1441 1442 Integer vector operations are taken by FP unit and execute 3 vector 1443 plus/minus operations per cycle but only one multiply. This is adjusted 1444 in ix86_reassociation_width. */ 1445 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1446 znver1_memcpy, 1447 znver1_memset, 1448 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1449 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1450 "16", /* Loop alignment. */ 1451 "16", /* Jump alignment. */ 1452 "0:0:8", /* Label alignment. */ 1453 "16", /* Func alignment. */ 1454 }; 1455 1456 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for 1457 very small blocks it is better to use loop. For large blocks, libcall 1458 can do nontemporary accesses and beat inline considerably. */ 1459 static stringop_algs znver2_memcpy[2] = { 1460 /* 32-bit tuning. */ 1461 {libcall, {{6, loop, false}, 1462 {14, unrolled_loop, false}, 1463 {-1, libcall, false}}}, 1464 /* 64-bit tuning. */ 1465 {libcall, {{16, loop, false}, 1466 {64, rep_prefix_4_byte, false}, 1467 {-1, libcall, false}}}}; 1468 static stringop_algs znver2_memset[2] = { 1469 /* 32-bit tuning. */ 1470 {libcall, {{8, loop, false}, 1471 {24, unrolled_loop, false}, 1472 {128, rep_prefix_4_byte, false}, 1473 {-1, libcall, false}}}, 1474 /* 64-bit tuning. */ 1475 {libcall, {{24, rep_prefix_4_byte, false}, 1476 {128, rep_prefix_8_byte, false}, 1477 {-1, libcall, false}}}}; 1478 1479 struct processor_costs znver2_cost = { 1480 { 1481 /* Start of register allocator costs. integer->integer move cost is 2. */ 1482 1483 /* reg-reg moves are done by renaming and thus they are even cheaper than 1484 1 cycle. Because reg-reg move cost is 2 and following tables correspond 1485 to doubles of latencies, we do not model this correctly. It does not 1486 seem to make practical difference to bump prices up even more. */ 1487 6, /* cost for loading QImode using 1488 movzbl. */ 1489 {6, 6, 6}, /* cost of loading integer registers 1490 in QImode, HImode and SImode. 1491 Relative to reg-reg move (2). */ 1492 {8, 8, 8}, /* cost of storing integer 1493 registers. */ 1494 2, /* cost of reg,reg fld/fst. */ 1495 {6, 6, 16}, /* cost of loading fp registers 1496 in SFmode, DFmode and XFmode. */ 1497 {8, 8, 16}, /* cost of storing fp registers 1498 in SFmode, DFmode and XFmode. */ 1499 2, /* cost of moving MMX register. */ 1500 {6, 6}, /* cost of loading MMX registers 1501 in SImode and DImode. */ 1502 {8, 8}, /* cost of storing MMX registers 1503 in SImode and DImode. */ 1504 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1505 register. */ 1506 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 1507 in 32,64,128,256 and 512-bit. */ 1508 {8, 8, 8, 8, 16}, /* cost of storing SSE registers 1509 in 32,64,128,256 and 512-bit. */ 1510 6, 6, /* SSE->integer and integer->SSE 1511 moves. */ 1512 /* End of register allocator costs. */ 1513 }, 1514 1515 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1516 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1517 COSTS_N_INSNS (1), /* variable shift costs. */ 1518 COSTS_N_INSNS (1), /* constant shift costs. */ 1519 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1520 COSTS_N_INSNS (3), /* HI. */ 1521 COSTS_N_INSNS (3), /* SI. */ 1522 COSTS_N_INSNS (3), /* DI. */ 1523 COSTS_N_INSNS (3)}, /* other. */ 1524 0, /* cost of multiply per each bit 1525 set. */ 1526 /* Depending on parameters, idiv can get faster on ryzen. This is upper 1527 bound. */ 1528 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ 1529 COSTS_N_INSNS (22), /* HI. */ 1530 COSTS_N_INSNS (30), /* SI. */ 1531 COSTS_N_INSNS (45), /* DI. */ 1532 COSTS_N_INSNS (45)}, /* other. */ 1533 COSTS_N_INSNS (1), /* cost of movsx. */ 1534 COSTS_N_INSNS (1), /* cost of movzx. */ 1535 8, /* "large" insn. */ 1536 9, /* MOVE_RATIO. */ 1537 6, /* CLEAR_RATIO */ 1538 {6, 6, 6}, /* cost of loading integer registers 1539 in QImode, HImode and SImode. 1540 Relative to reg-reg move (2). */ 1541 {8, 8, 8}, /* cost of storing integer 1542 registers. */ 1543 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 1544 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1545 {8, 8, 8, 8, 16}, /* cost of storing SSE register 1546 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1547 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 1548 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1549 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1550 register. */ 1551 6, /* cost of moving SSE register to integer. */ 1552 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, 1553 throughput 12. Approx 9 uops do not depend on vector size and every load 1554 is 7 uops. */ 1555 18, 8, /* Gather load static, per_elt. */ 1556 18, 10, /* Gather store static, per_elt. */ 1557 32, /* size of l1 cache. */ 1558 512, /* size of l2 cache. */ 1559 64, /* size of prefetch block. */ 1560 /* New AMD processors never drop prefetches; if they cannot be performed 1561 immediately, they are queued. We set number of simultaneous prefetches 1562 to a large constant to reflect this (it probably is not a good idea not 1563 to limit number of prefetches at all, as their execution also takes some 1564 time). */ 1565 100, /* number of parallel prefetches. */ 1566 3, /* Branch cost. */ 1567 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1568 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1569 /* Latency of fdiv is 8-15. */ 1570 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1571 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1572 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1573 /* Latency of fsqrt is 4-10. */ 1574 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1575 1576 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1577 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1578 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1579 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 1580 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1581 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1582 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1583 /* 9-13. */ 1584 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1585 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1586 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1587 /* Zen can execute 4 integer operations per cycle. FP operations 1588 take 3 cycles and it can execute 2 integer additions and 2 1589 multiplications thus reassociation may make sense up to with of 6. 1590 SPEC2k6 bencharks suggests 1591 that 4 works better than 6 probably due to register pressure. 1592 1593 Integer vector operations are taken by FP unit and execute 3 vector 1594 plus/minus operations per cycle but only one multiply. This is adjusted 1595 in ix86_reassociation_width. */ 1596 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1597 znver2_memcpy, 1598 znver2_memset, 1599 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1600 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1601 "16", /* Loop alignment. */ 1602 "16", /* Jump alignment. */ 1603 "0:0:8", /* Label alignment. */ 1604 "16", /* Func alignment. */ 1605 }; 1606 1607 struct processor_costs znver3_cost = { 1608 { 1609 /* Start of register allocator costs. integer->integer move cost is 2. */ 1610 1611 /* reg-reg moves are done by renaming and thus they are even cheaper than 1612 1 cycle. Because reg-reg move cost is 2 and following tables correspond 1613 to doubles of latencies, we do not model this correctly. It does not 1614 seem to make practical difference to bump prices up even more. */ 1615 6, /* cost for loading QImode using 1616 movzbl. */ 1617 {6, 6, 6}, /* cost of loading integer registers 1618 in QImode, HImode and SImode. 1619 Relative to reg-reg move (2). */ 1620 {8, 8, 8}, /* cost of storing integer 1621 registers. */ 1622 2, /* cost of reg,reg fld/fst. */ 1623 {6, 6, 16}, /* cost of loading fp registers 1624 in SFmode, DFmode and XFmode. */ 1625 {8, 8, 16}, /* cost of storing fp registers 1626 in SFmode, DFmode and XFmode. */ 1627 2, /* cost of moving MMX register. */ 1628 {6, 6}, /* cost of loading MMX registers 1629 in SImode and DImode. */ 1630 {8, 8}, /* cost of storing MMX registers 1631 in SImode and DImode. */ 1632 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1633 register. */ 1634 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 1635 in 32,64,128,256 and 512-bit. */ 1636 {8, 8, 8, 8, 16}, /* cost of storing SSE registers 1637 in 32,64,128,256 and 512-bit. */ 1638 6, 6, /* SSE->integer and integer->SSE 1639 moves. */ 1640 /* End of register allocator costs. */ 1641 }, 1642 1643 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1644 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1645 COSTS_N_INSNS (1), /* variable shift costs. */ 1646 COSTS_N_INSNS (1), /* constant shift costs. */ 1647 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1648 COSTS_N_INSNS (3), /* HI. */ 1649 COSTS_N_INSNS (3), /* SI. */ 1650 COSTS_N_INSNS (3), /* DI. */ 1651 COSTS_N_INSNS (3)}, /* other. */ 1652 0, /* cost of multiply per each bit 1653 set. */ 1654 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */ 1655 COSTS_N_INSNS (10), /* HI. */ 1656 COSTS_N_INSNS (12), /* SI. */ 1657 COSTS_N_INSNS (17), /* DI. */ 1658 COSTS_N_INSNS (17)}, /* other. */ 1659 COSTS_N_INSNS (1), /* cost of movsx. */ 1660 COSTS_N_INSNS (1), /* cost of movzx. */ 1661 8, /* "large" insn. */ 1662 9, /* MOVE_RATIO. */ 1663 6, /* CLEAR_RATIO */ 1664 {6, 6, 6}, /* cost of loading integer registers 1665 in QImode, HImode and SImode. 1666 Relative to reg-reg move (2). */ 1667 {8, 8, 8}, /* cost of storing integer 1668 registers. */ 1669 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 1670 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1671 {8, 8, 8, 8, 16}, /* cost of storing SSE register 1672 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1673 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 1674 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1675 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1676 register. */ 1677 6, /* cost of moving SSE register to integer. */ 1678 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, 1679 throughput 12. Approx 9 uops do not depend on vector size and every load 1680 is 7 uops. */ 1681 18, 8, /* Gather load static, per_elt. */ 1682 18, 10, /* Gather store static, per_elt. */ 1683 32, /* size of l1 cache. */ 1684 512, /* size of l2 cache. */ 1685 64, /* size of prefetch block. */ 1686 /* New AMD processors never drop prefetches; if they cannot be performed 1687 immediately, they are queued. We set number of simultaneous prefetches 1688 to a large constant to reflect this (it probably is not a good idea not 1689 to limit number of prefetches at all, as their execution also takes some 1690 time). */ 1691 100, /* number of parallel prefetches. */ 1692 3, /* Branch cost. */ 1693 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1694 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1695 /* Latency of fdiv is 8-15. */ 1696 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1697 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1698 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1699 /* Latency of fsqrt is 4-10. */ 1700 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1701 1702 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1703 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1704 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1705 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 1706 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1707 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1708 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1709 /* 9-13. */ 1710 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1711 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1712 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1713 /* Zen can execute 4 integer operations per cycle. FP operations 1714 take 3 cycles and it can execute 2 integer additions and 2 1715 multiplications thus reassociation may make sense up to with of 6. 1716 SPEC2k6 bencharks suggests 1717 that 4 works better than 6 probably due to register pressure. 1718 1719 Integer vector operations are taken by FP unit and execute 3 vector 1720 plus/minus operations per cycle but only one multiply. This is adjusted 1721 in ix86_reassociation_width. */ 1722 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1723 znver2_memcpy, 1724 znver2_memset, 1725 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1726 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1727 "16", /* Loop alignment. */ 1728 "16", /* Jump alignment. */ 1729 "0:0:8", /* Label alignment. */ 1730 "16", /* Func alignment. */ 1731 }; 1732 1733 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ 1734 static stringop_algs skylake_memcpy[2] = { 1735 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, 1736 {libcall, {{16, loop, false}, {512, unrolled_loop, false}, 1737 {-1, libcall, false}}}}; 1738 1739 static stringop_algs skylake_memset[2] = { 1740 {libcall, {{6, loop_1_byte, true}, 1741 {24, loop, true}, 1742 {8192, rep_prefix_4_byte, true}, 1743 {-1, libcall, false}}}, 1744 {libcall, {{24, loop, true}, {512, unrolled_loop, false}, 1745 {-1, libcall, false}}}}; 1746 1747 static const 1748 struct processor_costs skylake_cost = { 1749 { 1750 /* Start of register allocator costs. integer->integer move cost is 2. */ 1751 6, /* cost for loading QImode using movzbl */ 1752 {4, 4, 4}, /* cost of loading integer registers 1753 in QImode, HImode and SImode. 1754 Relative to reg-reg move (2). */ 1755 {6, 6, 6}, /* cost of storing integer registers */ 1756 2, /* cost of reg,reg fld/fst */ 1757 {6, 6, 8}, /* cost of loading fp registers 1758 in SFmode, DFmode and XFmode */ 1759 {6, 6, 10}, /* cost of storing fp registers 1760 in SFmode, DFmode and XFmode */ 1761 2, /* cost of moving MMX register */ 1762 {6, 6}, /* cost of loading MMX registers 1763 in SImode and DImode */ 1764 {6, 6}, /* cost of storing MMX registers 1765 in SImode and DImode */ 1766 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 1767 {6, 6, 6, 10, 20}, /* cost of loading SSE registers 1768 in 32,64,128,256 and 512-bit */ 1769 {8, 8, 8, 12, 24}, /* cost of storing SSE registers 1770 in 32,64,128,256 and 512-bit */ 1771 6, 6, /* SSE->integer and integer->SSE moves */ 1772 /* End of register allocator costs. */ 1773 }, 1774 1775 COSTS_N_INSNS (1), /* cost of an add instruction */ 1776 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ 1777 COSTS_N_INSNS (1), /* variable shift costs */ 1778 COSTS_N_INSNS (1), /* constant shift costs */ 1779 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1780 COSTS_N_INSNS (4), /* HI */ 1781 COSTS_N_INSNS (3), /* SI */ 1782 COSTS_N_INSNS (3), /* DI */ 1783 COSTS_N_INSNS (3)}, /* other */ 1784 0, /* cost of multiply per each bit set */ 1785 /* Expanding div/mod currently doesn't consider parallelism. So the cost 1786 model is not realistic. We compensate by increasing the latencies a bit. */ 1787 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 1788 COSTS_N_INSNS (11), /* HI */ 1789 COSTS_N_INSNS (14), /* SI */ 1790 COSTS_N_INSNS (76), /* DI */ 1791 COSTS_N_INSNS (76)}, /* other */ 1792 COSTS_N_INSNS (1), /* cost of movsx */ 1793 COSTS_N_INSNS (0), /* cost of movzx */ 1794 8, /* "large" insn */ 1795 17, /* MOVE_RATIO */ 1796 6, /* CLEAR_RATIO */ 1797 {4, 4, 4}, /* cost of loading integer registers 1798 in QImode, HImode and SImode. 1799 Relative to reg-reg move (2). */ 1800 {6, 6, 6}, /* cost of storing integer registers */ 1801 {6, 6, 6, 10, 20}, /* cost of loading SSE register 1802 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1803 {8, 8, 8, 12, 24}, /* cost of storing SSE register 1804 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1805 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ 1806 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1807 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 1808 2, /* cost of moving SSE register to integer. */ 1809 20, 8, /* Gather load static, per_elt. */ 1810 22, 10, /* Gather store static, per_elt. */ 1811 64, /* size of l1 cache. */ 1812 512, /* size of l2 cache. */ 1813 64, /* size of prefetch block */ 1814 6, /* number of parallel prefetches */ 1815 3, /* Branch cost */ 1816 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 1817 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1818 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 1819 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1820 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1821 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ 1822 1823 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1824 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1825 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1826 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1827 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 1828 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 1829 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ 1830 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ 1831 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ 1832 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 1833 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 1834 skylake_memcpy, 1835 skylake_memset, 1836 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1837 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1838 "16:11:8", /* Loop alignment. */ 1839 "16:11:8", /* Jump alignment. */ 1840 "0:0:8", /* Label alignment. */ 1841 "16", /* Func alignment. */ 1842 }; 1843 /* BTVER1 has optimized REP instruction for medium sized blocks, but for 1844 very small blocks it is better to use loop. For large blocks, libcall can 1845 do nontemporary accesses and beat inline considerably. */ 1846 static stringop_algs btver1_memcpy[2] = { 1847 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1848 {-1, rep_prefix_4_byte, false}}}, 1849 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1850 {-1, libcall, false}}}}; 1851 static stringop_algs btver1_memset[2] = { 1852 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1853 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1854 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1855 {-1, libcall, false}}}}; 1856 const struct processor_costs btver1_cost = { 1857 { 1858 /* Start of register allocator costs. integer->integer move cost is 2. */ 1859 8, /* cost for loading QImode using movzbl */ 1860 {6, 8, 6}, /* cost of loading integer registers 1861 in QImode, HImode and SImode. 1862 Relative to reg-reg move (2). */ 1863 {6, 8, 6}, /* cost of storing integer registers */ 1864 4, /* cost of reg,reg fld/fst */ 1865 {12, 12, 28}, /* cost of loading fp registers 1866 in SFmode, DFmode and XFmode */ 1867 {12, 12, 38}, /* cost of storing fp registers 1868 in SFmode, DFmode and XFmode */ 1869 4, /* cost of moving MMX register */ 1870 {10, 10}, /* cost of loading MMX registers 1871 in SImode and DImode */ 1872 {12, 12}, /* cost of storing MMX registers 1873 in SImode and DImode */ 1874 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1875 {10, 10, 12, 48, 96}, /* cost of loading SSE registers 1876 in 32,64,128,256 and 512-bit */ 1877 {10, 10, 12, 48, 96}, /* cost of storing SSE registers 1878 in 32,64,128,256 and 512-bit */ 1879 14, 14, /* SSE->integer and integer->SSE moves */ 1880 /* End of register allocator costs. */ 1881 }, 1882 1883 COSTS_N_INSNS (1), /* cost of an add instruction */ 1884 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1885 COSTS_N_INSNS (1), /* variable shift costs */ 1886 COSTS_N_INSNS (1), /* constant shift costs */ 1887 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1888 COSTS_N_INSNS (4), /* HI */ 1889 COSTS_N_INSNS (3), /* SI */ 1890 COSTS_N_INSNS (4), /* DI */ 1891 COSTS_N_INSNS (5)}, /* other */ 1892 0, /* cost of multiply per each bit set */ 1893 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1894 COSTS_N_INSNS (35), /* HI */ 1895 COSTS_N_INSNS (51), /* SI */ 1896 COSTS_N_INSNS (83), /* DI */ 1897 COSTS_N_INSNS (83)}, /* other */ 1898 COSTS_N_INSNS (1), /* cost of movsx */ 1899 COSTS_N_INSNS (1), /* cost of movzx */ 1900 8, /* "large" insn */ 1901 9, /* MOVE_RATIO */ 1902 6, /* CLEAR_RATIO */ 1903 {6, 8, 6}, /* cost of loading integer registers 1904 in QImode, HImode and SImode. 1905 Relative to reg-reg move (2). */ 1906 {6, 8, 6}, /* cost of storing integer registers */ 1907 {10, 10, 12, 48, 96}, /* cost of loading SSE register 1908 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1909 {10, 10, 12, 48, 96}, /* cost of storing SSE register 1910 in 32bit, 64bit, 128bit, 256bit and 512bit */ 1911 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ 1912 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 1913 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1914 14, /* cost of moving SSE register to integer. */ 1915 10, 10, /* Gather load static, per_elt. */ 1916 10, 10, /* Gather store static, per_elt. */ 1917 32, /* size of l1 cache. */ 1918 512, /* size of l2 cache. */ 1919 64, /* size of prefetch block */ 1920 100, /* number of parallel prefetches */ 1921 2, /* Branch cost */ 1922 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1923 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1924 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1925 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1926 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1927 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1928 1929 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1930 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1931 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 1932 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1933 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1934 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1935 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 1936 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 1937 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 1938 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */ 1939 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1940 btver1_memcpy, 1941 btver1_memset, 1942 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1943 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1944 "16:11:8", /* Loop alignment. */ 1945 "16:8:8", /* Jump alignment. */ 1946 "0:0:8", /* Label alignment. */ 1947 "11", /* Func alignment. */ 1948 }; 1949 1950 static stringop_algs btver2_memcpy[2] = { 1951 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1952 {-1, rep_prefix_4_byte, false}}}, 1953 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1954 {-1, libcall, false}}}}; 1955 static stringop_algs btver2_memset[2] = { 1956 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1957 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1958 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1959 {-1, libcall, false}}}}; 1960 const struct processor_costs btver2_cost = { 1961 { 1962 /* Start of register allocator costs. integer->integer move cost is 2. */ 1963 8, /* cost for loading QImode using movzbl */ 1964 {8, 8, 6}, /* cost of loading integer registers 1965 in QImode, HImode and SImode. 1966 Relative to reg-reg move (2). */ 1967 {8, 8, 6}, /* cost of storing integer registers */ 1968 4, /* cost of reg,reg fld/fst */ 1969 {12, 12, 28}, /* cost of loading fp registers 1970 in SFmode, DFmode and XFmode */ 1971 {12, 12, 38}, /* cost of storing fp registers 1972 in SFmode, DFmode and XFmode */ 1973 4, /* cost of moving MMX register */ 1974 {10, 10}, /* cost of loading MMX registers 1975 in SImode and DImode */ 1976 {12, 12}, /* cost of storing MMX registers 1977 in SImode and DImode */ 1978 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1979 {10, 10, 12, 48, 96}, /* cost of loading SSE registers 1980 in 32,64,128,256 and 512-bit */ 1981 {10, 10, 12, 48, 96}, /* cost of storing SSE registers 1982 in 32,64,128,256 and 512-bit */ 1983 14, 14, /* SSE->integer and integer->SSE moves */ 1984 /* End of register allocator costs. */ 1985 }, 1986 1987 COSTS_N_INSNS (1), /* cost of an add instruction */ 1988 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1989 COSTS_N_INSNS (1), /* variable shift costs */ 1990 COSTS_N_INSNS (1), /* constant shift costs */ 1991 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1992 COSTS_N_INSNS (4), /* HI */ 1993 COSTS_N_INSNS (3), /* SI */ 1994 COSTS_N_INSNS (4), /* DI */ 1995 COSTS_N_INSNS (5)}, /* other */ 1996 0, /* cost of multiply per each bit set */ 1997 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1998 COSTS_N_INSNS (35), /* HI */ 1999 COSTS_N_INSNS (51), /* SI */ 2000 COSTS_N_INSNS (83), /* DI */ 2001 COSTS_N_INSNS (83)}, /* other */ 2002 COSTS_N_INSNS (1), /* cost of movsx */ 2003 COSTS_N_INSNS (1), /* cost of movzx */ 2004 8, /* "large" insn */ 2005 9, /* MOVE_RATIO */ 2006 6, /* CLEAR_RATIO */ 2007 {8, 8, 6}, /* cost of loading integer registers 2008 in QImode, HImode and SImode. 2009 Relative to reg-reg move (2). */ 2010 {8, 8, 6}, /* cost of storing integer registers */ 2011 {10, 10, 12, 48, 96}, /* cost of loading SSE register 2012 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2013 {10, 10, 12, 48, 96}, /* cost of storing SSE register 2014 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2015 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ 2016 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2017 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2018 14, /* cost of moving SSE register to integer. */ 2019 10, 10, /* Gather load static, per_elt. */ 2020 10, 10, /* Gather store static, per_elt. */ 2021 32, /* size of l1 cache. */ 2022 2048, /* size of l2 cache. */ 2023 64, /* size of prefetch block */ 2024 100, /* number of parallel prefetches */ 2025 2, /* Branch cost */ 2026 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 2027 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 2028 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 2029 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 2030 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 2031 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 2032 2033 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2034 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2035 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 2036 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 2037 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2038 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2039 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 2040 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */ 2041 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */ 2042 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ 2043 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2044 btver2_memcpy, 2045 btver2_memset, 2046 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 2047 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2048 "16:11:8", /* Loop alignment. */ 2049 "16:8:8", /* Jump alignment. */ 2050 "0:0:8", /* Label alignment. */ 2051 "11", /* Func alignment. */ 2052 }; 2053 2054 static stringop_algs pentium4_memcpy[2] = { 2055 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 2056 DUMMY_STRINGOP_ALGS}; 2057 static stringop_algs pentium4_memset[2] = { 2058 {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 2059 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2060 DUMMY_STRINGOP_ALGS}; 2061 2062 static const 2063 struct processor_costs pentium4_cost = { 2064 { 2065 /* Start of register allocator costs. integer->integer move cost is 2. */ 2066 5, /* cost for loading QImode using movzbl */ 2067 {4, 5, 4}, /* cost of loading integer registers 2068 in QImode, HImode and SImode. 2069 Relative to reg-reg move (2). */ 2070 {2, 3, 2}, /* cost of storing integer registers */ 2071 12, /* cost of reg,reg fld/fst */ 2072 {14, 14, 14}, /* cost of loading fp registers 2073 in SFmode, DFmode and XFmode */ 2074 {14, 14, 14}, /* cost of storing fp registers 2075 in SFmode, DFmode and XFmode */ 2076 12, /* cost of moving MMX register */ 2077 {16, 16}, /* cost of loading MMX registers 2078 in SImode and DImode */ 2079 {16, 16}, /* cost of storing MMX registers 2080 in SImode and DImode */ 2081 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 2082 {16, 16, 16, 32, 64}, /* cost of loading SSE registers 2083 in 32,64,128,256 and 512-bit */ 2084 {16, 16, 16, 32, 64}, /* cost of storing SSE registers 2085 in 32,64,128,256 and 512-bit */ 2086 20, 12, /* SSE->integer and integer->SSE moves */ 2087 /* End of register allocator costs. */ 2088 }, 2089 2090 COSTS_N_INSNS (1), /* cost of an add instruction */ 2091 COSTS_N_INSNS (3), /* cost of a lea instruction */ 2092 COSTS_N_INSNS (4), /* variable shift costs */ 2093 COSTS_N_INSNS (4), /* constant shift costs */ 2094 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ 2095 COSTS_N_INSNS (15), /* HI */ 2096 COSTS_N_INSNS (15), /* SI */ 2097 COSTS_N_INSNS (15), /* DI */ 2098 COSTS_N_INSNS (15)}, /* other */ 2099 0, /* cost of multiply per each bit set */ 2100 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ 2101 COSTS_N_INSNS (56), /* HI */ 2102 COSTS_N_INSNS (56), /* SI */ 2103 COSTS_N_INSNS (56), /* DI */ 2104 COSTS_N_INSNS (56)}, /* other */ 2105 COSTS_N_INSNS (1), /* cost of movsx */ 2106 COSTS_N_INSNS (1), /* cost of movzx */ 2107 16, /* "large" insn */ 2108 6, /* MOVE_RATIO */ 2109 6, /* CLEAR_RATIO */ 2110 {4, 5, 4}, /* cost of loading integer registers 2111 in QImode, HImode and SImode. 2112 Relative to reg-reg move (2). */ 2113 {2, 3, 2}, /* cost of storing integer registers */ 2114 {16, 16, 16, 32, 64}, /* cost of loading SSE register 2115 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2116 {16, 16, 16, 32, 64}, /* cost of storing SSE register 2117 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2118 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ 2119 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 2120 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 2121 20, /* cost of moving SSE register to integer. */ 2122 16, 16, /* Gather load static, per_elt. */ 2123 16, 16, /* Gather store static, per_elt. */ 2124 8, /* size of l1 cache. */ 2125 256, /* size of l2 cache. */ 2126 64, /* size of prefetch block */ 2127 6, /* number of parallel prefetches */ 2128 2, /* Branch cost */ 2129 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 2130 COSTS_N_INSNS (7), /* cost of FMUL instruction. */ 2131 COSTS_N_INSNS (43), /* cost of FDIV instruction. */ 2132 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 2133 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 2134 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ 2135 2136 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 2137 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 2138 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 2139 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 2140 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2141 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2142 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */ 2143 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */ 2144 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */ 2145 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */ 2146 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2147 pentium4_memcpy, 2148 pentium4_memset, 2149 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2150 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2151 NULL, /* Loop alignment. */ 2152 NULL, /* Jump alignment. */ 2153 NULL, /* Label alignment. */ 2154 NULL, /* Func alignment. */ 2155 }; 2156 2157 static stringop_algs nocona_memcpy[2] = { 2158 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 2159 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, 2160 {100000, unrolled_loop, false}, {-1, libcall, false}}}}; 2161 2162 static stringop_algs nocona_memset[2] = { 2163 {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 2164 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2165 {libcall, {{24, loop, false}, {64, unrolled_loop, false}, 2166 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2167 2168 static const 2169 struct processor_costs nocona_cost = { 2170 { 2171 /* Start of register allocator costs. integer->integer move cost is 2. */ 2172 4, /* cost for loading QImode using movzbl */ 2173 {4, 4, 4}, /* cost of loading integer registers 2174 in QImode, HImode and SImode. 2175 Relative to reg-reg move (2). */ 2176 {4, 4, 4}, /* cost of storing integer registers */ 2177 12, /* cost of reg,reg fld/fst */ 2178 {14, 14, 14}, /* cost of loading fp registers 2179 in SFmode, DFmode and XFmode */ 2180 {14, 14, 14}, /* cost of storing fp registers 2181 in SFmode, DFmode and XFmode */ 2182 14, /* cost of moving MMX register */ 2183 {12, 12}, /* cost of loading MMX registers 2184 in SImode and DImode */ 2185 {12, 12}, /* cost of storing MMX registers 2186 in SImode and DImode */ 2187 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 2188 {12, 12, 12, 24, 48}, /* cost of loading SSE registers 2189 in 32,64,128,256 and 512-bit */ 2190 {12, 12, 12, 24, 48}, /* cost of storing SSE registers 2191 in 32,64,128,256 and 512-bit */ 2192 20, 12, /* SSE->integer and integer->SSE moves */ 2193 /* End of register allocator costs. */ 2194 }, 2195 2196 COSTS_N_INSNS (1), /* cost of an add instruction */ 2197 COSTS_N_INSNS (1), /* cost of a lea instruction */ 2198 COSTS_N_INSNS (1), /* variable shift costs */ 2199 COSTS_N_INSNS (1), /* constant shift costs */ 2200 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ 2201 COSTS_N_INSNS (10), /* HI */ 2202 COSTS_N_INSNS (10), /* SI */ 2203 COSTS_N_INSNS (10), /* DI */ 2204 COSTS_N_INSNS (10)}, /* other */ 2205 0, /* cost of multiply per each bit set */ 2206 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ 2207 COSTS_N_INSNS (66), /* HI */ 2208 COSTS_N_INSNS (66), /* SI */ 2209 COSTS_N_INSNS (66), /* DI */ 2210 COSTS_N_INSNS (66)}, /* other */ 2211 COSTS_N_INSNS (1), /* cost of movsx */ 2212 COSTS_N_INSNS (1), /* cost of movzx */ 2213 16, /* "large" insn */ 2214 17, /* MOVE_RATIO */ 2215 6, /* CLEAR_RATIO */ 2216 {4, 4, 4}, /* cost of loading integer registers 2217 in QImode, HImode and SImode. 2218 Relative to reg-reg move (2). */ 2219 {4, 4, 4}, /* cost of storing integer registers */ 2220 {12, 12, 12, 24, 48}, /* cost of loading SSE register 2221 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2222 {12, 12, 12, 24, 48}, /* cost of storing SSE register 2223 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2224 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ 2225 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 2226 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 2227 20, /* cost of moving SSE register to integer. */ 2228 12, 12, /* Gather load static, per_elt. */ 2229 12, 12, /* Gather store static, per_elt. */ 2230 8, /* size of l1 cache. */ 2231 1024, /* size of l2 cache. */ 2232 64, /* size of prefetch block */ 2233 8, /* number of parallel prefetches */ 2234 1, /* Branch cost */ 2235 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 2236 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2237 COSTS_N_INSNS (40), /* cost of FDIV instruction. */ 2238 COSTS_N_INSNS (3), /* cost of FABS instruction. */ 2239 COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 2240 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ 2241 2242 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 2243 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 2244 COSTS_N_INSNS (7), /* cost of MULSS instruction. */ 2245 COSTS_N_INSNS (7), /* cost of MULSD instruction. */ 2246 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 2247 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 2248 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */ 2249 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */ 2250 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */ 2251 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */ 2252 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2253 nocona_memcpy, 2254 nocona_memset, 2255 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2256 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2257 NULL, /* Loop alignment. */ 2258 NULL, /* Jump alignment. */ 2259 NULL, /* Label alignment. */ 2260 NULL, /* Func alignment. */ 2261 }; 2262 2263 static stringop_algs atom_memcpy[2] = { 2264 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2265 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2266 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2267 static stringop_algs atom_memset[2] = { 2268 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2269 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2270 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2271 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2272 static const 2273 struct processor_costs atom_cost = { 2274 { 2275 /* Start of register allocator costs. integer->integer move cost is 2. */ 2276 6, /* cost for loading QImode using movzbl */ 2277 {6, 6, 6}, /* cost of loading integer registers 2278 in QImode, HImode and SImode. 2279 Relative to reg-reg move (2). */ 2280 {6, 6, 6}, /* cost of storing integer registers */ 2281 4, /* cost of reg,reg fld/fst */ 2282 {6, 6, 18}, /* cost of loading fp registers 2283 in SFmode, DFmode and XFmode */ 2284 {14, 14, 24}, /* cost of storing fp registers 2285 in SFmode, DFmode and XFmode */ 2286 2, /* cost of moving MMX register */ 2287 {8, 8}, /* cost of loading MMX registers 2288 in SImode and DImode */ 2289 {10, 10}, /* cost of storing MMX registers 2290 in SImode and DImode */ 2291 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2292 {8, 8, 8, 16, 32}, /* cost of loading SSE registers 2293 in 32,64,128,256 and 512-bit */ 2294 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 2295 in 32,64,128,256 and 512-bit */ 2296 8, 6, /* SSE->integer and integer->SSE moves */ 2297 /* End of register allocator costs. */ 2298 }, 2299 2300 COSTS_N_INSNS (1), /* cost of an add instruction */ 2301 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2302 COSTS_N_INSNS (1), /* variable shift costs */ 2303 COSTS_N_INSNS (1), /* constant shift costs */ 2304 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2305 COSTS_N_INSNS (4), /* HI */ 2306 COSTS_N_INSNS (3), /* SI */ 2307 COSTS_N_INSNS (4), /* DI */ 2308 COSTS_N_INSNS (2)}, /* other */ 2309 0, /* cost of multiply per each bit set */ 2310 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2311 COSTS_N_INSNS (26), /* HI */ 2312 COSTS_N_INSNS (42), /* SI */ 2313 COSTS_N_INSNS (74), /* DI */ 2314 COSTS_N_INSNS (74)}, /* other */ 2315 COSTS_N_INSNS (1), /* cost of movsx */ 2316 COSTS_N_INSNS (1), /* cost of movzx */ 2317 8, /* "large" insn */ 2318 17, /* MOVE_RATIO */ 2319 6, /* CLEAR_RATIO */ 2320 {6, 6, 6}, /* cost of loading integer registers 2321 in QImode, HImode and SImode. 2322 Relative to reg-reg move (2). */ 2323 {6, 6, 6}, /* cost of storing integer registers */ 2324 {8, 8, 8, 16, 32}, /* cost of loading SSE register 2325 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2326 {8, 8, 8, 16, 32}, /* cost of storing SSE register 2327 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2328 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 2329 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2330 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2331 8, /* cost of moving SSE register to integer. */ 2332 8, 8, /* Gather load static, per_elt. */ 2333 8, 8, /* Gather store static, per_elt. */ 2334 32, /* size of l1 cache. */ 2335 256, /* size of l2 cache. */ 2336 64, /* size of prefetch block */ 2337 6, /* number of parallel prefetches */ 2338 3, /* Branch cost */ 2339 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2340 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2341 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2342 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2343 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2344 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2345 2346 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2347 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 2348 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2349 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2350 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2351 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2352 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 2353 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 2354 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 2355 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 2356 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2357 atom_memcpy, 2358 atom_memset, 2359 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2360 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2361 "16", /* Loop alignment. */ 2362 "16:8:8", /* Jump alignment. */ 2363 "0:0:8", /* Label alignment. */ 2364 "16", /* Func alignment. */ 2365 }; 2366 2367 static stringop_algs slm_memcpy[2] = { 2368 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2369 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2370 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2371 static stringop_algs slm_memset[2] = { 2372 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2373 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2374 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2375 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2376 static const 2377 struct processor_costs slm_cost = { 2378 { 2379 /* Start of register allocator costs. integer->integer move cost is 2. */ 2380 8, /* cost for loading QImode using movzbl */ 2381 {8, 8, 8}, /* cost of loading integer registers 2382 in QImode, HImode and SImode. 2383 Relative to reg-reg move (2). */ 2384 {6, 6, 6}, /* cost of storing integer registers */ 2385 2, /* cost of reg,reg fld/fst */ 2386 {8, 8, 18}, /* cost of loading fp registers 2387 in SFmode, DFmode and XFmode */ 2388 {6, 6, 18}, /* cost of storing fp registers 2389 in SFmode, DFmode and XFmode */ 2390 2, /* cost of moving MMX register */ 2391 {8, 8}, /* cost of loading MMX registers 2392 in SImode and DImode */ 2393 {6, 6}, /* cost of storing MMX registers 2394 in SImode and DImode */ 2395 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2396 {8, 8, 8, 16, 32}, /* cost of loading SSE registers 2397 in 32,64,128,256 and 512-bit */ 2398 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 2399 in 32,64,128,256 and 512-bit */ 2400 8, 6, /* SSE->integer and integer->SSE moves */ 2401 /* End of register allocator costs. */ 2402 }, 2403 2404 COSTS_N_INSNS (1), /* cost of an add instruction */ 2405 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2406 COSTS_N_INSNS (1), /* variable shift costs */ 2407 COSTS_N_INSNS (1), /* constant shift costs */ 2408 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2409 COSTS_N_INSNS (3), /* HI */ 2410 COSTS_N_INSNS (3), /* SI */ 2411 COSTS_N_INSNS (4), /* DI */ 2412 COSTS_N_INSNS (2)}, /* other */ 2413 0, /* cost of multiply per each bit set */ 2414 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2415 COSTS_N_INSNS (26), /* HI */ 2416 COSTS_N_INSNS (42), /* SI */ 2417 COSTS_N_INSNS (74), /* DI */ 2418 COSTS_N_INSNS (74)}, /* other */ 2419 COSTS_N_INSNS (1), /* cost of movsx */ 2420 COSTS_N_INSNS (1), /* cost of movzx */ 2421 8, /* "large" insn */ 2422 17, /* MOVE_RATIO */ 2423 6, /* CLEAR_RATIO */ 2424 {8, 8, 8}, /* cost of loading integer registers 2425 in QImode, HImode and SImode. 2426 Relative to reg-reg move (2). */ 2427 {6, 6, 6}, /* cost of storing integer registers */ 2428 {8, 8, 8, 16, 32}, /* cost of loading SSE register 2429 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2430 {8, 8, 8, 16, 32}, /* cost of storing SSE register 2431 in SImode, DImode and TImode. */ 2432 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 2433 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2434 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2435 8, /* cost of moving SSE register to integer. */ 2436 8, 8, /* Gather load static, per_elt. */ 2437 8, 8, /* Gather store static, per_elt. */ 2438 32, /* size of l1 cache. */ 2439 256, /* size of l2 cache. */ 2440 64, /* size of prefetch block */ 2441 6, /* number of parallel prefetches */ 2442 3, /* Branch cost */ 2443 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2444 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2445 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2446 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2447 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2448 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2449 2450 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2451 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2452 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2453 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2454 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2455 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2456 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 2457 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */ 2458 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */ 2459 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */ 2460 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2461 slm_memcpy, 2462 slm_memset, 2463 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2464 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2465 "16", /* Loop alignment. */ 2466 "16:8:8", /* Jump alignment. */ 2467 "0:0:8", /* Label alignment. */ 2468 "16", /* Func alignment. */ 2469 }; 2470 2471 static stringop_algs intel_memcpy[2] = { 2472 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2473 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2474 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2475 static stringop_algs intel_memset[2] = { 2476 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2477 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2478 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2479 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2480 static const 2481 struct processor_costs intel_cost = { 2482 { 2483 /* Start of register allocator costs. integer->integer move cost is 2. */ 2484 6, /* cost for loading QImode using movzbl */ 2485 {4, 4, 4}, /* cost of loading integer registers 2486 in QImode, HImode and SImode. 2487 Relative to reg-reg move (2). */ 2488 {6, 6, 6}, /* cost of storing integer registers */ 2489 2, /* cost of reg,reg fld/fst */ 2490 {6, 6, 8}, /* cost of loading fp registers 2491 in SFmode, DFmode and XFmode */ 2492 {6, 6, 10}, /* cost of storing fp registers 2493 in SFmode, DFmode and XFmode */ 2494 2, /* cost of moving MMX register */ 2495 {6, 6}, /* cost of loading MMX registers 2496 in SImode and DImode */ 2497 {6, 6}, /* cost of storing MMX registers 2498 in SImode and DImode */ 2499 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 2500 {6, 6, 6, 6, 6}, /* cost of loading SSE registers 2501 in 32,64,128,256 and 512-bit */ 2502 {6, 6, 6, 6, 6}, /* cost of storing SSE registers 2503 in 32,64,128,256 and 512-bit */ 2504 4, 4, /* SSE->integer and integer->SSE moves */ 2505 /* End of register allocator costs. */ 2506 }, 2507 2508 COSTS_N_INSNS (1), /* cost of an add instruction */ 2509 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2510 COSTS_N_INSNS (1), /* variable shift costs */ 2511 COSTS_N_INSNS (1), /* constant shift costs */ 2512 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2513 COSTS_N_INSNS (3), /* HI */ 2514 COSTS_N_INSNS (3), /* SI */ 2515 COSTS_N_INSNS (4), /* DI */ 2516 COSTS_N_INSNS (2)}, /* other */ 2517 0, /* cost of multiply per each bit set */ 2518 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2519 COSTS_N_INSNS (26), /* HI */ 2520 COSTS_N_INSNS (42), /* SI */ 2521 COSTS_N_INSNS (74), /* DI */ 2522 COSTS_N_INSNS (74)}, /* other */ 2523 COSTS_N_INSNS (1), /* cost of movsx */ 2524 COSTS_N_INSNS (1), /* cost of movzx */ 2525 8, /* "large" insn */ 2526 17, /* MOVE_RATIO */ 2527 6, /* CLEAR_RATIO */ 2528 {4, 4, 4}, /* cost of loading integer registers 2529 in QImode, HImode and SImode. 2530 Relative to reg-reg move (2). */ 2531 {6, 6, 6}, /* cost of storing integer registers */ 2532 {6, 6, 6, 6, 6}, /* cost of loading SSE register 2533 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2534 {6, 6, 6, 6, 6}, /* cost of storing SSE register 2535 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2536 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2537 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2538 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 2539 4, /* cost of moving SSE register to integer. */ 2540 6, 6, /* Gather load static, per_elt. */ 2541 6, 6, /* Gather store static, per_elt. */ 2542 32, /* size of l1 cache. */ 2543 256, /* size of l2 cache. */ 2544 64, /* size of prefetch block */ 2545 6, /* number of parallel prefetches */ 2546 3, /* Branch cost */ 2547 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2548 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2549 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2550 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2551 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2552 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2553 2554 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2555 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 2556 COSTS_N_INSNS (8), /* cost of MULSS instruction. */ 2557 COSTS_N_INSNS (8), /* cost of MULSD instruction. */ 2558 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2559 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2560 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */ 2561 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 2562 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */ 2563 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */ 2564 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2565 intel_memcpy, 2566 intel_memset, 2567 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2568 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2569 "16", /* Loop alignment. */ 2570 "16:8:8", /* Jump alignment. */ 2571 "0:0:8", /* Label alignment. */ 2572 "16", /* Func alignment. */ 2573 }; 2574 2575 /* Generic should produce code tuned for Core-i7 (and newer chips) 2576 and btver1 (and newer chips). */ 2577 2578 static stringop_algs generic_memcpy[2] = { 2579 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 2580 {-1, libcall, false}}}, 2581 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 2582 {-1, libcall, false}}}}; 2583 static stringop_algs generic_memset[2] = { 2584 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 2585 {-1, libcall, false}}}, 2586 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 2587 {-1, libcall, false}}}}; 2588 static const 2589 struct processor_costs generic_cost = { 2590 { 2591 /* Start of register allocator costs. integer->integer move cost is 2. */ 2592 6, /* cost for loading QImode using movzbl */ 2593 {6, 6, 6}, /* cost of loading integer registers 2594 in QImode, HImode and SImode. 2595 Relative to reg-reg move (2). */ 2596 {6, 6, 6}, /* cost of storing integer registers */ 2597 4, /* cost of reg,reg fld/fst */ 2598 {6, 6, 12}, /* cost of loading fp registers 2599 in SFmode, DFmode and XFmode */ 2600 {6, 6, 12}, /* cost of storing fp registers 2601 in SFmode, DFmode and XFmode */ 2602 2, /* cost of moving MMX register */ 2603 {6, 6}, /* cost of loading MMX registers 2604 in SImode and DImode */ 2605 {6, 6}, /* cost of storing MMX registers 2606 in SImode and DImode */ 2607 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 2608 {6, 6, 6, 10, 15}, /* cost of loading SSE registers 2609 in 32,64,128,256 and 512-bit */ 2610 {6, 6, 6, 10, 15}, /* cost of storing SSE registers 2611 in 32,64,128,256 and 512-bit */ 2612 6, 6, /* SSE->integer and integer->SSE moves */ 2613 /* End of register allocator costs. */ 2614 }, 2615 2616 COSTS_N_INSNS (1), /* cost of an add instruction */ 2617 /* Setting cost to 2 makes our current implementation of synth_mult result in 2618 use of unnecessary temporary registers causing regression on several 2619 SPECfp benchmarks. */ 2620 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2621 COSTS_N_INSNS (1), /* variable shift costs */ 2622 COSTS_N_INSNS (1), /* constant shift costs */ 2623 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2624 COSTS_N_INSNS (4), /* HI */ 2625 COSTS_N_INSNS (3), /* SI */ 2626 COSTS_N_INSNS (4), /* DI */ 2627 COSTS_N_INSNS (4)}, /* other */ 2628 0, /* cost of multiply per each bit set */ 2629 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ 2630 COSTS_N_INSNS (22), /* HI */ 2631 COSTS_N_INSNS (30), /* SI */ 2632 COSTS_N_INSNS (74), /* DI */ 2633 COSTS_N_INSNS (74)}, /* other */ 2634 COSTS_N_INSNS (1), /* cost of movsx */ 2635 COSTS_N_INSNS (1), /* cost of movzx */ 2636 8, /* "large" insn */ 2637 17, /* MOVE_RATIO */ 2638 6, /* CLEAR_RATIO */ 2639 {6, 6, 6}, /* cost of loading integer registers 2640 in QImode, HImode and SImode. 2641 Relative to reg-reg move (2). */ 2642 {6, 6, 6}, /* cost of storing integer registers */ 2643 {6, 6, 6, 10, 15}, /* cost of loading SSE register 2644 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2645 {6, 6, 6, 10, 15}, /* cost of storing SSE register 2646 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2647 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ 2648 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2649 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 2650 6, /* cost of moving SSE register to integer. */ 2651 18, 6, /* Gather load static, per_elt. */ 2652 18, 6, /* Gather store static, per_elt. */ 2653 32, /* size of l1 cache. */ 2654 512, /* size of l2 cache. */ 2655 64, /* size of prefetch block */ 2656 6, /* number of parallel prefetches */ 2657 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this 2658 value is increased to perhaps more appropriate value of 5. */ 2659 3, /* Branch cost */ 2660 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2661 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 2662 COSTS_N_INSNS (17), /* cost of FDIV instruction. */ 2663 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2664 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2665 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ 2666 2667 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2668 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2669 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2670 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2671 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2672 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2673 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 2674 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 2675 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 2676 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 2677 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ 2678 generic_memcpy, 2679 generic_memset, 2680 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 2681 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 2682 "16:11:8", /* Loop alignment. */ 2683 "16:11:8", /* Jump alignment. */ 2684 "0:0:8", /* Label alignment. */ 2685 "16", /* Func alignment. */ 2686 }; 2687 2688 /* core_cost should produce code tuned for Core familly of CPUs. */ 2689 static stringop_algs core_memcpy[2] = { 2690 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, 2691 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, 2692 {-1, libcall, false}}}}; 2693 static stringop_algs core_memset[2] = { 2694 {libcall, {{6, loop_1_byte, true}, 2695 {24, loop, true}, 2696 {8192, rep_prefix_4_byte, true}, 2697 {-1, libcall, false}}}, 2698 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, 2699 {-1, libcall, false}}}}; 2700 2701 static const 2702 struct processor_costs core_cost = { 2703 { 2704 /* Start of register allocator costs. integer->integer move cost is 2. */ 2705 6, /* cost for loading QImode using movzbl */ 2706 {4, 4, 4}, /* cost of loading integer registers 2707 in QImode, HImode and SImode. 2708 Relative to reg-reg move (2). */ 2709 {6, 6, 6}, /* cost of storing integer registers */ 2710 2, /* cost of reg,reg fld/fst */ 2711 {6, 6, 8}, /* cost of loading fp registers 2712 in SFmode, DFmode and XFmode */ 2713 {6, 6, 10}, /* cost of storing fp registers 2714 in SFmode, DFmode and XFmode */ 2715 2, /* cost of moving MMX register */ 2716 {6, 6}, /* cost of loading MMX registers 2717 in SImode and DImode */ 2718 {6, 6}, /* cost of storing MMX registers 2719 in SImode and DImode */ 2720 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2721 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 2722 in 32,64,128,256 and 512-bit */ 2723 {6, 6, 6, 6, 12}, /* cost of storing SSE registers 2724 in 32,64,128,256 and 512-bit */ 2725 6, 6, /* SSE->integer and integer->SSE moves */ 2726 /* End of register allocator costs. */ 2727 }, 2728 2729 COSTS_N_INSNS (1), /* cost of an add instruction */ 2730 /* On all chips taken into consideration lea is 2 cycles and more. With 2731 this cost however our current implementation of synth_mult results in 2732 use of unnecessary temporary registers causing regression on several 2733 SPECfp benchmarks. */ 2734 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2735 COSTS_N_INSNS (1), /* variable shift costs */ 2736 COSTS_N_INSNS (1), /* constant shift costs */ 2737 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2738 COSTS_N_INSNS (4), /* HI */ 2739 COSTS_N_INSNS (3), /* SI */ 2740 /* Here we tune for Sandybridge or newer. */ 2741 COSTS_N_INSNS (3), /* DI */ 2742 COSTS_N_INSNS (3)}, /* other */ 2743 0, /* cost of multiply per each bit set */ 2744 /* Expanding div/mod currently doesn't consider parallelism. So the cost 2745 model is not realistic. We compensate by increasing the latencies a bit. */ 2746 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 2747 COSTS_N_INSNS (11), /* HI */ 2748 COSTS_N_INSNS (14), /* SI */ 2749 COSTS_N_INSNS (81), /* DI */ 2750 COSTS_N_INSNS (81)}, /* other */ 2751 COSTS_N_INSNS (1), /* cost of movsx */ 2752 COSTS_N_INSNS (1), /* cost of movzx */ 2753 8, /* "large" insn */ 2754 17, /* MOVE_RATIO */ 2755 6, /* CLEAR_RATIO */ 2756 {4, 4, 4}, /* cost of loading integer registers 2757 in QImode, HImode and SImode. 2758 Relative to reg-reg move (2). */ 2759 {6, 6, 6}, /* cost of storing integer registers */ 2760 {6, 6, 6, 6, 12}, /* cost of loading SSE register 2761 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2762 {6, 6, 6, 6, 12}, /* cost of storing SSE register 2763 in 32bit, 64bit, 128bit, 256bit and 512bit */ 2764 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 2765 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 2766 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2767 2, /* cost of moving SSE register to integer. */ 2768 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, 2769 rec. throughput 6. 2770 So 5 uops statically and one uops per load. */ 2771 10, 6, /* Gather load static, per_elt. */ 2772 10, 6, /* Gather store static, per_elt. */ 2773 64, /* size of l1 cache. */ 2774 512, /* size of l2 cache. */ 2775 64, /* size of prefetch block */ 2776 6, /* number of parallel prefetches */ 2777 /* FIXME perhaps more appropriate value is 5. */ 2778 3, /* Branch cost */ 2779 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2780 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 2781 /* 10-24 */ 2782 COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 2783 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2784 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2785 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */ 2786 2787 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2788 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2789 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2790 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2791 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2792 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2793 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 2794 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ 2795 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ 2796 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ 2797 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2798 core_memcpy, 2799 core_memset, 2800 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2801 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2802 "16:11:8", /* Loop alignment. */ 2803 "16:11:8", /* Jump alignment. */ 2804 "0:0:8", /* Label alignment. */ 2805 "16", /* Func alignment. */ 2806 }; 2807 2808