1 /* Costs of operations of individual x86 CPUs. 2 Copyright (C) 1988-2019 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 GCC is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 Under Section 7 of GPL version 3, you are granted additional 17 permissions described in the GCC Runtime Library Exception, version 18 3.1, as published by the Free Software Foundation. 19 20 You should have received a copy of the GNU General Public License and 21 a copy of the GCC Runtime Library Exception along with this program; 22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 <http://www.gnu.org/licenses/>. */ 24 /* Processor costs (relative to an add) */ 25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ 26 #define COSTS_N_BYTES(N) ((N) * 2) 27 28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} 29 30 static stringop_algs ix86_size_memcpy[2] = { 31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 33 static stringop_algs ix86_size_memset[2] = { 34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 36 37 const 38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */ 39 COSTS_N_BYTES (2), /* cost of an add instruction */ 40 COSTS_N_BYTES (3), /* cost of a lea instruction */ 41 COSTS_N_BYTES (2), /* variable shift costs */ 42 COSTS_N_BYTES (3), /* constant shift costs */ 43 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ 44 COSTS_N_BYTES (3), /* HI */ 45 COSTS_N_BYTES (3), /* SI */ 46 COSTS_N_BYTES (3), /* DI */ 47 COSTS_N_BYTES (5)}, /* other */ 48 0, /* cost of multiply per each bit set */ 49 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ 50 COSTS_N_BYTES (3), /* HI */ 51 COSTS_N_BYTES (3), /* SI */ 52 COSTS_N_BYTES (3), /* DI */ 53 COSTS_N_BYTES (5)}, /* other */ 54 COSTS_N_BYTES (3), /* cost of movsx */ 55 COSTS_N_BYTES (3), /* cost of movzx */ 56 0, /* "large" insn */ 57 2, /* MOVE_RATIO */ 58 59 /* All move costs are relative to integer->integer move times 2. */ 60 2, /* cost for loading QImode using movzbl */ 61 {2, 2, 2}, /* cost of loading integer registers 62 in QImode, HImode and SImode. 63 Relative to reg-reg move (2). */ 64 {2, 2, 2}, /* cost of storing integer registers */ 65 2, /* cost of reg,reg fld/fst */ 66 {2, 2, 2}, /* cost of loading fp registers 67 in SFmode, DFmode and XFmode */ 68 {2, 2, 2}, /* cost of storing fp registers 69 in SFmode, DFmode and XFmode */ 70 3, /* cost of moving MMX register */ 71 {3, 3}, /* cost of loading MMX registers 72 in SImode and DImode */ 73 {3, 3}, /* cost of storing MMX registers 74 in SImode and DImode */ 75 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 76 {3, 3, 3, 3, 3}, /* cost of loading SSE registers 77 in 32,64,128,256 and 512-bit */ 78 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load 79 in 128bit, 256bit and 512bit */ 80 {3, 3, 3, 3, 3}, /* cost of storing SSE registers 81 in 32,64,128,256 and 512-bit */ 82 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store 83 in 128bit, 256bit and 512bit */ 84 3, 3, /* SSE->integer and integer->SSE moves */ 85 5, 0, /* Gather load static, per_elt. */ 86 5, 0, /* Gather store static, per_elt. */ 87 0, /* size of l1 cache */ 88 0, /* size of l2 cache */ 89 0, /* size of prefetch block */ 90 0, /* number of parallel prefetches */ 91 2, /* Branch cost */ 92 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ 93 COSTS_N_BYTES (2), /* cost of FMUL instruction. */ 94 COSTS_N_BYTES (2), /* cost of FDIV instruction. */ 95 COSTS_N_BYTES (2), /* cost of FABS instruction. */ 96 COSTS_N_BYTES (2), /* cost of FCHS instruction. */ 97 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ 98 99 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */ 100 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 101 COSTS_N_BYTES (2), /* cost of MULSS instruction. */ 102 COSTS_N_BYTES (2), /* cost of MULSD instruction. */ 103 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */ 104 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */ 105 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */ 106 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */ 107 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */ 108 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */ 109 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 110 ix86_size_memcpy, 111 ix86_size_memset, 112 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ 113 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */ 114 NULL, /* Loop alignment. */ 115 NULL, /* Jump alignment. */ 116 NULL, /* Label alignment. */ 117 NULL, /* Func alignment. */ 118 }; 119 120 /* Processor costs (relative to an add) */ 121 static stringop_algs i386_memcpy[2] = { 122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 123 DUMMY_STRINGOP_ALGS}; 124 static stringop_algs i386_memset[2] = { 125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 126 DUMMY_STRINGOP_ALGS}; 127 128 static const 129 struct processor_costs i386_cost = { /* 386 specific costs */ 130 COSTS_N_INSNS (1), /* cost of an add instruction */ 131 COSTS_N_INSNS (1), /* cost of a lea instruction */ 132 COSTS_N_INSNS (3), /* variable shift costs */ 133 COSTS_N_INSNS (2), /* constant shift costs */ 134 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ 135 COSTS_N_INSNS (6), /* HI */ 136 COSTS_N_INSNS (6), /* SI */ 137 COSTS_N_INSNS (6), /* DI */ 138 COSTS_N_INSNS (6)}, /* other */ 139 COSTS_N_INSNS (1), /* cost of multiply per each bit set */ 140 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ 141 COSTS_N_INSNS (23), /* HI */ 142 COSTS_N_INSNS (23), /* SI */ 143 COSTS_N_INSNS (23), /* DI */ 144 COSTS_N_INSNS (23)}, /* other */ 145 COSTS_N_INSNS (3), /* cost of movsx */ 146 COSTS_N_INSNS (2), /* cost of movzx */ 147 15, /* "large" insn */ 148 3, /* MOVE_RATIO */ 149 150 /* All move costs are relative to integer->integer move times 2 and thus 151 they are latency*2. */ 152 4, /* cost for loading QImode using movzbl */ 153 {2, 4, 2}, /* cost of loading integer registers 154 in QImode, HImode and SImode. 155 Relative to reg-reg move (2). */ 156 {2, 4, 2}, /* cost of storing integer registers */ 157 2, /* cost of reg,reg fld/fst */ 158 {8, 8, 8}, /* cost of loading fp registers 159 in SFmode, DFmode and XFmode */ 160 {8, 8, 8}, /* cost of storing fp registers 161 in SFmode, DFmode and XFmode */ 162 2, /* cost of moving MMX register */ 163 {4, 8}, /* cost of loading MMX registers 164 in SImode and DImode */ 165 {4, 8}, /* cost of storing MMX registers 166 in SImode and DImode */ 167 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 168 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 169 in 32,64,128,256 and 512-bit */ 170 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 171 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 172 in 32,64,128,256 and 512-bit */ 173 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 174 3, 3, /* SSE->integer and integer->SSE moves */ 175 4, 4, /* Gather load static, per_elt. */ 176 4, 4, /* Gather store static, per_elt. */ 177 0, /* size of l1 cache */ 178 0, /* size of l2 cache */ 179 0, /* size of prefetch block */ 180 0, /* number of parallel prefetches */ 181 1, /* Branch cost */ 182 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ 183 COSTS_N_INSNS (27), /* cost of FMUL instruction. */ 184 COSTS_N_INSNS (88), /* cost of FDIV instruction. */ 185 COSTS_N_INSNS (22), /* cost of FABS instruction. */ 186 COSTS_N_INSNS (24), /* cost of FCHS instruction. */ 187 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ 188 189 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 190 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */ 191 COSTS_N_INSNS (27), /* cost of MULSS instruction. */ 192 COSTS_N_INSNS (27), /* cost of MULSD instruction. */ 193 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */ 194 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */ 195 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */ 196 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */ 197 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */ 198 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */ 199 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 200 i386_memcpy, 201 i386_memset, 202 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 203 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 204 "4", /* Loop alignment. */ 205 "4", /* Jump alignment. */ 206 NULL, /* Label alignment. */ 207 "4", /* Func alignment. */ 208 }; 209 210 static stringop_algs i486_memcpy[2] = { 211 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 212 DUMMY_STRINGOP_ALGS}; 213 static stringop_algs i486_memset[2] = { 214 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 215 DUMMY_STRINGOP_ALGS}; 216 217 static const 218 struct processor_costs i486_cost = { /* 486 specific costs */ 219 COSTS_N_INSNS (1), /* cost of an add instruction */ 220 COSTS_N_INSNS (1), /* cost of a lea instruction */ 221 COSTS_N_INSNS (3), /* variable shift costs */ 222 COSTS_N_INSNS (2), /* constant shift costs */ 223 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ 224 COSTS_N_INSNS (12), /* HI */ 225 COSTS_N_INSNS (12), /* SI */ 226 COSTS_N_INSNS (12), /* DI */ 227 COSTS_N_INSNS (12)}, /* other */ 228 1, /* cost of multiply per each bit set */ 229 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ 230 COSTS_N_INSNS (40), /* HI */ 231 COSTS_N_INSNS (40), /* SI */ 232 COSTS_N_INSNS (40), /* DI */ 233 COSTS_N_INSNS (40)}, /* other */ 234 COSTS_N_INSNS (3), /* cost of movsx */ 235 COSTS_N_INSNS (2), /* cost of movzx */ 236 15, /* "large" insn */ 237 3, /* MOVE_RATIO */ 238 239 /* All move costs are relative to integer->integer move times 2 and thus 240 they are latency*2. */ 241 4, /* cost for loading QImode using movzbl */ 242 {2, 4, 2}, /* cost of loading integer registers 243 in QImode, HImode and SImode. 244 Relative to reg-reg move (2). */ 245 {2, 4, 2}, /* cost of storing integer registers */ 246 2, /* cost of reg,reg fld/fst */ 247 {8, 8, 8}, /* cost of loading fp registers 248 in SFmode, DFmode and XFmode */ 249 {8, 8, 8}, /* cost of storing fp registers 250 in SFmode, DFmode and XFmode */ 251 2, /* cost of moving MMX register */ 252 {4, 8}, /* cost of loading MMX registers 253 in SImode and DImode */ 254 {4, 8}, /* cost of storing MMX registers 255 in SImode and DImode */ 256 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 257 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 258 in 32,64,128,256 and 512-bit */ 259 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 260 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 261 in 32,64,128,256 and 512-bit */ 262 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 263 3, 3, /* SSE->integer and integer->SSE moves */ 264 4, 4, /* Gather load static, per_elt. */ 265 4, 4, /* Gather store static, per_elt. */ 266 4, /* size of l1 cache. 486 has 8kB cache 267 shared for code and data, so 4kB is 268 not really precise. */ 269 4, /* size of l2 cache */ 270 0, /* size of prefetch block */ 271 0, /* number of parallel prefetches */ 272 1, /* Branch cost */ 273 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 274 COSTS_N_INSNS (16), /* cost of FMUL instruction. */ 275 COSTS_N_INSNS (73), /* cost of FDIV instruction. */ 276 COSTS_N_INSNS (3), /* cost of FABS instruction. */ 277 COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 278 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ 279 280 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 281 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 282 COSTS_N_INSNS (16), /* cost of MULSS instruction. */ 283 COSTS_N_INSNS (16), /* cost of MULSD instruction. */ 284 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */ 285 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */ 286 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */ 287 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */ 288 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */ 289 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */ 290 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 291 i486_memcpy, 292 i486_memset, 293 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 294 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 295 "16", /* Loop alignment. */ 296 "16", /* Jump alignment. */ 297 "0:0:8", /* Label alignment. */ 298 "16", /* Func alignment. */ 299 }; 300 301 static stringop_algs pentium_memcpy[2] = { 302 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 303 DUMMY_STRINGOP_ALGS}; 304 static stringop_algs pentium_memset[2] = { 305 {libcall, {{-1, rep_prefix_4_byte, false}}}, 306 DUMMY_STRINGOP_ALGS}; 307 308 static const 309 struct processor_costs pentium_cost = { 310 COSTS_N_INSNS (1), /* cost of an add instruction */ 311 COSTS_N_INSNS (1), /* cost of a lea instruction */ 312 COSTS_N_INSNS (4), /* variable shift costs */ 313 COSTS_N_INSNS (1), /* constant shift costs */ 314 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 315 COSTS_N_INSNS (11), /* HI */ 316 COSTS_N_INSNS (11), /* SI */ 317 COSTS_N_INSNS (11), /* DI */ 318 COSTS_N_INSNS (11)}, /* other */ 319 0, /* cost of multiply per each bit set */ 320 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 321 COSTS_N_INSNS (25), /* HI */ 322 COSTS_N_INSNS (25), /* SI */ 323 COSTS_N_INSNS (25), /* DI */ 324 COSTS_N_INSNS (25)}, /* other */ 325 COSTS_N_INSNS (3), /* cost of movsx */ 326 COSTS_N_INSNS (2), /* cost of movzx */ 327 8, /* "large" insn */ 328 6, /* MOVE_RATIO */ 329 330 /* All move costs are relative to integer->integer move times 2 and thus 331 they are latency*2. */ 332 6, /* cost for loading QImode using movzbl */ 333 {2, 4, 2}, /* cost of loading integer registers 334 in QImode, HImode and SImode. 335 Relative to reg-reg move (2). */ 336 {2, 4, 2}, /* cost of storing integer registers */ 337 2, /* cost of reg,reg fld/fst */ 338 {2, 2, 6}, /* cost of loading fp registers 339 in SFmode, DFmode and XFmode */ 340 {4, 4, 6}, /* cost of storing fp registers 341 in SFmode, DFmode and XFmode */ 342 8, /* cost of moving MMX register */ 343 {8, 8}, /* cost of loading MMX registers 344 in SImode and DImode */ 345 {8, 8}, /* cost of storing MMX registers 346 in SImode and DImode */ 347 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 348 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 349 in 32,64,128,256 and 512-bit */ 350 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 351 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 352 in 32,64,128,256 and 512-bit */ 353 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 354 3, 3, /* SSE->integer and integer->SSE moves */ 355 4, 4, /* Gather load static, per_elt. */ 356 4, 4, /* Gather store static, per_elt. */ 357 8, /* size of l1 cache. */ 358 8, /* size of l2 cache */ 359 0, /* size of prefetch block */ 360 0, /* number of parallel prefetches */ 361 2, /* Branch cost */ 362 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 363 COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 364 COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 365 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 366 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 367 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 368 369 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 370 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 371 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 372 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 373 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 374 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 375 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 376 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */ 377 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */ 378 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */ 379 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 380 pentium_memcpy, 381 pentium_memset, 382 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 383 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 384 "16:8:8", /* Loop alignment. */ 385 "16:8:8", /* Jump alignment. */ 386 "0:0:8", /* Label alignment. */ 387 "16", /* Func alignment. */ 388 }; 389 390 static const 391 struct processor_costs lakemont_cost = { 392 COSTS_N_INSNS (1), /* cost of an add instruction */ 393 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 394 COSTS_N_INSNS (1), /* variable shift costs */ 395 COSTS_N_INSNS (1), /* constant shift costs */ 396 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 397 COSTS_N_INSNS (11), /* HI */ 398 COSTS_N_INSNS (11), /* SI */ 399 COSTS_N_INSNS (11), /* DI */ 400 COSTS_N_INSNS (11)}, /* other */ 401 0, /* cost of multiply per each bit set */ 402 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 403 COSTS_N_INSNS (25), /* HI */ 404 COSTS_N_INSNS (25), /* SI */ 405 COSTS_N_INSNS (25), /* DI */ 406 COSTS_N_INSNS (25)}, /* other */ 407 COSTS_N_INSNS (3), /* cost of movsx */ 408 COSTS_N_INSNS (2), /* cost of movzx */ 409 8, /* "large" insn */ 410 17, /* MOVE_RATIO */ 411 412 /* All move costs are relative to integer->integer move times 2 and thus 413 they are latency*2. */ 414 6, /* cost for loading QImode using movzbl */ 415 {2, 4, 2}, /* cost of loading integer registers 416 in QImode, HImode and SImode. 417 Relative to reg-reg move (2). */ 418 {2, 4, 2}, /* cost of storing integer registers */ 419 2, /* cost of reg,reg fld/fst */ 420 {2, 2, 6}, /* cost of loading fp registers 421 in SFmode, DFmode and XFmode */ 422 {4, 4, 6}, /* cost of storing fp registers 423 in SFmode, DFmode and XFmode */ 424 8, /* cost of moving MMX register */ 425 {8, 8}, /* cost of loading MMX registers 426 in SImode and DImode */ 427 {8, 8}, /* cost of storing MMX registers 428 in SImode and DImode */ 429 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 430 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 431 in 32,64,128,256 and 512-bit */ 432 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 433 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 434 in 32,64,128,256 and 512-bit */ 435 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 436 3, 3, /* SSE->integer and integer->SSE moves */ 437 4, 4, /* Gather load static, per_elt. */ 438 4, 4, /* Gather store static, per_elt. */ 439 8, /* size of l1 cache. */ 440 8, /* size of l2 cache */ 441 0, /* size of prefetch block */ 442 0, /* number of parallel prefetches */ 443 2, /* Branch cost */ 444 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 445 COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 446 COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 447 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 448 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 449 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 450 451 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 452 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 453 COSTS_N_INSNS (5), /* cost of MULSS instruction. */ 454 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 455 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */ 456 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */ 457 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 458 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 459 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 460 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 461 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 462 pentium_memcpy, 463 pentium_memset, 464 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 465 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 466 "16:8:8", /* Loop alignment. */ 467 "16:8:8", /* Jump alignment. */ 468 "0:0:8", /* Label alignment. */ 469 "16", /* Func alignment. */ 470 }; 471 472 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes 473 (we ensure the alignment). For small blocks inline loop is still a 474 noticeable win, for bigger blocks either rep movsl or rep movsb is 475 way to go. Rep movsb has apparently more expensive startup time in CPU, 476 but after 4K the difference is down in the noise. */ 477 static stringop_algs pentiumpro_memcpy[2] = { 478 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, 479 {8192, rep_prefix_4_byte, false}, 480 {-1, rep_prefix_1_byte, false}}}, 481 DUMMY_STRINGOP_ALGS}; 482 static stringop_algs pentiumpro_memset[2] = { 483 {rep_prefix_4_byte, {{1024, unrolled_loop, false}, 484 {8192, rep_prefix_4_byte, false}, 485 {-1, libcall, false}}}, 486 DUMMY_STRINGOP_ALGS}; 487 static const 488 struct processor_costs pentiumpro_cost = { 489 COSTS_N_INSNS (1), /* cost of an add instruction */ 490 COSTS_N_INSNS (1), /* cost of a lea instruction */ 491 COSTS_N_INSNS (1), /* variable shift costs */ 492 COSTS_N_INSNS (1), /* constant shift costs */ 493 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 494 COSTS_N_INSNS (4), /* HI */ 495 COSTS_N_INSNS (4), /* SI */ 496 COSTS_N_INSNS (4), /* DI */ 497 COSTS_N_INSNS (4)}, /* other */ 498 0, /* cost of multiply per each bit set */ 499 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ 500 COSTS_N_INSNS (17), /* HI */ 501 COSTS_N_INSNS (17), /* SI */ 502 COSTS_N_INSNS (17), /* DI */ 503 COSTS_N_INSNS (17)}, /* other */ 504 COSTS_N_INSNS (1), /* cost of movsx */ 505 COSTS_N_INSNS (1), /* cost of movzx */ 506 8, /* "large" insn */ 507 6, /* MOVE_RATIO */ 508 509 /* All move costs are relative to integer->integer move times 2 and thus 510 they are latency*2. */ 511 2, /* cost for loading QImode using movzbl */ 512 {4, 4, 4}, /* cost of loading integer registers 513 in QImode, HImode and SImode. 514 Relative to reg-reg move (2). */ 515 {2, 2, 2}, /* cost of storing integer registers */ 516 2, /* cost of reg,reg fld/fst */ 517 {2, 2, 6}, /* cost of loading fp registers 518 in SFmode, DFmode and XFmode */ 519 {4, 4, 6}, /* cost of storing fp registers 520 in SFmode, DFmode and XFmode */ 521 2, /* cost of moving MMX register */ 522 {2, 2}, /* cost of loading MMX registers 523 in SImode and DImode */ 524 {2, 2}, /* cost of storing MMX registers 525 in SImode and DImode */ 526 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 527 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 528 in 32,64,128,256 and 512-bit */ 529 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 530 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 531 in 32,64,128,256 and 512-bit */ 532 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 533 3, 3, /* SSE->integer and integer->SSE moves */ 534 4, 4, /* Gather load static, per_elt. */ 535 4, 4, /* Gather store static, per_elt. */ 536 8, /* size of l1 cache. */ 537 256, /* size of l2 cache */ 538 32, /* size of prefetch block */ 539 6, /* number of parallel prefetches */ 540 2, /* Branch cost */ 541 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 542 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 543 COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 544 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 545 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 546 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 547 548 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 549 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 550 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 551 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 552 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 553 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 554 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 555 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */ 556 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 557 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */ 558 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 559 pentiumpro_memcpy, 560 pentiumpro_memset, 561 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 562 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 563 "16", /* Loop alignment. */ 564 "16:11:8", /* Jump alignment. */ 565 "0:0:8", /* Label alignment. */ 566 "16", /* Func alignment. */ 567 }; 568 569 static stringop_algs geode_memcpy[2] = { 570 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 571 DUMMY_STRINGOP_ALGS}; 572 static stringop_algs geode_memset[2] = { 573 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 574 DUMMY_STRINGOP_ALGS}; 575 static const 576 struct processor_costs geode_cost = { 577 COSTS_N_INSNS (1), /* cost of an add instruction */ 578 COSTS_N_INSNS (1), /* cost of a lea instruction */ 579 COSTS_N_INSNS (2), /* variable shift costs */ 580 COSTS_N_INSNS (1), /* constant shift costs */ 581 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 582 COSTS_N_INSNS (4), /* HI */ 583 COSTS_N_INSNS (7), /* SI */ 584 COSTS_N_INSNS (7), /* DI */ 585 COSTS_N_INSNS (7)}, /* other */ 586 0, /* cost of multiply per each bit set */ 587 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ 588 COSTS_N_INSNS (23), /* HI */ 589 COSTS_N_INSNS (39), /* SI */ 590 COSTS_N_INSNS (39), /* DI */ 591 COSTS_N_INSNS (39)}, /* other */ 592 COSTS_N_INSNS (1), /* cost of movsx */ 593 COSTS_N_INSNS (1), /* cost of movzx */ 594 8, /* "large" insn */ 595 4, /* MOVE_RATIO */ 596 597 /* All move costs are relative to integer->integer move times 2 and thus 598 they are latency*2. */ 599 2, /* cost for loading QImode using movzbl */ 600 {2, 2, 2}, /* cost of loading integer registers 601 in QImode, HImode and SImode. 602 Relative to reg-reg move (2). */ 603 {2, 2, 2}, /* cost of storing integer registers */ 604 2, /* cost of reg,reg fld/fst */ 605 {2, 2, 2}, /* cost of loading fp registers 606 in SFmode, DFmode and XFmode */ 607 {4, 6, 6}, /* cost of storing fp registers 608 in SFmode, DFmode and XFmode */ 609 610 2, /* cost of moving MMX register */ 611 {2, 2}, /* cost of loading MMX registers 612 in SImode and DImode */ 613 {2, 2}, /* cost of storing MMX registers 614 in SImode and DImode */ 615 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 616 {2, 2, 8, 16, 32}, /* cost of loading SSE registers 617 in 32,64,128,256 and 512-bit */ 618 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 619 {2, 2, 8, 16, 32}, /* cost of storing SSE registers 620 in 32,64,128,256 and 512-bit */ 621 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 622 6, 6, /* SSE->integer and integer->SSE moves */ 623 2, 2, /* Gather load static, per_elt. */ 624 2, 2, /* Gather store static, per_elt. */ 625 64, /* size of l1 cache. */ 626 128, /* size of l2 cache. */ 627 32, /* size of prefetch block */ 628 1, /* number of parallel prefetches */ 629 1, /* Branch cost */ 630 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 631 COSTS_N_INSNS (11), /* cost of FMUL instruction. */ 632 COSTS_N_INSNS (47), /* cost of FDIV instruction. */ 633 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 634 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 635 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ 636 637 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 638 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 639 COSTS_N_INSNS (11), /* cost of MULSS instruction. */ 640 COSTS_N_INSNS (11), /* cost of MULSD instruction. */ 641 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */ 642 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */ 643 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */ 644 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */ 645 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */ 646 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */ 647 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 648 geode_memcpy, 649 geode_memset, 650 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 651 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 652 NULL, /* Loop alignment. */ 653 NULL, /* Jump alignment. */ 654 NULL, /* Label alignment. */ 655 NULL, /* Func alignment. */ 656 }; 657 658 static stringop_algs k6_memcpy[2] = { 659 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 660 DUMMY_STRINGOP_ALGS}; 661 static stringop_algs k6_memset[2] = { 662 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 663 DUMMY_STRINGOP_ALGS}; 664 static const 665 struct processor_costs k6_cost = { 666 COSTS_N_INSNS (1), /* cost of an add instruction */ 667 COSTS_N_INSNS (2), /* cost of a lea instruction */ 668 COSTS_N_INSNS (1), /* variable shift costs */ 669 COSTS_N_INSNS (1), /* constant shift costs */ 670 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 671 COSTS_N_INSNS (3), /* HI */ 672 COSTS_N_INSNS (3), /* SI */ 673 COSTS_N_INSNS (3), /* DI */ 674 COSTS_N_INSNS (3)}, /* other */ 675 0, /* cost of multiply per each bit set */ 676 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 677 COSTS_N_INSNS (18), /* HI */ 678 COSTS_N_INSNS (18), /* SI */ 679 COSTS_N_INSNS (18), /* DI */ 680 COSTS_N_INSNS (18)}, /* other */ 681 COSTS_N_INSNS (2), /* cost of movsx */ 682 COSTS_N_INSNS (2), /* cost of movzx */ 683 8, /* "large" insn */ 684 4, /* MOVE_RATIO */ 685 686 /* All move costs are relative to integer->integer move times 2 and thus 687 they are latency*2. */ 688 3, /* cost for loading QImode using movzbl */ 689 {4, 5, 4}, /* cost of loading integer registers 690 in QImode, HImode and SImode. 691 Relative to reg-reg move (2). */ 692 {2, 3, 2}, /* cost of storing integer registers */ 693 4, /* cost of reg,reg fld/fst */ 694 {6, 6, 6}, /* cost of loading fp registers 695 in SFmode, DFmode and XFmode */ 696 {4, 4, 4}, /* cost of storing fp registers 697 in SFmode, DFmode and XFmode */ 698 2, /* cost of moving MMX register */ 699 {2, 2}, /* cost of loading MMX registers 700 in SImode and DImode */ 701 {2, 2}, /* cost of storing MMX registers 702 in SImode and DImode */ 703 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 704 {2, 2, 8, 16, 32}, /* cost of loading SSE registers 705 in 32,64,128,256 and 512-bit */ 706 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 707 {2, 2, 8, 16, 32}, /* cost of storing SSE registers 708 in 32,64,128,256 and 512-bit */ 709 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 710 6, 6, /* SSE->integer and integer->SSE moves */ 711 2, 2, /* Gather load static, per_elt. */ 712 2, 2, /* Gather store static, per_elt. */ 713 32, /* size of l1 cache. */ 714 32, /* size of l2 cache. Some models 715 have integrated l2 cache, but 716 optimizing for k6 is not important 717 enough to worry about that. */ 718 32, /* size of prefetch block */ 719 1, /* number of parallel prefetches */ 720 1, /* Branch cost */ 721 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ 722 COSTS_N_INSNS (2), /* cost of FMUL instruction. */ 723 COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 724 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 725 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 726 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 727 728 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 729 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 730 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 731 COSTS_N_INSNS (2), /* cost of MULSD instruction. */ 732 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 733 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 734 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */ 735 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */ 736 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */ 737 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */ 738 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 739 k6_memcpy, 740 k6_memset, 741 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 742 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 743 "32:8:8", /* Loop alignment. */ 744 "32:8:8", /* Jump alignment. */ 745 "0:0:8", /* Label alignment. */ 746 "32", /* Func alignment. */ 747 }; 748 749 /* For some reason, Athlon deals better with REP prefix (relative to loops) 750 compared to K8. Alignment becomes important after 8 bytes for memcpy and 751 128 bytes for memset. */ 752 static stringop_algs athlon_memcpy[2] = { 753 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 754 DUMMY_STRINGOP_ALGS}; 755 static stringop_algs athlon_memset[2] = { 756 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 757 DUMMY_STRINGOP_ALGS}; 758 static const 759 struct processor_costs athlon_cost = { 760 COSTS_N_INSNS (1), /* cost of an add instruction */ 761 COSTS_N_INSNS (2), /* cost of a lea instruction */ 762 COSTS_N_INSNS (1), /* variable shift costs */ 763 COSTS_N_INSNS (1), /* constant shift costs */ 764 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ 765 COSTS_N_INSNS (5), /* HI */ 766 COSTS_N_INSNS (5), /* SI */ 767 COSTS_N_INSNS (5), /* DI */ 768 COSTS_N_INSNS (5)}, /* other */ 769 0, /* cost of multiply per each bit set */ 770 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 771 COSTS_N_INSNS (26), /* HI */ 772 COSTS_N_INSNS (42), /* SI */ 773 COSTS_N_INSNS (74), /* DI */ 774 COSTS_N_INSNS (74)}, /* other */ 775 COSTS_N_INSNS (1), /* cost of movsx */ 776 COSTS_N_INSNS (1), /* cost of movzx */ 777 8, /* "large" insn */ 778 9, /* MOVE_RATIO */ 779 780 /* All move costs are relative to integer->integer move times 2 and thus 781 they are latency*2. */ 782 4, /* cost for loading QImode using movzbl */ 783 {3, 4, 3}, /* cost of loading integer registers 784 in QImode, HImode and SImode. 785 Relative to reg-reg move (2). */ 786 {3, 4, 3}, /* cost of storing integer registers */ 787 4, /* cost of reg,reg fld/fst */ 788 {4, 4, 12}, /* cost of loading fp registers 789 in SFmode, DFmode and XFmode */ 790 {6, 6, 8}, /* cost of storing fp registers 791 in SFmode, DFmode and XFmode */ 792 2, /* cost of moving MMX register */ 793 {4, 4}, /* cost of loading MMX registers 794 in SImode and DImode */ 795 {4, 4}, /* cost of storing MMX registers 796 in SImode and DImode */ 797 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 798 {4, 4, 12, 12, 24}, /* cost of loading SSE registers 799 in 32,64,128,256 and 512-bit */ 800 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */ 801 {4, 4, 10, 10, 20}, /* cost of storing SSE registers 802 in 32,64,128,256 and 512-bit */ 803 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 804 5, 5, /* SSE->integer and integer->SSE moves */ 805 4, 4, /* Gather load static, per_elt. */ 806 4, 4, /* Gather store static, per_elt. */ 807 64, /* size of l1 cache. */ 808 256, /* size of l2 cache. */ 809 64, /* size of prefetch block */ 810 6, /* number of parallel prefetches */ 811 5, /* Branch cost */ 812 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 813 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 814 COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 815 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 816 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 817 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 818 819 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 820 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 821 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 822 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 823 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 824 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 825 /* 11-16 */ 826 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 827 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */ 828 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 829 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */ 830 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 831 athlon_memcpy, 832 athlon_memset, 833 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 834 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 835 "16:8:8", /* Loop alignment. */ 836 "16:8:8", /* Jump alignment. */ 837 "0:0:8", /* Label alignment. */ 838 "16", /* Func alignment. */ 839 }; 840 841 /* K8 has optimized REP instruction for medium sized blocks, but for very 842 small blocks it is better to use loop. For large blocks, libcall can 843 do nontemporary accesses and beat inline considerably. */ 844 static stringop_algs k8_memcpy[2] = { 845 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 846 {-1, rep_prefix_4_byte, false}}}, 847 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 848 {-1, libcall, false}}}}; 849 static stringop_algs k8_memset[2] = { 850 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 851 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 852 {libcall, {{48, unrolled_loop, false}, 853 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 854 static const 855 struct processor_costs k8_cost = { 856 COSTS_N_INSNS (1), /* cost of an add instruction */ 857 COSTS_N_INSNS (2), /* cost of a lea instruction */ 858 COSTS_N_INSNS (1), /* variable shift costs */ 859 COSTS_N_INSNS (1), /* constant shift costs */ 860 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 861 COSTS_N_INSNS (4), /* HI */ 862 COSTS_N_INSNS (3), /* SI */ 863 COSTS_N_INSNS (4), /* DI */ 864 COSTS_N_INSNS (5)}, /* other */ 865 0, /* cost of multiply per each bit set */ 866 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 867 COSTS_N_INSNS (26), /* HI */ 868 COSTS_N_INSNS (42), /* SI */ 869 COSTS_N_INSNS (74), /* DI */ 870 COSTS_N_INSNS (74)}, /* other */ 871 COSTS_N_INSNS (1), /* cost of movsx */ 872 COSTS_N_INSNS (1), /* cost of movzx */ 873 8, /* "large" insn */ 874 9, /* MOVE_RATIO */ 875 876 /* All move costs are relative to integer->integer move times 2 and thus 877 they are latency*2. */ 878 4, /* cost for loading QImode using movzbl */ 879 {3, 4, 3}, /* cost of loading integer registers 880 in QImode, HImode and SImode. 881 Relative to reg-reg move (2). */ 882 {3, 4, 3}, /* cost of storing integer registers */ 883 4, /* cost of reg,reg fld/fst */ 884 {4, 4, 12}, /* cost of loading fp registers 885 in SFmode, DFmode and XFmode */ 886 {6, 6, 8}, /* cost of storing fp registers 887 in SFmode, DFmode and XFmode */ 888 2, /* cost of moving MMX register */ 889 {3, 3}, /* cost of loading MMX registers 890 in SImode and DImode */ 891 {4, 4}, /* cost of storing MMX registers 892 in SImode and DImode */ 893 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 894 {4, 3, 12, 12, 24}, /* cost of loading SSE registers 895 in 32,64,128,256 and 512-bit */ 896 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */ 897 {4, 4, 10, 10, 20}, /* cost of storing SSE registers 898 in 32,64,128,256 and 512-bit */ 899 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 900 5, 5, /* SSE->integer and integer->SSE moves */ 901 4, 4, /* Gather load static, per_elt. */ 902 4, 4, /* Gather store static, per_elt. */ 903 64, /* size of l1 cache. */ 904 512, /* size of l2 cache. */ 905 64, /* size of prefetch block */ 906 /* New AMD processors never drop prefetches; if they cannot be performed 907 immediately, they are queued. We set number of simultaneous prefetches 908 to a large constant to reflect this (it probably is not a good idea not 909 to limit number of prefetches at all, as their execution also takes some 910 time). */ 911 100, /* number of parallel prefetches */ 912 3, /* Branch cost */ 913 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 914 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 915 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 916 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 917 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 918 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 919 920 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 921 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 922 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 923 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 924 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 925 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 926 /* 11-16 */ 927 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 928 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 929 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 930 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 931 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 932 k8_memcpy, 933 k8_memset, 934 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 935 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 936 "16:8:8", /* Loop alignment. */ 937 "16:8:8", /* Jump alignment. */ 938 "0:0:8", /* Label alignment. */ 939 "16", /* Func alignment. */ 940 }; 941 942 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for 943 very small blocks it is better to use loop. For large blocks, libcall can 944 do nontemporary accesses and beat inline considerably. */ 945 static stringop_algs amdfam10_memcpy[2] = { 946 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 947 {-1, rep_prefix_4_byte, false}}}, 948 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 949 {-1, libcall, false}}}}; 950 static stringop_algs amdfam10_memset[2] = { 951 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 952 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 953 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 954 {-1, libcall, false}}}}; 955 struct processor_costs amdfam10_cost = { 956 COSTS_N_INSNS (1), /* cost of an add instruction */ 957 COSTS_N_INSNS (2), /* cost of a lea instruction */ 958 COSTS_N_INSNS (1), /* variable shift costs */ 959 COSTS_N_INSNS (1), /* constant shift costs */ 960 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 961 COSTS_N_INSNS (4), /* HI */ 962 COSTS_N_INSNS (3), /* SI */ 963 COSTS_N_INSNS (4), /* DI */ 964 COSTS_N_INSNS (5)}, /* other */ 965 0, /* cost of multiply per each bit set */ 966 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 967 COSTS_N_INSNS (35), /* HI */ 968 COSTS_N_INSNS (51), /* SI */ 969 COSTS_N_INSNS (83), /* DI */ 970 COSTS_N_INSNS (83)}, /* other */ 971 COSTS_N_INSNS (1), /* cost of movsx */ 972 COSTS_N_INSNS (1), /* cost of movzx */ 973 8, /* "large" insn */ 974 9, /* MOVE_RATIO */ 975 976 /* All move costs are relative to integer->integer move times 2 and thus 977 they are latency*2. */ 978 4, /* cost for loading QImode using movzbl */ 979 {3, 4, 3}, /* cost of loading integer registers 980 in QImode, HImode and SImode. 981 Relative to reg-reg move (2). */ 982 {3, 4, 3}, /* cost of storing integer registers */ 983 4, /* cost of reg,reg fld/fst */ 984 {4, 4, 12}, /* cost of loading fp registers 985 in SFmode, DFmode and XFmode */ 986 {6, 6, 8}, /* cost of storing fp registers 987 in SFmode, DFmode and XFmode */ 988 2, /* cost of moving MMX register */ 989 {3, 3}, /* cost of loading MMX registers 990 in SImode and DImode */ 991 {4, 4}, /* cost of storing MMX registers 992 in SImode and DImode */ 993 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 994 {4, 4, 3, 6, 12}, /* cost of loading SSE registers 995 in 32,64,128,256 and 512-bit */ 996 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ 997 {4, 4, 5, 10, 20}, /* cost of storing SSE registers 998 in 32,64,128,256 and 512-bit */ 999 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 1000 3, 3, /* SSE->integer and integer->SSE moves */ 1001 /* On K8: 1002 MOVD reg64, xmmreg Double FSTORE 4 1003 MOVD reg32, xmmreg Double FSTORE 4 1004 On AMDFAM10: 1005 MOVD reg64, xmmreg Double FADD 3 1006 1/1 1/1 1007 MOVD reg32, xmmreg Double FADD 3 1008 1/1 1/1 */ 1009 4, 4, /* Gather load static, per_elt. */ 1010 4, 4, /* Gather store static, per_elt. */ 1011 64, /* size of l1 cache. */ 1012 512, /* size of l2 cache. */ 1013 64, /* size of prefetch block */ 1014 /* New AMD processors never drop prefetches; if they cannot be performed 1015 immediately, they are queued. We set number of simultaneous prefetches 1016 to a large constant to reflect this (it probably is not a good idea not 1017 to limit number of prefetches at all, as their execution also takes some 1018 time). */ 1019 100, /* number of parallel prefetches */ 1020 2, /* Branch cost */ 1021 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1022 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1023 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1024 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1025 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1026 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1027 1028 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1029 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1030 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1031 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1032 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 1033 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 1034 /* 11-16 */ 1035 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 1036 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 1037 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 1038 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 1039 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1040 amdfam10_memcpy, 1041 amdfam10_memset, 1042 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1043 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1044 "32:25:8", /* Loop alignment. */ 1045 "32:8:8", /* Jump alignment. */ 1046 "0:0:8", /* Label alignment. */ 1047 "32", /* Func alignment. */ 1048 }; 1049 1050 /* BDVER has optimized REP instruction for medium sized blocks, but for 1051 very small blocks it is better to use loop. For large blocks, libcall 1052 can do nontemporary accesses and beat inline considerably. */ 1053 static stringop_algs bdver_memcpy[2] = { 1054 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1055 {-1, rep_prefix_4_byte, false}}}, 1056 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1057 {-1, libcall, false}}}}; 1058 static stringop_algs bdver_memset[2] = { 1059 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1060 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1061 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1062 {-1, libcall, false}}}}; 1063 1064 const struct processor_costs bdver_cost = { 1065 COSTS_N_INSNS (1), /* cost of an add instruction */ 1066 COSTS_N_INSNS (1), /* cost of a lea instruction */ 1067 COSTS_N_INSNS (1), /* variable shift costs */ 1068 COSTS_N_INSNS (1), /* constant shift costs */ 1069 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1070 COSTS_N_INSNS (4), /* HI */ 1071 COSTS_N_INSNS (4), /* SI */ 1072 COSTS_N_INSNS (6), /* DI */ 1073 COSTS_N_INSNS (6)}, /* other */ 1074 0, /* cost of multiply per each bit set */ 1075 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1076 COSTS_N_INSNS (35), /* HI */ 1077 COSTS_N_INSNS (51), /* SI */ 1078 COSTS_N_INSNS (83), /* DI */ 1079 COSTS_N_INSNS (83)}, /* other */ 1080 COSTS_N_INSNS (1), /* cost of movsx */ 1081 COSTS_N_INSNS (1), /* cost of movzx */ 1082 8, /* "large" insn */ 1083 9, /* MOVE_RATIO */ 1084 1085 /* All move costs are relative to integer->integer move times 2 and thus 1086 they are latency*2. */ 1087 8, /* cost for loading QImode using movzbl */ 1088 {8, 8, 8}, /* cost of loading integer registers 1089 in QImode, HImode and SImode. 1090 Relative to reg-reg move (2). */ 1091 {8, 8, 8}, /* cost of storing integer registers */ 1092 4, /* cost of reg,reg fld/fst */ 1093 {12, 12, 28}, /* cost of loading fp registers 1094 in SFmode, DFmode and XFmode */ 1095 {10, 10, 18}, /* cost of storing fp registers 1096 in SFmode, DFmode and XFmode */ 1097 4, /* cost of moving MMX register */ 1098 {12, 12}, /* cost of loading MMX registers 1099 in SImode and DImode */ 1100 {10, 10}, /* cost of storing MMX registers 1101 in SImode and DImode */ 1102 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1103 {12, 12, 10, 40, 60}, /* cost of loading SSE registers 1104 in 32,64,128,256 and 512-bit */ 1105 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ 1106 {10, 10, 10, 40, 60}, /* cost of storing SSE registers 1107 in 32,64,128,256 and 512-bit */ 1108 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 1109 16, 20, /* SSE->integer and integer->SSE moves */ 1110 12, 12, /* Gather load static, per_elt. */ 1111 10, 10, /* Gather store static, per_elt. */ 1112 16, /* size of l1 cache. */ 1113 2048, /* size of l2 cache. */ 1114 64, /* size of prefetch block */ 1115 /* New AMD processors never drop prefetches; if they cannot be performed 1116 immediately, they are queued. We set number of simultaneous prefetches 1117 to a large constant to reflect this (it probably is not a good idea not 1118 to limit number of prefetches at all, as their execution also takes some 1119 time). */ 1120 100, /* number of parallel prefetches */ 1121 2, /* Branch cost */ 1122 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1123 COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1124 COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1125 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1126 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1127 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1128 1129 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1130 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1131 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1132 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1133 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1134 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1135 /* 9-24 */ 1136 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1137 /* 9-27 */ 1138 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1139 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1140 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1141 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1142 bdver_memcpy, 1143 bdver_memset, 1144 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1145 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1146 "16:11:8", /* Loop alignment. */ 1147 "16:8:8", /* Jump alignment. */ 1148 "0:0:8", /* Label alignment. */ 1149 "11", /* Func alignment. */ 1150 }; 1151 1152 1153 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for 1154 very small blocks it is better to use loop. For large blocks, libcall 1155 can do nontemporary accesses and beat inline considerably. */ 1156 static stringop_algs znver1_memcpy[2] = { 1157 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1158 {-1, rep_prefix_4_byte, false}}}, 1159 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1160 {-1, libcall, false}}}}; 1161 static stringop_algs znver1_memset[2] = { 1162 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1163 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1164 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1165 {-1, libcall, false}}}}; 1166 struct processor_costs znver1_cost = { 1167 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1168 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1169 COSTS_N_INSNS (1), /* variable shift costs. */ 1170 COSTS_N_INSNS (1), /* constant shift costs. */ 1171 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1172 COSTS_N_INSNS (3), /* HI. */ 1173 COSTS_N_INSNS (3), /* SI. */ 1174 COSTS_N_INSNS (3), /* DI. */ 1175 COSTS_N_INSNS (3)}, /* other. */ 1176 0, /* cost of multiply per each bit 1177 set. */ 1178 /* Depending on parameters, idiv can get faster on ryzen. This is upper 1179 bound. */ 1180 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ 1181 COSTS_N_INSNS (22), /* HI. */ 1182 COSTS_N_INSNS (30), /* SI. */ 1183 COSTS_N_INSNS (45), /* DI. */ 1184 COSTS_N_INSNS (45)}, /* other. */ 1185 COSTS_N_INSNS (1), /* cost of movsx. */ 1186 COSTS_N_INSNS (1), /* cost of movzx. */ 1187 8, /* "large" insn. */ 1188 9, /* MOVE_RATIO. */ 1189 1190 /* All move costs are relative to integer->integer move times 2 and thus 1191 they are latency*2. */ 1192 1193 /* reg-reg moves are done by renaming and thus they are even cheaper than 1194 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond 1195 to doubles of latencies, we do not model this correctly. It does not 1196 seem to make practical difference to bump prices up even more. */ 1197 6, /* cost for loading QImode using 1198 movzbl. */ 1199 {6, 6, 6}, /* cost of loading integer registers 1200 in QImode, HImode and SImode. 1201 Relative to reg-reg move (2). */ 1202 {8, 8, 8}, /* cost of storing integer 1203 registers. */ 1204 2, /* cost of reg,reg fld/fst. */ 1205 {6, 6, 16}, /* cost of loading fp registers 1206 in SFmode, DFmode and XFmode. */ 1207 {8, 8, 16}, /* cost of storing fp registers 1208 in SFmode, DFmode and XFmode. */ 1209 2, /* cost of moving MMX register. */ 1210 {6, 6}, /* cost of loading MMX registers 1211 in SImode and DImode. */ 1212 {8, 8}, /* cost of storing MMX registers 1213 in SImode and DImode. */ 1214 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 1215 {6, 6, 6, 12, 24}, /* cost of loading SSE registers 1216 in 32,64,128,256 and 512-bit. */ 1217 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */ 1218 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 1219 in 32,64,128,256 and 512-bit. */ 1220 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ 1221 6, 6, /* SSE->integer and integer->SSE moves. */ 1222 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, 1223 throughput 12. Approx 9 uops do not depend on vector size and every load 1224 is 7 uops. */ 1225 18, 8, /* Gather load static, per_elt. */ 1226 18, 10, /* Gather store static, per_elt. */ 1227 32, /* size of l1 cache. */ 1228 512, /* size of l2 cache. */ 1229 64, /* size of prefetch block. */ 1230 /* New AMD processors never drop prefetches; if they cannot be performed 1231 immediately, they are queued. We set number of simultaneous prefetches 1232 to a large constant to reflect this (it probably is not a good idea not 1233 to limit number of prefetches at all, as their execution also takes some 1234 time). */ 1235 100, /* number of parallel prefetches. */ 1236 3, /* Branch cost. */ 1237 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1238 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1239 /* Latency of fdiv is 8-15. */ 1240 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1241 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1242 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1243 /* Latency of fsqrt is 4-10. */ 1244 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1245 1246 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1247 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1248 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1249 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1250 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1251 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1252 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1253 /* 9-13 */ 1254 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1255 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1256 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1257 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles 1258 and it can execute 2 integer additions and 2 multiplications thus 1259 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests 1260 that 4 works better than 6 probably due to register pressure. 1261 1262 Integer vector operations are taken by FP unit and execute 3 vector 1263 plus/minus operations per cycle but only one multiply. This is adjusted 1264 in ix86_reassociation_width. */ 1265 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1266 znver1_memcpy, 1267 znver1_memset, 1268 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1269 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1270 "16", /* Loop alignment. */ 1271 "16", /* Jump alignment. */ 1272 "0:0:8", /* Label alignment. */ 1273 "16", /* Func alignment. */ 1274 }; 1275 1276 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for 1277 very small blocks it is better to use loop. For large blocks, libcall 1278 can do nontemporary accesses and beat inline considerably. */ 1279 static stringop_algs znver2_memcpy[2] = { 1280 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1281 {-1, rep_prefix_4_byte, false}}}, 1282 {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false}, 1283 {-1, libcall, false}}}}; 1284 static stringop_algs znver2_memset[2] = { 1285 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1286 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1287 {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false}, 1288 {-1, libcall, false}}}}; 1289 1290 struct processor_costs znver2_cost = { 1291 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1292 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1293 COSTS_N_INSNS (1), /* variable shift costs. */ 1294 COSTS_N_INSNS (1), /* constant shift costs. */ 1295 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1296 COSTS_N_INSNS (3), /* HI. */ 1297 COSTS_N_INSNS (3), /* SI. */ 1298 COSTS_N_INSNS (3), /* DI. */ 1299 COSTS_N_INSNS (3)}, /* other. */ 1300 0, /* cost of multiply per each bit 1301 set. */ 1302 /* Depending on parameters, idiv can get faster on ryzen. This is upper 1303 bound. */ 1304 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ 1305 COSTS_N_INSNS (22), /* HI. */ 1306 COSTS_N_INSNS (30), /* SI. */ 1307 COSTS_N_INSNS (45), /* DI. */ 1308 COSTS_N_INSNS (45)}, /* other. */ 1309 COSTS_N_INSNS (1), /* cost of movsx. */ 1310 COSTS_N_INSNS (1), /* cost of movzx. */ 1311 8, /* "large" insn. */ 1312 9, /* MOVE_RATIO. */ 1313 1314 /* All move costs are relative to integer->integer move times 2 and thus 1315 they are latency*2. */ 1316 1317 /* reg-reg moves are done by renaming and thus they are even cheaper than 1318 1 cycle. Because reg-reg move cost is 2 and following tables correspond 1319 to doubles of latencies, we do not model this correctly. It does not 1320 seem to make practical difference to bump prices up even more. */ 1321 6, /* cost for loading QImode using 1322 movzbl. */ 1323 {6, 6, 6}, /* cost of loading integer registers 1324 in QImode, HImode and SImode. 1325 Relative to reg-reg move (2). */ 1326 {8, 8, 8}, /* cost of storing integer 1327 registers. */ 1328 2, /* cost of reg,reg fld/fst. */ 1329 {6, 6, 16}, /* cost of loading fp registers 1330 in SFmode, DFmode and XFmode. */ 1331 {8, 8, 16}, /* cost of storing fp registers 1332 in SFmode, DFmode and XFmode. */ 1333 2, /* cost of moving MMX register. */ 1334 {6, 6}, /* cost of loading MMX registers 1335 in SImode and DImode. */ 1336 {8, 8}, /* cost of storing MMX registers 1337 in SImode and DImode. */ 1338 2, 2, 3, /* cost of moving XMM,YMM,ZMM 1339 register. */ 1340 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 1341 in 32,64,128,256 and 512-bit. */ 1342 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 1343 {8, 8, 8, 8, 16}, /* cost of storing SSE registers 1344 in 32,64,128,256 and 512-bit. */ 1345 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1346 6, 6, /* SSE->integer and integer->SSE 1347 moves. */ 1348 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, 1349 throughput 12. Approx 9 uops do not depend on vector size and every load 1350 is 7 uops. */ 1351 18, 8, /* Gather load static, per_elt. */ 1352 18, 10, /* Gather store static, per_elt. */ 1353 32, /* size of l1 cache. */ 1354 512, /* size of l2 cache. */ 1355 64, /* size of prefetch block. */ 1356 /* New AMD processors never drop prefetches; if they cannot be performed 1357 immediately, they are queued. We set number of simultaneous prefetches 1358 to a large constant to reflect this (it probably is not a good idea not 1359 to limit number of prefetches at all, as their execution also takes some 1360 time). */ 1361 100, /* number of parallel prefetches. */ 1362 3, /* Branch cost. */ 1363 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1364 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1365 /* Latency of fdiv is 8-15. */ 1366 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1367 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1368 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1369 /* Latency of fsqrt is 4-10. */ 1370 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1371 1372 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1373 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1374 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1375 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 1376 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1377 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1378 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1379 /* 9-13. */ 1380 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1381 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1382 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1383 /* Zen can execute 4 integer operations per cycle. FP operations 1384 take 3 cycles and it can execute 2 integer additions and 2 1385 multiplications thus reassociation may make sense up to with of 6. 1386 SPEC2k6 bencharks suggests 1387 that 4 works better than 6 probably due to register pressure. 1388 1389 Integer vector operations are taken by FP unit and execute 3 vector 1390 plus/minus operations per cycle but only one multiply. This is adjusted 1391 in ix86_reassociation_width. */ 1392 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1393 znver2_memcpy, 1394 znver2_memset, 1395 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1396 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1397 "16", /* Loop alignment. */ 1398 "16", /* Jump alignment. */ 1399 "0:0:8", /* Label alignment. */ 1400 "16", /* Func alignment. */ 1401 }; 1402 1403 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ 1404 static stringop_algs skylake_memcpy[2] = { 1405 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, 1406 {libcall, {{16, loop, false}, {512, unrolled_loop, false}, 1407 {-1, libcall, false}}}}; 1408 1409 static stringop_algs skylake_memset[2] = { 1410 {libcall, {{6, loop_1_byte, true}, 1411 {24, loop, true}, 1412 {8192, rep_prefix_4_byte, true}, 1413 {-1, libcall, false}}}, 1414 {libcall, {{24, loop, true}, {512, unrolled_loop, false}, 1415 {-1, libcall, false}}}}; 1416 1417 static const 1418 struct processor_costs skylake_cost = { 1419 COSTS_N_INSNS (1), /* cost of an add instruction */ 1420 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ 1421 COSTS_N_INSNS (1), /* variable shift costs */ 1422 COSTS_N_INSNS (1), /* constant shift costs */ 1423 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1424 COSTS_N_INSNS (4), /* HI */ 1425 COSTS_N_INSNS (3), /* SI */ 1426 COSTS_N_INSNS (3), /* DI */ 1427 COSTS_N_INSNS (3)}, /* other */ 1428 0, /* cost of multiply per each bit set */ 1429 /* Expanding div/mod currently doesn't consider parallelism. So the cost 1430 model is not realistic. We compensate by increasing the latencies a bit. */ 1431 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 1432 COSTS_N_INSNS (11), /* HI */ 1433 COSTS_N_INSNS (14), /* SI */ 1434 COSTS_N_INSNS (76), /* DI */ 1435 COSTS_N_INSNS (76)}, /* other */ 1436 COSTS_N_INSNS (1), /* cost of movsx */ 1437 COSTS_N_INSNS (0), /* cost of movzx */ 1438 8, /* "large" insn */ 1439 17, /* MOVE_RATIO */ 1440 1441 6, /* cost for loading QImode using movzbl */ 1442 {4, 4, 4}, /* cost of loading integer registers 1443 in QImode, HImode and SImode. 1444 Relative to reg-reg move (2). */ 1445 {6, 6, 3}, /* cost of storing integer registers */ 1446 2, /* cost of reg,reg fld/fst */ 1447 {6, 6, 8}, /* cost of loading fp registers 1448 in SFmode, DFmode and XFmode */ 1449 {6, 6, 10}, /* cost of storing fp registers 1450 in SFmode, DFmode and XFmode */ 1451 2, /* cost of moving MMX register */ 1452 {6, 6}, /* cost of loading MMX registers 1453 in SImode and DImode */ 1454 {6, 6}, /* cost of storing MMX registers 1455 in SImode and DImode */ 1456 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 1457 {6, 6, 6, 10, 20}, /* cost of loading SSE registers 1458 in 32,64,128,256 and 512-bit */ 1459 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ 1460 {8, 8, 8, 12, 24}, /* cost of storing SSE registers 1461 in 32,64,128,256 and 512-bit */ 1462 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1463 2, 2, /* SSE->integer and integer->SSE moves */ 1464 20, 8, /* Gather load static, per_elt. */ 1465 22, 10, /* Gather store static, per_elt. */ 1466 64, /* size of l1 cache. */ 1467 512, /* size of l2 cache. */ 1468 64, /* size of prefetch block */ 1469 6, /* number of parallel prefetches */ 1470 3, /* Branch cost */ 1471 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 1472 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1473 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 1474 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1475 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1476 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ 1477 1478 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1479 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1480 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1481 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1482 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 1483 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 1484 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ 1485 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ 1486 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ 1487 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 1488 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 1489 skylake_memcpy, 1490 skylake_memset, 1491 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1492 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1493 "16:11:8", /* Loop alignment. */ 1494 "16:11:8", /* Jump alignment. */ 1495 "0:0:8", /* Label alignment. */ 1496 "16", /* Func alignment. */ 1497 }; 1498 /* BTVER1 has optimized REP instruction for medium sized blocks, but for 1499 very small blocks it is better to use loop. For large blocks, libcall can 1500 do nontemporary accesses and beat inline considerably. */ 1501 static stringop_algs btver1_memcpy[2] = { 1502 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1503 {-1, rep_prefix_4_byte, false}}}, 1504 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1505 {-1, libcall, false}}}}; 1506 static stringop_algs btver1_memset[2] = { 1507 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1508 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1509 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1510 {-1, libcall, false}}}}; 1511 const struct processor_costs btver1_cost = { 1512 COSTS_N_INSNS (1), /* cost of an add instruction */ 1513 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1514 COSTS_N_INSNS (1), /* variable shift costs */ 1515 COSTS_N_INSNS (1), /* constant shift costs */ 1516 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1517 COSTS_N_INSNS (4), /* HI */ 1518 COSTS_N_INSNS (3), /* SI */ 1519 COSTS_N_INSNS (4), /* DI */ 1520 COSTS_N_INSNS (5)}, /* other */ 1521 0, /* cost of multiply per each bit set */ 1522 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1523 COSTS_N_INSNS (35), /* HI */ 1524 COSTS_N_INSNS (51), /* SI */ 1525 COSTS_N_INSNS (83), /* DI */ 1526 COSTS_N_INSNS (83)}, /* other */ 1527 COSTS_N_INSNS (1), /* cost of movsx */ 1528 COSTS_N_INSNS (1), /* cost of movzx */ 1529 8, /* "large" insn */ 1530 9, /* MOVE_RATIO */ 1531 1532 /* All move costs are relative to integer->integer move times 2 and thus 1533 they are latency*2. */ 1534 8, /* cost for loading QImode using movzbl */ 1535 {6, 8, 6}, /* cost of loading integer registers 1536 in QImode, HImode and SImode. 1537 Relative to reg-reg move (2). */ 1538 {6, 8, 6}, /* cost of storing integer registers */ 1539 4, /* cost of reg,reg fld/fst */ 1540 {12, 12, 28}, /* cost of loading fp registers 1541 in SFmode, DFmode and XFmode */ 1542 {12, 12, 38}, /* cost of storing fp registers 1543 in SFmode, DFmode and XFmode */ 1544 4, /* cost of moving MMX register */ 1545 {10, 10}, /* cost of loading MMX registers 1546 in SImode and DImode */ 1547 {12, 12}, /* cost of storing MMX registers 1548 in SImode and DImode */ 1549 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1550 {10, 10, 12, 48, 96}, /* cost of loading SSE registers 1551 in 32,64,128,256 and 512-bit */ 1552 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ 1553 {10, 10, 12, 48, 96}, /* cost of storing SSE registers 1554 in 32,64,128,256 and 512-bit */ 1555 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 1556 14, 14, /* SSE->integer and integer->SSE moves */ 1557 10, 10, /* Gather load static, per_elt. */ 1558 10, 10, /* Gather store static, per_elt. */ 1559 32, /* size of l1 cache. */ 1560 512, /* size of l2 cache. */ 1561 64, /* size of prefetch block */ 1562 100, /* number of parallel prefetches */ 1563 2, /* Branch cost */ 1564 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1565 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1566 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1567 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1568 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1569 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1570 1571 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1572 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1573 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 1574 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1575 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1576 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1577 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 1578 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 1579 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 1580 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */ 1581 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1582 btver1_memcpy, 1583 btver1_memset, 1584 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1585 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1586 "16:11:8", /* Loop alignment. */ 1587 "16:8:8", /* Jump alignment. */ 1588 "0:0:8", /* Label alignment. */ 1589 "11", /* Func alignment. */ 1590 }; 1591 1592 static stringop_algs btver2_memcpy[2] = { 1593 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1594 {-1, rep_prefix_4_byte, false}}}, 1595 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1596 {-1, libcall, false}}}}; 1597 static stringop_algs btver2_memset[2] = { 1598 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1599 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1600 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1601 {-1, libcall, false}}}}; 1602 const struct processor_costs btver2_cost = { 1603 COSTS_N_INSNS (1), /* cost of an add instruction */ 1604 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1605 COSTS_N_INSNS (1), /* variable shift costs */ 1606 COSTS_N_INSNS (1), /* constant shift costs */ 1607 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1608 COSTS_N_INSNS (4), /* HI */ 1609 COSTS_N_INSNS (3), /* SI */ 1610 COSTS_N_INSNS (4), /* DI */ 1611 COSTS_N_INSNS (5)}, /* other */ 1612 0, /* cost of multiply per each bit set */ 1613 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1614 COSTS_N_INSNS (35), /* HI */ 1615 COSTS_N_INSNS (51), /* SI */ 1616 COSTS_N_INSNS (83), /* DI */ 1617 COSTS_N_INSNS (83)}, /* other */ 1618 COSTS_N_INSNS (1), /* cost of movsx */ 1619 COSTS_N_INSNS (1), /* cost of movzx */ 1620 8, /* "large" insn */ 1621 9, /* MOVE_RATIO */ 1622 1623 /* All move costs are relative to integer->integer move times 2 and thus 1624 they are latency*2. */ 1625 8, /* cost for loading QImode using movzbl */ 1626 {8, 8, 6}, /* cost of loading integer registers 1627 in QImode, HImode and SImode. 1628 Relative to reg-reg move (2). */ 1629 {8, 8, 6}, /* cost of storing integer registers */ 1630 4, /* cost of reg,reg fld/fst */ 1631 {12, 12, 28}, /* cost of loading fp registers 1632 in SFmode, DFmode and XFmode */ 1633 {12, 12, 38}, /* cost of storing fp registers 1634 in SFmode, DFmode and XFmode */ 1635 4, /* cost of moving MMX register */ 1636 {10, 10}, /* cost of loading MMX registers 1637 in SImode and DImode */ 1638 {12, 12}, /* cost of storing MMX registers 1639 in SImode and DImode */ 1640 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1641 {10, 10, 12, 48, 96}, /* cost of loading SSE registers 1642 in 32,64,128,256 and 512-bit */ 1643 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ 1644 {10, 10, 12, 48, 96}, /* cost of storing SSE registers 1645 in 32,64,128,256 and 512-bit */ 1646 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 1647 14, 14, /* SSE->integer and integer->SSE moves */ 1648 10, 10, /* Gather load static, per_elt. */ 1649 10, 10, /* Gather store static, per_elt. */ 1650 32, /* size of l1 cache. */ 1651 2048, /* size of l2 cache. */ 1652 64, /* size of prefetch block */ 1653 100, /* number of parallel prefetches */ 1654 2, /* Branch cost */ 1655 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1656 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1657 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1658 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1659 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1660 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1661 1662 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1663 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1664 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 1665 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1666 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1667 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1668 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 1669 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */ 1670 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */ 1671 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ 1672 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1673 btver2_memcpy, 1674 btver2_memset, 1675 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1676 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1677 "16:11:8", /* Loop alignment. */ 1678 "16:8:8", /* Jump alignment. */ 1679 "0:0:8", /* Label alignment. */ 1680 "11", /* Func alignment. */ 1681 }; 1682 1683 static stringop_algs pentium4_memcpy[2] = { 1684 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 1685 DUMMY_STRINGOP_ALGS}; 1686 static stringop_algs pentium4_memset[2] = { 1687 {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 1688 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1689 DUMMY_STRINGOP_ALGS}; 1690 1691 static const 1692 struct processor_costs pentium4_cost = { 1693 COSTS_N_INSNS (1), /* cost of an add instruction */ 1694 COSTS_N_INSNS (3), /* cost of a lea instruction */ 1695 COSTS_N_INSNS (4), /* variable shift costs */ 1696 COSTS_N_INSNS (4), /* constant shift costs */ 1697 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ 1698 COSTS_N_INSNS (15), /* HI */ 1699 COSTS_N_INSNS (15), /* SI */ 1700 COSTS_N_INSNS (15), /* DI */ 1701 COSTS_N_INSNS (15)}, /* other */ 1702 0, /* cost of multiply per each bit set */ 1703 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ 1704 COSTS_N_INSNS (56), /* HI */ 1705 COSTS_N_INSNS (56), /* SI */ 1706 COSTS_N_INSNS (56), /* DI */ 1707 COSTS_N_INSNS (56)}, /* other */ 1708 COSTS_N_INSNS (1), /* cost of movsx */ 1709 COSTS_N_INSNS (1), /* cost of movzx */ 1710 16, /* "large" insn */ 1711 6, /* MOVE_RATIO */ 1712 1713 /* All move costs are relative to integer->integer move times 2 and thus 1714 they are latency*2. */ 1715 5, /* cost for loading QImode using movzbl */ 1716 {4, 5, 4}, /* cost of loading integer registers 1717 in QImode, HImode and SImode. 1718 Relative to reg-reg move (2). */ 1719 {2, 3, 2}, /* cost of storing integer registers */ 1720 12, /* cost of reg,reg fld/fst */ 1721 {14, 14, 14}, /* cost of loading fp registers 1722 in SFmode, DFmode and XFmode */ 1723 {14, 14, 14}, /* cost of storing fp registers 1724 in SFmode, DFmode and XFmode */ 1725 12, /* cost of moving MMX register */ 1726 {16, 16}, /* cost of loading MMX registers 1727 in SImode and DImode */ 1728 {16, 16}, /* cost of storing MMX registers 1729 in SImode and DImode */ 1730 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 1731 {16, 16, 16, 32, 64}, /* cost of loading SSE registers 1732 in 32,64,128,256 and 512-bit */ 1733 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ 1734 {16, 16, 16, 32, 64}, /* cost of storing SSE registers 1735 in 32,64,128,256 and 512-bit */ 1736 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 1737 20, 12, /* SSE->integer and integer->SSE moves */ 1738 16, 16, /* Gather load static, per_elt. */ 1739 16, 16, /* Gather store static, per_elt. */ 1740 8, /* size of l1 cache. */ 1741 256, /* size of l2 cache. */ 1742 64, /* size of prefetch block */ 1743 6, /* number of parallel prefetches */ 1744 2, /* Branch cost */ 1745 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1746 COSTS_N_INSNS (7), /* cost of FMUL instruction. */ 1747 COSTS_N_INSNS (43), /* cost of FDIV instruction. */ 1748 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1749 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1750 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ 1751 1752 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1753 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1754 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1755 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1756 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1757 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1758 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */ 1759 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */ 1760 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */ 1761 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */ 1762 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1763 pentium4_memcpy, 1764 pentium4_memset, 1765 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1766 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1767 NULL, /* Loop alignment. */ 1768 NULL, /* Jump alignment. */ 1769 NULL, /* Label alignment. */ 1770 NULL, /* Func alignment. */ 1771 }; 1772 1773 static stringop_algs nocona_memcpy[2] = { 1774 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 1775 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, 1776 {100000, unrolled_loop, false}, {-1, libcall, false}}}}; 1777 1778 static stringop_algs nocona_memset[2] = { 1779 {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 1780 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1781 {libcall, {{24, loop, false}, {64, unrolled_loop, false}, 1782 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1783 1784 static const 1785 struct processor_costs nocona_cost = { 1786 COSTS_N_INSNS (1), /* cost of an add instruction */ 1787 COSTS_N_INSNS (1), /* cost of a lea instruction */ 1788 COSTS_N_INSNS (1), /* variable shift costs */ 1789 COSTS_N_INSNS (1), /* constant shift costs */ 1790 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ 1791 COSTS_N_INSNS (10), /* HI */ 1792 COSTS_N_INSNS (10), /* SI */ 1793 COSTS_N_INSNS (10), /* DI */ 1794 COSTS_N_INSNS (10)}, /* other */ 1795 0, /* cost of multiply per each bit set */ 1796 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ 1797 COSTS_N_INSNS (66), /* HI */ 1798 COSTS_N_INSNS (66), /* SI */ 1799 COSTS_N_INSNS (66), /* DI */ 1800 COSTS_N_INSNS (66)}, /* other */ 1801 COSTS_N_INSNS (1), /* cost of movsx */ 1802 COSTS_N_INSNS (1), /* cost of movzx */ 1803 16, /* "large" insn */ 1804 17, /* MOVE_RATIO */ 1805 1806 /* All move costs are relative to integer->integer move times 2 and thus 1807 they are latency*2. */ 1808 4, /* cost for loading QImode using movzbl */ 1809 {4, 4, 4}, /* cost of loading integer registers 1810 in QImode, HImode and SImode. 1811 Relative to reg-reg move (2). */ 1812 {4, 4, 4}, /* cost of storing integer registers */ 1813 12, /* cost of reg,reg fld/fst */ 1814 {14, 14, 14}, /* cost of loading fp registers 1815 in SFmode, DFmode and XFmode */ 1816 {14, 14, 14}, /* cost of storing fp registers 1817 in SFmode, DFmode and XFmode */ 1818 14, /* cost of moving MMX register */ 1819 {12, 12}, /* cost of loading MMX registers 1820 in SImode and DImode */ 1821 {12, 12}, /* cost of storing MMX registers 1822 in SImode and DImode */ 1823 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 1824 {12, 12, 12, 24, 48}, /* cost of loading SSE registers 1825 in 32,64,128,256 and 512-bit */ 1826 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ 1827 {12, 12, 12, 24, 48}, /* cost of storing SSE registers 1828 in 32,64,128,256 and 512-bit */ 1829 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 1830 20, 12, /* SSE->integer and integer->SSE moves */ 1831 12, 12, /* Gather load static, per_elt. */ 1832 12, 12, /* Gather store static, per_elt. */ 1833 8, /* size of l1 cache. */ 1834 1024, /* size of l2 cache. */ 1835 64, /* size of prefetch block */ 1836 8, /* number of parallel prefetches */ 1837 1, /* Branch cost */ 1838 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1839 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 1840 COSTS_N_INSNS (40), /* cost of FDIV instruction. */ 1841 COSTS_N_INSNS (3), /* cost of FABS instruction. */ 1842 COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 1843 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ 1844 1845 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1846 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 1847 COSTS_N_INSNS (7), /* cost of MULSS instruction. */ 1848 COSTS_N_INSNS (7), /* cost of MULSD instruction. */ 1849 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 1850 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 1851 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */ 1852 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */ 1853 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */ 1854 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */ 1855 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1856 nocona_memcpy, 1857 nocona_memset, 1858 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1859 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1860 NULL, /* Loop alignment. */ 1861 NULL, /* Jump alignment. */ 1862 NULL, /* Label alignment. */ 1863 NULL, /* Func alignment. */ 1864 }; 1865 1866 static stringop_algs atom_memcpy[2] = { 1867 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 1868 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 1869 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1870 static stringop_algs atom_memset[2] = { 1871 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 1872 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1873 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 1874 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1875 static const 1876 struct processor_costs atom_cost = { 1877 COSTS_N_INSNS (1), /* cost of an add instruction */ 1878 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 1879 COSTS_N_INSNS (1), /* variable shift costs */ 1880 COSTS_N_INSNS (1), /* constant shift costs */ 1881 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1882 COSTS_N_INSNS (4), /* HI */ 1883 COSTS_N_INSNS (3), /* SI */ 1884 COSTS_N_INSNS (4), /* DI */ 1885 COSTS_N_INSNS (2)}, /* other */ 1886 0, /* cost of multiply per each bit set */ 1887 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 1888 COSTS_N_INSNS (26), /* HI */ 1889 COSTS_N_INSNS (42), /* SI */ 1890 COSTS_N_INSNS (74), /* DI */ 1891 COSTS_N_INSNS (74)}, /* other */ 1892 COSTS_N_INSNS (1), /* cost of movsx */ 1893 COSTS_N_INSNS (1), /* cost of movzx */ 1894 8, /* "large" insn */ 1895 17, /* MOVE_RATIO */ 1896 1897 /* All move costs are relative to integer->integer move times 2 and thus 1898 they are latency*2. */ 1899 6, /* cost for loading QImode using movzbl */ 1900 {6, 6, 6}, /* cost of loading integer registers 1901 in QImode, HImode and SImode. 1902 Relative to reg-reg move (2). */ 1903 {6, 6, 6}, /* cost of storing integer registers */ 1904 4, /* cost of reg,reg fld/fst */ 1905 {6, 6, 18}, /* cost of loading fp registers 1906 in SFmode, DFmode and XFmode */ 1907 {14, 14, 24}, /* cost of storing fp registers 1908 in SFmode, DFmode and XFmode */ 1909 2, /* cost of moving MMX register */ 1910 {8, 8}, /* cost of loading MMX registers 1911 in SImode and DImode */ 1912 {10, 10}, /* cost of storing MMX registers 1913 in SImode and DImode */ 1914 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1915 {8, 8, 8, 16, 32}, /* cost of loading SSE registers 1916 in 32,64,128,256 and 512-bit */ 1917 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 1918 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 1919 in 32,64,128,256 and 512-bit */ 1920 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 1921 8, 6, /* SSE->integer and integer->SSE moves */ 1922 8, 8, /* Gather load static, per_elt. */ 1923 8, 8, /* Gather store static, per_elt. */ 1924 32, /* size of l1 cache. */ 1925 256, /* size of l2 cache. */ 1926 64, /* size of prefetch block */ 1927 6, /* number of parallel prefetches */ 1928 3, /* Branch cost */ 1929 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 1930 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 1931 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 1932 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 1933 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 1934 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 1935 1936 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1937 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 1938 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1939 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 1940 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1941 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1942 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 1943 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 1944 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 1945 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 1946 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 1947 atom_memcpy, 1948 atom_memset, 1949 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1950 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1951 "16", /* Loop alignment. */ 1952 "16:8:8", /* Jump alignment. */ 1953 "0:0:8", /* Label alignment. */ 1954 "16", /* Func alignment. */ 1955 }; 1956 1957 static stringop_algs slm_memcpy[2] = { 1958 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 1959 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 1960 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1961 static stringop_algs slm_memset[2] = { 1962 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 1963 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1964 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 1965 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1966 static const 1967 struct processor_costs slm_cost = { 1968 COSTS_N_INSNS (1), /* cost of an add instruction */ 1969 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 1970 COSTS_N_INSNS (1), /* variable shift costs */ 1971 COSTS_N_INSNS (1), /* constant shift costs */ 1972 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1973 COSTS_N_INSNS (3), /* HI */ 1974 COSTS_N_INSNS (3), /* SI */ 1975 COSTS_N_INSNS (4), /* DI */ 1976 COSTS_N_INSNS (2)}, /* other */ 1977 0, /* cost of multiply per each bit set */ 1978 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 1979 COSTS_N_INSNS (26), /* HI */ 1980 COSTS_N_INSNS (42), /* SI */ 1981 COSTS_N_INSNS (74), /* DI */ 1982 COSTS_N_INSNS (74)}, /* other */ 1983 COSTS_N_INSNS (1), /* cost of movsx */ 1984 COSTS_N_INSNS (1), /* cost of movzx */ 1985 8, /* "large" insn */ 1986 17, /* MOVE_RATIO */ 1987 1988 /* All move costs are relative to integer->integer move times 2 and thus 1989 they are latency*2. */ 1990 8, /* cost for loading QImode using movzbl */ 1991 {8, 8, 8}, /* cost of loading integer registers 1992 in QImode, HImode and SImode. 1993 Relative to reg-reg move (2). */ 1994 {6, 6, 6}, /* cost of storing integer registers */ 1995 2, /* cost of reg,reg fld/fst */ 1996 {8, 8, 18}, /* cost of loading fp registers 1997 in SFmode, DFmode and XFmode */ 1998 {6, 6, 18}, /* cost of storing fp registers 1999 in SFmode, DFmode and XFmode */ 2000 2, /* cost of moving MMX register */ 2001 {8, 8}, /* cost of loading MMX registers 2002 in SImode and DImode */ 2003 {6, 6}, /* cost of storing MMX registers 2004 in SImode and DImode */ 2005 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2006 {8, 8, 8, 16, 32}, /* cost of loading SSE registers 2007 in 32,64,128,256 and 512-bit */ 2008 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 2009 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 2010 in 32,64,128,256 and 512-bit */ 2011 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2012 8, 6, /* SSE->integer and integer->SSE moves */ 2013 8, 8, /* Gather load static, per_elt. */ 2014 8, 8, /* Gather store static, per_elt. */ 2015 32, /* size of l1 cache. */ 2016 256, /* size of l2 cache. */ 2017 64, /* size of prefetch block */ 2018 6, /* number of parallel prefetches */ 2019 3, /* Branch cost */ 2020 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2021 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2022 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2023 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2024 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2025 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2026 2027 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2028 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2029 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2030 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2031 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2032 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2033 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 2034 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */ 2035 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */ 2036 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */ 2037 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2038 slm_memcpy, 2039 slm_memset, 2040 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2041 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2042 "16", /* Loop alignment. */ 2043 "16:8:8", /* Jump alignment. */ 2044 "0:0:8", /* Label alignment. */ 2045 "16", /* Func alignment. */ 2046 }; 2047 2048 static stringop_algs intel_memcpy[2] = { 2049 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2050 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2051 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2052 static stringop_algs intel_memset[2] = { 2053 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2054 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2055 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2056 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2057 static const 2058 struct processor_costs intel_cost = { 2059 COSTS_N_INSNS (1), /* cost of an add instruction */ 2060 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2061 COSTS_N_INSNS (1), /* variable shift costs */ 2062 COSTS_N_INSNS (1), /* constant shift costs */ 2063 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2064 COSTS_N_INSNS (3), /* HI */ 2065 COSTS_N_INSNS (3), /* SI */ 2066 COSTS_N_INSNS (4), /* DI */ 2067 COSTS_N_INSNS (2)}, /* other */ 2068 0, /* cost of multiply per each bit set */ 2069 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2070 COSTS_N_INSNS (26), /* HI */ 2071 COSTS_N_INSNS (42), /* SI */ 2072 COSTS_N_INSNS (74), /* DI */ 2073 COSTS_N_INSNS (74)}, /* other */ 2074 COSTS_N_INSNS (1), /* cost of movsx */ 2075 COSTS_N_INSNS (1), /* cost of movzx */ 2076 8, /* "large" insn */ 2077 17, /* MOVE_RATIO */ 2078 2079 /* All move costs are relative to integer->integer move times 2 and thus 2080 they are latency*2. */ 2081 6, /* cost for loading QImode using movzbl */ 2082 {4, 4, 4}, /* cost of loading integer registers 2083 in QImode, HImode and SImode. 2084 Relative to reg-reg move (2). */ 2085 {6, 6, 6}, /* cost of storing integer registers */ 2086 2, /* cost of reg,reg fld/fst */ 2087 {6, 6, 8}, /* cost of loading fp registers 2088 in SFmode, DFmode and XFmode */ 2089 {6, 6, 10}, /* cost of storing fp registers 2090 in SFmode, DFmode and XFmode */ 2091 2, /* cost of moving MMX register */ 2092 {6, 6}, /* cost of loading MMX registers 2093 in SImode and DImode */ 2094 {6, 6}, /* cost of storing MMX registers 2095 in SImode and DImode */ 2096 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 2097 {6, 6, 6, 6, 6}, /* cost of loading SSE registers 2098 in 32,64,128,256 and 512-bit */ 2099 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2100 {6, 6, 6, 6, 6}, /* cost of storing SSE registers 2101 in 32,64,128,256 and 512-bit */ 2102 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2103 4, 4, /* SSE->integer and integer->SSE moves */ 2104 6, 6, /* Gather load static, per_elt. */ 2105 6, 6, /* Gather store static, per_elt. */ 2106 32, /* size of l1 cache. */ 2107 256, /* size of l2 cache. */ 2108 64, /* size of prefetch block */ 2109 6, /* number of parallel prefetches */ 2110 3, /* Branch cost */ 2111 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2112 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2113 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2114 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2115 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2116 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2117 2118 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2119 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 2120 COSTS_N_INSNS (8), /* cost of MULSS instruction. */ 2121 COSTS_N_INSNS (8), /* cost of MULSD instruction. */ 2122 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2123 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2124 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */ 2125 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 2126 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */ 2127 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */ 2128 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2129 intel_memcpy, 2130 intel_memset, 2131 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2132 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2133 "16", /* Loop alignment. */ 2134 "16:8:8", /* Jump alignment. */ 2135 "0:0:8", /* Label alignment. */ 2136 "16", /* Func alignment. */ 2137 }; 2138 2139 /* Generic should produce code tuned for Core-i7 (and newer chips) 2140 and btver1 (and newer chips). */ 2141 2142 static stringop_algs generic_memcpy[2] = { 2143 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 2144 {-1, libcall, false}}}, 2145 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 2146 {-1, libcall, false}}}}; 2147 static stringop_algs generic_memset[2] = { 2148 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 2149 {-1, libcall, false}}}, 2150 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 2151 {-1, libcall, false}}}}; 2152 static const 2153 struct processor_costs generic_cost = { 2154 COSTS_N_INSNS (1), /* cost of an add instruction */ 2155 /* Setting cost to 2 makes our current implementation of synth_mult result in 2156 use of unnecessary temporary registers causing regression on several 2157 SPECfp benchmarks. */ 2158 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2159 COSTS_N_INSNS (1), /* variable shift costs */ 2160 COSTS_N_INSNS (1), /* constant shift costs */ 2161 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2162 COSTS_N_INSNS (4), /* HI */ 2163 COSTS_N_INSNS (3), /* SI */ 2164 COSTS_N_INSNS (4), /* DI */ 2165 COSTS_N_INSNS (4)}, /* other */ 2166 0, /* cost of multiply per each bit set */ 2167 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ 2168 COSTS_N_INSNS (22), /* HI */ 2169 COSTS_N_INSNS (30), /* SI */ 2170 COSTS_N_INSNS (74), /* DI */ 2171 COSTS_N_INSNS (74)}, /* other */ 2172 COSTS_N_INSNS (1), /* cost of movsx */ 2173 COSTS_N_INSNS (1), /* cost of movzx */ 2174 8, /* "large" insn */ 2175 17, /* MOVE_RATIO */ 2176 2177 /* All move costs are relative to integer->integer move times 2 and thus 2178 they are latency*2. */ 2179 6, /* cost for loading QImode using movzbl */ 2180 {6, 6, 6}, /* cost of loading integer registers 2181 in QImode, HImode and SImode. 2182 Relative to reg-reg move (2). */ 2183 {6, 6, 6}, /* cost of storing integer registers */ 2184 4, /* cost of reg,reg fld/fst */ 2185 {6, 6, 12}, /* cost of loading fp registers 2186 in SFmode, DFmode and XFmode */ 2187 {6, 6, 12}, /* cost of storing fp registers 2188 in SFmode, DFmode and XFmode */ 2189 2, /* cost of moving MMX register */ 2190 {6, 6}, /* cost of loading MMX registers 2191 in SImode and DImode */ 2192 {6, 6}, /* cost of storing MMX registers 2193 in SImode and DImode */ 2194 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 2195 {6, 6, 6, 10, 15}, /* cost of loading SSE registers 2196 in 32,64,128,256 and 512-bit */ 2197 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ 2198 {6, 6, 6, 10, 15}, /* cost of storing SSE registers 2199 in 32,64,128,256 and 512-bit */ 2200 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2201 6, 6, /* SSE->integer and integer->SSE moves */ 2202 18, 6, /* Gather load static, per_elt. */ 2203 18, 6, /* Gather store static, per_elt. */ 2204 32, /* size of l1 cache. */ 2205 512, /* size of l2 cache. */ 2206 64, /* size of prefetch block */ 2207 6, /* number of parallel prefetches */ 2208 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this 2209 value is increased to perhaps more appropriate value of 5. */ 2210 3, /* Branch cost */ 2211 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2212 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 2213 COSTS_N_INSNS (17), /* cost of FDIV instruction. */ 2214 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2215 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2216 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ 2217 2218 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2219 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2220 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2221 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2222 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2223 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2224 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 2225 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 2226 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 2227 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 2228 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ 2229 generic_memcpy, 2230 generic_memset, 2231 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 2232 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 2233 "16:11:8", /* Loop alignment. */ 2234 "16:11:8", /* Jump alignment. */ 2235 "0:0:8", /* Label alignment. */ 2236 "16", /* Func alignment. */ 2237 }; 2238 2239 /* core_cost should produce code tuned for Core familly of CPUs. */ 2240 static stringop_algs core_memcpy[2] = { 2241 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, 2242 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, 2243 {-1, libcall, false}}}}; 2244 static stringop_algs core_memset[2] = { 2245 {libcall, {{6, loop_1_byte, true}, 2246 {24, loop, true}, 2247 {8192, rep_prefix_4_byte, true}, 2248 {-1, libcall, false}}}, 2249 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, 2250 {-1, libcall, false}}}}; 2251 2252 static const 2253 struct processor_costs core_cost = { 2254 COSTS_N_INSNS (1), /* cost of an add instruction */ 2255 /* On all chips taken into consideration lea is 2 cycles and more. With 2256 this cost however our current implementation of synth_mult results in 2257 use of unnecessary temporary registers causing regression on several 2258 SPECfp benchmarks. */ 2259 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2260 COSTS_N_INSNS (1), /* variable shift costs */ 2261 COSTS_N_INSNS (1), /* constant shift costs */ 2262 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2263 COSTS_N_INSNS (4), /* HI */ 2264 COSTS_N_INSNS (3), /* SI */ 2265 /* Here we tune for Sandybridge or newer. */ 2266 COSTS_N_INSNS (3), /* DI */ 2267 COSTS_N_INSNS (3)}, /* other */ 2268 0, /* cost of multiply per each bit set */ 2269 /* Expanding div/mod currently doesn't consider parallelism. So the cost 2270 model is not realistic. We compensate by increasing the latencies a bit. */ 2271 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 2272 COSTS_N_INSNS (11), /* HI */ 2273 COSTS_N_INSNS (14), /* SI */ 2274 COSTS_N_INSNS (81), /* DI */ 2275 COSTS_N_INSNS (81)}, /* other */ 2276 COSTS_N_INSNS (1), /* cost of movsx */ 2277 COSTS_N_INSNS (1), /* cost of movzx */ 2278 8, /* "large" insn */ 2279 17, /* MOVE_RATIO */ 2280 2281 /* All move costs are relative to integer->integer move times 2 and thus 2282 they are latency*2. */ 2283 6, /* cost for loading QImode using movzbl */ 2284 {4, 4, 4}, /* cost of loading integer registers 2285 in QImode, HImode and SImode. 2286 Relative to reg-reg move (2). */ 2287 {6, 6, 6}, /* cost of storing integer registers */ 2288 2, /* cost of reg,reg fld/fst */ 2289 {6, 6, 8}, /* cost of loading fp registers 2290 in SFmode, DFmode and XFmode */ 2291 {6, 6, 10}, /* cost of storing fp registers 2292 in SFmode, DFmode and XFmode */ 2293 2, /* cost of moving MMX register */ 2294 {6, 6}, /* cost of loading MMX registers 2295 in SImode and DImode */ 2296 {6, 6}, /* cost of storing MMX registers 2297 in SImode and DImode */ 2298 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2299 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 2300 in 32,64,128,256 and 512-bit */ 2301 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 2302 {6, 6, 6, 6, 12}, /* cost of storing SSE registers 2303 in 32,64,128,256 and 512-bit */ 2304 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 2305 2, 2, /* SSE->integer and integer->SSE moves */ 2306 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, 2307 rec. throughput 6. 2308 So 5 uops statically and one uops per load. */ 2309 10, 6, /* Gather load static, per_elt. */ 2310 10, 6, /* Gather store static, per_elt. */ 2311 64, /* size of l1 cache. */ 2312 512, /* size of l2 cache. */ 2313 64, /* size of prefetch block */ 2314 6, /* number of parallel prefetches */ 2315 /* FIXME perhaps more appropriate value is 5. */ 2316 3, /* Branch cost */ 2317 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2318 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 2319 /* 10-24 */ 2320 COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 2321 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2322 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2323 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */ 2324 2325 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2326 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2327 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2328 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2329 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2330 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2331 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 2332 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ 2333 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ 2334 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ 2335 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2336 core_memcpy, 2337 core_memset, 2338 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2339 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2340 "16:11:8", /* Loop alignment. */ 2341 "16:11:8", /* Jump alignment. */ 2342 "0:0:8", /* Label alignment. */ 2343 "16", /* Func alignment. */ 2344 }; 2345 2346