config/i386/x86-tune-costs.h

*38fd1498Szrj/* Costs of operations of individual x86 CPUs.
*38fd1498Szrj   Copyright (C) 1988-2018 Free Software Foundation, Inc.
*38fd1498Szrj
*38fd1498SzrjThis file is part of GCC.
*38fd1498Szrj
*38fd1498SzrjGCC is free software; you can redistribute it and/or modify
*38fd1498Szrjit under the terms of the GNU General Public License as published by
*38fd1498Szrjthe Free Software Foundation; either version 3, or (at your option)
*38fd1498Szrjany later version.
*38fd1498Szrj
*38fd1498SzrjGCC is distributed in the hope that it will be useful,
*38fd1498Szrjbut WITHOUT ANY WARRANTY; without even the implied warranty of
*38fd1498SzrjMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*38fd1498SzrjGNU General Public License for more details.
*38fd1498Szrj
*38fd1498SzrjUnder Section 7 of GPL version 3, you are granted additional
*38fd1498Szrjpermissions described in the GCC Runtime Library Exception, version
*38fd1498Szrj3.1, as published by the Free Software Foundation.
*38fd1498Szrj
*38fd1498SzrjYou should have received a copy of the GNU General Public License and
*38fd1498Szrja copy of the GCC Runtime Library Exception along with this program;
*38fd1498Szrjsee the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
*38fd1498Szrj<http://www.gnu.org/licenses/>.  */
*38fd1498Szrj/* Processor costs (relative to an add) */
*38fd1498Szrj/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
*38fd1498Szrj#define COSTS_N_BYTES(N) ((N) * 2)
*38fd1498Szrj
*38fd1498Szrj#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs ix86_size_memcpy[2] = {
*38fd1498Szrj  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
*38fd1498Szrj  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
*38fd1498Szrjstatic stringop_algs ix86_size_memset[2] = {
*38fd1498Szrj  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
*38fd1498Szrj  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
*38fd1498Szrj
*38fd1498Szrjconst
*38fd1498Szrjstruct processor_costs ix86_size_cost = {/* costs for tuning for size */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_BYTES (3),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_BYTES (3),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_BYTES (3),			/*				 HI */
*38fd1498Szrj   COSTS_N_BYTES (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_BYTES (3),			/*				 DI */
*38fd1498Szrj   COSTS_N_BYTES (5)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_BYTES (3),			/*			    HI */
*38fd1498Szrj   COSTS_N_BYTES (3),			/*			    SI */
*38fd1498Szrj   COSTS_N_BYTES (3),			/*			    DI */
*38fd1498Szrj   COSTS_N_BYTES (5)},			/*			    other */
*38fd1498Szrj  COSTS_N_BYTES (3),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_BYTES (3),			/* cost of movzx */
*38fd1498Szrj  0,					/* "large" insn */
*38fd1498Szrj  2,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2. */
*38fd1498Szrj  2,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {2, 2, 2},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {2, 2, 2},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {2, 2, 2},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {2, 2, 2},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  3,					/* cost of moving MMX register */
*38fd1498Szrj  {3, 3},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {3, 3},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {3, 3, 3, 3, 3},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
*38fd1498Szrj					   in 128bit, 256bit and 512bit */
*38fd1498Szrj  {3, 3, 3, 3, 3},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
*38fd1498Szrj					   in 128bit, 256bit and 512bit */
*38fd1498Szrj  3, 3,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  5, 0,					/* Gather load static, per_elt.  */
*38fd1498Szrj  5, 0,					/* Gather store static, per_elt.  */
*38fd1498Szrj  0,					/* size of l1 cache  */
*38fd1498Szrj  0,					/* size of l2 cache  */
*38fd1498Szrj  0,					/* size of prefetch block */
*38fd1498Szrj  0,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  ix86_size_memcpy,
*38fd1498Szrj  ix86_size_memset,
*38fd1498Szrj  COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_BYTES (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/* Processor costs (relative to an add) */
*38fd1498Szrjstatic stringop_algs i386_memcpy[2] = {
*38fd1498Szrj  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic stringop_algs i386_memset[2] = {
*38fd1498Szrj  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrj
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs i386_cost = {	/* 386 specific costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (6),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (6),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (6),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (6)},			/*			      other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (23),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (23),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (23),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (23)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of movzx */
*38fd1498Szrj  15,					/* "large" insn */
*38fd1498Szrj  3,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  4,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {2, 4, 2},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {2, 4, 2},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {8, 8, 8},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {8, 8, 8},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {4, 8},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {4, 8},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
*38fd1498Szrj  3, 3,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  4, 4,					/* Gather load static, per_elt.  */
*38fd1498Szrj  4, 4,					/* Gather store static, per_elt.  */
*38fd1498Szrj  0,					/* size of l1 cache  */
*38fd1498Szrj  0,					/* size of l2 cache  */
*38fd1498Szrj  0,					/* size of prefetch block */
*38fd1498Szrj  0,					/* number of parallel prefetches */
*38fd1498Szrj  1,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (23),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (88),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (88),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  i386_memcpy,
*38fd1498Szrj  i386_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs i486_memcpy[2] = {
*38fd1498Szrj  {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic stringop_algs i486_memset[2] = {
*38fd1498Szrj  {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrj
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs i486_cost = {	/* 486 specific costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (12),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (12),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (12),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (12)},			/*			      other */
*38fd1498Szrj  1,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (40),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (40),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (40),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (40)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of movzx */
*38fd1498Szrj  15,					/* "large" insn */
*38fd1498Szrj  3,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  4,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {2, 4, 2},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {2, 4, 2},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {8, 8, 8},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {8, 8, 8},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {4, 8},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {4, 8},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
*38fd1498Szrj  3, 3,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  4, 4,					/* Gather load static, per_elt.  */
*38fd1498Szrj  4, 4,					/* Gather store static, per_elt.  */
*38fd1498Szrj  4,					/* size of l1 cache.  486 has 8kB cache
*38fd1498Szrj					   shared for code and data, so 4kB is
*38fd1498Szrj					   not really precise.  */
*38fd1498Szrj  4,					/* size of l2 cache  */
*38fd1498Szrj  0,					/* size of prefetch block */
*38fd1498Szrj  0,					/* number of parallel prefetches */
*38fd1498Szrj  1,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (16),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (16),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (16),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (16),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (73),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (74),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  i486_memcpy,
*38fd1498Szrj  i486_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs pentium_memcpy[2] = {
*38fd1498Szrj  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic stringop_algs pentium_memset[2] = {
*38fd1498Szrj  {libcall, {{-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrj
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs pentium_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (11),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (11),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (11),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (11)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (25),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (25),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (25),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (25)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  6,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  6,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {2, 4, 2},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {2, 4, 2},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {2, 2, 6},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {4, 4, 6},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  8,					/* cost of moving MMX register */
*38fd1498Szrj  {8, 8},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {8, 8},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
*38fd1498Szrj  3, 3,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  4, 4,					/* Gather load static, per_elt.  */
*38fd1498Szrj  4, 4,					/* Gather store static, per_elt.  */
*38fd1498Szrj  8,					/* size of l1 cache.  */
*38fd1498Szrj  8,					/* size of l2 cache  */
*38fd1498Szrj  0,					/* size of prefetch block */
*38fd1498Szrj  0,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (39),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  pentium_memcpy,
*38fd1498Szrj  pentium_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs lakemont_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (11),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (11),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (11),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (11)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (25),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (25),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (25),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (25)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  17,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  6,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {2, 4, 2},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {2, 4, 2},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {2, 2, 6},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {4, 4, 6},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  8,					/* cost of moving MMX register */
*38fd1498Szrj  {8, 8},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {8, 8},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
*38fd1498Szrj  3, 3,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  4, 4,					/* Gather load static, per_elt.  */
*38fd1498Szrj  4, 4,					/* Gather store static, per_elt.  */
*38fd1498Szrj  8,					/* size of l1 cache.  */
*38fd1498Szrj  8,					/* size of l2 cache  */
*38fd1498Szrj  0,					/* size of prefetch block */
*38fd1498Szrj  0,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (10),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (10),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  pentium_memcpy,
*38fd1498Szrj  pentium_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
*38fd1498Szrj   (we ensure the alignment).  For small blocks inline loop is still a
*38fd1498Szrj   noticeable win, for bigger blocks either rep movsl or rep movsb is
*38fd1498Szrj   way to go.  Rep movsb has apparently more expensive startup time in CPU,
*38fd1498Szrj   but after 4K the difference is down in the noise.  */
*38fd1498Szrjstatic stringop_algs pentiumpro_memcpy[2] = {
*38fd1498Szrj  {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
*38fd1498Szrj                       {8192, rep_prefix_4_byte, false},
*38fd1498Szrj                       {-1, rep_prefix_1_byte, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic stringop_algs pentiumpro_memset[2] = {
*38fd1498Szrj  {rep_prefix_4_byte, {{1024, unrolled_loop, false},
*38fd1498Szrj                       {8192, rep_prefix_4_byte, false},
*38fd1498Szrj                       {-1, libcall, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs pentiumpro_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (4)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (17),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (17),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (17),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (17)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  6,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  2,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {4, 4, 4},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {2, 2, 2},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {2, 2, 6},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {4, 4, 6},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {2, 2},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {2, 2},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
*38fd1498Szrj  3, 3,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  4, 4,					/* Gather load static, per_elt.  */
*38fd1498Szrj  4, 4,					/* Gather store static, per_elt.  */
*38fd1498Szrj  8,					/* size of l1 cache.  */
*38fd1498Szrj  256,					/* size of l2 cache  */
*38fd1498Szrj  32,					/* size of prefetch block */
*38fd1498Szrj  6,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (18),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  pentiumpro_memcpy,
*38fd1498Szrj  pentiumpro_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs geode_memcpy[2] = {
*38fd1498Szrj  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic stringop_algs geode_memset[2] = {
*38fd1498Szrj  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs geode_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (7),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (7),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (7)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (23),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (39),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (39),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (39)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  4,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  2,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {2, 2, 2},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {2, 2, 2},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {2, 2, 2},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {4, 6, 6},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {2, 2},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {2, 2},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
*38fd1498Szrj  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
*38fd1498Szrj  6, 6,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  2, 2,					/* Gather load static, per_elt.  */
*38fd1498Szrj  2, 2,					/* Gather store static, per_elt.  */
*38fd1498Szrj  64,					/* size of l1 cache.  */
*38fd1498Szrj  128,					/* size of l2 cache.  */
*38fd1498Szrj  32,					/* size of prefetch block */
*38fd1498Szrj  1,					/* number of parallel prefetches */
*38fd1498Szrj  1,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (11),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (11),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (17),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (17),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (47),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (47),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  geode_memcpy,
*38fd1498Szrj  geode_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs k6_memcpy[2] = {
*38fd1498Szrj  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic stringop_algs k6_memset[2] = {
*38fd1498Szrj  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs k6_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (3)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (18),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (18),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (18),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (18)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  4,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  3,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {4, 5, 4},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {2, 3, 2},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {6, 6, 6},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {4, 4, 4},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {2, 2},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {2, 2},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {2, 2, 8, 16, 32},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
*38fd1498Szrj  {2, 2, 8, 16, 32},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
*38fd1498Szrj  6, 6,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  2, 2,					/* Gather load static, per_elt.  */
*38fd1498Szrj  2, 2,					/* Gather store static, per_elt.  */
*38fd1498Szrj  32,					/* size of l1 cache.  */
*38fd1498Szrj  32,					/* size of l2 cache.  Some models
*38fd1498Szrj					   have integrated l2 cache, but
*38fd1498Szrj					   optimizing for k6 is not important
*38fd1498Szrj					   enough to worry about that.  */
*38fd1498Szrj  32,					/* size of prefetch block */
*38fd1498Szrj  1,					/* number of parallel prefetches */
*38fd1498Szrj  1,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (56),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (56),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  k6_memcpy,
*38fd1498Szrj  k6_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/* For some reason, Athlon deals better with REP prefix (relative to loops)
*38fd1498Szrj   compared to K8. Alignment becomes important after 8 bytes for memcpy and
*38fd1498Szrj   128 bytes for memset.  */
*38fd1498Szrjstatic stringop_algs athlon_memcpy[2] = {
*38fd1498Szrj  {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic stringop_algs athlon_memset[2] = {
*38fd1498Szrj  {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs athlon_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (5),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (5),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (5),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (5)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (26),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (42),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (74),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (74)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  9,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  4,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {3, 4, 3},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {3, 4, 3},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {4, 4, 12},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {6, 6, 8},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {4, 4},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {4, 4},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {4, 4, 6, 12, 24},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 4, 6, 12, 24},			/* cost of unaligned loads.  */
*38fd1498Szrj  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
*38fd1498Szrj  5, 5,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  4, 4,					/* Gather load static, per_elt.  */
*38fd1498Szrj  4, 4,					/* Gather store static, per_elt.  */
*38fd1498Szrj  64,					/* size of l1 cache.  */
*38fd1498Szrj  256,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  6,					/* number of parallel prefetches */
*38fd1498Szrj  5,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  /* 11-16  */
*38fd1498Szrj  COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (24),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  athlon_memcpy,
*38fd1498Szrj  athlon_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/* K8 has optimized REP instruction for medium sized blocks, but for very
*38fd1498Szrj   small blocks it is better to use loop. For large blocks, libcall can
*38fd1498Szrj   do nontemporary accesses and beat inline considerably.  */
*38fd1498Szrjstatic stringop_algs k8_memcpy[2] = {
*38fd1498Szrj  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
*38fd1498Szrj             {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs k8_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{48, unrolled_loop, false},
*38fd1498Szrj             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs k8_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (5)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (26),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (42),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (74),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (74)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  9,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  4,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {3, 4, 3},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {3, 4, 3},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {4, 4, 12},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {6, 6, 8},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {3, 3},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {4, 4},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {4, 3, 6, 12, 24},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 3, 6, 12, 24},			/* cost of unaligned loads.  */
*38fd1498Szrj  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
*38fd1498Szrj  5, 5,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  4, 4,					/* Gather load static, per_elt.  */
*38fd1498Szrj  4, 4,					/* Gather store static, per_elt.  */
*38fd1498Szrj  64,					/* size of l1 cache.  */
*38fd1498Szrj  512,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  /* New AMD processors never drop prefetches; if they cannot be performed
*38fd1498Szrj     immediately, they are queued.  We set number of simultaneous prefetches
*38fd1498Szrj     to a large constant to reflect this (it probably is not a good idea not
*38fd1498Szrj     to limit number of prefetches at all, as their execution also takes some
*38fd1498Szrj     time).  */
*38fd1498Szrj  100,					/* number of parallel prefetches */
*38fd1498Szrj  3,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  /* 11-16  */
*38fd1498Szrj  COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  k8_memcpy,
*38fd1498Szrj  k8_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
*38fd1498Szrj   very small blocks it is better to use loop. For large blocks, libcall can
*38fd1498Szrj   do nontemporary accesses and beat inline considerably.  */
*38fd1498Szrjstatic stringop_algs amdfam10_memcpy[2] = {
*38fd1498Szrj  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
*38fd1498Szrj             {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs amdfam10_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstruct processor_costs amdfam10_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (5)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (35),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (51),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (83),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (83)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  9,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  4,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {3, 4, 3},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {3, 4, 3},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {4, 4, 12},				/* cost of loading fp registers
*38fd1498Szrj		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {6, 6, 8},				/* cost of storing fp registers
*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {3, 3},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {4, 4},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {4, 4, 3, 6, 12},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
*38fd1498Szrj  {4, 4, 5, 10, 20},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
*38fd1498Szrj  3, 3,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  					/* On K8:
*38fd1498Szrj  					    MOVD reg64, xmmreg Double FSTORE 4
*38fd1498Szrj					    MOVD reg32, xmmreg Double FSTORE 4
*38fd1498Szrj					   On AMDFAM10:
*38fd1498Szrj					    MOVD reg64, xmmreg Double FADD 3
*38fd1498Szrj							       1/1  1/1
*38fd1498Szrj					    MOVD reg32, xmmreg Double FADD 3
*38fd1498Szrj							       1/1  1/1 */
*38fd1498Szrj  4, 4,					/* Gather load static, per_elt.  */
*38fd1498Szrj  4, 4,					/* Gather store static, per_elt.  */
*38fd1498Szrj  64,					/* size of l1 cache.  */
*38fd1498Szrj  512,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  /* New AMD processors never drop prefetches; if they cannot be performed
*38fd1498Szrj     immediately, they are queued.  We set number of simultaneous prefetches
*38fd1498Szrj     to a large constant to reflect this (it probably is not a good idea not
*38fd1498Szrj     to limit number of prefetches at all, as their execution also takes some
*38fd1498Szrj     time).  */
*38fd1498Szrj  100,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  /* 11-16  */
*38fd1498Szrj  COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  amdfam10_memcpy,
*38fd1498Szrj  amdfam10_memset,
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/*  BDVER1 has optimized REP instruction for medium sized blocks, but for
*38fd1498Szrj    very small blocks it is better to use loop. For large blocks, libcall
*38fd1498Szrj    can do nontemporary accesses and beat inline considerably.  */
*38fd1498Szrjstatic stringop_algs bdver1_memcpy[2] = {
*38fd1498Szrj  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
*38fd1498Szrj             {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs bdver1_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrj
*38fd1498Szrjconst struct processor_costs bdver1_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (6),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (6)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (35),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (51),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (83),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (83)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  9,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  8,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {8, 8, 8},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {8, 8, 8},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {12, 12, 28},				/* cost of loading fp registers
*38fd1498Szrj		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {10, 10, 18},				/* cost of storing fp registers
*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  4,					/* cost of moving MMX register */
*38fd1498Szrj  {12, 12},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {10, 10},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
*38fd1498Szrj  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
*38fd1498Szrj  16, 20,				/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  12, 12,				/* Gather load static, per_elt.  */
*38fd1498Szrj  10, 10,				/* Gather store static, per_elt.  */
*38fd1498Szrj  16,					/* size of l1 cache.  */
*38fd1498Szrj  2048,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  /* New AMD processors never drop prefetches; if they cannot be performed
*38fd1498Szrj     immediately, they are queued.  We set number of simultaneous prefetches
*38fd1498Szrj     to a large constant to reflect this (it probably is not a good idea not
*38fd1498Szrj     to limit number of prefetches at all, as their execution also takes some
*38fd1498Szrj     time).  */
*38fd1498Szrj  100,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  /* 9-24  */
*38fd1498Szrj  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  /* 9-27  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  bdver1_memcpy,
*38fd1498Szrj  bdver1_memset,
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/*  BDVER2 has optimized REP instruction for medium sized blocks, but for
*38fd1498Szrj    very small blocks it is better to use loop. For large blocks, libcall
*38fd1498Szrj    can do nontemporary accesses and beat inline considerably.  */
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs bdver2_memcpy[2] = {
*38fd1498Szrj  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
*38fd1498Szrj             {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs bdver2_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrj
*38fd1498Szrjconst struct processor_costs bdver2_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (6),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (6)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (35),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (51),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (83),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (83)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  9,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  8,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {8, 8, 8},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {8, 8, 8},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {12, 12, 28},				/* cost of loading fp registers
*38fd1498Szrj		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {10, 10, 18},				/* cost of storing fp registers
*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  4,					/* cost of moving MMX register */
*38fd1498Szrj  {12, 12},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {10, 10},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
*38fd1498Szrj  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
*38fd1498Szrj  16, 20,				/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  12, 12,				/* Gather load static, per_elt.  */
*38fd1498Szrj  10, 10,				/* Gather store static, per_elt.  */
*38fd1498Szrj  16,					/* size of l1 cache.  */
*38fd1498Szrj  2048,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  /* New AMD processors never drop prefetches; if they cannot be performed
*38fd1498Szrj     immediately, they are queued.  We set number of simultaneous prefetches
*38fd1498Szrj     to a large constant to reflect this (it probably is not a good idea not
*38fd1498Szrj     to limit number of prefetches at all, as their execution also takes some
*38fd1498Szrj     time).  */
*38fd1498Szrj  100,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  /* 9-24  */
*38fd1498Szrj  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  /* 9-27  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  bdver2_memcpy,
*38fd1498Szrj  bdver2_memset,
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj
*38fd1498Szrj  /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
*38fd1498Szrj      very small blocks it is better to use loop. For large blocks, libcall
*38fd1498Szrj      can do nontemporary accesses and beat inline considerably.  */
*38fd1498Szrjstatic stringop_algs bdver3_memcpy[2] = {
*38fd1498Szrj  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
*38fd1498Szrj             {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs bdver3_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstruct processor_costs bdver3_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (6),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (6)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (35),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (51),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (83),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (83)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  9,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  8,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {8, 8, 8},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {8, 8, 8},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {12, 12, 28},				/* cost of loading fp registers
*38fd1498Szrj		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {10, 10, 18},				/* cost of storing fp registers
*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  4,					/* cost of moving MMX register */
*38fd1498Szrj  {12, 12},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {10, 10},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
*38fd1498Szrj  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
*38fd1498Szrj  16, 20,				/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  12, 12,				/* Gather load static, per_elt.  */
*38fd1498Szrj  10, 10,				/* Gather store static, per_elt.  */
*38fd1498Szrj  16,					/* size of l1 cache.  */
*38fd1498Szrj  2048,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  /* New AMD processors never drop prefetches; if they cannot be performed
*38fd1498Szrj     immediately, they are queued.  We set number of simultaneous prefetches
*38fd1498Szrj     to a large constant to reflect this (it probably is not a good idea not
*38fd1498Szrj     to limit number of prefetches at all, as their execution also takes some
*38fd1498Szrj     time).  */
*38fd1498Szrj  100,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  /* 9-24  */
*38fd1498Szrj  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  /* 9-27  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  bdver3_memcpy,
*38fd1498Szrj  bdver3_memset,
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/*  BDVER4 has optimized REP instruction for medium sized blocks, but for
*38fd1498Szrj    very small blocks it is better to use loop. For large blocks, libcall
*38fd1498Szrj    can do nontemporary accesses and beat inline considerably.  */
*38fd1498Szrjstatic stringop_algs bdver4_memcpy[2] = {
*38fd1498Szrj  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
*38fd1498Szrj             {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs bdver4_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstruct processor_costs bdver4_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (6),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (6)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (35),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (51),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (83),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (83)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  9,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  8,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {8, 8, 8},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {8, 8, 8},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {12, 12, 28},				/* cost of loading fp registers
*38fd1498Szrj		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {10, 10, 18},				/* cost of storing fp registers
*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
*38fd1498Szrj  4,					/* cost of moving MMX register */
*38fd1498Szrj  {12, 12},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {10, 10},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {12, 12, 10, 20, 30},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
*38fd1498Szrj  {10, 10, 10, 20, 30},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
*38fd1498Szrj  16, 20,				/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  12, 12,				/* Gather load static, per_elt.  */
*38fd1498Szrj  10, 10,				/* Gather store static, per_elt.  */
*38fd1498Szrj  16,					/* size of l1 cache.  */
*38fd1498Szrj  2048,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  /* New AMD processors never drop prefetches; if they cannot be performed
*38fd1498Szrj     immediately, they are queued.  We set number of simultaneous prefetches
*38fd1498Szrj     to a large constant to reflect this (it probably is not a good idea not
*38fd1498Szrj     to limit number of prefetches at all, as their execution also takes some
*38fd1498Szrj     time).  */
*38fd1498Szrj  100,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  /* 9-24  */
*38fd1498Szrj  COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  /* 9-27  */
*38fd1498Szrj  COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  bdver4_memcpy,
*38fd1498Szrj  bdver4_memset,
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj
*38fd1498Szrj/*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
*38fd1498Szrj    very small blocks it is better to use loop.  For large blocks, libcall
*38fd1498Szrj    can do nontemporary accesses and beat inline considerably.  */
*38fd1498Szrjstatic stringop_algs znver1_memcpy[2] = {
*38fd1498Szrj  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
*38fd1498Szrj	     {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj	     {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs znver1_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
*38fd1498Szrj	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj	     {-1, libcall, false}}}};
*38fd1498Szrjstruct processor_costs znver1_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs.  */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 HI.  */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI.  */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 DI.  */
*38fd1498Szrj   COSTS_N_INSNS (3)},			/*			      other.  */
*38fd1498Szrj  0,					/* cost of multiply per each bit
*38fd1498Szrj					    set.  */
*38fd1498Szrj   /* Depending on parameters, idiv can get faster on ryzen.  This is upper
*38fd1498Szrj      bound.  */
*38fd1498Szrj  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
*38fd1498Szrj   COSTS_N_INSNS (22),			/*			    HI.  */
*38fd1498Szrj   COSTS_N_INSNS (30),			/*			    SI.  */
*38fd1498Szrj   COSTS_N_INSNS (45),			/*			    DI.  */
*38fd1498Szrj   COSTS_N_INSNS (45)},			/*			    other.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx.  */
*38fd1498Szrj  8,					/* "large" insn.  */
*38fd1498Szrj  9,					/* MOVE_RATIO.  */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj
*38fd1498Szrj  /* reg-reg moves are done by renaming and thus they are even cheaper than
*38fd1498Szrj     1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
*38fd1498Szrj     to doubles of latencies, we do not model this correctly.  It does not
*38fd1498Szrj     seem to make practical difference to bump prices up even more.  */
*38fd1498Szrj  6,					/* cost for loading QImode using
*38fd1498Szrj					   movzbl.  */
*38fd1498Szrj  {6, 6, 6},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {8, 8, 8},				/* cost of storing integer
*38fd1498Szrj					   registers.  */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst.  */
*38fd1498Szrj  {6, 6, 16},				/* cost of loading fp registers
*38fd1498Szrj		   			   in SFmode, DFmode and XFmode.  */
*38fd1498Szrj  {8, 8, 16},				/* cost of storing fp registers
*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode.  */
*38fd1498Szrj  2,					/* cost of moving MMX register.  */
*38fd1498Szrj  {6, 6},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode.  */
*38fd1498Szrj  {8, 8},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode.  */
*38fd1498Szrj  2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
*38fd1498Szrj  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit.  */
*38fd1498Szrj  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
*38fd1498Szrj  {8, 8, 8, 8, 16},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit.  */
*38fd1498Szrj  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
*38fd1498Szrj  6, 6,					/* SSE->integer and integer->SSE moves.  */
*38fd1498Szrj  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
*38fd1498Szrj     throughput 12.  Approx 9 uops do not depend on vector size and every load
*38fd1498Szrj     is 7 uops.  */
*38fd1498Szrj  18, 8,				/* Gather load static, per_elt.  */
*38fd1498Szrj  18, 10,				/* Gather store static, per_elt.  */
*38fd1498Szrj  32,					/* size of l1 cache.  */
*38fd1498Szrj  512,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block.  */
*38fd1498Szrj  /* New AMD processors never drop prefetches; if they cannot be performed
*38fd1498Szrj     immediately, they are queued.  We set number of simultaneous prefetches
*38fd1498Szrj     to a large constant to reflect this (it probably is not a good idea not
*38fd1498Szrj     to limit number of prefetches at all, as their execution also takes some
*38fd1498Szrj     time).  */
*38fd1498Szrj  100,					/* number of parallel prefetches.  */
*38fd1498Szrj  3,					/* Branch cost.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
*38fd1498Szrj  /* Latency of fdiv is 8-15.  */
*38fd1498Szrj  COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
*38fd1498Szrj  /* Latency of fsqrt is 4-10.  */
*38fd1498Szrj  COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  /* 9-13  */
*38fd1498Szrj  COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
*38fd1498Szrj     and it can execute 2 integer additions and 2 multiplications thus
*38fd1498Szrj     reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
*38fd1498Szrj     that 4 works better than 6 probably due to register pressure.
*38fd1498Szrj
*38fd1498Szrj     Integer vector operations are taken by FP unit and execute 3 vector
*38fd1498Szrj     plus/minus operations per cycle but only one multiply.  This is adjusted
*38fd1498Szrj     in ix86_reassociation_width.  */
*38fd1498Szrj  4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  znver1_memcpy,
*38fd1498Szrj  znver1_memset,
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
*38fd1498Szrjstatic stringop_algs skylake_memcpy[2] =   {
*38fd1498Szrj  {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {512, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs skylake_memset[2] = {
*38fd1498Szrj  {libcall, {{6, loop_1_byte, true},
*38fd1498Szrj             {24, loop, true},
*38fd1498Szrj             {8192, rep_prefix_4_byte, true},
*38fd1498Szrj             {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrj
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs skylake_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (3)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  /* Expanding div/mod currently doesn't consider parallelism. So the cost
*38fd1498Szrj     model is not realistic. We compensate by increasing the latencies a bit.  */
*38fd1498Szrj  {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (11),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (14),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (76),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (76)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (0),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  17,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  6,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {4, 4, 4},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {6, 6, 3},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {6, 6, 8},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {6, 6, 10},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {6, 6},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {6, 6},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
*38fd1498Szrj  {8, 8, 8, 12, 24},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
*38fd1498Szrj  2, 2,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  20, 8,				/* Gather load static, per_elt.  */
*38fd1498Szrj  22, 10,				/* Gather store static, per_elt.  */
*38fd1498Szrj  64,					/* size of l1 cache.  */
*38fd1498Szrj  512,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  6,					/* number of parallel prefetches */
*38fd1498Szrj  3,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  skylake_memcpy,
*38fd1498Szrj  skylake_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj  /* BTVER1 has optimized REP instruction for medium sized blocks, but for
*38fd1498Szrj     very small blocks it is better to use loop. For large blocks, libcall can
*38fd1498Szrj     do nontemporary accesses and beat inline considerably.  */
*38fd1498Szrjstatic stringop_algs btver1_memcpy[2] = {
*38fd1498Szrj  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
*38fd1498Szrj             {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs btver1_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjconst struct processor_costs btver1_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (5)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (35),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (51),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (83),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (83)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  9,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  8,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {6, 8, 6},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {6, 8, 6},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {12, 12, 28},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {12, 12, 38},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  4,					/* cost of moving MMX register */
*38fd1498Szrj  {10, 10},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {12, 12},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
*38fd1498Szrj  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
*38fd1498Szrj  14, 14,				/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  10, 10,				/* Gather load static, per_elt.  */
*38fd1498Szrj  10, 10,				/* Gather store static, per_elt.  */
*38fd1498Szrj  32,					/* size of l1 cache.  */
*38fd1498Szrj  512,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  100,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  btver1_memcpy,
*38fd1498Szrj  btver1_memset,
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs btver2_memcpy[2] = {
*38fd1498Szrj  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
*38fd1498Szrj             {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs btver2_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjconst struct processor_costs btver2_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (5)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (35),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (51),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (83),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (83)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  9,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  8,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {8, 8, 6},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {8, 8, 6},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {12, 12, 28},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {12, 12, 38},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  4,					/* cost of moving MMX register */
*38fd1498Szrj  {10, 10},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {12, 12},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {10, 10, 12, 24, 48},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
*38fd1498Szrj  {10, 10, 12, 24, 48},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
*38fd1498Szrj  14, 14,				/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  10, 10,				/* Gather load static, per_elt.  */
*38fd1498Szrj  10, 10,				/* Gather store static, per_elt.  */
*38fd1498Szrj  32,					/* size of l1 cache.  */
*38fd1498Szrj  2048,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  100,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (19),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  btver2_memcpy,
*38fd1498Szrj  btver2_memset,
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs pentium4_memcpy[2] = {
*38fd1498Szrj  {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrjstatic stringop_algs pentium4_memset[2] = {
*38fd1498Szrj  {libcall, {{6, loop_1_byte, false}, {48, loop, false},
*38fd1498Szrj             {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  DUMMY_STRINGOP_ALGS};
*38fd1498Szrj
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs pentium4_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (15),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (15),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (15),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (15)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (56),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (56),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (56),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (56)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  16,					/* "large" insn */
*38fd1498Szrj  6,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  5,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {4, 5, 4},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {2, 3, 2},				/* cost of storing integer registers */
*38fd1498Szrj  12,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {14, 14, 14},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {14, 14, 14},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  12,					/* cost of moving MMX register */
*38fd1498Szrj  {16, 16},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {16, 16},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {16, 16, 16, 32, 64},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
*38fd1498Szrj  {16, 16, 16, 32, 64},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
*38fd1498Szrj  20, 12,				/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  16, 16,				/* Gather load static, per_elt.  */
*38fd1498Szrj  16, 16,				/* Gather store static, per_elt.  */
*38fd1498Szrj  8,					/* size of l1 cache.  */
*38fd1498Szrj  256,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  6,					/* number of parallel prefetches */
*38fd1498Szrj  2,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (23),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (38),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  pentium4_memcpy,
*38fd1498Szrj  pentium4_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs nocona_memcpy[2] = {
*38fd1498Szrj  {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
*38fd1498Szrj             {100000, unrolled_loop, false}, {-1, libcall, false}}}};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs nocona_memset[2] = {
*38fd1498Szrj  {libcall, {{6, loop_1_byte, false}, {48, loop, false},
*38fd1498Szrj             {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{24, loop, false}, {64, unrolled_loop, false},
*38fd1498Szrj             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
*38fd1498Szrj
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs nocona_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (10),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (10),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (10),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (10)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (66),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (66),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (66),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (66)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  16,					/* "large" insn */
*38fd1498Szrj  17,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  4,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {4, 4, 4},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {4, 4, 4},				/* cost of storing integer registers */
*38fd1498Szrj  12,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {14, 14, 14},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {14, 14, 14},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  14,					/* cost of moving MMX register */
*38fd1498Szrj  {12, 12},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {12, 12},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {12, 12, 12, 24, 48},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
*38fd1498Szrj  {12, 12, 12, 24, 48},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
*38fd1498Szrj  20, 12,				/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  12, 12,				/* Gather load static, per_elt.  */
*38fd1498Szrj  12, 12,				/* Gather store static, per_elt.  */
*38fd1498Szrj  8,					/* size of l1 cache.  */
*38fd1498Szrj  1024,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  8,					/* number of parallel prefetches */
*38fd1498Szrj  1,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (7),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (7),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (32),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (40),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  nocona_memcpy,
*38fd1498Szrj  nocona_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs atom_memcpy[2] = {
*38fd1498Szrj  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
*38fd1498Szrj             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs atom_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
*38fd1498Szrj             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs atom_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (2)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (26),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (42),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (74),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (74)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  17,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  6,					/* cost for loading QImode using movzbl */
*38fd1498Szrj  {6, 6, 6},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {6, 6, 6},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {6, 6, 18},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {14, 14, 24},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {8, 8},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {10, 10},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
*38fd1498Szrj  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
*38fd1498Szrj  8, 6,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  8, 8,					/* Gather load static, per_elt.  */
*38fd1498Szrj  8, 8,					/* Gather store static, per_elt.  */
*38fd1498Szrj  32,					/* size of l1 cache.  */
*38fd1498Szrj  256,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  6,					/* number of parallel prefetches */
*38fd1498Szrj  3,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  atom_memcpy,
*38fd1498Szrj  atom_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs slm_memcpy[2] = {
*38fd1498Szrj  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
*38fd1498Szrj             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs slm_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
*38fd1498Szrj             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs slm_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (2)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (26),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (42),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (74),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (74)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  17,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  8,					/* cost for loading QImode using movzbl */
*38fd1498Szrj  {8, 8, 8},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {6, 6, 6},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {8, 8, 18},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {6, 6, 18},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {8, 8},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {6, 6},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {8, 8, 8, 16, 32},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
*38fd1498Szrj  {8, 8, 8, 16, 32},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
*38fd1498Szrj  8, 6,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  8, 8,					/* Gather load static, per_elt.  */
*38fd1498Szrj  8, 8,					/* Gather store static, per_elt.  */
*38fd1498Szrj  32,					/* size of l1 cache.  */
*38fd1498Szrj  256,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  6,					/* number of parallel prefetches */
*38fd1498Szrj  3,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (69),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  slm_memcpy,
*38fd1498Szrj  slm_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs intel_memcpy[2] = {
*38fd1498Szrj  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
*38fd1498Szrj  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
*38fd1498Szrj             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs intel_memset[2] = {
*38fd1498Szrj  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
*38fd1498Szrj             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
*38fd1498Szrj             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs intel_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (2)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (26),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (42),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (74),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (74)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  17,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  6,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {4, 4, 4},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {6, 6, 6},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {6, 6, 8},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {6, 6, 10},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {6, 6},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {6, 6},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {6, 6, 6, 6, 6},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
*38fd1498Szrj  {6, 6, 6, 6, 6},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
*38fd1498Szrj  4, 4,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  6, 6,					/* Gather load static, per_elt.  */
*38fd1498Szrj  6, 6,					/* Gather store static, per_elt.  */
*38fd1498Szrj  32,					/* size of l1 cache.  */
*38fd1498Szrj  256,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  6,					/* number of parallel prefetches */
*38fd1498Szrj  3,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  intel_memcpy,
*38fd1498Szrj  intel_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/* Generic should produce code tuned for Core-i7 (and newer chips)
*38fd1498Szrj   and btver1 (and newer chips).  */
*38fd1498Szrj
*38fd1498Szrjstatic stringop_algs generic_memcpy[2] = {
*38fd1498Szrj  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
*38fd1498Szrj             {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs generic_memset[2] = {
*38fd1498Szrj  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
*38fd1498Szrj             {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs generic_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  /* Setting cost to 2 makes our current implementation of synth_mult result in
*38fd1498Szrj     use of unnecessary temporary registers causing regression on several
*38fd1498Szrj     SPECfp benchmarks.  */
*38fd1498Szrj  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (4)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (22),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (30),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (74),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (74)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  17,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  6,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {6, 6, 6},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {6, 6, 6},				/* cost of storing integer registers */
*38fd1498Szrj  4,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {6, 6, 12},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {6, 6, 12},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {6, 6},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {6, 6},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
*38fd1498Szrj  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
*38fd1498Szrj  6, 6,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  18, 6,				/* Gather load static, per_elt.  */
*38fd1498Szrj  18, 6,				/* Gather store static, per_elt.  */
*38fd1498Szrj  32,					/* size of l1 cache.  */
*38fd1498Szrj  512,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  6,					/* number of parallel prefetches */
*38fd1498Szrj  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
*38fd1498Szrj     value is increased to perhaps more appropriate value of 5.  */
*38fd1498Szrj  3,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  generic_memcpy,
*38fd1498Szrj  generic_memset,
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj
*38fd1498Szrj/* core_cost should produce code tuned for Core familly of CPUs.  */
*38fd1498Szrjstatic stringop_algs core_memcpy[2] = {
*38fd1498Szrj  {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrjstatic stringop_algs core_memset[2] = {
*38fd1498Szrj  {libcall, {{6, loop_1_byte, true},
*38fd1498Szrj             {24, loop, true},
*38fd1498Szrj             {8192, rep_prefix_4_byte, true},
*38fd1498Szrj             {-1, libcall, false}}},
*38fd1498Szrj  {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
*38fd1498Szrj             {-1, libcall, false}}}};
*38fd1498Szrj
*38fd1498Szrjstatic const
*38fd1498Szrjstruct processor_costs core_cost = {
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of an add instruction */
*38fd1498Szrj  /* On all chips taken into consideration lea is 2 cycles and more.  With
*38fd1498Szrj     this cost however our current implementation of synth_mult results in
*38fd1498Szrj     use of unnecessary temporary registers causing regression on several
*38fd1498Szrj     SPECfp benchmarks.  */
*38fd1498Szrj  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* variable shift costs */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* constant shift costs */
*38fd1498Szrj  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
*38fd1498Szrj   COSTS_N_INSNS (4),			/*				 HI */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 SI */
*38fd1498Szrj   /* Here we tune for Sandybridge or newer.  */
*38fd1498Szrj   COSTS_N_INSNS (3),			/*				 DI */
*38fd1498Szrj   COSTS_N_INSNS (3)},			/*			      other */
*38fd1498Szrj  0,					/* cost of multiply per each bit set */
*38fd1498Szrj  /* Expanding div/mod currently doesn't consider parallelism. So the cost
*38fd1498Szrj     model is not realistic. We compensate by increasing the latencies a bit.  */
*38fd1498Szrj  {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
*38fd1498Szrj   COSTS_N_INSNS (11),			/*			    HI */
*38fd1498Szrj   COSTS_N_INSNS (14),			/*			    SI */
*38fd1498Szrj   COSTS_N_INSNS (81),			/*			    DI */
*38fd1498Szrj   COSTS_N_INSNS (81)},			/*			    other */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movsx */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of movzx */
*38fd1498Szrj  8,					/* "large" insn */
*38fd1498Szrj  17,					/* MOVE_RATIO */
*38fd1498Szrj
*38fd1498Szrj  /* All move costs are relative to integer->integer move times 2 and thus
*38fd1498Szrj     they are latency*2. */
*38fd1498Szrj  6,				     /* cost for loading QImode using movzbl */
*38fd1498Szrj  {4, 4, 4},				/* cost of loading integer registers
*38fd1498Szrj					   in QImode, HImode and SImode.
*38fd1498Szrj					   Relative to reg-reg move (2).  */
*38fd1498Szrj  {6, 6, 6},				/* cost of storing integer registers */
*38fd1498Szrj  2,					/* cost of reg,reg fld/fst */
*38fd1498Szrj  {6, 6, 8},				/* cost of loading fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  {6, 6, 10},				/* cost of storing fp registers
*38fd1498Szrj					   in SFmode, DFmode and XFmode */
*38fd1498Szrj  2,					/* cost of moving MMX register */
*38fd1498Szrj  {6, 6},				/* cost of loading MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  {6, 6},				/* cost of storing MMX registers
*38fd1498Szrj					   in SImode and DImode */
*38fd1498Szrj  2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
*38fd1498Szrj  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
*38fd1498Szrj  {6, 6, 6, 6, 12},			/* cost of storing SSE registers
*38fd1498Szrj					   in 32,64,128,256 and 512-bit */
*38fd1498Szrj  {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
*38fd1498Szrj  2, 2,					/* SSE->integer and integer->SSE moves */
*38fd1498Szrj  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
*38fd1498Szrj     rec. throughput 6.
*38fd1498Szrj     So 5 uops statically and one uops per load.  */
*38fd1498Szrj  10, 6,				/* Gather load static, per_elt.  */
*38fd1498Szrj  10, 6,				/* Gather store static, per_elt.  */
*38fd1498Szrj  64,					/* size of l1 cache.  */
*38fd1498Szrj  512,					/* size of l2 cache.  */
*38fd1498Szrj  64,					/* size of prefetch block */
*38fd1498Szrj  6,					/* number of parallel prefetches */
*38fd1498Szrj  /* FIXME perhaps more appropriate value is 5.  */
*38fd1498Szrj  3,					/* Branch cost */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
*38fd1498Szrj  /* 10-24 */
*38fd1498Szrj  COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (23),			/* cost of FSQRT instruction.  */
*38fd1498Szrj
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
*38fd1498Szrj  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
*38fd1498Szrj  COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
*38fd1498Szrj  1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
*38fd1498Szrj  core_memcpy,
*38fd1498Szrj  core_memset,
*38fd1498Szrj  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
*38fd1498Szrj  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
*38fd1498Szrj};
*38fd1498Szrj