xref: /dflybsd-src/contrib/gcc-8.0/gcc/config/i386/x86-tune-costs.h (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Costs of operations of individual x86 CPUs.
2*38fd1498Szrj    Copyright (C) 1988-2018 Free Software Foundation, Inc.
3*38fd1498Szrj 
4*38fd1498Szrj This file is part of GCC.
5*38fd1498Szrj 
6*38fd1498Szrj GCC is free software; you can redistribute it and/or modify
7*38fd1498Szrj it under the terms of the GNU General Public License as published by
8*38fd1498Szrj the Free Software Foundation; either version 3, or (at your option)
9*38fd1498Szrj any later version.
10*38fd1498Szrj 
11*38fd1498Szrj GCC is distributed in the hope that it will be useful,
12*38fd1498Szrj but WITHOUT ANY WARRANTY; without even the implied warranty of
13*38fd1498Szrj MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14*38fd1498Szrj GNU General Public License for more details.
15*38fd1498Szrj 
16*38fd1498Szrj Under Section 7 of GPL version 3, you are granted additional
17*38fd1498Szrj permissions described in the GCC Runtime Library Exception, version
18*38fd1498Szrj 3.1, as published by the Free Software Foundation.
19*38fd1498Szrj 
20*38fd1498Szrj You should have received a copy of the GNU General Public License and
21*38fd1498Szrj a copy of the GCC Runtime Library Exception along with this program;
22*38fd1498Szrj see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23*38fd1498Szrj <http://www.gnu.org/licenses/>.  */
24*38fd1498Szrj /* Processor costs (relative to an add) */
25*38fd1498Szrj /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
26*38fd1498Szrj #define COSTS_N_BYTES(N) ((N) * 2)
27*38fd1498Szrj 
28*38fd1498Szrj #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29*38fd1498Szrj 
30*38fd1498Szrj static stringop_algs ix86_size_memcpy[2] = {
31*38fd1498Szrj   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32*38fd1498Szrj   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33*38fd1498Szrj static stringop_algs ix86_size_memset[2] = {
34*38fd1498Szrj   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35*38fd1498Szrj   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36*38fd1498Szrj 
37*38fd1498Szrj const
38*38fd1498Szrj struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of an add instruction */
40*38fd1498Szrj   COSTS_N_BYTES (3),			/* cost of a lea instruction */
41*38fd1498Szrj   COSTS_N_BYTES (2),			/* variable shift costs */
42*38fd1498Szrj   COSTS_N_BYTES (3),			/* constant shift costs */
43*38fd1498Szrj   {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
44*38fd1498Szrj    COSTS_N_BYTES (3),			/*				 HI */
45*38fd1498Szrj    COSTS_N_BYTES (3),			/*				 SI */
46*38fd1498Szrj    COSTS_N_BYTES (3),			/*				 DI */
47*38fd1498Szrj    COSTS_N_BYTES (5)},			/*			      other */
48*38fd1498Szrj   0,					/* cost of multiply per each bit set */
49*38fd1498Szrj   {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
50*38fd1498Szrj    COSTS_N_BYTES (3),			/*			    HI */
51*38fd1498Szrj    COSTS_N_BYTES (3),			/*			    SI */
52*38fd1498Szrj    COSTS_N_BYTES (3),			/*			    DI */
53*38fd1498Szrj    COSTS_N_BYTES (5)},			/*			    other */
54*38fd1498Szrj   COSTS_N_BYTES (3),			/* cost of movsx */
55*38fd1498Szrj   COSTS_N_BYTES (3),			/* cost of movzx */
56*38fd1498Szrj   0,					/* "large" insn */
57*38fd1498Szrj   2,					/* MOVE_RATIO */
58*38fd1498Szrj 
59*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2. */
60*38fd1498Szrj   2,				     /* cost for loading QImode using movzbl */
61*38fd1498Szrj   {2, 2, 2},				/* cost of loading integer registers
62*38fd1498Szrj 					   in QImode, HImode and SImode.
63*38fd1498Szrj 					   Relative to reg-reg move (2).  */
64*38fd1498Szrj   {2, 2, 2},				/* cost of storing integer registers */
65*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
66*38fd1498Szrj   {2, 2, 2},				/* cost of loading fp registers
67*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
68*38fd1498Szrj   {2, 2, 2},				/* cost of storing fp registers
69*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
70*38fd1498Szrj   3,					/* cost of moving MMX register */
71*38fd1498Szrj   {3, 3},				/* cost of loading MMX registers
72*38fd1498Szrj 					   in SImode and DImode */
73*38fd1498Szrj   {3, 3},				/* cost of storing MMX registers
74*38fd1498Szrj 					   in SImode and DImode */
75*38fd1498Szrj   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
76*38fd1498Szrj   {3, 3, 3, 3, 3},			/* cost of loading SSE registers
77*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
78*38fd1498Szrj   {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
79*38fd1498Szrj 					   in 128bit, 256bit and 512bit */
80*38fd1498Szrj   {3, 3, 3, 3, 3},			/* cost of storing SSE registers
81*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
82*38fd1498Szrj   {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
83*38fd1498Szrj 					   in 128bit, 256bit and 512bit */
84*38fd1498Szrj   3, 3,					/* SSE->integer and integer->SSE moves */
85*38fd1498Szrj   5, 0,					/* Gather load static, per_elt.  */
86*38fd1498Szrj   5, 0,					/* Gather store static, per_elt.  */
87*38fd1498Szrj   0,					/* size of l1 cache  */
88*38fd1498Szrj   0,					/* size of l2 cache  */
89*38fd1498Szrj   0,					/* size of prefetch block */
90*38fd1498Szrj   0,					/* number of parallel prefetches */
91*38fd1498Szrj   2,					/* Branch cost */
92*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
93*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
94*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
95*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
96*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
97*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
98*38fd1498Szrj 
99*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of cheap SSE instruction.  */
100*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
101*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of MULSS instruction.  */
102*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of MULSD instruction.  */
103*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of FMA SS instruction.  */
104*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of FMA SD instruction.  */
105*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of DIVSS instruction.  */
106*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of DIVSD instruction.  */
107*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
108*38fd1498Szrj   COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
109*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
110*38fd1498Szrj   ix86_size_memcpy,
111*38fd1498Szrj   ix86_size_memset,
112*38fd1498Szrj   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
113*38fd1498Szrj   COSTS_N_BYTES (1),			/* cond_not_taken_branch_cost.  */
114*38fd1498Szrj };
115*38fd1498Szrj 
116*38fd1498Szrj /* Processor costs (relative to an add) */
117*38fd1498Szrj static stringop_algs i386_memcpy[2] = {
118*38fd1498Szrj   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
119*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
120*38fd1498Szrj static stringop_algs i386_memset[2] = {
121*38fd1498Szrj   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
123*38fd1498Szrj 
124*38fd1498Szrj static const
125*38fd1498Szrj struct processor_costs i386_cost = {	/* 386 specific costs */
126*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
127*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
128*38fd1498Szrj   COSTS_N_INSNS (3),			/* variable shift costs */
129*38fd1498Szrj   COSTS_N_INSNS (2),			/* constant shift costs */
130*38fd1498Szrj   {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
131*38fd1498Szrj    COSTS_N_INSNS (6),			/*				 HI */
132*38fd1498Szrj    COSTS_N_INSNS (6),			/*				 SI */
133*38fd1498Szrj    COSTS_N_INSNS (6),			/*				 DI */
134*38fd1498Szrj    COSTS_N_INSNS (6)},			/*			      other */
135*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
136*38fd1498Szrj   {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
137*38fd1498Szrj    COSTS_N_INSNS (23),			/*			    HI */
138*38fd1498Szrj    COSTS_N_INSNS (23),			/*			    SI */
139*38fd1498Szrj    COSTS_N_INSNS (23),			/*			    DI */
140*38fd1498Szrj    COSTS_N_INSNS (23)},			/*			    other */
141*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of movsx */
142*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of movzx */
143*38fd1498Szrj   15,					/* "large" insn */
144*38fd1498Szrj   3,					/* MOVE_RATIO */
145*38fd1498Szrj 
146*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
147*38fd1498Szrj      they are latency*2. */
148*38fd1498Szrj   4,				     /* cost for loading QImode using movzbl */
149*38fd1498Szrj   {2, 4, 2},				/* cost of loading integer registers
150*38fd1498Szrj 					   in QImode, HImode and SImode.
151*38fd1498Szrj 					   Relative to reg-reg move (2).  */
152*38fd1498Szrj   {2, 4, 2},				/* cost of storing integer registers */
153*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
154*38fd1498Szrj   {8, 8, 8},				/* cost of loading fp registers
155*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
156*38fd1498Szrj   {8, 8, 8},				/* cost of storing fp registers
157*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
158*38fd1498Szrj   2,					/* cost of moving MMX register */
159*38fd1498Szrj   {4, 8},				/* cost of loading MMX registers
160*38fd1498Szrj 					   in SImode and DImode */
161*38fd1498Szrj   {4, 8},				/* cost of storing MMX registers
162*38fd1498Szrj 					   in SImode and DImode */
163*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
164*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
165*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
166*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
167*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
168*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
169*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
170*38fd1498Szrj   3, 3,					/* SSE->integer and integer->SSE moves */
171*38fd1498Szrj   4, 4,					/* Gather load static, per_elt.  */
172*38fd1498Szrj   4, 4,					/* Gather store static, per_elt.  */
173*38fd1498Szrj   0,					/* size of l1 cache  */
174*38fd1498Szrj   0,					/* size of l2 cache  */
175*38fd1498Szrj   0,					/* size of prefetch block */
176*38fd1498Szrj   0,					/* number of parallel prefetches */
177*38fd1498Szrj   1,					/* Branch cost */
178*38fd1498Szrj   COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
179*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
180*38fd1498Szrj   COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
181*38fd1498Szrj   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
182*38fd1498Szrj   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
183*38fd1498Szrj   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
184*38fd1498Szrj 
185*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
186*38fd1498Szrj   COSTS_N_INSNS (23),			/* cost of ADDSS/SD SUBSS/SD insns.  */
187*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of MULSS instruction.  */
188*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of MULSD instruction.  */
189*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of FMA SS instruction.  */
190*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of FMA SD instruction.  */
191*38fd1498Szrj   COSTS_N_INSNS (88),			/* cost of DIVSS instruction.  */
192*38fd1498Szrj   COSTS_N_INSNS (88),			/* cost of DIVSD instruction.  */
193*38fd1498Szrj   COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
194*38fd1498Szrj   COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
195*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
196*38fd1498Szrj   i386_memcpy,
197*38fd1498Szrj   i386_memset,
198*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
199*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
200*38fd1498Szrj };
201*38fd1498Szrj 
202*38fd1498Szrj static stringop_algs i486_memcpy[2] = {
203*38fd1498Szrj   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
204*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
205*38fd1498Szrj static stringop_algs i486_memset[2] = {
206*38fd1498Szrj   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
207*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
208*38fd1498Szrj 
209*38fd1498Szrj static const
210*38fd1498Szrj struct processor_costs i486_cost = {	/* 486 specific costs */
211*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
212*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
213*38fd1498Szrj   COSTS_N_INSNS (3),			/* variable shift costs */
214*38fd1498Szrj   COSTS_N_INSNS (2),			/* constant shift costs */
215*38fd1498Szrj   {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
216*38fd1498Szrj    COSTS_N_INSNS (12),			/*				 HI */
217*38fd1498Szrj    COSTS_N_INSNS (12),			/*				 SI */
218*38fd1498Szrj    COSTS_N_INSNS (12),			/*				 DI */
219*38fd1498Szrj    COSTS_N_INSNS (12)},			/*			      other */
220*38fd1498Szrj   1,					/* cost of multiply per each bit set */
221*38fd1498Szrj   {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
222*38fd1498Szrj    COSTS_N_INSNS (40),			/*			    HI */
223*38fd1498Szrj    COSTS_N_INSNS (40),			/*			    SI */
224*38fd1498Szrj    COSTS_N_INSNS (40),			/*			    DI */
225*38fd1498Szrj    COSTS_N_INSNS (40)},			/*			    other */
226*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of movsx */
227*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of movzx */
228*38fd1498Szrj   15,					/* "large" insn */
229*38fd1498Szrj   3,					/* MOVE_RATIO */
230*38fd1498Szrj 
231*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
232*38fd1498Szrj      they are latency*2. */
233*38fd1498Szrj   4,				     /* cost for loading QImode using movzbl */
234*38fd1498Szrj   {2, 4, 2},				/* cost of loading integer registers
235*38fd1498Szrj 					   in QImode, HImode and SImode.
236*38fd1498Szrj 					   Relative to reg-reg move (2).  */
237*38fd1498Szrj   {2, 4, 2},				/* cost of storing integer registers */
238*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
239*38fd1498Szrj   {8, 8, 8},				/* cost of loading fp registers
240*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
241*38fd1498Szrj   {8, 8, 8},				/* cost of storing fp registers
242*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
243*38fd1498Szrj   2,					/* cost of moving MMX register */
244*38fd1498Szrj   {4, 8},				/* cost of loading MMX registers
245*38fd1498Szrj 					   in SImode and DImode */
246*38fd1498Szrj   {4, 8},				/* cost of storing MMX registers
247*38fd1498Szrj 					   in SImode and DImode */
248*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
249*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
250*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
251*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
252*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
253*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
254*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
255*38fd1498Szrj   3, 3,					/* SSE->integer and integer->SSE moves */
256*38fd1498Szrj   4, 4,					/* Gather load static, per_elt.  */
257*38fd1498Szrj   4, 4,					/* Gather store static, per_elt.  */
258*38fd1498Szrj   4,					/* size of l1 cache.  486 has 8kB cache
259*38fd1498Szrj 					   shared for code and data, so 4kB is
260*38fd1498Szrj 					   not really precise.  */
261*38fd1498Szrj   4,					/* size of l2 cache  */
262*38fd1498Szrj   0,					/* size of prefetch block */
263*38fd1498Szrj   0,					/* number of parallel prefetches */
264*38fd1498Szrj   1,					/* Branch cost */
265*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
266*38fd1498Szrj   COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
267*38fd1498Szrj   COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
268*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
269*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
270*38fd1498Szrj   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
271*38fd1498Szrj 
272*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
273*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
274*38fd1498Szrj   COSTS_N_INSNS (16),			/* cost of MULSS instruction.  */
275*38fd1498Szrj   COSTS_N_INSNS (16),			/* cost of MULSD instruction.  */
276*38fd1498Szrj   COSTS_N_INSNS (16),			/* cost of FMA SS instruction.  */
277*38fd1498Szrj   COSTS_N_INSNS (16),			/* cost of FMA SD instruction.  */
278*38fd1498Szrj   COSTS_N_INSNS (73),			/* cost of DIVSS instruction.  */
279*38fd1498Szrj   COSTS_N_INSNS (74),			/* cost of DIVSD instruction.  */
280*38fd1498Szrj   COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
281*38fd1498Szrj   COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
282*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
283*38fd1498Szrj   i486_memcpy,
284*38fd1498Szrj   i486_memset,
285*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
286*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
287*38fd1498Szrj };
288*38fd1498Szrj 
289*38fd1498Szrj static stringop_algs pentium_memcpy[2] = {
290*38fd1498Szrj   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
291*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
292*38fd1498Szrj static stringop_algs pentium_memset[2] = {
293*38fd1498Szrj   {libcall, {{-1, rep_prefix_4_byte, false}}},
294*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
295*38fd1498Szrj 
296*38fd1498Szrj static const
297*38fd1498Szrj struct processor_costs pentium_cost = {
298*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
299*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
300*38fd1498Szrj   COSTS_N_INSNS (4),			/* variable shift costs */
301*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
302*38fd1498Szrj   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
303*38fd1498Szrj    COSTS_N_INSNS (11),			/*				 HI */
304*38fd1498Szrj    COSTS_N_INSNS (11),			/*				 SI */
305*38fd1498Szrj    COSTS_N_INSNS (11),			/*				 DI */
306*38fd1498Szrj    COSTS_N_INSNS (11)},			/*			      other */
307*38fd1498Szrj   0,					/* cost of multiply per each bit set */
308*38fd1498Szrj   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
309*38fd1498Szrj    COSTS_N_INSNS (25),			/*			    HI */
310*38fd1498Szrj    COSTS_N_INSNS (25),			/*			    SI */
311*38fd1498Szrj    COSTS_N_INSNS (25),			/*			    DI */
312*38fd1498Szrj    COSTS_N_INSNS (25)},			/*			    other */
313*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of movsx */
314*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of movzx */
315*38fd1498Szrj   8,					/* "large" insn */
316*38fd1498Szrj   6,					/* MOVE_RATIO */
317*38fd1498Szrj 
318*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
319*38fd1498Szrj      they are latency*2. */
320*38fd1498Szrj   6,				     /* cost for loading QImode using movzbl */
321*38fd1498Szrj   {2, 4, 2},				/* cost of loading integer registers
322*38fd1498Szrj 					   in QImode, HImode and SImode.
323*38fd1498Szrj 					   Relative to reg-reg move (2).  */
324*38fd1498Szrj   {2, 4, 2},				/* cost of storing integer registers */
325*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
326*38fd1498Szrj   {2, 2, 6},				/* cost of loading fp registers
327*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
328*38fd1498Szrj   {4, 4, 6},				/* cost of storing fp registers
329*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
330*38fd1498Szrj   8,					/* cost of moving MMX register */
331*38fd1498Szrj   {8, 8},				/* cost of loading MMX registers
332*38fd1498Szrj 					   in SImode and DImode */
333*38fd1498Szrj   {8, 8},				/* cost of storing MMX registers
334*38fd1498Szrj 					   in SImode and DImode */
335*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
336*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
337*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
338*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
339*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
340*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
341*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
342*38fd1498Szrj   3, 3,					/* SSE->integer and integer->SSE moves */
343*38fd1498Szrj   4, 4,					/* Gather load static, per_elt.  */
344*38fd1498Szrj   4, 4,					/* Gather store static, per_elt.  */
345*38fd1498Szrj   8,					/* size of l1 cache.  */
346*38fd1498Szrj   8,					/* size of l2 cache  */
347*38fd1498Szrj   0,					/* size of prefetch block */
348*38fd1498Szrj   0,					/* number of parallel prefetches */
349*38fd1498Szrj   2,					/* Branch cost */
350*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
351*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
352*38fd1498Szrj   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
353*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
354*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
355*38fd1498Szrj   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
356*38fd1498Szrj 
357*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
358*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
359*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
360*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
361*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
362*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
363*38fd1498Szrj   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
364*38fd1498Szrj   COSTS_N_INSNS (39),			/* cost of DIVSD instruction.  */
365*38fd1498Szrj   COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
366*38fd1498Szrj   COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
367*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
368*38fd1498Szrj   pentium_memcpy,
369*38fd1498Szrj   pentium_memset,
370*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
371*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
372*38fd1498Szrj };
373*38fd1498Szrj 
374*38fd1498Szrj static const
375*38fd1498Szrj struct processor_costs lakemont_cost = {
376*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
377*38fd1498Szrj   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
378*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
379*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
380*38fd1498Szrj   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
381*38fd1498Szrj    COSTS_N_INSNS (11),			/*				 HI */
382*38fd1498Szrj    COSTS_N_INSNS (11),			/*				 SI */
383*38fd1498Szrj    COSTS_N_INSNS (11),			/*				 DI */
384*38fd1498Szrj    COSTS_N_INSNS (11)},			/*			      other */
385*38fd1498Szrj   0,					/* cost of multiply per each bit set */
386*38fd1498Szrj   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
387*38fd1498Szrj    COSTS_N_INSNS (25),			/*			    HI */
388*38fd1498Szrj    COSTS_N_INSNS (25),			/*			    SI */
389*38fd1498Szrj    COSTS_N_INSNS (25),			/*			    DI */
390*38fd1498Szrj    COSTS_N_INSNS (25)},			/*			    other */
391*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of movsx */
392*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of movzx */
393*38fd1498Szrj   8,					/* "large" insn */
394*38fd1498Szrj   17,					/* MOVE_RATIO */
395*38fd1498Szrj 
396*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
397*38fd1498Szrj      they are latency*2. */
398*38fd1498Szrj   6,				     /* cost for loading QImode using movzbl */
399*38fd1498Szrj   {2, 4, 2},				/* cost of loading integer registers
400*38fd1498Szrj 					   in QImode, HImode and SImode.
401*38fd1498Szrj 					   Relative to reg-reg move (2).  */
402*38fd1498Szrj   {2, 4, 2},				/* cost of storing integer registers */
403*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
404*38fd1498Szrj   {2, 2, 6},				/* cost of loading fp registers
405*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
406*38fd1498Szrj   {4, 4, 6},				/* cost of storing fp registers
407*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
408*38fd1498Szrj   8,					/* cost of moving MMX register */
409*38fd1498Szrj   {8, 8},				/* cost of loading MMX registers
410*38fd1498Szrj 					   in SImode and DImode */
411*38fd1498Szrj   {8, 8},				/* cost of storing MMX registers
412*38fd1498Szrj 					   in SImode and DImode */
413*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
414*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
415*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
416*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
417*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
418*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
419*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
420*38fd1498Szrj   3, 3,					/* SSE->integer and integer->SSE moves */
421*38fd1498Szrj   4, 4,					/* Gather load static, per_elt.  */
422*38fd1498Szrj   4, 4,					/* Gather store static, per_elt.  */
423*38fd1498Szrj   8,					/* size of l1 cache.  */
424*38fd1498Szrj   8,					/* size of l2 cache  */
425*38fd1498Szrj   0,					/* size of prefetch block */
426*38fd1498Szrj   0,					/* number of parallel prefetches */
427*38fd1498Szrj   2,					/* Branch cost */
428*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
429*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
430*38fd1498Szrj   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
431*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
432*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
433*38fd1498Szrj   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
434*38fd1498Szrj 
435*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
436*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
437*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of MULSS instruction.  */
438*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
439*38fd1498Szrj   COSTS_N_INSNS (10),			/* cost of FMA SS instruction.  */
440*38fd1498Szrj   COSTS_N_INSNS (10),			/* cost of FMA SD instruction.  */
441*38fd1498Szrj   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
442*38fd1498Szrj   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
443*38fd1498Szrj   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
444*38fd1498Szrj   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
445*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
446*38fd1498Szrj   pentium_memcpy,
447*38fd1498Szrj   pentium_memset,
448*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
449*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
450*38fd1498Szrj };
451*38fd1498Szrj 
452*38fd1498Szrj /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
453*38fd1498Szrj    (we ensure the alignment).  For small blocks inline loop is still a
454*38fd1498Szrj    noticeable win, for bigger blocks either rep movsl or rep movsb is
455*38fd1498Szrj    way to go.  Rep movsb has apparently more expensive startup time in CPU,
456*38fd1498Szrj    but after 4K the difference is down in the noise.  */
457*38fd1498Szrj static stringop_algs pentiumpro_memcpy[2] = {
458*38fd1498Szrj   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
459*38fd1498Szrj                        {8192, rep_prefix_4_byte, false},
460*38fd1498Szrj                        {-1, rep_prefix_1_byte, false}}},
461*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
462*38fd1498Szrj static stringop_algs pentiumpro_memset[2] = {
463*38fd1498Szrj   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
464*38fd1498Szrj                        {8192, rep_prefix_4_byte, false},
465*38fd1498Szrj                        {-1, libcall, false}}},
466*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
467*38fd1498Szrj static const
468*38fd1498Szrj struct processor_costs pentiumpro_cost = {
469*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
470*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
471*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
472*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
473*38fd1498Szrj   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
474*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
475*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 SI */
476*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 DI */
477*38fd1498Szrj    COSTS_N_INSNS (4)},			/*			      other */
478*38fd1498Szrj   0,					/* cost of multiply per each bit set */
479*38fd1498Szrj   {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
480*38fd1498Szrj    COSTS_N_INSNS (17),			/*			    HI */
481*38fd1498Szrj    COSTS_N_INSNS (17),			/*			    SI */
482*38fd1498Szrj    COSTS_N_INSNS (17),			/*			    DI */
483*38fd1498Szrj    COSTS_N_INSNS (17)},			/*			    other */
484*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
485*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
486*38fd1498Szrj   8,					/* "large" insn */
487*38fd1498Szrj   6,					/* MOVE_RATIO */
488*38fd1498Szrj 
489*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
490*38fd1498Szrj      they are latency*2. */
491*38fd1498Szrj   2,				     /* cost for loading QImode using movzbl */
492*38fd1498Szrj   {4, 4, 4},				/* cost of loading integer registers
493*38fd1498Szrj 					   in QImode, HImode and SImode.
494*38fd1498Szrj 					   Relative to reg-reg move (2).  */
495*38fd1498Szrj   {2, 2, 2},				/* cost of storing integer registers */
496*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
497*38fd1498Szrj   {2, 2, 6},				/* cost of loading fp registers
498*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
499*38fd1498Szrj   {4, 4, 6},				/* cost of storing fp registers
500*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
501*38fd1498Szrj   2,					/* cost of moving MMX register */
502*38fd1498Szrj   {2, 2},				/* cost of loading MMX registers
503*38fd1498Szrj 					   in SImode and DImode */
504*38fd1498Szrj   {2, 2},				/* cost of storing MMX registers
505*38fd1498Szrj 					   in SImode and DImode */
506*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
507*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
508*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
509*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
510*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
511*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
512*38fd1498Szrj   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
513*38fd1498Szrj   3, 3,					/* SSE->integer and integer->SSE moves */
514*38fd1498Szrj   4, 4,					/* Gather load static, per_elt.  */
515*38fd1498Szrj   4, 4,					/* Gather store static, per_elt.  */
516*38fd1498Szrj   8,					/* size of l1 cache.  */
517*38fd1498Szrj   256,					/* size of l2 cache  */
518*38fd1498Szrj   32,					/* size of prefetch block */
519*38fd1498Szrj   6,					/* number of parallel prefetches */
520*38fd1498Szrj   2,					/* Branch cost */
521*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
522*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
523*38fd1498Szrj   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
524*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
525*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
526*38fd1498Szrj   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
527*38fd1498Szrj 
528*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
529*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
530*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
531*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
532*38fd1498Szrj   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
533*38fd1498Szrj   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
534*38fd1498Szrj   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
535*38fd1498Szrj   COSTS_N_INSNS (18),			/* cost of DIVSD instruction.  */
536*38fd1498Szrj   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
537*38fd1498Szrj   COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
538*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
539*38fd1498Szrj   pentiumpro_memcpy,
540*38fd1498Szrj   pentiumpro_memset,
541*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
542*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
543*38fd1498Szrj };
544*38fd1498Szrj 
545*38fd1498Szrj static stringop_algs geode_memcpy[2] = {
546*38fd1498Szrj   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
547*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
548*38fd1498Szrj static stringop_algs geode_memset[2] = {
549*38fd1498Szrj   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
550*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
551*38fd1498Szrj static const
552*38fd1498Szrj struct processor_costs geode_cost = {
553*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
554*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
555*38fd1498Szrj   COSTS_N_INSNS (2),			/* variable shift costs */
556*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
557*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
558*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
559*38fd1498Szrj    COSTS_N_INSNS (7),			/*				 SI */
560*38fd1498Szrj    COSTS_N_INSNS (7),			/*				 DI */
561*38fd1498Szrj    COSTS_N_INSNS (7)},			/*			      other */
562*38fd1498Szrj   0,					/* cost of multiply per each bit set */
563*38fd1498Szrj   {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
564*38fd1498Szrj    COSTS_N_INSNS (23),			/*			    HI */
565*38fd1498Szrj    COSTS_N_INSNS (39),			/*			    SI */
566*38fd1498Szrj    COSTS_N_INSNS (39),			/*			    DI */
567*38fd1498Szrj    COSTS_N_INSNS (39)},			/*			    other */
568*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
569*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
570*38fd1498Szrj   8,					/* "large" insn */
571*38fd1498Szrj   4,					/* MOVE_RATIO */
572*38fd1498Szrj 
573*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
574*38fd1498Szrj      they are latency*2. */
575*38fd1498Szrj   2,				     /* cost for loading QImode using movzbl */
576*38fd1498Szrj   {2, 2, 2},				/* cost of loading integer registers
577*38fd1498Szrj 					   in QImode, HImode and SImode.
578*38fd1498Szrj 					   Relative to reg-reg move (2).  */
579*38fd1498Szrj   {2, 2, 2},				/* cost of storing integer registers */
580*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
581*38fd1498Szrj   {2, 2, 2},				/* cost of loading fp registers
582*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
583*38fd1498Szrj   {4, 6, 6},				/* cost of storing fp registers
584*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
585*38fd1498Szrj 
586*38fd1498Szrj   2,					/* cost of moving MMX register */
587*38fd1498Szrj   {2, 2},				/* cost of loading MMX registers
588*38fd1498Szrj 					   in SImode and DImode */
589*38fd1498Szrj   {2, 2},				/* cost of storing MMX registers
590*38fd1498Szrj 					   in SImode and DImode */
591*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
592*38fd1498Szrj   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
593*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
594*38fd1498Szrj   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
595*38fd1498Szrj   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
596*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
597*38fd1498Szrj   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
598*38fd1498Szrj   6, 6,					/* SSE->integer and integer->SSE moves */
599*38fd1498Szrj   2, 2,					/* Gather load static, per_elt.  */
600*38fd1498Szrj   2, 2,					/* Gather store static, per_elt.  */
601*38fd1498Szrj   64,					/* size of l1 cache.  */
602*38fd1498Szrj   128,					/* size of l2 cache.  */
603*38fd1498Szrj   32,					/* size of prefetch block */
604*38fd1498Szrj   1,					/* number of parallel prefetches */
605*38fd1498Szrj   1,					/* Branch cost */
606*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
607*38fd1498Szrj   COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
608*38fd1498Szrj   COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
609*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
610*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
611*38fd1498Szrj   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
612*38fd1498Szrj 
613*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
614*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
615*38fd1498Szrj   COSTS_N_INSNS (11),			/* cost of MULSS instruction.  */
616*38fd1498Szrj   COSTS_N_INSNS (11),			/* cost of MULSD instruction.  */
617*38fd1498Szrj   COSTS_N_INSNS (17),			/* cost of FMA SS instruction.  */
618*38fd1498Szrj   COSTS_N_INSNS (17),			/* cost of FMA SD instruction.  */
619*38fd1498Szrj   COSTS_N_INSNS (47),			/* cost of DIVSS instruction.  */
620*38fd1498Szrj   COSTS_N_INSNS (47),			/* cost of DIVSD instruction.  */
621*38fd1498Szrj   COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
622*38fd1498Szrj   COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
623*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
624*38fd1498Szrj   geode_memcpy,
625*38fd1498Szrj   geode_memset,
626*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
627*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
628*38fd1498Szrj };
629*38fd1498Szrj 
630*38fd1498Szrj static stringop_algs k6_memcpy[2] = {
631*38fd1498Szrj   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
632*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
633*38fd1498Szrj static stringop_algs k6_memset[2] = {
634*38fd1498Szrj   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
635*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
636*38fd1498Szrj static const
637*38fd1498Szrj struct processor_costs k6_cost = {
638*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
639*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of a lea instruction */
640*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
641*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
642*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
643*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 HI */
644*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
645*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 DI */
646*38fd1498Szrj    COSTS_N_INSNS (3)},			/*			      other */
647*38fd1498Szrj   0,					/* cost of multiply per each bit set */
648*38fd1498Szrj   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
649*38fd1498Szrj    COSTS_N_INSNS (18),			/*			    HI */
650*38fd1498Szrj    COSTS_N_INSNS (18),			/*			    SI */
651*38fd1498Szrj    COSTS_N_INSNS (18),			/*			    DI */
652*38fd1498Szrj    COSTS_N_INSNS (18)},			/*			    other */
653*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of movsx */
654*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of movzx */
655*38fd1498Szrj   8,					/* "large" insn */
656*38fd1498Szrj   4,					/* MOVE_RATIO */
657*38fd1498Szrj 
658*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
659*38fd1498Szrj      they are latency*2. */
660*38fd1498Szrj   3,				     /* cost for loading QImode using movzbl */
661*38fd1498Szrj   {4, 5, 4},				/* cost of loading integer registers
662*38fd1498Szrj 					   in QImode, HImode and SImode.
663*38fd1498Szrj 					   Relative to reg-reg move (2).  */
664*38fd1498Szrj   {2, 3, 2},				/* cost of storing integer registers */
665*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
666*38fd1498Szrj   {6, 6, 6},				/* cost of loading fp registers
667*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
668*38fd1498Szrj   {4, 4, 4},				/* cost of storing fp registers
669*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
670*38fd1498Szrj   2,					/* cost of moving MMX register */
671*38fd1498Szrj   {2, 2},				/* cost of loading MMX registers
672*38fd1498Szrj 					   in SImode and DImode */
673*38fd1498Szrj   {2, 2},				/* cost of storing MMX registers
674*38fd1498Szrj 					   in SImode and DImode */
675*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
676*38fd1498Szrj   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
677*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
678*38fd1498Szrj   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
679*38fd1498Szrj   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
680*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
681*38fd1498Szrj   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
682*38fd1498Szrj   6, 6,					/* SSE->integer and integer->SSE moves */
683*38fd1498Szrj   2, 2,					/* Gather load static, per_elt.  */
684*38fd1498Szrj   2, 2,					/* Gather store static, per_elt.  */
685*38fd1498Szrj   32,					/* size of l1 cache.  */
686*38fd1498Szrj   32,					/* size of l2 cache.  Some models
687*38fd1498Szrj 					   have integrated l2 cache, but
688*38fd1498Szrj 					   optimizing for k6 is not important
689*38fd1498Szrj 					   enough to worry about that.  */
690*38fd1498Szrj   32,					/* size of prefetch block */
691*38fd1498Szrj   1,					/* number of parallel prefetches */
692*38fd1498Szrj   1,					/* Branch cost */
693*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
694*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
695*38fd1498Szrj   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
696*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
697*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
698*38fd1498Szrj   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
699*38fd1498Szrj 
700*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
701*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
702*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
703*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of MULSD instruction.  */
704*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
705*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
706*38fd1498Szrj   COSTS_N_INSNS (56),			/* cost of DIVSS instruction.  */
707*38fd1498Szrj   COSTS_N_INSNS (56),			/* cost of DIVSD instruction.  */
708*38fd1498Szrj   COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
709*38fd1498Szrj   COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
710*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
711*38fd1498Szrj   k6_memcpy,
712*38fd1498Szrj   k6_memset,
713*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
714*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
715*38fd1498Szrj };
716*38fd1498Szrj 
717*38fd1498Szrj /* For some reason, Athlon deals better with REP prefix (relative to loops)
718*38fd1498Szrj    compared to K8. Alignment becomes important after 8 bytes for memcpy and
719*38fd1498Szrj    128 bytes for memset.  */
720*38fd1498Szrj static stringop_algs athlon_memcpy[2] = {
721*38fd1498Szrj   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
722*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
723*38fd1498Szrj static stringop_algs athlon_memset[2] = {
724*38fd1498Szrj   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
725*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
726*38fd1498Szrj static const
727*38fd1498Szrj struct processor_costs athlon_cost = {
728*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
729*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of a lea instruction */
730*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
731*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
732*38fd1498Szrj   {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
733*38fd1498Szrj    COSTS_N_INSNS (5),			/*				 HI */
734*38fd1498Szrj    COSTS_N_INSNS (5),			/*				 SI */
735*38fd1498Szrj    COSTS_N_INSNS (5),			/*				 DI */
736*38fd1498Szrj    COSTS_N_INSNS (5)},			/*			      other */
737*38fd1498Szrj   0,					/* cost of multiply per each bit set */
738*38fd1498Szrj   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
739*38fd1498Szrj    COSTS_N_INSNS (26),			/*			    HI */
740*38fd1498Szrj    COSTS_N_INSNS (42),			/*			    SI */
741*38fd1498Szrj    COSTS_N_INSNS (74),			/*			    DI */
742*38fd1498Szrj    COSTS_N_INSNS (74)},			/*			    other */
743*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
744*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
745*38fd1498Szrj   8,					/* "large" insn */
746*38fd1498Szrj   9,					/* MOVE_RATIO */
747*38fd1498Szrj 
748*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
749*38fd1498Szrj      they are latency*2. */
750*38fd1498Szrj   4,				     /* cost for loading QImode using movzbl */
751*38fd1498Szrj   {3, 4, 3},				/* cost of loading integer registers
752*38fd1498Szrj 					   in QImode, HImode and SImode.
753*38fd1498Szrj 					   Relative to reg-reg move (2).  */
754*38fd1498Szrj   {3, 4, 3},				/* cost of storing integer registers */
755*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
756*38fd1498Szrj   {4, 4, 12},				/* cost of loading fp registers
757*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
758*38fd1498Szrj   {6, 6, 8},				/* cost of storing fp registers
759*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
760*38fd1498Szrj   2,					/* cost of moving MMX register */
761*38fd1498Szrj   {4, 4},				/* cost of loading MMX registers
762*38fd1498Szrj 					   in SImode and DImode */
763*38fd1498Szrj   {4, 4},				/* cost of storing MMX registers
764*38fd1498Szrj 					   in SImode and DImode */
765*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
766*38fd1498Szrj   {4, 4, 6, 12, 24},			/* cost of loading SSE registers
767*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
768*38fd1498Szrj   {4, 4, 6, 12, 24},			/* cost of unaligned loads.  */
769*38fd1498Szrj   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
770*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
771*38fd1498Szrj   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
772*38fd1498Szrj   5, 5,					/* SSE->integer and integer->SSE moves */
773*38fd1498Szrj   4, 4,					/* Gather load static, per_elt.  */
774*38fd1498Szrj   4, 4,					/* Gather store static, per_elt.  */
775*38fd1498Szrj   64,					/* size of l1 cache.  */
776*38fd1498Szrj   256,					/* size of l2 cache.  */
777*38fd1498Szrj   64,					/* size of prefetch block */
778*38fd1498Szrj   6,					/* number of parallel prefetches */
779*38fd1498Szrj   5,					/* Branch cost */
780*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
781*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
782*38fd1498Szrj   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
783*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
784*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
785*38fd1498Szrj   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
786*38fd1498Szrj 
787*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
788*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
789*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
790*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
791*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
792*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
793*38fd1498Szrj   /* 11-16  */
794*38fd1498Szrj   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
795*38fd1498Szrj   COSTS_N_INSNS (24),			/* cost of DIVSD instruction.  */
796*38fd1498Szrj   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
797*38fd1498Szrj   COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
798*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
799*38fd1498Szrj   athlon_memcpy,
800*38fd1498Szrj   athlon_memset,
801*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
802*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
803*38fd1498Szrj };
804*38fd1498Szrj 
805*38fd1498Szrj /* K8 has optimized REP instruction for medium sized blocks, but for very
806*38fd1498Szrj    small blocks it is better to use loop. For large blocks, libcall can
807*38fd1498Szrj    do nontemporary accesses and beat inline considerably.  */
808*38fd1498Szrj static stringop_algs k8_memcpy[2] = {
809*38fd1498Szrj   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
810*38fd1498Szrj              {-1, rep_prefix_4_byte, false}}},
811*38fd1498Szrj   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
812*38fd1498Szrj              {-1, libcall, false}}}};
813*38fd1498Szrj static stringop_algs k8_memset[2] = {
814*38fd1498Szrj   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
815*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
816*38fd1498Szrj   {libcall, {{48, unrolled_loop, false},
817*38fd1498Szrj              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
818*38fd1498Szrj static const
819*38fd1498Szrj struct processor_costs k8_cost = {
820*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
821*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of a lea instruction */
822*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
823*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
824*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
825*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
826*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
827*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 DI */
828*38fd1498Szrj    COSTS_N_INSNS (5)},			/*			      other */
829*38fd1498Szrj   0,					/* cost of multiply per each bit set */
830*38fd1498Szrj   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
831*38fd1498Szrj    COSTS_N_INSNS (26),			/*			    HI */
832*38fd1498Szrj    COSTS_N_INSNS (42),			/*			    SI */
833*38fd1498Szrj    COSTS_N_INSNS (74),			/*			    DI */
834*38fd1498Szrj    COSTS_N_INSNS (74)},			/*			    other */
835*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
836*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
837*38fd1498Szrj   8,					/* "large" insn */
838*38fd1498Szrj   9,					/* MOVE_RATIO */
839*38fd1498Szrj 
840*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
841*38fd1498Szrj      they are latency*2. */
842*38fd1498Szrj   4,				     /* cost for loading QImode using movzbl */
843*38fd1498Szrj   {3, 4, 3},				/* cost of loading integer registers
844*38fd1498Szrj 					   in QImode, HImode and SImode.
845*38fd1498Szrj 					   Relative to reg-reg move (2).  */
846*38fd1498Szrj   {3, 4, 3},				/* cost of storing integer registers */
847*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
848*38fd1498Szrj   {4, 4, 12},				/* cost of loading fp registers
849*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
850*38fd1498Szrj   {6, 6, 8},				/* cost of storing fp registers
851*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
852*38fd1498Szrj   2,					/* cost of moving MMX register */
853*38fd1498Szrj   {3, 3},				/* cost of loading MMX registers
854*38fd1498Szrj 					   in SImode and DImode */
855*38fd1498Szrj   {4, 4},				/* cost of storing MMX registers
856*38fd1498Szrj 					   in SImode and DImode */
857*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
858*38fd1498Szrj   {4, 3, 6, 12, 24},			/* cost of loading SSE registers
859*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
860*38fd1498Szrj   {4, 3, 6, 12, 24},			/* cost of unaligned loads.  */
861*38fd1498Szrj   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
862*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
863*38fd1498Szrj   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
864*38fd1498Szrj   5, 5,					/* SSE->integer and integer->SSE moves */
865*38fd1498Szrj   4, 4,					/* Gather load static, per_elt.  */
866*38fd1498Szrj   4, 4,					/* Gather store static, per_elt.  */
867*38fd1498Szrj   64,					/* size of l1 cache.  */
868*38fd1498Szrj   512,					/* size of l2 cache.  */
869*38fd1498Szrj   64,					/* size of prefetch block */
870*38fd1498Szrj   /* New AMD processors never drop prefetches; if they cannot be performed
871*38fd1498Szrj      immediately, they are queued.  We set number of simultaneous prefetches
872*38fd1498Szrj      to a large constant to reflect this (it probably is not a good idea not
873*38fd1498Szrj      to limit number of prefetches at all, as their execution also takes some
874*38fd1498Szrj      time).  */
875*38fd1498Szrj   100,					/* number of parallel prefetches */
876*38fd1498Szrj   3,					/* Branch cost */
877*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
878*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
879*38fd1498Szrj   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
880*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
881*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
882*38fd1498Szrj   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
883*38fd1498Szrj 
884*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
885*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
886*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
887*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
888*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
889*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
890*38fd1498Szrj   /* 11-16  */
891*38fd1498Szrj   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
892*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
893*38fd1498Szrj   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
894*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
895*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
896*38fd1498Szrj   k8_memcpy,
897*38fd1498Szrj   k8_memset,
898*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
899*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
900*38fd1498Szrj };
901*38fd1498Szrj 
902*38fd1498Szrj /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
903*38fd1498Szrj    very small blocks it is better to use loop. For large blocks, libcall can
904*38fd1498Szrj    do nontemporary accesses and beat inline considerably.  */
905*38fd1498Szrj static stringop_algs amdfam10_memcpy[2] = {
906*38fd1498Szrj   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
907*38fd1498Szrj              {-1, rep_prefix_4_byte, false}}},
908*38fd1498Szrj   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909*38fd1498Szrj              {-1, libcall, false}}}};
910*38fd1498Szrj static stringop_algs amdfam10_memset[2] = {
911*38fd1498Szrj   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
912*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
913*38fd1498Szrj   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
914*38fd1498Szrj              {-1, libcall, false}}}};
915*38fd1498Szrj struct processor_costs amdfam10_cost = {
916*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
917*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of a lea instruction */
918*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
919*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
920*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
921*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
922*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
923*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 DI */
924*38fd1498Szrj    COSTS_N_INSNS (5)},			/*			      other */
925*38fd1498Szrj   0,					/* cost of multiply per each bit set */
926*38fd1498Szrj   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
927*38fd1498Szrj    COSTS_N_INSNS (35),			/*			    HI */
928*38fd1498Szrj    COSTS_N_INSNS (51),			/*			    SI */
929*38fd1498Szrj    COSTS_N_INSNS (83),			/*			    DI */
930*38fd1498Szrj    COSTS_N_INSNS (83)},			/*			    other */
931*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
932*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
933*38fd1498Szrj   8,					/* "large" insn */
934*38fd1498Szrj   9,					/* MOVE_RATIO */
935*38fd1498Szrj 
936*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
937*38fd1498Szrj      they are latency*2. */
938*38fd1498Szrj   4,				     /* cost for loading QImode using movzbl */
939*38fd1498Szrj   {3, 4, 3},				/* cost of loading integer registers
940*38fd1498Szrj 					   in QImode, HImode and SImode.
941*38fd1498Szrj 					   Relative to reg-reg move (2).  */
942*38fd1498Szrj   {3, 4, 3},				/* cost of storing integer registers */
943*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
944*38fd1498Szrj   {4, 4, 12},				/* cost of loading fp registers
945*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
946*38fd1498Szrj   {6, 6, 8},				/* cost of storing fp registers
947*38fd1498Szrj  		   			   in SFmode, DFmode and XFmode */
948*38fd1498Szrj   2,					/* cost of moving MMX register */
949*38fd1498Szrj   {3, 3},				/* cost of loading MMX registers
950*38fd1498Szrj 					   in SImode and DImode */
951*38fd1498Szrj   {4, 4},				/* cost of storing MMX registers
952*38fd1498Szrj 					   in SImode and DImode */
953*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
954*38fd1498Szrj   {4, 4, 3, 6, 12},			/* cost of loading SSE registers
955*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
956*38fd1498Szrj   {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
957*38fd1498Szrj   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
958*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
959*38fd1498Szrj   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
960*38fd1498Szrj   3, 3,					/* SSE->integer and integer->SSE moves */
961*38fd1498Szrj   					/* On K8:
962*38fd1498Szrj   					    MOVD reg64, xmmreg Double FSTORE 4
963*38fd1498Szrj 					    MOVD reg32, xmmreg Double FSTORE 4
964*38fd1498Szrj 					   On AMDFAM10:
965*38fd1498Szrj 					    MOVD reg64, xmmreg Double FADD 3
966*38fd1498Szrj 							       1/1  1/1
967*38fd1498Szrj 					    MOVD reg32, xmmreg Double FADD 3
968*38fd1498Szrj 							       1/1  1/1 */
969*38fd1498Szrj   4, 4,					/* Gather load static, per_elt.  */
970*38fd1498Szrj   4, 4,					/* Gather store static, per_elt.  */
971*38fd1498Szrj   64,					/* size of l1 cache.  */
972*38fd1498Szrj   512,					/* size of l2 cache.  */
973*38fd1498Szrj   64,					/* size of prefetch block */
974*38fd1498Szrj   /* New AMD processors never drop prefetches; if they cannot be performed
975*38fd1498Szrj      immediately, they are queued.  We set number of simultaneous prefetches
976*38fd1498Szrj      to a large constant to reflect this (it probably is not a good idea not
977*38fd1498Szrj      to limit number of prefetches at all, as their execution also takes some
978*38fd1498Szrj      time).  */
979*38fd1498Szrj   100,					/* number of parallel prefetches */
980*38fd1498Szrj   2,					/* Branch cost */
981*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
982*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
983*38fd1498Szrj   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
984*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
985*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
986*38fd1498Szrj   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
987*38fd1498Szrj 
988*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
989*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
990*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
991*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
992*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
993*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
994*38fd1498Szrj   /* 11-16  */
995*38fd1498Szrj   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
996*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
997*38fd1498Szrj   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
998*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
999*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1000*38fd1498Szrj   amdfam10_memcpy,
1001*38fd1498Szrj   amdfam10_memset,
1002*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1003*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1004*38fd1498Szrj };
1005*38fd1498Szrj 
1006*38fd1498Szrj /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
1007*38fd1498Szrj     very small blocks it is better to use loop. For large blocks, libcall
1008*38fd1498Szrj     can do nontemporary accesses and beat inline considerably.  */
1009*38fd1498Szrj static stringop_algs bdver1_memcpy[2] = {
1010*38fd1498Szrj   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1011*38fd1498Szrj              {-1, rep_prefix_4_byte, false}}},
1012*38fd1498Szrj   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1013*38fd1498Szrj              {-1, libcall, false}}}};
1014*38fd1498Szrj static stringop_algs bdver1_memset[2] = {
1015*38fd1498Szrj   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1016*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1017*38fd1498Szrj   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1018*38fd1498Szrj              {-1, libcall, false}}}};
1019*38fd1498Szrj 
1020*38fd1498Szrj const struct processor_costs bdver1_cost = {
1021*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1022*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1023*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
1024*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
1025*38fd1498Szrj   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1026*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
1027*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 SI */
1028*38fd1498Szrj    COSTS_N_INSNS (6),			/*				 DI */
1029*38fd1498Szrj    COSTS_N_INSNS (6)},			/*			      other */
1030*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1031*38fd1498Szrj   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1032*38fd1498Szrj    COSTS_N_INSNS (35),			/*			    HI */
1033*38fd1498Szrj    COSTS_N_INSNS (51),			/*			    SI */
1034*38fd1498Szrj    COSTS_N_INSNS (83),			/*			    DI */
1035*38fd1498Szrj    COSTS_N_INSNS (83)},			/*			    other */
1036*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1037*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
1038*38fd1498Szrj   8,					/* "large" insn */
1039*38fd1498Szrj   9,					/* MOVE_RATIO */
1040*38fd1498Szrj 
1041*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1042*38fd1498Szrj      they are latency*2. */
1043*38fd1498Szrj   8,				     /* cost for loading QImode using movzbl */
1044*38fd1498Szrj   {8, 8, 8},				/* cost of loading integer registers
1045*38fd1498Szrj 					   in QImode, HImode and SImode.
1046*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1047*38fd1498Szrj   {8, 8, 8},				/* cost of storing integer registers */
1048*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
1049*38fd1498Szrj   {12, 12, 28},				/* cost of loading fp registers
1050*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
1051*38fd1498Szrj   {10, 10, 18},				/* cost of storing fp registers
1052*38fd1498Szrj  		   			   in SFmode, DFmode and XFmode */
1053*38fd1498Szrj   4,					/* cost of moving MMX register */
1054*38fd1498Szrj   {12, 12},				/* cost of loading MMX registers
1055*38fd1498Szrj 					   in SImode and DImode */
1056*38fd1498Szrj   {10, 10},				/* cost of storing MMX registers
1057*38fd1498Szrj 					   in SImode and DImode */
1058*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1059*38fd1498Szrj   {12, 12, 10, 20, 30},			/* cost of loading SSE registers
1060*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1061*38fd1498Szrj   {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
1062*38fd1498Szrj   {10, 10, 10, 20, 30},			/* cost of storing SSE registers
1063*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1064*38fd1498Szrj   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
1065*38fd1498Szrj   16, 20,				/* SSE->integer and integer->SSE moves */
1066*38fd1498Szrj   12, 12,				/* Gather load static, per_elt.  */
1067*38fd1498Szrj   10, 10,				/* Gather store static, per_elt.  */
1068*38fd1498Szrj   16,					/* size of l1 cache.  */
1069*38fd1498Szrj   2048,					/* size of l2 cache.  */
1070*38fd1498Szrj   64,					/* size of prefetch block */
1071*38fd1498Szrj   /* New AMD processors never drop prefetches; if they cannot be performed
1072*38fd1498Szrj      immediately, they are queued.  We set number of simultaneous prefetches
1073*38fd1498Szrj      to a large constant to reflect this (it probably is not a good idea not
1074*38fd1498Szrj      to limit number of prefetches at all, as their execution also takes some
1075*38fd1498Szrj      time).  */
1076*38fd1498Szrj   100,					/* number of parallel prefetches */
1077*38fd1498Szrj   2,					/* Branch cost */
1078*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1079*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1080*38fd1498Szrj   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1081*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1082*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1083*38fd1498Szrj   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1084*38fd1498Szrj 
1085*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1086*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1087*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1088*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1089*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1090*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1091*38fd1498Szrj   /* 9-24  */
1092*38fd1498Szrj   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1093*38fd1498Szrj   /* 9-27  */
1094*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1095*38fd1498Szrj   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1096*38fd1498Szrj   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1097*38fd1498Szrj   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1098*38fd1498Szrj   bdver1_memcpy,
1099*38fd1498Szrj   bdver1_memset,
1100*38fd1498Szrj   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1101*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1102*38fd1498Szrj };
1103*38fd1498Szrj 
1104*38fd1498Szrj /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
1105*38fd1498Szrj     very small blocks it is better to use loop. For large blocks, libcall
1106*38fd1498Szrj     can do nontemporary accesses and beat inline considerably.  */
1107*38fd1498Szrj 
1108*38fd1498Szrj static stringop_algs bdver2_memcpy[2] = {
1109*38fd1498Szrj   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1110*38fd1498Szrj              {-1, rep_prefix_4_byte, false}}},
1111*38fd1498Szrj   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1112*38fd1498Szrj              {-1, libcall, false}}}};
1113*38fd1498Szrj static stringop_algs bdver2_memset[2] = {
1114*38fd1498Szrj   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1115*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1116*38fd1498Szrj   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1117*38fd1498Szrj              {-1, libcall, false}}}};
1118*38fd1498Szrj 
1119*38fd1498Szrj const struct processor_costs bdver2_cost = {
1120*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1121*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1122*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
1123*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
1124*38fd1498Szrj   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1125*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
1126*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 SI */
1127*38fd1498Szrj    COSTS_N_INSNS (6),			/*				 DI */
1128*38fd1498Szrj    COSTS_N_INSNS (6)},			/*			      other */
1129*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1130*38fd1498Szrj   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1131*38fd1498Szrj    COSTS_N_INSNS (35),			/*			    HI */
1132*38fd1498Szrj    COSTS_N_INSNS (51),			/*			    SI */
1133*38fd1498Szrj    COSTS_N_INSNS (83),			/*			    DI */
1134*38fd1498Szrj    COSTS_N_INSNS (83)},			/*			    other */
1135*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1136*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
1137*38fd1498Szrj   8,					/* "large" insn */
1138*38fd1498Szrj   9,					/* MOVE_RATIO */
1139*38fd1498Szrj 
1140*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1141*38fd1498Szrj      they are latency*2. */
1142*38fd1498Szrj   8,				     /* cost for loading QImode using movzbl */
1143*38fd1498Szrj   {8, 8, 8},				/* cost of loading integer registers
1144*38fd1498Szrj 					   in QImode, HImode and SImode.
1145*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1146*38fd1498Szrj   {8, 8, 8},				/* cost of storing integer registers */
1147*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
1148*38fd1498Szrj   {12, 12, 28},				/* cost of loading fp registers
1149*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
1150*38fd1498Szrj   {10, 10, 18},				/* cost of storing fp registers
1151*38fd1498Szrj  		   			   in SFmode, DFmode and XFmode */
1152*38fd1498Szrj   4,					/* cost of moving MMX register */
1153*38fd1498Szrj   {12, 12},				/* cost of loading MMX registers
1154*38fd1498Szrj 					   in SImode and DImode */
1155*38fd1498Szrj   {10, 10},				/* cost of storing MMX registers
1156*38fd1498Szrj 					   in SImode and DImode */
1157*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1158*38fd1498Szrj   {12, 12, 10, 20, 30},			/* cost of loading SSE registers
1159*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1160*38fd1498Szrj   {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
1161*38fd1498Szrj   {10, 10, 10, 20, 30},			/* cost of storing SSE registers
1162*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1163*38fd1498Szrj   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
1164*38fd1498Szrj   16, 20,				/* SSE->integer and integer->SSE moves */
1165*38fd1498Szrj   12, 12,				/* Gather load static, per_elt.  */
1166*38fd1498Szrj   10, 10,				/* Gather store static, per_elt.  */
1167*38fd1498Szrj   16,					/* size of l1 cache.  */
1168*38fd1498Szrj   2048,					/* size of l2 cache.  */
1169*38fd1498Szrj   64,					/* size of prefetch block */
1170*38fd1498Szrj   /* New AMD processors never drop prefetches; if they cannot be performed
1171*38fd1498Szrj      immediately, they are queued.  We set number of simultaneous prefetches
1172*38fd1498Szrj      to a large constant to reflect this (it probably is not a good idea not
1173*38fd1498Szrj      to limit number of prefetches at all, as their execution also takes some
1174*38fd1498Szrj      time).  */
1175*38fd1498Szrj   100,					/* number of parallel prefetches */
1176*38fd1498Szrj   2,					/* Branch cost */
1177*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1178*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1179*38fd1498Szrj   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1180*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1181*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1182*38fd1498Szrj   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1183*38fd1498Szrj 
1184*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1185*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1186*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1187*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1188*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1189*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1190*38fd1498Szrj   /* 9-24  */
1191*38fd1498Szrj   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1192*38fd1498Szrj   /* 9-27  */
1193*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1194*38fd1498Szrj   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1195*38fd1498Szrj   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1196*38fd1498Szrj   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1197*38fd1498Szrj   bdver2_memcpy,
1198*38fd1498Szrj   bdver2_memset,
1199*38fd1498Szrj   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1200*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1201*38fd1498Szrj };
1202*38fd1498Szrj 
1203*38fd1498Szrj 
1204*38fd1498Szrj   /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
1205*38fd1498Szrj       very small blocks it is better to use loop. For large blocks, libcall
1206*38fd1498Szrj       can do nontemporary accesses and beat inline considerably.  */
1207*38fd1498Szrj static stringop_algs bdver3_memcpy[2] = {
1208*38fd1498Szrj   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1209*38fd1498Szrj              {-1, rep_prefix_4_byte, false}}},
1210*38fd1498Szrj   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1211*38fd1498Szrj              {-1, libcall, false}}}};
1212*38fd1498Szrj static stringop_algs bdver3_memset[2] = {
1213*38fd1498Szrj   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1214*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1215*38fd1498Szrj   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1216*38fd1498Szrj              {-1, libcall, false}}}};
1217*38fd1498Szrj struct processor_costs bdver3_cost = {
1218*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1219*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1220*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
1221*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
1222*38fd1498Szrj   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1223*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
1224*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 SI */
1225*38fd1498Szrj    COSTS_N_INSNS (6),			/*				 DI */
1226*38fd1498Szrj    COSTS_N_INSNS (6)},			/*			      other */
1227*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1228*38fd1498Szrj   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1229*38fd1498Szrj    COSTS_N_INSNS (35),			/*			    HI */
1230*38fd1498Szrj    COSTS_N_INSNS (51),			/*			    SI */
1231*38fd1498Szrj    COSTS_N_INSNS (83),			/*			    DI */
1232*38fd1498Szrj    COSTS_N_INSNS (83)},			/*			    other */
1233*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1234*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
1235*38fd1498Szrj   8,					/* "large" insn */
1236*38fd1498Szrj   9,					/* MOVE_RATIO */
1237*38fd1498Szrj 
1238*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1239*38fd1498Szrj      they are latency*2. */
1240*38fd1498Szrj   8,				     /* cost for loading QImode using movzbl */
1241*38fd1498Szrj   {8, 8, 8},				/* cost of loading integer registers
1242*38fd1498Szrj 					   in QImode, HImode and SImode.
1243*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1244*38fd1498Szrj   {8, 8, 8},				/* cost of storing integer registers */
1245*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
1246*38fd1498Szrj   {12, 12, 28},				/* cost of loading fp registers
1247*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
1248*38fd1498Szrj   {10, 10, 18},				/* cost of storing fp registers
1249*38fd1498Szrj  		   			   in SFmode, DFmode and XFmode */
1250*38fd1498Szrj   4,					/* cost of moving MMX register */
1251*38fd1498Szrj   {12, 12},				/* cost of loading MMX registers
1252*38fd1498Szrj 					   in SImode and DImode */
1253*38fd1498Szrj   {10, 10},				/* cost of storing MMX registers
1254*38fd1498Szrj 					   in SImode and DImode */
1255*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1256*38fd1498Szrj   {12, 12, 10, 20, 30},			/* cost of loading SSE registers
1257*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1258*38fd1498Szrj   {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
1259*38fd1498Szrj   {10, 10, 10, 20, 30},			/* cost of storing SSE registers
1260*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1261*38fd1498Szrj   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
1262*38fd1498Szrj   16, 20,				/* SSE->integer and integer->SSE moves */
1263*38fd1498Szrj   12, 12,				/* Gather load static, per_elt.  */
1264*38fd1498Szrj   10, 10,				/* Gather store static, per_elt.  */
1265*38fd1498Szrj   16,					/* size of l1 cache.  */
1266*38fd1498Szrj   2048,					/* size of l2 cache.  */
1267*38fd1498Szrj   64,					/* size of prefetch block */
1268*38fd1498Szrj   /* New AMD processors never drop prefetches; if they cannot be performed
1269*38fd1498Szrj      immediately, they are queued.  We set number of simultaneous prefetches
1270*38fd1498Szrj      to a large constant to reflect this (it probably is not a good idea not
1271*38fd1498Szrj      to limit number of prefetches at all, as their execution also takes some
1272*38fd1498Szrj      time).  */
1273*38fd1498Szrj   100,					/* number of parallel prefetches */
1274*38fd1498Szrj   2,					/* Branch cost */
1275*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1276*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1277*38fd1498Szrj   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1278*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1279*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1280*38fd1498Szrj   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1281*38fd1498Szrj 
1282*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1283*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1284*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1285*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1286*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1287*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1288*38fd1498Szrj   /* 9-24  */
1289*38fd1498Szrj   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1290*38fd1498Szrj   /* 9-27  */
1291*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1292*38fd1498Szrj   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1293*38fd1498Szrj   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1294*38fd1498Szrj   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1295*38fd1498Szrj   bdver3_memcpy,
1296*38fd1498Szrj   bdver3_memset,
1297*38fd1498Szrj   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1298*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1299*38fd1498Szrj };
1300*38fd1498Szrj 
1301*38fd1498Szrj /*  BDVER4 has optimized REP instruction for medium sized blocks, but for
1302*38fd1498Szrj     very small blocks it is better to use loop. For large blocks, libcall
1303*38fd1498Szrj     can do nontemporary accesses and beat inline considerably.  */
1304*38fd1498Szrj static stringop_algs bdver4_memcpy[2] = {
1305*38fd1498Szrj   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1306*38fd1498Szrj              {-1, rep_prefix_4_byte, false}}},
1307*38fd1498Szrj   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1308*38fd1498Szrj              {-1, libcall, false}}}};
1309*38fd1498Szrj static stringop_algs bdver4_memset[2] = {
1310*38fd1498Szrj   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1311*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1312*38fd1498Szrj   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1313*38fd1498Szrj              {-1, libcall, false}}}};
1314*38fd1498Szrj struct processor_costs bdver4_cost = {
1315*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1316*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1317*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
1318*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
1319*38fd1498Szrj   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1320*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
1321*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 SI */
1322*38fd1498Szrj    COSTS_N_INSNS (6),			/*				 DI */
1323*38fd1498Szrj    COSTS_N_INSNS (6)},			/*			      other */
1324*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1325*38fd1498Szrj   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1326*38fd1498Szrj    COSTS_N_INSNS (35),			/*			    HI */
1327*38fd1498Szrj    COSTS_N_INSNS (51),			/*			    SI */
1328*38fd1498Szrj    COSTS_N_INSNS (83),			/*			    DI */
1329*38fd1498Szrj    COSTS_N_INSNS (83)},			/*			    other */
1330*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1331*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
1332*38fd1498Szrj   8,					/* "large" insn */
1333*38fd1498Szrj   9,					/* MOVE_RATIO */
1334*38fd1498Szrj 
1335*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1336*38fd1498Szrj      they are latency*2. */
1337*38fd1498Szrj   8,				     /* cost for loading QImode using movzbl */
1338*38fd1498Szrj   {8, 8, 8},				/* cost of loading integer registers
1339*38fd1498Szrj 					   in QImode, HImode and SImode.
1340*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1341*38fd1498Szrj   {8, 8, 8},				/* cost of storing integer registers */
1342*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
1343*38fd1498Szrj   {12, 12, 28},				/* cost of loading fp registers
1344*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode */
1345*38fd1498Szrj   {10, 10, 18},				/* cost of storing fp registers
1346*38fd1498Szrj  		   			   in SFmode, DFmode and XFmode */
1347*38fd1498Szrj   4,					/* cost of moving MMX register */
1348*38fd1498Szrj   {12, 12},				/* cost of loading MMX registers
1349*38fd1498Szrj 					   in SImode and DImode */
1350*38fd1498Szrj   {10, 10},				/* cost of storing MMX registers
1351*38fd1498Szrj 					   in SImode and DImode */
1352*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1353*38fd1498Szrj   {12, 12, 10, 20, 30},			/* cost of loading SSE registers
1354*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1355*38fd1498Szrj   {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
1356*38fd1498Szrj   {10, 10, 10, 20, 30},			/* cost of storing SSE registers
1357*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1358*38fd1498Szrj   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
1359*38fd1498Szrj   16, 20,				/* SSE->integer and integer->SSE moves */
1360*38fd1498Szrj   12, 12,				/* Gather load static, per_elt.  */
1361*38fd1498Szrj   10, 10,				/* Gather store static, per_elt.  */
1362*38fd1498Szrj   16,					/* size of l1 cache.  */
1363*38fd1498Szrj   2048,					/* size of l2 cache.  */
1364*38fd1498Szrj   64,					/* size of prefetch block */
1365*38fd1498Szrj   /* New AMD processors never drop prefetches; if they cannot be performed
1366*38fd1498Szrj      immediately, they are queued.  We set number of simultaneous prefetches
1367*38fd1498Szrj      to a large constant to reflect this (it probably is not a good idea not
1368*38fd1498Szrj      to limit number of prefetches at all, as their execution also takes some
1369*38fd1498Szrj      time).  */
1370*38fd1498Szrj   100,					/* number of parallel prefetches */
1371*38fd1498Szrj   2,					/* Branch cost */
1372*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1373*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1374*38fd1498Szrj   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1375*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1376*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1377*38fd1498Szrj   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1378*38fd1498Szrj 
1379*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1380*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1381*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1382*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1383*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1384*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1385*38fd1498Szrj   /* 9-24  */
1386*38fd1498Szrj   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1387*38fd1498Szrj   /* 9-27  */
1388*38fd1498Szrj   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1389*38fd1498Szrj   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1390*38fd1498Szrj   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1391*38fd1498Szrj   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1392*38fd1498Szrj   bdver4_memcpy,
1393*38fd1498Szrj   bdver4_memset,
1394*38fd1498Szrj   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1395*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1396*38fd1498Szrj };
1397*38fd1498Szrj 
1398*38fd1498Szrj 
1399*38fd1498Szrj /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1400*38fd1498Szrj     very small blocks it is better to use loop.  For large blocks, libcall
1401*38fd1498Szrj     can do nontemporary accesses and beat inline considerably.  */
1402*38fd1498Szrj static stringop_algs znver1_memcpy[2] = {
1403*38fd1498Szrj   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1404*38fd1498Szrj 	     {-1, rep_prefix_4_byte, false}}},
1405*38fd1498Szrj   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1406*38fd1498Szrj 	     {-1, libcall, false}}}};
1407*38fd1498Szrj static stringop_algs znver1_memset[2] = {
1408*38fd1498Szrj   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1409*38fd1498Szrj 	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1410*38fd1498Szrj   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1411*38fd1498Szrj 	     {-1, libcall, false}}}};
1412*38fd1498Szrj struct processor_costs znver1_cost = {
1413*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1414*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1415*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs.  */
1416*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs.  */
1417*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1418*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 HI.  */
1419*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI.  */
1420*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 DI.  */
1421*38fd1498Szrj    COSTS_N_INSNS (3)},			/*			      other.  */
1422*38fd1498Szrj   0,					/* cost of multiply per each bit
1423*38fd1498Szrj 					    set.  */
1424*38fd1498Szrj    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1425*38fd1498Szrj       bound.  */
1426*38fd1498Szrj   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1427*38fd1498Szrj    COSTS_N_INSNS (22),			/*			    HI.  */
1428*38fd1498Szrj    COSTS_N_INSNS (30),			/*			    SI.  */
1429*38fd1498Szrj    COSTS_N_INSNS (45),			/*			    DI.  */
1430*38fd1498Szrj    COSTS_N_INSNS (45)},			/*			    other.  */
1431*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx.  */
1432*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx.  */
1433*38fd1498Szrj   8,					/* "large" insn.  */
1434*38fd1498Szrj   9,					/* MOVE_RATIO.  */
1435*38fd1498Szrj 
1436*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1437*38fd1498Szrj      they are latency*2. */
1438*38fd1498Szrj 
1439*38fd1498Szrj   /* reg-reg moves are done by renaming and thus they are even cheaper than
1440*38fd1498Szrj      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1441*38fd1498Szrj      to doubles of latencies, we do not model this correctly.  It does not
1442*38fd1498Szrj      seem to make practical difference to bump prices up even more.  */
1443*38fd1498Szrj   6,					/* cost for loading QImode using
1444*38fd1498Szrj 					   movzbl.  */
1445*38fd1498Szrj   {6, 6, 6},				/* cost of loading integer registers
1446*38fd1498Szrj 					   in QImode, HImode and SImode.
1447*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1448*38fd1498Szrj   {8, 8, 8},				/* cost of storing integer
1449*38fd1498Szrj 					   registers.  */
1450*38fd1498Szrj   2,					/* cost of reg,reg fld/fst.  */
1451*38fd1498Szrj   {6, 6, 16},				/* cost of loading fp registers
1452*38fd1498Szrj 		   			   in SFmode, DFmode and XFmode.  */
1453*38fd1498Szrj   {8, 8, 16},				/* cost of storing fp registers
1454*38fd1498Szrj  		   			   in SFmode, DFmode and XFmode.  */
1455*38fd1498Szrj   2,					/* cost of moving MMX register.  */
1456*38fd1498Szrj   {6, 6},				/* cost of loading MMX registers
1457*38fd1498Szrj 					   in SImode and DImode.  */
1458*38fd1498Szrj   {8, 8},				/* cost of storing MMX registers
1459*38fd1498Szrj 					   in SImode and DImode.  */
1460*38fd1498Szrj   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
1461*38fd1498Szrj   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
1462*38fd1498Szrj 					   in 32,64,128,256 and 512-bit.  */
1463*38fd1498Szrj   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
1464*38fd1498Szrj   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
1465*38fd1498Szrj 					   in 32,64,128,256 and 512-bit.  */
1466*38fd1498Szrj   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1467*38fd1498Szrj   6, 6,					/* SSE->integer and integer->SSE moves.  */
1468*38fd1498Szrj   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1469*38fd1498Szrj      throughput 12.  Approx 9 uops do not depend on vector size and every load
1470*38fd1498Szrj      is 7 uops.  */
1471*38fd1498Szrj   18, 8,				/* Gather load static, per_elt.  */
1472*38fd1498Szrj   18, 10,				/* Gather store static, per_elt.  */
1473*38fd1498Szrj   32,					/* size of l1 cache.  */
1474*38fd1498Szrj   512,					/* size of l2 cache.  */
1475*38fd1498Szrj   64,					/* size of prefetch block.  */
1476*38fd1498Szrj   /* New AMD processors never drop prefetches; if they cannot be performed
1477*38fd1498Szrj      immediately, they are queued.  We set number of simultaneous prefetches
1478*38fd1498Szrj      to a large constant to reflect this (it probably is not a good idea not
1479*38fd1498Szrj      to limit number of prefetches at all, as their execution also takes some
1480*38fd1498Szrj      time).  */
1481*38fd1498Szrj   100,					/* number of parallel prefetches.  */
1482*38fd1498Szrj   3,					/* Branch cost.  */
1483*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1484*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1485*38fd1498Szrj   /* Latency of fdiv is 8-15.  */
1486*38fd1498Szrj   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1487*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1488*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1489*38fd1498Szrj   /* Latency of fsqrt is 4-10.  */
1490*38fd1498Szrj   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1491*38fd1498Szrj 
1492*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1493*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1494*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1495*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1496*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1497*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1498*38fd1498Szrj   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1499*38fd1498Szrj   /* 9-13  */
1500*38fd1498Szrj   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1501*38fd1498Szrj   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1502*38fd1498Szrj   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1503*38fd1498Szrj   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1504*38fd1498Szrj      and it can execute 2 integer additions and 2 multiplications thus
1505*38fd1498Szrj      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1506*38fd1498Szrj      that 4 works better than 6 probably due to register pressure.
1507*38fd1498Szrj 
1508*38fd1498Szrj      Integer vector operations are taken by FP unit and execute 3 vector
1509*38fd1498Szrj      plus/minus operations per cycle but only one multiply.  This is adjusted
1510*38fd1498Szrj      in ix86_reassociation_width.  */
1511*38fd1498Szrj   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1512*38fd1498Szrj   znver1_memcpy,
1513*38fd1498Szrj   znver1_memset,
1514*38fd1498Szrj   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1515*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1516*38fd1498Szrj };
1517*38fd1498Szrj 
1518*38fd1498Szrj /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1519*38fd1498Szrj static stringop_algs skylake_memcpy[2] =   {
1520*38fd1498Szrj   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1521*38fd1498Szrj   {libcall, {{16, loop, false}, {512, rep_prefix_8_byte, false},
1522*38fd1498Szrj              {-1, libcall, false}}}};
1523*38fd1498Szrj 
1524*38fd1498Szrj static stringop_algs skylake_memset[2] = {
1525*38fd1498Szrj   {libcall, {{6, loop_1_byte, true},
1526*38fd1498Szrj              {24, loop, true},
1527*38fd1498Szrj              {8192, rep_prefix_4_byte, true},
1528*38fd1498Szrj              {-1, libcall, false}}},
1529*38fd1498Szrj   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, false},
1530*38fd1498Szrj              {-1, libcall, false}}}};
1531*38fd1498Szrj 
1532*38fd1498Szrj static const
1533*38fd1498Szrj struct processor_costs skylake_cost = {
1534*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1535*38fd1498Szrj   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
1536*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
1537*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
1538*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1539*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
1540*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
1541*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 DI */
1542*38fd1498Szrj    COSTS_N_INSNS (3)},			/*			      other */
1543*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1544*38fd1498Szrj   /* Expanding div/mod currently doesn't consider parallelism. So the cost
1545*38fd1498Szrj      model is not realistic. We compensate by increasing the latencies a bit.  */
1546*38fd1498Szrj   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
1547*38fd1498Szrj    COSTS_N_INSNS (11),			/*			    HI */
1548*38fd1498Szrj    COSTS_N_INSNS (14),			/*			    SI */
1549*38fd1498Szrj    COSTS_N_INSNS (76),			/*			    DI */
1550*38fd1498Szrj    COSTS_N_INSNS (76)},			/*			    other */
1551*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1552*38fd1498Szrj   COSTS_N_INSNS (0),			/* cost of movzx */
1553*38fd1498Szrj   8,					/* "large" insn */
1554*38fd1498Szrj   17,					/* MOVE_RATIO */
1555*38fd1498Szrj 
1556*38fd1498Szrj   6,				     /* cost for loading QImode using movzbl */
1557*38fd1498Szrj   {4, 4, 4},				/* cost of loading integer registers
1558*38fd1498Szrj 					   in QImode, HImode and SImode.
1559*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1560*38fd1498Szrj   {6, 6, 3},				/* cost of storing integer registers */
1561*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
1562*38fd1498Szrj   {6, 6, 8},				/* cost of loading fp registers
1563*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1564*38fd1498Szrj   {6, 6, 10},				/* cost of storing fp registers
1565*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1566*38fd1498Szrj   2,					/* cost of moving MMX register */
1567*38fd1498Szrj   {6, 6},				/* cost of loading MMX registers
1568*38fd1498Szrj 					   in SImode and DImode */
1569*38fd1498Szrj   {6, 6},				/* cost of storing MMX registers
1570*38fd1498Szrj 					   in SImode and DImode */
1571*38fd1498Szrj   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
1572*38fd1498Szrj   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
1573*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1574*38fd1498Szrj   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
1575*38fd1498Szrj   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
1576*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1577*38fd1498Szrj   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1578*38fd1498Szrj   2, 2,					/* SSE->integer and integer->SSE moves */
1579*38fd1498Szrj   20, 8,				/* Gather load static, per_elt.  */
1580*38fd1498Szrj   22, 10,				/* Gather store static, per_elt.  */
1581*38fd1498Szrj   64,					/* size of l1 cache.  */
1582*38fd1498Szrj   512,					/* size of l2 cache.  */
1583*38fd1498Szrj   64,					/* size of prefetch block */
1584*38fd1498Szrj   6,					/* number of parallel prefetches */
1585*38fd1498Szrj   3,					/* Branch cost */
1586*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
1587*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1588*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1589*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1590*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1591*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
1592*38fd1498Szrj 
1593*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1594*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1595*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1596*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1597*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
1598*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
1599*38fd1498Szrj   COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
1600*38fd1498Szrj   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
1601*38fd1498Szrj   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
1602*38fd1498Szrj   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
1603*38fd1498Szrj   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
1604*38fd1498Szrj   skylake_memcpy,
1605*38fd1498Szrj   skylake_memset,
1606*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1607*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1608*38fd1498Szrj };
1609*38fd1498Szrj   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1610*38fd1498Szrj      very small blocks it is better to use loop. For large blocks, libcall can
1611*38fd1498Szrj      do nontemporary accesses and beat inline considerably.  */
1612*38fd1498Szrj static stringop_algs btver1_memcpy[2] = {
1613*38fd1498Szrj   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1614*38fd1498Szrj              {-1, rep_prefix_4_byte, false}}},
1615*38fd1498Szrj   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1616*38fd1498Szrj              {-1, libcall, false}}}};
1617*38fd1498Szrj static stringop_algs btver1_memset[2] = {
1618*38fd1498Szrj   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1619*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1620*38fd1498Szrj   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1621*38fd1498Szrj              {-1, libcall, false}}}};
1622*38fd1498Szrj const struct processor_costs btver1_cost = {
1623*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1624*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1625*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
1626*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
1627*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1628*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
1629*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
1630*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 DI */
1631*38fd1498Szrj    COSTS_N_INSNS (5)},			/*			      other */
1632*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1633*38fd1498Szrj   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1634*38fd1498Szrj    COSTS_N_INSNS (35),			/*			    HI */
1635*38fd1498Szrj    COSTS_N_INSNS (51),			/*			    SI */
1636*38fd1498Szrj    COSTS_N_INSNS (83),			/*			    DI */
1637*38fd1498Szrj    COSTS_N_INSNS (83)},			/*			    other */
1638*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1639*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
1640*38fd1498Szrj   8,					/* "large" insn */
1641*38fd1498Szrj   9,					/* MOVE_RATIO */
1642*38fd1498Szrj 
1643*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1644*38fd1498Szrj      they are latency*2. */
1645*38fd1498Szrj   8,				     /* cost for loading QImode using movzbl */
1646*38fd1498Szrj   {6, 8, 6},				/* cost of loading integer registers
1647*38fd1498Szrj 					   in QImode, HImode and SImode.
1648*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1649*38fd1498Szrj   {6, 8, 6},				/* cost of storing integer registers */
1650*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
1651*38fd1498Szrj   {12, 12, 28},				/* cost of loading fp registers
1652*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1653*38fd1498Szrj   {12, 12, 38},				/* cost of storing fp registers
1654*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1655*38fd1498Szrj   4,					/* cost of moving MMX register */
1656*38fd1498Szrj   {10, 10},				/* cost of loading MMX registers
1657*38fd1498Szrj 					   in SImode and DImode */
1658*38fd1498Szrj   {12, 12},				/* cost of storing MMX registers
1659*38fd1498Szrj 					   in SImode and DImode */
1660*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1661*38fd1498Szrj   {10, 10, 12, 24, 48},			/* cost of loading SSE registers
1662*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1663*38fd1498Szrj   {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
1664*38fd1498Szrj   {10, 10, 12, 24, 48},			/* cost of storing SSE registers
1665*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1666*38fd1498Szrj   {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
1667*38fd1498Szrj   14, 14,				/* SSE->integer and integer->SSE moves */
1668*38fd1498Szrj   10, 10,				/* Gather load static, per_elt.  */
1669*38fd1498Szrj   10, 10,				/* Gather store static, per_elt.  */
1670*38fd1498Szrj   32,					/* size of l1 cache.  */
1671*38fd1498Szrj   512,					/* size of l2 cache.  */
1672*38fd1498Szrj   64,					/* size of prefetch block */
1673*38fd1498Szrj   100,					/* number of parallel prefetches */
1674*38fd1498Szrj   2,					/* Branch cost */
1675*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1676*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1677*38fd1498Szrj   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1678*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1679*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1680*38fd1498Szrj   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1681*38fd1498Szrj 
1682*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1683*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1684*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
1685*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1686*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1687*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1688*38fd1498Szrj   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
1689*38fd1498Szrj   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
1690*38fd1498Szrj   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
1691*38fd1498Szrj   COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
1692*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1693*38fd1498Szrj   btver1_memcpy,
1694*38fd1498Szrj   btver1_memset,
1695*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1696*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1697*38fd1498Szrj };
1698*38fd1498Szrj 
1699*38fd1498Szrj static stringop_algs btver2_memcpy[2] = {
1700*38fd1498Szrj   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1701*38fd1498Szrj              {-1, rep_prefix_4_byte, false}}},
1702*38fd1498Szrj   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1703*38fd1498Szrj              {-1, libcall, false}}}};
1704*38fd1498Szrj static stringop_algs btver2_memset[2] = {
1705*38fd1498Szrj   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1706*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1707*38fd1498Szrj   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1708*38fd1498Szrj              {-1, libcall, false}}}};
1709*38fd1498Szrj const struct processor_costs btver2_cost = {
1710*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1711*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1712*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
1713*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
1714*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1715*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
1716*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
1717*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 DI */
1718*38fd1498Szrj    COSTS_N_INSNS (5)},			/*			      other */
1719*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1720*38fd1498Szrj   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1721*38fd1498Szrj    COSTS_N_INSNS (35),			/*			    HI */
1722*38fd1498Szrj    COSTS_N_INSNS (51),			/*			    SI */
1723*38fd1498Szrj    COSTS_N_INSNS (83),			/*			    DI */
1724*38fd1498Szrj    COSTS_N_INSNS (83)},			/*			    other */
1725*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1726*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
1727*38fd1498Szrj   8,					/* "large" insn */
1728*38fd1498Szrj   9,					/* MOVE_RATIO */
1729*38fd1498Szrj 
1730*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1731*38fd1498Szrj      they are latency*2. */
1732*38fd1498Szrj   8,				     /* cost for loading QImode using movzbl */
1733*38fd1498Szrj   {8, 8, 6},				/* cost of loading integer registers
1734*38fd1498Szrj 					   in QImode, HImode and SImode.
1735*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1736*38fd1498Szrj   {8, 8, 6},				/* cost of storing integer registers */
1737*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
1738*38fd1498Szrj   {12, 12, 28},				/* cost of loading fp registers
1739*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1740*38fd1498Szrj   {12, 12, 38},				/* cost of storing fp registers
1741*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1742*38fd1498Szrj   4,					/* cost of moving MMX register */
1743*38fd1498Szrj   {10, 10},				/* cost of loading MMX registers
1744*38fd1498Szrj 					   in SImode and DImode */
1745*38fd1498Szrj   {12, 12},				/* cost of storing MMX registers
1746*38fd1498Szrj 					   in SImode and DImode */
1747*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1748*38fd1498Szrj   {10, 10, 12, 24, 48},			/* cost of loading SSE registers
1749*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1750*38fd1498Szrj   {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
1751*38fd1498Szrj   {10, 10, 12, 24, 48},			/* cost of storing SSE registers
1752*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1753*38fd1498Szrj   {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
1754*38fd1498Szrj   14, 14,				/* SSE->integer and integer->SSE moves */
1755*38fd1498Szrj   10, 10,				/* Gather load static, per_elt.  */
1756*38fd1498Szrj   10, 10,				/* Gather store static, per_elt.  */
1757*38fd1498Szrj   32,					/* size of l1 cache.  */
1758*38fd1498Szrj   2048,					/* size of l2 cache.  */
1759*38fd1498Szrj   64,					/* size of prefetch block */
1760*38fd1498Szrj   100,					/* number of parallel prefetches */
1761*38fd1498Szrj   2,					/* Branch cost */
1762*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1763*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1764*38fd1498Szrj   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1765*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1766*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1767*38fd1498Szrj   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1768*38fd1498Szrj 
1769*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1770*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1771*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
1772*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1773*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1774*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1775*38fd1498Szrj   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
1776*38fd1498Szrj   COSTS_N_INSNS (19),			/* cost of DIVSD instruction.  */
1777*38fd1498Szrj   COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
1778*38fd1498Szrj   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
1779*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1780*38fd1498Szrj   btver2_memcpy,
1781*38fd1498Szrj   btver2_memset,
1782*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1783*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1784*38fd1498Szrj };
1785*38fd1498Szrj 
1786*38fd1498Szrj static stringop_algs pentium4_memcpy[2] = {
1787*38fd1498Szrj   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1788*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
1789*38fd1498Szrj static stringop_algs pentium4_memset[2] = {
1790*38fd1498Szrj   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1791*38fd1498Szrj              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1792*38fd1498Szrj   DUMMY_STRINGOP_ALGS};
1793*38fd1498Szrj 
1794*38fd1498Szrj static const
1795*38fd1498Szrj struct processor_costs pentium4_cost = {
1796*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1797*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of a lea instruction */
1798*38fd1498Szrj   COSTS_N_INSNS (4),			/* variable shift costs */
1799*38fd1498Szrj   COSTS_N_INSNS (4),			/* constant shift costs */
1800*38fd1498Szrj   {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
1801*38fd1498Szrj    COSTS_N_INSNS (15),			/*				 HI */
1802*38fd1498Szrj    COSTS_N_INSNS (15),			/*				 SI */
1803*38fd1498Szrj    COSTS_N_INSNS (15),			/*				 DI */
1804*38fd1498Szrj    COSTS_N_INSNS (15)},			/*			      other */
1805*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1806*38fd1498Szrj   {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
1807*38fd1498Szrj    COSTS_N_INSNS (56),			/*			    HI */
1808*38fd1498Szrj    COSTS_N_INSNS (56),			/*			    SI */
1809*38fd1498Szrj    COSTS_N_INSNS (56),			/*			    DI */
1810*38fd1498Szrj    COSTS_N_INSNS (56)},			/*			    other */
1811*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1812*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
1813*38fd1498Szrj   16,					/* "large" insn */
1814*38fd1498Szrj   6,					/* MOVE_RATIO */
1815*38fd1498Szrj 
1816*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1817*38fd1498Szrj      they are latency*2. */
1818*38fd1498Szrj   5,				     /* cost for loading QImode using movzbl */
1819*38fd1498Szrj   {4, 5, 4},				/* cost of loading integer registers
1820*38fd1498Szrj 					   in QImode, HImode and SImode.
1821*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1822*38fd1498Szrj   {2, 3, 2},				/* cost of storing integer registers */
1823*38fd1498Szrj   12,					/* cost of reg,reg fld/fst */
1824*38fd1498Szrj   {14, 14, 14},				/* cost of loading fp registers
1825*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1826*38fd1498Szrj   {14, 14, 14},				/* cost of storing fp registers
1827*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1828*38fd1498Szrj   12,					/* cost of moving MMX register */
1829*38fd1498Szrj   {16, 16},				/* cost of loading MMX registers
1830*38fd1498Szrj 					   in SImode and DImode */
1831*38fd1498Szrj   {16, 16},				/* cost of storing MMX registers
1832*38fd1498Szrj 					   in SImode and DImode */
1833*38fd1498Szrj   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
1834*38fd1498Szrj   {16, 16, 16, 32, 64},			/* cost of loading SSE registers
1835*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1836*38fd1498Szrj   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
1837*38fd1498Szrj   {16, 16, 16, 32, 64},			/* cost of storing SSE registers
1838*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1839*38fd1498Szrj   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
1840*38fd1498Szrj   20, 12,				/* SSE->integer and integer->SSE moves */
1841*38fd1498Szrj   16, 16,				/* Gather load static, per_elt.  */
1842*38fd1498Szrj   16, 16,				/* Gather store static, per_elt.  */
1843*38fd1498Szrj   8,					/* size of l1 cache.  */
1844*38fd1498Szrj   256,					/* size of l2 cache.  */
1845*38fd1498Szrj   64,					/* size of prefetch block */
1846*38fd1498Szrj   6,					/* number of parallel prefetches */
1847*38fd1498Szrj   2,					/* Branch cost */
1848*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1849*38fd1498Szrj   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
1850*38fd1498Szrj   COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
1851*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1852*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1853*38fd1498Szrj   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
1854*38fd1498Szrj 
1855*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1856*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1857*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1858*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1859*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1860*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1861*38fd1498Szrj   COSTS_N_INSNS (23),			/* cost of DIVSS instruction.  */
1862*38fd1498Szrj   COSTS_N_INSNS (38),			/* cost of DIVSD instruction.  */
1863*38fd1498Szrj   COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
1864*38fd1498Szrj   COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
1865*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1866*38fd1498Szrj   pentium4_memcpy,
1867*38fd1498Szrj   pentium4_memset,
1868*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1869*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1870*38fd1498Szrj };
1871*38fd1498Szrj 
1872*38fd1498Szrj static stringop_algs nocona_memcpy[2] = {
1873*38fd1498Szrj   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1874*38fd1498Szrj   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1875*38fd1498Szrj              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1876*38fd1498Szrj 
1877*38fd1498Szrj static stringop_algs nocona_memset[2] = {
1878*38fd1498Szrj   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1879*38fd1498Szrj              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1880*38fd1498Szrj   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1881*38fd1498Szrj              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1882*38fd1498Szrj 
1883*38fd1498Szrj static const
1884*38fd1498Szrj struct processor_costs nocona_cost = {
1885*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1886*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1887*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
1888*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
1889*38fd1498Szrj   {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
1890*38fd1498Szrj    COSTS_N_INSNS (10),			/*				 HI */
1891*38fd1498Szrj    COSTS_N_INSNS (10),			/*				 SI */
1892*38fd1498Szrj    COSTS_N_INSNS (10),			/*				 DI */
1893*38fd1498Szrj    COSTS_N_INSNS (10)},			/*			      other */
1894*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1895*38fd1498Szrj   {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
1896*38fd1498Szrj    COSTS_N_INSNS (66),			/*			    HI */
1897*38fd1498Szrj    COSTS_N_INSNS (66),			/*			    SI */
1898*38fd1498Szrj    COSTS_N_INSNS (66),			/*			    DI */
1899*38fd1498Szrj    COSTS_N_INSNS (66)},			/*			    other */
1900*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1901*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
1902*38fd1498Szrj   16,					/* "large" insn */
1903*38fd1498Szrj   17,					/* MOVE_RATIO */
1904*38fd1498Szrj 
1905*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1906*38fd1498Szrj      they are latency*2. */
1907*38fd1498Szrj   4,				     /* cost for loading QImode using movzbl */
1908*38fd1498Szrj   {4, 4, 4},				/* cost of loading integer registers
1909*38fd1498Szrj 					   in QImode, HImode and SImode.
1910*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1911*38fd1498Szrj   {4, 4, 4},				/* cost of storing integer registers */
1912*38fd1498Szrj   12,					/* cost of reg,reg fld/fst */
1913*38fd1498Szrj   {14, 14, 14},				/* cost of loading fp registers
1914*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1915*38fd1498Szrj   {14, 14, 14},				/* cost of storing fp registers
1916*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
1917*38fd1498Szrj   14,					/* cost of moving MMX register */
1918*38fd1498Szrj   {12, 12},				/* cost of loading MMX registers
1919*38fd1498Szrj 					   in SImode and DImode */
1920*38fd1498Szrj   {12, 12},				/* cost of storing MMX registers
1921*38fd1498Szrj 					   in SImode and DImode */
1922*38fd1498Szrj   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
1923*38fd1498Szrj   {12, 12, 12, 24, 48},			/* cost of loading SSE registers
1924*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1925*38fd1498Szrj   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
1926*38fd1498Szrj   {12, 12, 12, 24, 48},			/* cost of storing SSE registers
1927*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
1928*38fd1498Szrj   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
1929*38fd1498Szrj   20, 12,				/* SSE->integer and integer->SSE moves */
1930*38fd1498Szrj   12, 12,				/* Gather load static, per_elt.  */
1931*38fd1498Szrj   12, 12,				/* Gather store static, per_elt.  */
1932*38fd1498Szrj   8,					/* size of l1 cache.  */
1933*38fd1498Szrj   1024,					/* size of l2 cache.  */
1934*38fd1498Szrj   64,					/* size of prefetch block */
1935*38fd1498Szrj   8,					/* number of parallel prefetches */
1936*38fd1498Szrj   1,					/* Branch cost */
1937*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1938*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1939*38fd1498Szrj   COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
1940*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
1941*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
1942*38fd1498Szrj   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
1943*38fd1498Szrj 
1944*38fd1498Szrj   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1945*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1946*38fd1498Szrj   COSTS_N_INSNS (7),			/* cost of MULSS instruction.  */
1947*38fd1498Szrj   COSTS_N_INSNS (7),			/* cost of MULSD instruction.  */
1948*38fd1498Szrj   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
1949*38fd1498Szrj   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
1950*38fd1498Szrj   COSTS_N_INSNS (32),			/* cost of DIVSS instruction.  */
1951*38fd1498Szrj   COSTS_N_INSNS (40),			/* cost of DIVSD instruction.  */
1952*38fd1498Szrj   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
1953*38fd1498Szrj   COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
1954*38fd1498Szrj   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1955*38fd1498Szrj   nocona_memcpy,
1956*38fd1498Szrj   nocona_memset,
1957*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1958*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1959*38fd1498Szrj };
1960*38fd1498Szrj 
1961*38fd1498Szrj static stringop_algs atom_memcpy[2] = {
1962*38fd1498Szrj   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1963*38fd1498Szrj   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1964*38fd1498Szrj              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1965*38fd1498Szrj static stringop_algs atom_memset[2] = {
1966*38fd1498Szrj   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1967*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1968*38fd1498Szrj   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1969*38fd1498Szrj              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1970*38fd1498Szrj static const
1971*38fd1498Szrj struct processor_costs atom_cost = {
1972*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
1973*38fd1498Szrj   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1974*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
1975*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
1976*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1977*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
1978*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
1979*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 DI */
1980*38fd1498Szrj    COSTS_N_INSNS (2)},			/*			      other */
1981*38fd1498Szrj   0,					/* cost of multiply per each bit set */
1982*38fd1498Szrj   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1983*38fd1498Szrj    COSTS_N_INSNS (26),			/*			    HI */
1984*38fd1498Szrj    COSTS_N_INSNS (42),			/*			    SI */
1985*38fd1498Szrj    COSTS_N_INSNS (74),			/*			    DI */
1986*38fd1498Szrj    COSTS_N_INSNS (74)},			/*			    other */
1987*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
1988*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
1989*38fd1498Szrj   8,					/* "large" insn */
1990*38fd1498Szrj   17,					/* MOVE_RATIO */
1991*38fd1498Szrj 
1992*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
1993*38fd1498Szrj      they are latency*2. */
1994*38fd1498Szrj   6,					/* cost for loading QImode using movzbl */
1995*38fd1498Szrj   {6, 6, 6},				/* cost of loading integer registers
1996*38fd1498Szrj 					   in QImode, HImode and SImode.
1997*38fd1498Szrj 					   Relative to reg-reg move (2).  */
1998*38fd1498Szrj   {6, 6, 6},				/* cost of storing integer registers */
1999*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
2000*38fd1498Szrj   {6, 6, 18},				/* cost of loading fp registers
2001*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2002*38fd1498Szrj   {14, 14, 24},				/* cost of storing fp registers
2003*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2004*38fd1498Szrj   2,					/* cost of moving MMX register */
2005*38fd1498Szrj   {8, 8},				/* cost of loading MMX registers
2006*38fd1498Szrj 					   in SImode and DImode */
2007*38fd1498Szrj   {10, 10},				/* cost of storing MMX registers
2008*38fd1498Szrj 					   in SImode and DImode */
2009*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2010*38fd1498Szrj   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2011*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2012*38fd1498Szrj   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2013*38fd1498Szrj   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2014*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2015*38fd1498Szrj   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2016*38fd1498Szrj   8, 6,					/* SSE->integer and integer->SSE moves */
2017*38fd1498Szrj   8, 8,					/* Gather load static, per_elt.  */
2018*38fd1498Szrj   8, 8,					/* Gather store static, per_elt.  */
2019*38fd1498Szrj   32,					/* size of l1 cache.  */
2020*38fd1498Szrj   256,					/* size of l2 cache.  */
2021*38fd1498Szrj   64,					/* size of prefetch block */
2022*38fd1498Szrj   6,					/* number of parallel prefetches */
2023*38fd1498Szrj   3,					/* Branch cost */
2024*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2025*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2026*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2027*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2028*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2029*38fd1498Szrj   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2030*38fd1498Szrj 
2031*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2032*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2033*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2034*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2035*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2036*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2037*38fd1498Szrj   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
2038*38fd1498Szrj   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
2039*38fd1498Szrj   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
2040*38fd1498Szrj   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
2041*38fd1498Szrj   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2042*38fd1498Szrj   atom_memcpy,
2043*38fd1498Szrj   atom_memset,
2044*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2045*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2046*38fd1498Szrj };
2047*38fd1498Szrj 
2048*38fd1498Szrj static stringop_algs slm_memcpy[2] = {
2049*38fd1498Szrj   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2050*38fd1498Szrj   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2051*38fd1498Szrj              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2052*38fd1498Szrj static stringop_algs slm_memset[2] = {
2053*38fd1498Szrj   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2054*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2055*38fd1498Szrj   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2056*38fd1498Szrj              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2057*38fd1498Szrj static const
2058*38fd1498Szrj struct processor_costs slm_cost = {
2059*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
2060*38fd1498Szrj   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2061*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
2062*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
2063*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2064*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 HI */
2065*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
2066*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 DI */
2067*38fd1498Szrj    COSTS_N_INSNS (2)},			/*			      other */
2068*38fd1498Szrj   0,					/* cost of multiply per each bit set */
2069*38fd1498Szrj   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2070*38fd1498Szrj    COSTS_N_INSNS (26),			/*			    HI */
2071*38fd1498Szrj    COSTS_N_INSNS (42),			/*			    SI */
2072*38fd1498Szrj    COSTS_N_INSNS (74),			/*			    DI */
2073*38fd1498Szrj    COSTS_N_INSNS (74)},			/*			    other */
2074*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
2075*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
2076*38fd1498Szrj   8,					/* "large" insn */
2077*38fd1498Szrj   17,					/* MOVE_RATIO */
2078*38fd1498Szrj 
2079*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
2080*38fd1498Szrj      they are latency*2. */
2081*38fd1498Szrj   8,					/* cost for loading QImode using movzbl */
2082*38fd1498Szrj   {8, 8, 8},				/* cost of loading integer registers
2083*38fd1498Szrj 					   in QImode, HImode and SImode.
2084*38fd1498Szrj 					   Relative to reg-reg move (2).  */
2085*38fd1498Szrj   {6, 6, 6},				/* cost of storing integer registers */
2086*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
2087*38fd1498Szrj   {8, 8, 18},				/* cost of loading fp registers
2088*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2089*38fd1498Szrj   {6, 6, 18},				/* cost of storing fp registers
2090*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2091*38fd1498Szrj   2,					/* cost of moving MMX register */
2092*38fd1498Szrj   {8, 8},				/* cost of loading MMX registers
2093*38fd1498Szrj 					   in SImode and DImode */
2094*38fd1498Szrj   {6, 6},				/* cost of storing MMX registers
2095*38fd1498Szrj 					   in SImode and DImode */
2096*38fd1498Szrj   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2097*38fd1498Szrj   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2098*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2099*38fd1498Szrj   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2100*38fd1498Szrj   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2101*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2102*38fd1498Szrj   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2103*38fd1498Szrj   8, 6,					/* SSE->integer and integer->SSE moves */
2104*38fd1498Szrj   8, 8,					/* Gather load static, per_elt.  */
2105*38fd1498Szrj   8, 8,					/* Gather store static, per_elt.  */
2106*38fd1498Szrj   32,					/* size of l1 cache.  */
2107*38fd1498Szrj   256,					/* size of l2 cache.  */
2108*38fd1498Szrj   64,					/* size of prefetch block */
2109*38fd1498Szrj   6,					/* number of parallel prefetches */
2110*38fd1498Szrj   3,					/* Branch cost */
2111*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2112*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2113*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2114*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2115*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2116*38fd1498Szrj   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2117*38fd1498Szrj 
2118*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2119*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2120*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2121*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2122*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2123*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2124*38fd1498Szrj   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
2125*38fd1498Szrj   COSTS_N_INSNS (69),			/* cost of DIVSD instruction.  */
2126*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
2127*38fd1498Szrj   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
2128*38fd1498Szrj   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2129*38fd1498Szrj   slm_memcpy,
2130*38fd1498Szrj   slm_memset,
2131*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2132*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2133*38fd1498Szrj };
2134*38fd1498Szrj 
2135*38fd1498Szrj static stringop_algs intel_memcpy[2] = {
2136*38fd1498Szrj   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2137*38fd1498Szrj   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2138*38fd1498Szrj              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2139*38fd1498Szrj static stringop_algs intel_memset[2] = {
2140*38fd1498Szrj   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2141*38fd1498Szrj              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2142*38fd1498Szrj   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2143*38fd1498Szrj              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2144*38fd1498Szrj static const
2145*38fd1498Szrj struct processor_costs intel_cost = {
2146*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
2147*38fd1498Szrj   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2148*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
2149*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
2150*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2151*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 HI */
2152*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
2153*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 DI */
2154*38fd1498Szrj    COSTS_N_INSNS (2)},			/*			      other */
2155*38fd1498Szrj   0,					/* cost of multiply per each bit set */
2156*38fd1498Szrj   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2157*38fd1498Szrj    COSTS_N_INSNS (26),			/*			    HI */
2158*38fd1498Szrj    COSTS_N_INSNS (42),			/*			    SI */
2159*38fd1498Szrj    COSTS_N_INSNS (74),			/*			    DI */
2160*38fd1498Szrj    COSTS_N_INSNS (74)},			/*			    other */
2161*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
2162*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
2163*38fd1498Szrj   8,					/* "large" insn */
2164*38fd1498Szrj   17,					/* MOVE_RATIO */
2165*38fd1498Szrj 
2166*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
2167*38fd1498Szrj      they are latency*2. */
2168*38fd1498Szrj   6,				     /* cost for loading QImode using movzbl */
2169*38fd1498Szrj   {4, 4, 4},				/* cost of loading integer registers
2170*38fd1498Szrj 					   in QImode, HImode and SImode.
2171*38fd1498Szrj 					   Relative to reg-reg move (2).  */
2172*38fd1498Szrj   {6, 6, 6},				/* cost of storing integer registers */
2173*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
2174*38fd1498Szrj   {6, 6, 8},				/* cost of loading fp registers
2175*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2176*38fd1498Szrj   {6, 6, 10},				/* cost of storing fp registers
2177*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2178*38fd1498Szrj   2,					/* cost of moving MMX register */
2179*38fd1498Szrj   {6, 6},				/* cost of loading MMX registers
2180*38fd1498Szrj 					   in SImode and DImode */
2181*38fd1498Szrj   {6, 6},				/* cost of storing MMX registers
2182*38fd1498Szrj 					   in SImode and DImode */
2183*38fd1498Szrj   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
2184*38fd1498Szrj   {6, 6, 6, 6, 6},			/* cost of loading SSE registers
2185*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2186*38fd1498Szrj   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2187*38fd1498Szrj   {6, 6, 6, 6, 6},			/* cost of storing SSE registers
2188*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2189*38fd1498Szrj   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2190*38fd1498Szrj   4, 4,					/* SSE->integer and integer->SSE moves */
2191*38fd1498Szrj   6, 6,					/* Gather load static, per_elt.  */
2192*38fd1498Szrj   6, 6,					/* Gather store static, per_elt.  */
2193*38fd1498Szrj   32,					/* size of l1 cache.  */
2194*38fd1498Szrj   256,					/* size of l2 cache.  */
2195*38fd1498Szrj   64,					/* size of prefetch block */
2196*38fd1498Szrj   6,					/* number of parallel prefetches */
2197*38fd1498Szrj   3,					/* Branch cost */
2198*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2199*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2200*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2201*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2202*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2203*38fd1498Szrj   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2204*38fd1498Szrj 
2205*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of cheap SSE instruction.  */
2206*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2207*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
2208*38fd1498Szrj   COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
2209*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2210*38fd1498Szrj   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2211*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
2212*38fd1498Szrj   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
2213*38fd1498Szrj   COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
2214*38fd1498Szrj   COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
2215*38fd1498Szrj   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2216*38fd1498Szrj   intel_memcpy,
2217*38fd1498Szrj   intel_memset,
2218*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2219*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2220*38fd1498Szrj };
2221*38fd1498Szrj 
2222*38fd1498Szrj /* Generic should produce code tuned for Core-i7 (and newer chips)
2223*38fd1498Szrj    and btver1 (and newer chips).  */
2224*38fd1498Szrj 
2225*38fd1498Szrj static stringop_algs generic_memcpy[2] = {
2226*38fd1498Szrj   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2227*38fd1498Szrj              {-1, libcall, false}}},
2228*38fd1498Szrj   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2229*38fd1498Szrj              {-1, libcall, false}}}};
2230*38fd1498Szrj static stringop_algs generic_memset[2] = {
2231*38fd1498Szrj   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2232*38fd1498Szrj              {-1, libcall, false}}},
2233*38fd1498Szrj   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2234*38fd1498Szrj              {-1, libcall, false}}}};
2235*38fd1498Szrj static const
2236*38fd1498Szrj struct processor_costs generic_cost = {
2237*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
2238*38fd1498Szrj   /* Setting cost to 2 makes our current implementation of synth_mult result in
2239*38fd1498Szrj      use of unnecessary temporary registers causing regression on several
2240*38fd1498Szrj      SPECfp benchmarks.  */
2241*38fd1498Szrj   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2242*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
2243*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
2244*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2245*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
2246*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
2247*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 DI */
2248*38fd1498Szrj    COSTS_N_INSNS (4)},			/*			      other */
2249*38fd1498Szrj   0,					/* cost of multiply per each bit set */
2250*38fd1498Szrj   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
2251*38fd1498Szrj    COSTS_N_INSNS (22),			/*			    HI */
2252*38fd1498Szrj    COSTS_N_INSNS (30),			/*			    SI */
2253*38fd1498Szrj    COSTS_N_INSNS (74),			/*			    DI */
2254*38fd1498Szrj    COSTS_N_INSNS (74)},			/*			    other */
2255*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
2256*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
2257*38fd1498Szrj   8,					/* "large" insn */
2258*38fd1498Szrj   17,					/* MOVE_RATIO */
2259*38fd1498Szrj 
2260*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
2261*38fd1498Szrj      they are latency*2. */
2262*38fd1498Szrj   6,				     /* cost for loading QImode using movzbl */
2263*38fd1498Szrj   {6, 6, 6},				/* cost of loading integer registers
2264*38fd1498Szrj 					   in QImode, HImode and SImode.
2265*38fd1498Szrj 					   Relative to reg-reg move (2).  */
2266*38fd1498Szrj   {6, 6, 6},				/* cost of storing integer registers */
2267*38fd1498Szrj   4,					/* cost of reg,reg fld/fst */
2268*38fd1498Szrj   {6, 6, 12},				/* cost of loading fp registers
2269*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2270*38fd1498Szrj   {6, 6, 12},				/* cost of storing fp registers
2271*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2272*38fd1498Szrj   2,					/* cost of moving MMX register */
2273*38fd1498Szrj   {6, 6},				/* cost of loading MMX registers
2274*38fd1498Szrj 					   in SImode and DImode */
2275*38fd1498Szrj   {6, 6},				/* cost of storing MMX registers
2276*38fd1498Szrj 					   in SImode and DImode */
2277*38fd1498Szrj   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
2278*38fd1498Szrj   {6, 6, 6, 10, 15},			/* cost of loading SSE registers
2279*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2280*38fd1498Szrj   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
2281*38fd1498Szrj   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
2282*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2283*38fd1498Szrj   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
2284*38fd1498Szrj   6, 6,					/* SSE->integer and integer->SSE moves */
2285*38fd1498Szrj   18, 6,				/* Gather load static, per_elt.  */
2286*38fd1498Szrj   18, 6,				/* Gather store static, per_elt.  */
2287*38fd1498Szrj   32,					/* size of l1 cache.  */
2288*38fd1498Szrj   512,					/* size of l2 cache.  */
2289*38fd1498Szrj   64,					/* size of prefetch block */
2290*38fd1498Szrj   6,					/* number of parallel prefetches */
2291*38fd1498Szrj   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2292*38fd1498Szrj      value is increased to perhaps more appropriate value of 5.  */
2293*38fd1498Szrj   3,					/* Branch cost */
2294*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2295*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2296*38fd1498Szrj   COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
2297*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2298*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2299*38fd1498Szrj   COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
2300*38fd1498Szrj 
2301*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2302*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2303*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2304*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2305*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2306*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2307*38fd1498Szrj   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2308*38fd1498Szrj   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
2309*38fd1498Szrj   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
2310*38fd1498Szrj   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
2311*38fd1498Szrj   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
2312*38fd1498Szrj   generic_memcpy,
2313*38fd1498Szrj   generic_memset,
2314*38fd1498Szrj   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
2315*38fd1498Szrj   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
2316*38fd1498Szrj };
2317*38fd1498Szrj 
2318*38fd1498Szrj /* core_cost should produce code tuned for Core familly of CPUs.  */
2319*38fd1498Szrj static stringop_algs core_memcpy[2] = {
2320*38fd1498Szrj   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2321*38fd1498Szrj   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2322*38fd1498Szrj              {-1, libcall, false}}}};
2323*38fd1498Szrj static stringop_algs core_memset[2] = {
2324*38fd1498Szrj   {libcall, {{6, loop_1_byte, true},
2325*38fd1498Szrj              {24, loop, true},
2326*38fd1498Szrj              {8192, rep_prefix_4_byte, true},
2327*38fd1498Szrj              {-1, libcall, false}}},
2328*38fd1498Szrj   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2329*38fd1498Szrj              {-1, libcall, false}}}};
2330*38fd1498Szrj 
2331*38fd1498Szrj static const
2332*38fd1498Szrj struct processor_costs core_cost = {
2333*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of an add instruction */
2334*38fd1498Szrj   /* On all chips taken into consideration lea is 2 cycles and more.  With
2335*38fd1498Szrj      this cost however our current implementation of synth_mult results in
2336*38fd1498Szrj      use of unnecessary temporary registers causing regression on several
2337*38fd1498Szrj      SPECfp benchmarks.  */
2338*38fd1498Szrj   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2339*38fd1498Szrj   COSTS_N_INSNS (1),			/* variable shift costs */
2340*38fd1498Szrj   COSTS_N_INSNS (1),			/* constant shift costs */
2341*38fd1498Szrj   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2342*38fd1498Szrj    COSTS_N_INSNS (4),			/*				 HI */
2343*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 SI */
2344*38fd1498Szrj    /* Here we tune for Sandybridge or newer.  */
2345*38fd1498Szrj    COSTS_N_INSNS (3),			/*				 DI */
2346*38fd1498Szrj    COSTS_N_INSNS (3)},			/*			      other */
2347*38fd1498Szrj   0,					/* cost of multiply per each bit set */
2348*38fd1498Szrj   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2349*38fd1498Szrj      model is not realistic. We compensate by increasing the latencies a bit.  */
2350*38fd1498Szrj   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
2351*38fd1498Szrj    COSTS_N_INSNS (11),			/*			    HI */
2352*38fd1498Szrj    COSTS_N_INSNS (14),			/*			    SI */
2353*38fd1498Szrj    COSTS_N_INSNS (81),			/*			    DI */
2354*38fd1498Szrj    COSTS_N_INSNS (81)},			/*			    other */
2355*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movsx */
2356*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of movzx */
2357*38fd1498Szrj   8,					/* "large" insn */
2358*38fd1498Szrj   17,					/* MOVE_RATIO */
2359*38fd1498Szrj 
2360*38fd1498Szrj   /* All move costs are relative to integer->integer move times 2 and thus
2361*38fd1498Szrj      they are latency*2. */
2362*38fd1498Szrj   6,				     /* cost for loading QImode using movzbl */
2363*38fd1498Szrj   {4, 4, 4},				/* cost of loading integer registers
2364*38fd1498Szrj 					   in QImode, HImode and SImode.
2365*38fd1498Szrj 					   Relative to reg-reg move (2).  */
2366*38fd1498Szrj   {6, 6, 6},				/* cost of storing integer registers */
2367*38fd1498Szrj   2,					/* cost of reg,reg fld/fst */
2368*38fd1498Szrj   {6, 6, 8},				/* cost of loading fp registers
2369*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2370*38fd1498Szrj   {6, 6, 10},				/* cost of storing fp registers
2371*38fd1498Szrj 					   in SFmode, DFmode and XFmode */
2372*38fd1498Szrj   2,					/* cost of moving MMX register */
2373*38fd1498Szrj   {6, 6},				/* cost of loading MMX registers
2374*38fd1498Szrj 					   in SImode and DImode */
2375*38fd1498Szrj   {6, 6},				/* cost of storing MMX registers
2376*38fd1498Szrj 					   in SImode and DImode */
2377*38fd1498Szrj   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2378*38fd1498Szrj   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
2379*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2380*38fd1498Szrj   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
2381*38fd1498Szrj   {6, 6, 6, 6, 12},			/* cost of storing SSE registers
2382*38fd1498Szrj 					   in 32,64,128,256 and 512-bit */
2383*38fd1498Szrj   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
2384*38fd1498Szrj   2, 2,					/* SSE->integer and integer->SSE moves */
2385*38fd1498Szrj   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2386*38fd1498Szrj      rec. throughput 6.
2387*38fd1498Szrj      So 5 uops statically and one uops per load.  */
2388*38fd1498Szrj   10, 6,				/* Gather load static, per_elt.  */
2389*38fd1498Szrj   10, 6,				/* Gather store static, per_elt.  */
2390*38fd1498Szrj   64,					/* size of l1 cache.  */
2391*38fd1498Szrj   512,					/* size of l2 cache.  */
2392*38fd1498Szrj   64,					/* size of prefetch block */
2393*38fd1498Szrj   6,					/* number of parallel prefetches */
2394*38fd1498Szrj   /* FIXME perhaps more appropriate value is 5.  */
2395*38fd1498Szrj   3,					/* Branch cost */
2396*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2397*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2398*38fd1498Szrj   /* 10-24 */
2399*38fd1498Szrj   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
2400*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2401*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2402*38fd1498Szrj   COSTS_N_INSNS (23),			/* cost of FSQRT instruction.  */
2403*38fd1498Szrj 
2404*38fd1498Szrj   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2405*38fd1498Szrj   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2406*38fd1498Szrj   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2407*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2408*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2409*38fd1498Szrj   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2410*38fd1498Szrj   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
2411*38fd1498Szrj   COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
2412*38fd1498Szrj   COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
2413*38fd1498Szrj   COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
2414*38fd1498Szrj   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2415*38fd1498Szrj   core_memcpy,
2416*38fd1498Szrj   core_memset,
2417*38fd1498Szrj   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2418*38fd1498Szrj   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2419*38fd1498Szrj };
2420*38fd1498Szrj 
2421