xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/i386/x86-tune-costs.h (revision 4c3eb207d36f67d31994830c0a694161fc1ca39b)
1 /* Costs of operations of individual x86 CPUs.
2    Copyright (C) 1988-2020 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10 
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
19 
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 <http://www.gnu.org/licenses/>.  */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
26 #define COSTS_N_BYTES(N) ((N) * 2)
27 
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29 
30 static stringop_algs ix86_size_memcpy[2] = {
31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36 
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39   {
40   /* Start of register allocator costs.  integer->integer move cost is 2. */
41   2,				     /* cost for loading QImode using movzbl */
42   {2, 2, 2},				/* cost of loading integer registers
43 					   in QImode, HImode and SImode.
44 					   Relative to reg-reg move (2).  */
45   {2, 2, 2},				/* cost of storing integer registers */
46   2,					/* cost of reg,reg fld/fst */
47   {2, 2, 2},				/* cost of loading fp registers
48 					   in SFmode, DFmode and XFmode */
49   {2, 2, 2},				/* cost of storing fp registers
50 					   in SFmode, DFmode and XFmode */
51   3,					/* cost of moving MMX register */
52   {3, 3},				/* cost of loading MMX registers
53 					   in SImode and DImode */
54   {3, 3},				/* cost of storing MMX registers
55 					   in SImode and DImode */
56   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
57   {3, 3, 3, 3, 3},			/* cost of loading SSE registers
58 					   in 32,64,128,256 and 512-bit */
59   {3, 3, 3, 3, 3},			/* cost of storing SSE registers
60 					   in 32,64,128,256 and 512-bit */
61   3, 3,					/* SSE->integer and integer->SSE moves */
62   /* End of register allocator costs.  */
63   },
64 
65   COSTS_N_BYTES (2),			/* cost of an add instruction */
66   COSTS_N_BYTES (3),			/* cost of a lea instruction */
67   COSTS_N_BYTES (2),			/* variable shift costs */
68   COSTS_N_BYTES (3),			/* constant shift costs */
69   {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
70    COSTS_N_BYTES (3),			/*				 HI */
71    COSTS_N_BYTES (3),			/*				 SI */
72    COSTS_N_BYTES (3),			/*				 DI */
73    COSTS_N_BYTES (5)},			/*			      other */
74   0,					/* cost of multiply per each bit set */
75   {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
76    COSTS_N_BYTES (3),			/*			    HI */
77    COSTS_N_BYTES (3),			/*			    SI */
78    COSTS_N_BYTES (3),			/*			    DI */
79    COSTS_N_BYTES (5)},			/*			    other */
80   COSTS_N_BYTES (3),			/* cost of movsx */
81   COSTS_N_BYTES (3),			/* cost of movzx */
82   0,					/* "large" insn */
83   2,					/* MOVE_RATIO */
84   2,					/* CLEAR_RATIO */
85   {2, 2, 2},				/* cost of loading integer registers
86 					   in QImode, HImode and SImode.
87 					   Relative to reg-reg move (2).  */
88   {2, 2, 2},				/* cost of storing integer registers */
89   {3, 3, 3, 3, 3},			/* cost of loading SSE register
90 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
91   {3, 3, 3, 3, 3},			/* cost of storing SSE register
92 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
93   {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
94 					   in 128bit, 256bit and 512bit */
95   {3, 3, 3, 3, 3},			/* cost of unaligned SSE store
96 					   in 128bit, 256bit and 512bit */
97   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
98   3,					/* cost of moving SSE register to integer.  */
99   5, 0,					/* Gather load static, per_elt.  */
100   5, 0,					/* Gather store static, per_elt.  */
101   0,					/* size of l1 cache  */
102   0,					/* size of l2 cache  */
103   0,					/* size of prefetch block */
104   0,					/* number of parallel prefetches */
105   2,					/* Branch cost */
106   COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
107   COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
108   COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
109   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
110   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
111   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
112 
113   COSTS_N_BYTES (2),			/* cost of cheap SSE instruction.  */
114   COSTS_N_BYTES (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
115   COSTS_N_BYTES (2),			/* cost of MULSS instruction.  */
116   COSTS_N_BYTES (2),			/* cost of MULSD instruction.  */
117   COSTS_N_BYTES (2),			/* cost of FMA SS instruction.  */
118   COSTS_N_BYTES (2),			/* cost of FMA SD instruction.  */
119   COSTS_N_BYTES (2),			/* cost of DIVSS instruction.  */
120   COSTS_N_BYTES (2),			/* cost of DIVSD instruction.  */
121   COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
122   COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
123   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
124   ix86_size_memcpy,
125   ix86_size_memset,
126   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
127   COSTS_N_BYTES (1),			/* cond_not_taken_branch_cost.  */
128   NULL,					/* Loop alignment.  */
129   NULL,					/* Jump alignment.  */
130   NULL,					/* Label alignment.  */
131   NULL,					/* Func alignment.  */
132 };
133 
134 /* Processor costs (relative to an add) */
135 static stringop_algs i386_memcpy[2] = {
136   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
137   DUMMY_STRINGOP_ALGS};
138 static stringop_algs i386_memset[2] = {
139   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140   DUMMY_STRINGOP_ALGS};
141 
142 static const
143 struct processor_costs i386_cost = {	/* 386 specific costs */
144   {
145   /* Start of register allocator costs.  integer->integer move cost is 2. */
146   4,				     /* cost for loading QImode using movzbl */
147   {2, 4, 2},				/* cost of loading integer registers
148 					   in QImode, HImode and SImode.
149 					   Relative to reg-reg move (2).  */
150   {2, 4, 2},				/* cost of storing integer registers */
151   2,					/* cost of reg,reg fld/fst */
152   {8, 8, 8},				/* cost of loading fp registers
153 					   in SFmode, DFmode and XFmode */
154   {8, 8, 8},				/* cost of storing fp registers
155 					   in SFmode, DFmode and XFmode */
156   2,					/* cost of moving MMX register */
157   {4, 8},				/* cost of loading MMX registers
158 					   in SImode and DImode */
159   {4, 8},				/* cost of storing MMX registers
160 					   in SImode and DImode */
161   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
162   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
163 					   in 32,64,128,256 and 512-bit */
164   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
165 					   in 32,64,128,256 and 512-bit */
166   3, 3,					/* SSE->integer and integer->SSE moves */
167   /* End of register allocator costs.  */
168   },
169 
170   COSTS_N_INSNS (1),			/* cost of an add instruction */
171   COSTS_N_INSNS (1),			/* cost of a lea instruction */
172   COSTS_N_INSNS (3),			/* variable shift costs */
173   COSTS_N_INSNS (2),			/* constant shift costs */
174   {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
175    COSTS_N_INSNS (6),			/*				 HI */
176    COSTS_N_INSNS (6),			/*				 SI */
177    COSTS_N_INSNS (6),			/*				 DI */
178    COSTS_N_INSNS (6)},			/*			      other */
179   COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
180   {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
181    COSTS_N_INSNS (23),			/*			    HI */
182    COSTS_N_INSNS (23),			/*			    SI */
183    COSTS_N_INSNS (23),			/*			    DI */
184    COSTS_N_INSNS (23)},			/*			    other */
185   COSTS_N_INSNS (3),			/* cost of movsx */
186   COSTS_N_INSNS (2),			/* cost of movzx */
187   15,					/* "large" insn */
188   3,					/* MOVE_RATIO */
189   3,					/* CLEAR_RATIO */
190   {2, 4, 2},				/* cost of loading integer registers
191 					   in QImode, HImode and SImode.
192 					   Relative to reg-reg move (2).  */
193   {2, 4, 2},				/* cost of storing integer registers */
194   {4, 8, 16, 32, 64},			/* cost of loading SSE register
195 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
196   {4, 8, 16, 32, 64},			/* cost of storing SSE register
197 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
198   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
199   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
200   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
201   3,					/* cost of moving SSE register to integer.  */
202   4, 4,					/* Gather load static, per_elt.  */
203   4, 4,					/* Gather store static, per_elt.  */
204   0,					/* size of l1 cache  */
205   0,					/* size of l2 cache  */
206   0,					/* size of prefetch block */
207   0,					/* number of parallel prefetches */
208   1,					/* Branch cost */
209   COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
210   COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
211   COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
212   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
213   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
214   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
215 
216   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
217   COSTS_N_INSNS (23),			/* cost of ADDSS/SD SUBSS/SD insns.  */
218   COSTS_N_INSNS (27),			/* cost of MULSS instruction.  */
219   COSTS_N_INSNS (27),			/* cost of MULSD instruction.  */
220   COSTS_N_INSNS (27),			/* cost of FMA SS instruction.  */
221   COSTS_N_INSNS (27),			/* cost of FMA SD instruction.  */
222   COSTS_N_INSNS (88),			/* cost of DIVSS instruction.  */
223   COSTS_N_INSNS (88),			/* cost of DIVSD instruction.  */
224   COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
225   COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
226   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
227   i386_memcpy,
228   i386_memset,
229   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
230   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
231   "4",					/* Loop alignment.  */
232   "4",					/* Jump alignment.  */
233   NULL,					/* Label alignment.  */
234   "4",					/* Func alignment.  */
235 };
236 
237 static stringop_algs i486_memcpy[2] = {
238   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
239   DUMMY_STRINGOP_ALGS};
240 static stringop_algs i486_memset[2] = {
241   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
242   DUMMY_STRINGOP_ALGS};
243 
244 static const
245 struct processor_costs i486_cost = {	/* 486 specific costs */
246   {
247   /* Start of register allocator costs.  integer->integer move cost is 2. */
248   4,				     /* cost for loading QImode using movzbl */
249   {2, 4, 2},				/* cost of loading integer registers
250 					   in QImode, HImode and SImode.
251 					   Relative to reg-reg move (2).  */
252   {2, 4, 2},				/* cost of storing integer registers */
253   2,					/* cost of reg,reg fld/fst */
254   {8, 8, 8},				/* cost of loading fp registers
255 					   in SFmode, DFmode and XFmode */
256   {8, 8, 8},				/* cost of storing fp registers
257 					   in SFmode, DFmode and XFmode */
258   2,					/* cost of moving MMX register */
259   {4, 8},				/* cost of loading MMX registers
260 					   in SImode and DImode */
261   {4, 8},				/* cost of storing MMX registers
262 					   in SImode and DImode */
263   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
264   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
265 					   in 32,64,128,256 and 512-bit */
266   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
267 					   in 32,64,128,256 and 512-bit */
268   3, 3,					/* SSE->integer and integer->SSE moves */
269   /* End of register allocator costs.  */
270   },
271 
272   COSTS_N_INSNS (1),			/* cost of an add instruction */
273   COSTS_N_INSNS (1),			/* cost of a lea instruction */
274   COSTS_N_INSNS (3),			/* variable shift costs */
275   COSTS_N_INSNS (2),			/* constant shift costs */
276   {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
277    COSTS_N_INSNS (12),			/*				 HI */
278    COSTS_N_INSNS (12),			/*				 SI */
279    COSTS_N_INSNS (12),			/*				 DI */
280    COSTS_N_INSNS (12)},			/*			      other */
281   1,					/* cost of multiply per each bit set */
282   {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
283    COSTS_N_INSNS (40),			/*			    HI */
284    COSTS_N_INSNS (40),			/*			    SI */
285    COSTS_N_INSNS (40),			/*			    DI */
286    COSTS_N_INSNS (40)},			/*			    other */
287   COSTS_N_INSNS (3),			/* cost of movsx */
288   COSTS_N_INSNS (2),			/* cost of movzx */
289   15,					/* "large" insn */
290   3,					/* MOVE_RATIO */
291   3,					/* CLEAR_RATIO */
292   {2, 4, 2},				/* cost of loading integer registers
293 					   in QImode, HImode and SImode.
294 					   Relative to reg-reg move (2).  */
295   {2, 4, 2},				/* cost of storing integer registers */
296   {4, 8, 16, 32, 64},			/* cost of loading SSE register
297 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
298   {4, 8, 16, 32, 64},			/* cost of storing SSE register
299 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
300   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
301   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
302   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
303   3,					/* cost of moving SSE register to integer.  */
304   4, 4,					/* Gather load static, per_elt.  */
305   4, 4,					/* Gather store static, per_elt.  */
306   4,					/* size of l1 cache.  486 has 8kB cache
307 					   shared for code and data, so 4kB is
308 					   not really precise.  */
309   4,					/* size of l2 cache  */
310   0,					/* size of prefetch block */
311   0,					/* number of parallel prefetches */
312   1,					/* Branch cost */
313   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
314   COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
315   COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
316   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
317   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
318   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
319 
320   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
321   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
322   COSTS_N_INSNS (16),			/* cost of MULSS instruction.  */
323   COSTS_N_INSNS (16),			/* cost of MULSD instruction.  */
324   COSTS_N_INSNS (16),			/* cost of FMA SS instruction.  */
325   COSTS_N_INSNS (16),			/* cost of FMA SD instruction.  */
326   COSTS_N_INSNS (73),			/* cost of DIVSS instruction.  */
327   COSTS_N_INSNS (74),			/* cost of DIVSD instruction.  */
328   COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
329   COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
330   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
331   i486_memcpy,
332   i486_memset,
333   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
334   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
335   "16",					/* Loop alignment.  */
336   "16",					/* Jump alignment.  */
337   "0:0:8",				/* Label alignment.  */
338   "16",					/* Func alignment.  */
339 };
340 
341 static stringop_algs pentium_memcpy[2] = {
342   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
343   DUMMY_STRINGOP_ALGS};
344 static stringop_algs pentium_memset[2] = {
345   {libcall, {{-1, rep_prefix_4_byte, false}}},
346   DUMMY_STRINGOP_ALGS};
347 
348 static const
349 struct processor_costs pentium_cost = {
350   {
351   /* Start of register allocator costs.  integer->integer move cost is 2. */
352   6,				     /* cost for loading QImode using movzbl */
353   {2, 4, 2},				/* cost of loading integer registers
354 					   in QImode, HImode and SImode.
355 					   Relative to reg-reg move (2).  */
356   {2, 4, 2},				/* cost of storing integer registers */
357   2,					/* cost of reg,reg fld/fst */
358   {2, 2, 6},				/* cost of loading fp registers
359 					   in SFmode, DFmode and XFmode */
360   {4, 4, 6},				/* cost of storing fp registers
361 					   in SFmode, DFmode and XFmode */
362   8,					/* cost of moving MMX register */
363   {8, 8},				/* cost of loading MMX registers
364 					   in SImode and DImode */
365   {8, 8},				/* cost of storing MMX registers
366 					   in SImode and DImode */
367   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
368   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
369 					   in 32,64,128,256 and 512-bit */
370   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
371 					   in 32,64,128,256 and 512-bit */
372   3, 3,					/* SSE->integer and integer->SSE moves */
373   /* End of register allocator costs.  */
374   },
375 
376   COSTS_N_INSNS (1),			/* cost of an add instruction */
377   COSTS_N_INSNS (1),			/* cost of a lea instruction */
378   COSTS_N_INSNS (4),			/* variable shift costs */
379   COSTS_N_INSNS (1),			/* constant shift costs */
380   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
381    COSTS_N_INSNS (11),			/*				 HI */
382    COSTS_N_INSNS (11),			/*				 SI */
383    COSTS_N_INSNS (11),			/*				 DI */
384    COSTS_N_INSNS (11)},			/*			      other */
385   0,					/* cost of multiply per each bit set */
386   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
387    COSTS_N_INSNS (25),			/*			    HI */
388    COSTS_N_INSNS (25),			/*			    SI */
389    COSTS_N_INSNS (25),			/*			    DI */
390    COSTS_N_INSNS (25)},			/*			    other */
391   COSTS_N_INSNS (3),			/* cost of movsx */
392   COSTS_N_INSNS (2),			/* cost of movzx */
393   8,					/* "large" insn */
394   6,					/* MOVE_RATIO */
395   6,					/* CLEAR_RATIO */
396   {2, 4, 2},				/* cost of loading integer registers
397 					   in QImode, HImode and SImode.
398 					   Relative to reg-reg move (2).  */
399   {2, 4, 2},				/* cost of storing integer registers */
400   {4, 8, 16, 32, 64},			/* cost of loading SSE register
401 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
402   {4, 8, 16, 32, 64},			/* cost of storing SSE register
403 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
404   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
405   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
406   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
407   3,					/* cost of moving SSE register to integer.  */
408   4, 4,					/* Gather load static, per_elt.  */
409   4, 4,					/* Gather store static, per_elt.  */
410   8,					/* size of l1 cache.  */
411   8,					/* size of l2 cache  */
412   0,					/* size of prefetch block */
413   0,					/* number of parallel prefetches */
414   2,					/* Branch cost */
415   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
416   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
417   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
418   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
419   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
420   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
421 
422   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
423   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
424   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
425   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
426   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
427   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
428   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
429   COSTS_N_INSNS (39),			/* cost of DIVSD instruction.  */
430   COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
431   COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
432   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
433   pentium_memcpy,
434   pentium_memset,
435   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
436   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
437   "16:8:8",				/* Loop alignment.  */
438   "16:8:8",				/* Jump alignment.  */
439   "0:0:8",				/* Label alignment.  */
440   "16",					/* Func alignment.  */
441 };
442 
443 static const
444 struct processor_costs lakemont_cost = {
445   {
446   /* Start of register allocator costs.  integer->integer move cost is 2. */
447   6,				     /* cost for loading QImode using movzbl */
448   {2, 4, 2},				/* cost of loading integer registers
449 					   in QImode, HImode and SImode.
450 					   Relative to reg-reg move (2).  */
451   {2, 4, 2},				/* cost of storing integer registers */
452   2,					/* cost of reg,reg fld/fst */
453   {2, 2, 6},				/* cost of loading fp registers
454 					   in SFmode, DFmode and XFmode */
455   {4, 4, 6},				/* cost of storing fp registers
456 					   in SFmode, DFmode and XFmode */
457   8,					/* cost of moving MMX register */
458   {8, 8},				/* cost of loading MMX registers
459 					   in SImode and DImode */
460   {8, 8},				/* cost of storing MMX registers
461 					   in SImode and DImode */
462   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
463   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
464 					   in 32,64,128,256 and 512-bit */
465   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
466 					   in 32,64,128,256 and 512-bit */
467   3, 3,					/* SSE->integer and integer->SSE moves */
468   /* End of register allocator costs.  */
469   },
470 
471   COSTS_N_INSNS (1),			/* cost of an add instruction */
472   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
473   COSTS_N_INSNS (1),			/* variable shift costs */
474   COSTS_N_INSNS (1),			/* constant shift costs */
475   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
476    COSTS_N_INSNS (11),			/*				 HI */
477    COSTS_N_INSNS (11),			/*				 SI */
478    COSTS_N_INSNS (11),			/*				 DI */
479    COSTS_N_INSNS (11)},			/*			      other */
480   0,					/* cost of multiply per each bit set */
481   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
482    COSTS_N_INSNS (25),			/*			    HI */
483    COSTS_N_INSNS (25),			/*			    SI */
484    COSTS_N_INSNS (25),			/*			    DI */
485    COSTS_N_INSNS (25)},			/*			    other */
486   COSTS_N_INSNS (3),			/* cost of movsx */
487   COSTS_N_INSNS (2),			/* cost of movzx */
488   8,					/* "large" insn */
489   17,					/* MOVE_RATIO */
490   6,					/* CLEAR_RATIO */
491   {2, 4, 2},				/* cost of loading integer registers
492 					   in QImode, HImode and SImode.
493 					   Relative to reg-reg move (2).  */
494   {2, 4, 2},				/* cost of storing integer registers */
495   {4, 8, 16, 32, 64},			/* cost of loading SSE register
496 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
497   {4, 8, 16, 32, 64},			/* cost of storing SSE register
498 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
499   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
500   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
501   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
502   3,					/* cost of moving SSE register to integer.  */
503   4, 4,					/* Gather load static, per_elt.  */
504   4, 4,					/* Gather store static, per_elt.  */
505   8,					/* size of l1 cache.  */
506   8,					/* size of l2 cache  */
507   0,					/* size of prefetch block */
508   0,					/* number of parallel prefetches */
509   2,					/* Branch cost */
510   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
511   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
512   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
513   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
514   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
515   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
516 
517   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
518   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
519   COSTS_N_INSNS (5),			/* cost of MULSS instruction.  */
520   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
521   COSTS_N_INSNS (10),			/* cost of FMA SS instruction.  */
522   COSTS_N_INSNS (10),			/* cost of FMA SD instruction.  */
523   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
524   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
525   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
526   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
527   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
528   pentium_memcpy,
529   pentium_memset,
530   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
531   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
532   "16:8:8",				/* Loop alignment.  */
533   "16:8:8",				/* Jump alignment.  */
534   "0:0:8",				/* Label alignment.  */
535   "16",					/* Func alignment.  */
536 };
537 
538 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
539    (we ensure the alignment).  For small blocks inline loop is still a
540    noticeable win, for bigger blocks either rep movsl or rep movsb is
541    way to go.  Rep movsb has apparently more expensive startup time in CPU,
542    but after 4K the difference is down in the noise.  */
543 static stringop_algs pentiumpro_memcpy[2] = {
544   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
545                        {8192, rep_prefix_4_byte, false},
546                        {-1, rep_prefix_1_byte, false}}},
547   DUMMY_STRINGOP_ALGS};
548 static stringop_algs pentiumpro_memset[2] = {
549   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
550                        {8192, rep_prefix_4_byte, false},
551                        {-1, libcall, false}}},
552   DUMMY_STRINGOP_ALGS};
553 static const
554 struct processor_costs pentiumpro_cost = {
555   {
556   /* Start of register allocator costs.  integer->integer move cost is 2. */
557   2,				     /* cost for loading QImode using movzbl */
558   {4, 4, 4},				/* cost of loading integer registers
559 					   in QImode, HImode and SImode.
560 					   Relative to reg-reg move (2).  */
561   {2, 2, 2},				/* cost of storing integer registers */
562   2,					/* cost of reg,reg fld/fst */
563   {2, 2, 6},				/* cost of loading fp registers
564 					   in SFmode, DFmode and XFmode */
565   {4, 4, 6},				/* cost of storing fp registers
566 					   in SFmode, DFmode and XFmode */
567   2,					/* cost of moving MMX register */
568   {2, 2},				/* cost of loading MMX registers
569 					   in SImode and DImode */
570   {2, 2},				/* cost of storing MMX registers
571 					   in SImode and DImode */
572   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
573   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
574 					   in 32,64,128,256 and 512-bit */
575   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
576 					   in 32,64,128,256 and 512-bit */
577   3, 3,					/* SSE->integer and integer->SSE moves */
578   /* End of register allocator costs.  */
579   },
580 
581   COSTS_N_INSNS (1),			/* cost of an add instruction */
582   COSTS_N_INSNS (1),			/* cost of a lea instruction */
583   COSTS_N_INSNS (1),			/* variable shift costs */
584   COSTS_N_INSNS (1),			/* constant shift costs */
585   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
586    COSTS_N_INSNS (4),			/*				 HI */
587    COSTS_N_INSNS (4),			/*				 SI */
588    COSTS_N_INSNS (4),			/*				 DI */
589    COSTS_N_INSNS (4)},			/*			      other */
590   0,					/* cost of multiply per each bit set */
591   {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
592    COSTS_N_INSNS (17),			/*			    HI */
593    COSTS_N_INSNS (17),			/*			    SI */
594    COSTS_N_INSNS (17),			/*			    DI */
595    COSTS_N_INSNS (17)},			/*			    other */
596   COSTS_N_INSNS (1),			/* cost of movsx */
597   COSTS_N_INSNS (1),			/* cost of movzx */
598   8,					/* "large" insn */
599   6,					/* MOVE_RATIO */
600   6,					/* CLEAR_RATIO */
601   {4, 4, 4},				/* cost of loading integer registers
602 					   in QImode, HImode and SImode.
603 					   Relative to reg-reg move (2).  */
604   {2, 2, 2},				/* cost of storing integer registers */
605   {4, 8, 16, 32, 64},			/* cost of loading SSE register
606 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
607   {4, 8, 16, 32, 64},			/* cost of storing SSE register
608 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
609   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
610   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
611   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
612   3,					/* cost of moving SSE register to integer.  */
613   4, 4,					/* Gather load static, per_elt.  */
614   4, 4,					/* Gather store static, per_elt.  */
615   8,					/* size of l1 cache.  */
616   256,					/* size of l2 cache  */
617   32,					/* size of prefetch block */
618   6,					/* number of parallel prefetches */
619   2,					/* Branch cost */
620   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
621   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
622   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
623   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
624   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
625   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
626 
627   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
628   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
629   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
630   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
631   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
632   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
633   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
634   COSTS_N_INSNS (18),			/* cost of DIVSD instruction.  */
635   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
636   COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
637   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
638   pentiumpro_memcpy,
639   pentiumpro_memset,
640   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
641   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
642   "16",					/* Loop alignment.  */
643   "16:11:8",				/* Jump alignment.  */
644   "0:0:8",				/* Label alignment.  */
645   "16",					/* Func alignment.  */
646 };
647 
648 static stringop_algs geode_memcpy[2] = {
649   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
650   DUMMY_STRINGOP_ALGS};
651 static stringop_algs geode_memset[2] = {
652   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
653   DUMMY_STRINGOP_ALGS};
654 static const
655 struct processor_costs geode_cost = {
656   {
657   /* Start of register allocator costs.  integer->integer move cost is 2. */
658   2,				     /* cost for loading QImode using movzbl */
659   {2, 2, 2},				/* cost of loading integer registers
660 					   in QImode, HImode and SImode.
661 					   Relative to reg-reg move (2).  */
662   {2, 2, 2},				/* cost of storing integer registers */
663   2,					/* cost of reg,reg fld/fst */
664   {2, 2, 2},				/* cost of loading fp registers
665 					   in SFmode, DFmode and XFmode */
666   {4, 6, 6},				/* cost of storing fp registers
667 					   in SFmode, DFmode and XFmode */
668   2,					/* cost of moving MMX register */
669   {2, 2},				/* cost of loading MMX registers
670 					   in SImode and DImode */
671   {2, 2},				/* cost of storing MMX registers
672 					   in SImode and DImode */
673   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
674   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
675 					   in 32,64,128,256 and 512-bit */
676   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
677 					   in 32,64,128,256 and 512-bit */
678   6, 6,					/* SSE->integer and integer->SSE moves */
679   /* End of register allocator costs.  */
680   },
681 
682   COSTS_N_INSNS (1),			/* cost of an add instruction */
683   COSTS_N_INSNS (1),			/* cost of a lea instruction */
684   COSTS_N_INSNS (2),			/* variable shift costs */
685   COSTS_N_INSNS (1),			/* constant shift costs */
686   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
687    COSTS_N_INSNS (4),			/*				 HI */
688    COSTS_N_INSNS (7),			/*				 SI */
689    COSTS_N_INSNS (7),			/*				 DI */
690    COSTS_N_INSNS (7)},			/*			      other */
691   0,					/* cost of multiply per each bit set */
692   {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
693    COSTS_N_INSNS (23),			/*			    HI */
694    COSTS_N_INSNS (39),			/*			    SI */
695    COSTS_N_INSNS (39),			/*			    DI */
696    COSTS_N_INSNS (39)},			/*			    other */
697   COSTS_N_INSNS (1),			/* cost of movsx */
698   COSTS_N_INSNS (1),			/* cost of movzx */
699   8,					/* "large" insn */
700   4,					/* MOVE_RATIO */
701   4,					/* CLEAR_RATIO */
702   {2, 2, 2},				/* cost of loading integer registers
703 					   in QImode, HImode and SImode.
704 					   Relative to reg-reg move (2).  */
705   {2, 2, 2},				/* cost of storing integer registers */
706   {2, 2, 8, 16, 32},			/* cost of loading SSE register
707 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
708   {2, 2, 8, 16, 32},			/* cost of storing SSE register
709 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
710   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
711   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
712   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
713   6,					/* cost of moving SSE register to integer.  */
714   2, 2,					/* Gather load static, per_elt.  */
715   2, 2,					/* Gather store static, per_elt.  */
716   64,					/* size of l1 cache.  */
717   128,					/* size of l2 cache.  */
718   32,					/* size of prefetch block */
719   1,					/* number of parallel prefetches */
720   1,					/* Branch cost */
721   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
722   COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
723   COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
724   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
725   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
726   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
727 
728   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
729   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
730   COSTS_N_INSNS (11),			/* cost of MULSS instruction.  */
731   COSTS_N_INSNS (11),			/* cost of MULSD instruction.  */
732   COSTS_N_INSNS (17),			/* cost of FMA SS instruction.  */
733   COSTS_N_INSNS (17),			/* cost of FMA SD instruction.  */
734   COSTS_N_INSNS (47),			/* cost of DIVSS instruction.  */
735   COSTS_N_INSNS (47),			/* cost of DIVSD instruction.  */
736   COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
737   COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
738   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
739   geode_memcpy,
740   geode_memset,
741   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
742   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
743   NULL,					/* Loop alignment.  */
744   NULL,					/* Jump alignment.  */
745   NULL,					/* Label alignment.  */
746   NULL,					/* Func alignment.  */
747 };
748 
749 static stringop_algs k6_memcpy[2] = {
750   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
751   DUMMY_STRINGOP_ALGS};
752 static stringop_algs k6_memset[2] = {
753   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
754   DUMMY_STRINGOP_ALGS};
755 static const
756 struct processor_costs k6_cost = {
757   {
758   /* Start of register allocator costs.  integer->integer move cost is 2. */
759   3,				     /* cost for loading QImode using movzbl */
760   {4, 5, 4},				/* cost of loading integer registers
761 					   in QImode, HImode and SImode.
762 					   Relative to reg-reg move (2).  */
763   {2, 3, 2},				/* cost of storing integer registers */
764   4,					/* cost of reg,reg fld/fst */
765   {6, 6, 6},				/* cost of loading fp registers
766 					   in SFmode, DFmode and XFmode */
767   {4, 4, 4},				/* cost of storing fp registers
768 					   in SFmode, DFmode and XFmode */
769   2,					/* cost of moving MMX register */
770   {2, 2},				/* cost of loading MMX registers
771 					   in SImode and DImode */
772   {2, 2},				/* cost of storing MMX registers
773 					   in SImode and DImode */
774   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
775   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
776 					   in 32,64,128,256 and 512-bit */
777   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
778 					   in 32,64,128,256 and 512-bit */
779   6, 6,					/* SSE->integer and integer->SSE moves */
780   /* End of register allocator costs.  */
781   },
782 
783   COSTS_N_INSNS (1),			/* cost of an add instruction */
784   COSTS_N_INSNS (2),			/* cost of a lea instruction */
785   COSTS_N_INSNS (1),			/* variable shift costs */
786   COSTS_N_INSNS (1),			/* constant shift costs */
787   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
788    COSTS_N_INSNS (3),			/*				 HI */
789    COSTS_N_INSNS (3),			/*				 SI */
790    COSTS_N_INSNS (3),			/*				 DI */
791    COSTS_N_INSNS (3)},			/*			      other */
792   0,					/* cost of multiply per each bit set */
793   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
794    COSTS_N_INSNS (18),			/*			    HI */
795    COSTS_N_INSNS (18),			/*			    SI */
796    COSTS_N_INSNS (18),			/*			    DI */
797    COSTS_N_INSNS (18)},			/*			    other */
798   COSTS_N_INSNS (2),			/* cost of movsx */
799   COSTS_N_INSNS (2),			/* cost of movzx */
800   8,					/* "large" insn */
801   4,					/* MOVE_RATIO */
802   4,					/* CLEAR_RATIO */
803   {4, 5, 4},				/* cost of loading integer registers
804 					   in QImode, HImode and SImode.
805 					   Relative to reg-reg move (2).  */
806   {2, 3, 2},				/* cost of storing integer registers */
807   {2, 2, 8, 16, 32},			/* cost of loading SSE register
808 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
809   {2, 2, 8, 16, 32},			/* cost of storing SSE register
810 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
811   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
812   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
813   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
814   6,					/* cost of moving SSE register to integer.  */
815   2, 2,					/* Gather load static, per_elt.  */
816   2, 2,					/* Gather store static, per_elt.  */
817   32,					/* size of l1 cache.  */
818   32,					/* size of l2 cache.  Some models
819 					   have integrated l2 cache, but
820 					   optimizing for k6 is not important
821 					   enough to worry about that.  */
822   32,					/* size of prefetch block */
823   1,					/* number of parallel prefetches */
824   1,					/* Branch cost */
825   COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
826   COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
827   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
828   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
829   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
830   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
831 
832   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
833   COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
834   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
835   COSTS_N_INSNS (2),			/* cost of MULSD instruction.  */
836   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
837   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
838   COSTS_N_INSNS (56),			/* cost of DIVSS instruction.  */
839   COSTS_N_INSNS (56),			/* cost of DIVSD instruction.  */
840   COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
841   COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
842   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
843   k6_memcpy,
844   k6_memset,
845   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
846   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
847   "32:8:8",				/* Loop alignment.  */
848   "32:8:8",				/* Jump alignment.  */
849   "0:0:8",				/* Label alignment.  */
850   "32",					/* Func alignment.  */
851 };
852 
853 /* For some reason, Athlon deals better with REP prefix (relative to loops)
854    compared to K8. Alignment becomes important after 8 bytes for memcpy and
855    128 bytes for memset.  */
856 static stringop_algs athlon_memcpy[2] = {
857   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
858   DUMMY_STRINGOP_ALGS};
859 static stringop_algs athlon_memset[2] = {
860   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
861   DUMMY_STRINGOP_ALGS};
862 static const
863 struct processor_costs athlon_cost = {
864   {
865   /* Start of register allocator costs.  integer->integer move cost is 2. */
866   4,				     /* cost for loading QImode using movzbl */
867   {3, 4, 3},				/* cost of loading integer registers
868 					   in QImode, HImode and SImode.
869 					   Relative to reg-reg move (2).  */
870   {3, 4, 3},				/* cost of storing integer registers */
871   4,					/* cost of reg,reg fld/fst */
872   {4, 4, 12},				/* cost of loading fp registers
873 					   in SFmode, DFmode and XFmode */
874   {6, 6, 8},				/* cost of storing fp registers
875 					   in SFmode, DFmode and XFmode */
876   2,					/* cost of moving MMX register */
877   {4, 4},				/* cost of loading MMX registers
878 					   in SImode and DImode */
879   {4, 4},				/* cost of storing MMX registers
880 					   in SImode and DImode */
881   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
882   {4, 4, 12, 12, 24},			/* cost of loading SSE registers
883 					   in 32,64,128,256 and 512-bit */
884   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
885 					   in 32,64,128,256 and 512-bit */
886   5, 5,					/* SSE->integer and integer->SSE moves */
887   /* End of register allocator costs.  */
888   },
889 
890   COSTS_N_INSNS (1),			/* cost of an add instruction */
891   COSTS_N_INSNS (2),			/* cost of a lea instruction */
892   COSTS_N_INSNS (1),			/* variable shift costs */
893   COSTS_N_INSNS (1),			/* constant shift costs */
894   {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
895    COSTS_N_INSNS (5),			/*				 HI */
896    COSTS_N_INSNS (5),			/*				 SI */
897    COSTS_N_INSNS (5),			/*				 DI */
898    COSTS_N_INSNS (5)},			/*			      other */
899   0,					/* cost of multiply per each bit set */
900   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
901    COSTS_N_INSNS (26),			/*			    HI */
902    COSTS_N_INSNS (42),			/*			    SI */
903    COSTS_N_INSNS (74),			/*			    DI */
904    COSTS_N_INSNS (74)},			/*			    other */
905   COSTS_N_INSNS (1),			/* cost of movsx */
906   COSTS_N_INSNS (1),			/* cost of movzx */
907   8,					/* "large" insn */
908   9,					/* MOVE_RATIO */
909   6,					/* CLEAR_RATIO */
910   {3, 4, 3},				/* cost of loading integer registers
911 					   in QImode, HImode and SImode.
912 					   Relative to reg-reg move (2).  */
913   {3, 4, 3},				/* cost of storing integer registers */
914   {4, 4, 12, 12, 24},			/* cost of loading SSE register
915 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
916   {4, 4, 10, 10, 20},			/* cost of storing SSE register
917 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
918   {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
919   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
920   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
921   5,					/* cost of moving SSE register to integer.  */
922   4, 4,					/* Gather load static, per_elt.  */
923   4, 4,					/* Gather store static, per_elt.  */
924   64,					/* size of l1 cache.  */
925   256,					/* size of l2 cache.  */
926   64,					/* size of prefetch block */
927   6,					/* number of parallel prefetches */
928   5,					/* Branch cost */
929   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
930   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
931   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
932   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
933   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
934   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
935 
936   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
937   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
938   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
939   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
940   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
941   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
942   /* 11-16  */
943   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
944   COSTS_N_INSNS (24),			/* cost of DIVSD instruction.  */
945   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
946   COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
947   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
948   athlon_memcpy,
949   athlon_memset,
950   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
951   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
952   "16:8:8",				/* Loop alignment.  */
953   "16:8:8",				/* Jump alignment.  */
954   "0:0:8",				/* Label alignment.  */
955   "16",					/* Func alignment.  */
956 };
957 
958 /* K8 has optimized REP instruction for medium sized blocks, but for very
959    small blocks it is better to use loop. For large blocks, libcall can
960    do nontemporary accesses and beat inline considerably.  */
961 static stringop_algs k8_memcpy[2] = {
962   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
963              {-1, rep_prefix_4_byte, false}}},
964   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
965              {-1, libcall, false}}}};
966 static stringop_algs k8_memset[2] = {
967   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
968              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
969   {libcall, {{48, unrolled_loop, false},
970              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
971 static const
972 struct processor_costs k8_cost = {
973   {
974   /* Start of register allocator costs.  integer->integer move cost is 2. */
975   4,				     /* cost for loading QImode using movzbl */
976   {3, 4, 3},				/* cost of loading integer registers
977 					   in QImode, HImode and SImode.
978 					   Relative to reg-reg move (2).  */
979   {3, 4, 3},				/* cost of storing integer registers */
980   4,					/* cost of reg,reg fld/fst */
981   {4, 4, 12},				/* cost of loading fp registers
982 					   in SFmode, DFmode and XFmode */
983   {6, 6, 8},				/* cost of storing fp registers
984 					   in SFmode, DFmode and XFmode */
985   2,					/* cost of moving MMX register */
986   {3, 3},				/* cost of loading MMX registers
987 					   in SImode and DImode */
988   {4, 4},				/* cost of storing MMX registers
989 					   in SImode and DImode */
990   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
991   {4, 3, 12, 12, 24},			/* cost of loading SSE registers
992 					   in 32,64,128,256 and 512-bit */
993   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
994 					   in 32,64,128,256 and 512-bit */
995   5, 5,					/* SSE->integer and integer->SSE moves */
996   /* End of register allocator costs.  */
997   },
998 
999   COSTS_N_INSNS (1),			/* cost of an add instruction */
1000   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1001   COSTS_N_INSNS (1),			/* variable shift costs */
1002   COSTS_N_INSNS (1),			/* constant shift costs */
1003   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1004    COSTS_N_INSNS (4),			/*				 HI */
1005    COSTS_N_INSNS (3),			/*				 SI */
1006    COSTS_N_INSNS (4),			/*				 DI */
1007    COSTS_N_INSNS (5)},			/*			      other */
1008   0,					/* cost of multiply per each bit set */
1009   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1010    COSTS_N_INSNS (26),			/*			    HI */
1011    COSTS_N_INSNS (42),			/*			    SI */
1012    COSTS_N_INSNS (74),			/*			    DI */
1013    COSTS_N_INSNS (74)},			/*			    other */
1014   COSTS_N_INSNS (1),			/* cost of movsx */
1015   COSTS_N_INSNS (1),			/* cost of movzx */
1016   8,					/* "large" insn */
1017   9,					/* MOVE_RATIO */
1018   6,					/* CLEAR_RATIO */
1019   {3, 4, 3},				/* cost of loading integer registers
1020 					   in QImode, HImode and SImode.
1021 					   Relative to reg-reg move (2).  */
1022   {3, 4, 3},				/* cost of storing integer registers */
1023   {4, 3, 12, 12, 24},			/* cost of loading SSE register
1024 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1025   {4, 4, 10, 10, 20},			/* cost of storing SSE register
1026 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1027   {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
1028   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
1029   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1030   5,					/* cost of moving SSE register to integer.  */
1031   4, 4,					/* Gather load static, per_elt.  */
1032   4, 4,					/* Gather store static, per_elt.  */
1033   64,					/* size of l1 cache.  */
1034   512,					/* size of l2 cache.  */
1035   64,					/* size of prefetch block */
1036   /* New AMD processors never drop prefetches; if they cannot be performed
1037      immediately, they are queued.  We set number of simultaneous prefetches
1038      to a large constant to reflect this (it probably is not a good idea not
1039      to limit number of prefetches at all, as their execution also takes some
1040      time).  */
1041   100,					/* number of parallel prefetches */
1042   3,					/* Branch cost */
1043   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1044   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1045   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1046   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1047   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1048   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1049 
1050   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1051   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1052   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1053   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1054   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
1055   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
1056   /* 11-16  */
1057   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
1058   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
1059   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
1060   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
1061   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1062   k8_memcpy,
1063   k8_memset,
1064   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1065   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1066   "16:8:8",				/* Loop alignment.  */
1067   "16:8:8",				/* Jump alignment.  */
1068   "0:0:8",				/* Label alignment.  */
1069   "16",					/* Func alignment.  */
1070 };
1071 
1072 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1073    very small blocks it is better to use loop. For large blocks, libcall can
1074    do nontemporary accesses and beat inline considerably.  */
1075 static stringop_algs amdfam10_memcpy[2] = {
1076   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1077              {-1, rep_prefix_4_byte, false}}},
1078   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1079              {-1, libcall, false}}}};
1080 static stringop_algs amdfam10_memset[2] = {
1081   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1082              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1083   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1084              {-1, libcall, false}}}};
1085 struct processor_costs amdfam10_cost = {
1086   {
1087   /* Start of register allocator costs.  integer->integer move cost is 2. */
1088   4,				     /* cost for loading QImode using movzbl */
1089   {3, 4, 3},				/* cost of loading integer registers
1090 					   in QImode, HImode and SImode.
1091 					   Relative to reg-reg move (2).  */
1092   {3, 4, 3},				/* cost of storing integer registers */
1093   4,					/* cost of reg,reg fld/fst */
1094   {4, 4, 12},				/* cost of loading fp registers
1095 		   			   in SFmode, DFmode and XFmode */
1096   {6, 6, 8},				/* cost of storing fp registers
1097  		   			   in SFmode, DFmode and XFmode */
1098   2,					/* cost of moving MMX register */
1099   {3, 3},				/* cost of loading MMX registers
1100 					   in SImode and DImode */
1101   {4, 4},				/* cost of storing MMX registers
1102 					   in SImode and DImode */
1103   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1104   {4, 4, 3, 6, 12},			/* cost of loading SSE registers
1105 					   in 32,64,128,256 and 512-bit */
1106   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
1107 					   in 32,64,128,256 and 512-bit */
1108   3, 3,					/* SSE->integer and integer->SSE moves */
1109 
1110   					/* On K8:
1111   					    MOVD reg64, xmmreg Double FSTORE 4
1112 					    MOVD reg32, xmmreg Double FSTORE 4
1113 					   On AMDFAM10:
1114 					    MOVD reg64, xmmreg Double FADD 3
1115 							       1/1  1/1
1116 					    MOVD reg32, xmmreg Double FADD 3
1117 							       1/1  1/1 */
1118   /* End of register allocator costs.  */
1119   },
1120 
1121   COSTS_N_INSNS (1),			/* cost of an add instruction */
1122   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1123   COSTS_N_INSNS (1),			/* variable shift costs */
1124   COSTS_N_INSNS (1),			/* constant shift costs */
1125   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1126    COSTS_N_INSNS (4),			/*				 HI */
1127    COSTS_N_INSNS (3),			/*				 SI */
1128    COSTS_N_INSNS (4),			/*				 DI */
1129    COSTS_N_INSNS (5)},			/*			      other */
1130   0,					/* cost of multiply per each bit set */
1131   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1132    COSTS_N_INSNS (35),			/*			    HI */
1133    COSTS_N_INSNS (51),			/*			    SI */
1134    COSTS_N_INSNS (83),			/*			    DI */
1135    COSTS_N_INSNS (83)},			/*			    other */
1136   COSTS_N_INSNS (1),			/* cost of movsx */
1137   COSTS_N_INSNS (1),			/* cost of movzx */
1138   8,					/* "large" insn */
1139   9,					/* MOVE_RATIO */
1140   6,					/* CLEAR_RATIO */
1141   {3, 4, 3},				/* cost of loading integer registers
1142 					   in QImode, HImode and SImode.
1143 					   Relative to reg-reg move (2).  */
1144   {3, 4, 3},				/* cost of storing integer registers */
1145   {4, 4, 3, 6, 12},			/* cost of loading SSE register
1146 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1147   {4, 4, 5, 10, 20},			/* cost of storing SSE register
1148 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1149   {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
1150   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
1151   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1152   3,					/* cost of moving SSE register to integer.  */
1153   4, 4,					/* Gather load static, per_elt.  */
1154   4, 4,					/* Gather store static, per_elt.  */
1155   64,					/* size of l1 cache.  */
1156   512,					/* size of l2 cache.  */
1157   64,					/* size of prefetch block */
1158   /* New AMD processors never drop prefetches; if they cannot be performed
1159      immediately, they are queued.  We set number of simultaneous prefetches
1160      to a large constant to reflect this (it probably is not a good idea not
1161      to limit number of prefetches at all, as their execution also takes some
1162      time).  */
1163   100,					/* number of parallel prefetches */
1164   2,					/* Branch cost */
1165   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1166   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1167   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1168   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1169   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1170   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1171 
1172   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1173   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1174   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1175   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1176   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
1177   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
1178   /* 11-16  */
1179   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
1180   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
1181   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
1182   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
1183   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1184   amdfam10_memcpy,
1185   amdfam10_memset,
1186   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1187   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1188   "32:25:8",				/* Loop alignment.  */
1189   "32:8:8",				/* Jump alignment.  */
1190   "0:0:8",				/* Label alignment.  */
1191   "32",					/* Func alignment.  */
1192 };
1193 
1194 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1195     very small blocks it is better to use loop. For large blocks, libcall
1196     can do nontemporary accesses and beat inline considerably.  */
1197 static stringop_algs bdver_memcpy[2] = {
1198   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1199              {-1, rep_prefix_4_byte, false}}},
1200   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1201              {-1, libcall, false}}}};
1202 static stringop_algs bdver_memset[2] = {
1203   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1204              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1205   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1206              {-1, libcall, false}}}};
1207 
1208 const struct processor_costs bdver_cost = {
1209   {
1210   /* Start of register allocator costs.  integer->integer move cost is 2. */
1211   8,				     /* cost for loading QImode using movzbl */
1212   {8, 8, 8},				/* cost of loading integer registers
1213 					   in QImode, HImode and SImode.
1214 					   Relative to reg-reg move (2).  */
1215   {8, 8, 8},				/* cost of storing integer registers */
1216   4,					/* cost of reg,reg fld/fst */
1217   {12, 12, 28},				/* cost of loading fp registers
1218 		   			   in SFmode, DFmode and XFmode */
1219   {10, 10, 18},				/* cost of storing fp registers
1220  		   			   in SFmode, DFmode and XFmode */
1221   4,					/* cost of moving MMX register */
1222   {12, 12},				/* cost of loading MMX registers
1223 					   in SImode and DImode */
1224   {10, 10},				/* cost of storing MMX registers
1225 					   in SImode and DImode */
1226   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1227   {12, 12, 10, 40, 60},			/* cost of loading SSE registers
1228 					   in 32,64,128,256 and 512-bit */
1229   {10, 10, 10, 40, 60},			/* cost of storing SSE registers
1230 					   in 32,64,128,256 and 512-bit */
1231   16, 20,				/* SSE->integer and integer->SSE moves */
1232   /* End of register allocator costs.  */
1233   },
1234 
1235   COSTS_N_INSNS (1),			/* cost of an add instruction */
1236   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1237   COSTS_N_INSNS (1),			/* variable shift costs */
1238   COSTS_N_INSNS (1),			/* constant shift costs */
1239   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1240    COSTS_N_INSNS (4),			/*				 HI */
1241    COSTS_N_INSNS (4),			/*				 SI */
1242    COSTS_N_INSNS (6),			/*				 DI */
1243    COSTS_N_INSNS (6)},			/*			      other */
1244   0,					/* cost of multiply per each bit set */
1245   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1246    COSTS_N_INSNS (35),			/*			    HI */
1247    COSTS_N_INSNS (51),			/*			    SI */
1248    COSTS_N_INSNS (83),			/*			    DI */
1249    COSTS_N_INSNS (83)},			/*			    other */
1250   COSTS_N_INSNS (1),			/* cost of movsx */
1251   COSTS_N_INSNS (1),			/* cost of movzx */
1252   8,					/* "large" insn */
1253   9,					/* MOVE_RATIO */
1254   6,					/* CLEAR_RATIO */
1255   {8, 8, 8},				/* cost of loading integer registers
1256 					   in QImode, HImode and SImode.
1257 					   Relative to reg-reg move (2).  */
1258   {8, 8, 8},				/* cost of storing integer registers */
1259   {12, 12, 10, 40, 60},			/* cost of loading SSE register
1260 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1261   {10, 10, 10, 40, 60},			/* cost of storing SSE register
1262 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1263   {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
1264   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
1265   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1266   16,					/* cost of moving SSE register to integer.  */
1267   12, 12,				/* Gather load static, per_elt.  */
1268   10, 10,				/* Gather store static, per_elt.  */
1269   16,					/* size of l1 cache.  */
1270   2048,					/* size of l2 cache.  */
1271   64,					/* size of prefetch block */
1272   /* New AMD processors never drop prefetches; if they cannot be performed
1273      immediately, they are queued.  We set number of simultaneous prefetches
1274      to a large constant to reflect this (it probably is not a good idea not
1275      to limit number of prefetches at all, as their execution also takes some
1276      time).  */
1277   100,					/* number of parallel prefetches */
1278   2,					/* Branch cost */
1279   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1280   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1281   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1282   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1283   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1284   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1285 
1286   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1287   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1288   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1289   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1290   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1291   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1292   /* 9-24  */
1293   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1294   /* 9-27  */
1295   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1296   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1297   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1298   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1299   bdver_memcpy,
1300   bdver_memset,
1301   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1302   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1303   "16:11:8",				/* Loop alignment.  */
1304   "16:8:8",				/* Jump alignment.  */
1305   "0:0:8",				/* Label alignment.  */
1306   "11",					/* Func alignment.  */
1307 };
1308 
1309 
1310 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1311     very small blocks it is better to use loop.  For large blocks, libcall
1312     can do nontemporary accesses and beat inline considerably.  */
1313 static stringop_algs znver1_memcpy[2] = {
1314   /* 32-bit tuning.  */
1315   {libcall, {{6, loop, false},
1316 	     {14, unrolled_loop, false},
1317 	     {-1, libcall, false}}},
1318   /* 64-bit tuning.  */
1319   {libcall, {{16, loop, false},
1320 	     {128, rep_prefix_8_byte, false},
1321 	     {-1, libcall, false}}}};
1322 static stringop_algs znver1_memset[2] = {
1323   /* 32-bit tuning.  */
1324   {libcall, {{8, loop, false},
1325 	     {24, unrolled_loop, false},
1326 	     {128, rep_prefix_4_byte, false},
1327 	     {-1, libcall, false}}},
1328   /* 64-bit tuning.  */
1329   {libcall, {{48, unrolled_loop, false},
1330 	     {128, rep_prefix_8_byte, false},
1331 	     {-1, libcall, false}}}};
1332 struct processor_costs znver1_cost = {
1333   {
1334   /* Start of register allocator costs.  integer->integer move cost is 2. */
1335 
1336   /* reg-reg moves are done by renaming and thus they are even cheaper than
1337      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1338      to doubles of latencies, we do not model this correctly.  It does not
1339      seem to make practical difference to bump prices up even more.  */
1340   6,					/* cost for loading QImode using
1341 					   movzbl.  */
1342   {6, 6, 6},				/* cost of loading integer registers
1343 					   in QImode, HImode and SImode.
1344 					   Relative to reg-reg move (2).  */
1345   {8, 8, 8},				/* cost of storing integer
1346 					   registers.  */
1347   2,					/* cost of reg,reg fld/fst.  */
1348   {6, 6, 16},				/* cost of loading fp registers
1349 		   			   in SFmode, DFmode and XFmode.  */
1350   {8, 8, 16},				/* cost of storing fp registers
1351  		   			   in SFmode, DFmode and XFmode.  */
1352   2,					/* cost of moving MMX register.  */
1353   {6, 6},				/* cost of loading MMX registers
1354 					   in SImode and DImode.  */
1355   {8, 8},				/* cost of storing MMX registers
1356 					   in SImode and DImode.  */
1357   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
1358   {6, 6, 6, 12, 24},			/* cost of loading SSE registers
1359 					   in 32,64,128,256 and 512-bit.  */
1360   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
1361 					   in 32,64,128,256 and 512-bit.  */
1362   6, 6,					/* SSE->integer and integer->SSE moves.  */
1363   /* End of register allocator costs.  */
1364   },
1365 
1366   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1367   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1368   COSTS_N_INSNS (1),			/* variable shift costs.  */
1369   COSTS_N_INSNS (1),			/* constant shift costs.  */
1370   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1371    COSTS_N_INSNS (3),			/*				 HI.  */
1372    COSTS_N_INSNS (3),			/*				 SI.  */
1373    COSTS_N_INSNS (3),			/*				 DI.  */
1374    COSTS_N_INSNS (3)},			/*			      other.  */
1375   0,					/* cost of multiply per each bit
1376 					    set.  */
1377    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1378       bound.  */
1379   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1380    COSTS_N_INSNS (22),			/*			    HI.  */
1381    COSTS_N_INSNS (30),			/*			    SI.  */
1382    COSTS_N_INSNS (45),			/*			    DI.  */
1383    COSTS_N_INSNS (45)},			/*			    other.  */
1384   COSTS_N_INSNS (1),			/* cost of movsx.  */
1385   COSTS_N_INSNS (1),			/* cost of movzx.  */
1386   8,					/* "large" insn.  */
1387   9,					/* MOVE_RATIO.  */
1388   6,					/* CLEAR_RATIO */
1389   {6, 6, 6},				/* cost of loading integer registers
1390 					   in QImode, HImode and SImode.
1391 					   Relative to reg-reg move (2).  */
1392   {8, 8, 8},				/* cost of storing integer
1393 					   registers.  */
1394   {6, 6, 6, 12, 24},			/* cost of loading SSE register
1395 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1396   {8, 8, 8, 16, 32},			/* cost of storing SSE register
1397 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1398   {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
1399   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
1400   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
1401   6,					/* cost of moving SSE register to integer.  */
1402   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1403      throughput 12.  Approx 9 uops do not depend on vector size and every load
1404      is 7 uops.  */
1405   18, 8,				/* Gather load static, per_elt.  */
1406   18, 10,				/* Gather store static, per_elt.  */
1407   32,					/* size of l1 cache.  */
1408   512,					/* size of l2 cache.  */
1409   64,					/* size of prefetch block.  */
1410   /* New AMD processors never drop prefetches; if they cannot be performed
1411      immediately, they are queued.  We set number of simultaneous prefetches
1412      to a large constant to reflect this (it probably is not a good idea not
1413      to limit number of prefetches at all, as their execution also takes some
1414      time).  */
1415   100,					/* number of parallel prefetches.  */
1416   3,					/* Branch cost.  */
1417   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1418   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1419   /* Latency of fdiv is 8-15.  */
1420   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1421   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1422   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1423   /* Latency of fsqrt is 4-10.  */
1424   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1425 
1426   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1427   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1428   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1429   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1430   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1431   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1432   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1433   /* 9-13  */
1434   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1435   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1436   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1437   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1438      and it can execute 2 integer additions and 2 multiplications thus
1439      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1440      that 4 works better than 6 probably due to register pressure.
1441 
1442      Integer vector operations are taken by FP unit and execute 3 vector
1443      plus/minus operations per cycle but only one multiply.  This is adjusted
1444      in ix86_reassociation_width.  */
1445   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1446   znver1_memcpy,
1447   znver1_memset,
1448   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1449   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1450   "16",					/* Loop alignment.  */
1451   "16",					/* Jump alignment.  */
1452   "0:0:8",				/* Label alignment.  */
1453   "16",					/* Func alignment.  */
1454 };
1455 
1456 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1457     very small blocks it is better to use loop.  For large blocks, libcall
1458     can do nontemporary accesses and beat inline considerably.  */
1459 static stringop_algs znver2_memcpy[2] = {
1460   /* 32-bit tuning.  */
1461   {libcall, {{6, loop, false},
1462 	     {14, unrolled_loop, false},
1463 	     {-1, libcall, false}}},
1464   /* 64-bit tuning.  */
1465   {libcall, {{16, loop, false},
1466 	     {64, rep_prefix_4_byte, false},
1467 	     {-1, libcall, false}}}};
1468 static stringop_algs znver2_memset[2] = {
1469   /* 32-bit tuning.  */
1470   {libcall, {{8, loop, false},
1471 	     {24, unrolled_loop, false},
1472 	     {128, rep_prefix_4_byte, false},
1473 	     {-1, libcall, false}}},
1474   /* 64-bit tuning.  */
1475   {libcall, {{24, rep_prefix_4_byte, false},
1476 	     {128, rep_prefix_8_byte, false},
1477 	     {-1, libcall, false}}}};
1478 
1479 struct processor_costs znver2_cost = {
1480   {
1481   /* Start of register allocator costs.  integer->integer move cost is 2. */
1482 
1483   /* reg-reg moves are done by renaming and thus they are even cheaper than
1484      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1485      to doubles of latencies, we do not model this correctly.  It does not
1486      seem to make practical difference to bump prices up even more.  */
1487   6,					/* cost for loading QImode using
1488 					   movzbl.  */
1489   {6, 6, 6},				/* cost of loading integer registers
1490 					   in QImode, HImode and SImode.
1491 					   Relative to reg-reg move (2).  */
1492   {8, 8, 8},				/* cost of storing integer
1493 					   registers.  */
1494   2,					/* cost of reg,reg fld/fst.  */
1495   {6, 6, 16},				/* cost of loading fp registers
1496 					   in SFmode, DFmode and XFmode.  */
1497   {8, 8, 16},				/* cost of storing fp registers
1498 					   in SFmode, DFmode and XFmode.  */
1499   2,					/* cost of moving MMX register.  */
1500   {6, 6},				/* cost of loading MMX registers
1501 					   in SImode and DImode.  */
1502   {8, 8},				/* cost of storing MMX registers
1503 					   in SImode and DImode.  */
1504   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1505 					   register.  */
1506   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1507 					   in 32,64,128,256 and 512-bit.  */
1508   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
1509 					   in 32,64,128,256 and 512-bit.  */
1510   6, 6,					/* SSE->integer and integer->SSE
1511 					   moves.  */
1512   /* End of register allocator costs.  */
1513   },
1514 
1515   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1516   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1517   COSTS_N_INSNS (1),			/* variable shift costs.  */
1518   COSTS_N_INSNS (1),			/* constant shift costs.  */
1519   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1520    COSTS_N_INSNS (3),			/* 				 HI.  */
1521    COSTS_N_INSNS (3),			/*				 SI.  */
1522    COSTS_N_INSNS (3),			/*				 DI.  */
1523    COSTS_N_INSNS (3)},			/*			other.  */
1524   0,					/* cost of multiply per each bit
1525 					   set.  */
1526    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1527       bound.  */
1528   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1529    COSTS_N_INSNS (22),			/* 			    HI.  */
1530    COSTS_N_INSNS (30),			/*			    SI.  */
1531    COSTS_N_INSNS (45),			/*			    DI.  */
1532    COSTS_N_INSNS (45)},			/*			    other.  */
1533   COSTS_N_INSNS (1),			/* cost of movsx.  */
1534   COSTS_N_INSNS (1),			/* cost of movzx.  */
1535   8,					/* "large" insn.  */
1536   9,					/* MOVE_RATIO.  */
1537   6,					/* CLEAR_RATIO */
1538   {6, 6, 6},				/* cost of loading integer registers
1539 					   in QImode, HImode and SImode.
1540 					   Relative to reg-reg move (2).  */
1541   {8, 8, 8},				/* cost of storing integer
1542 					   registers.  */
1543   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1544 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1545   {8, 8, 8, 8, 16},			/* cost of storing SSE register
1546 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1547   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
1548   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1549   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1550 					   register.  */
1551   6,					/* cost of moving SSE register to integer.  */
1552   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1553      throughput 12.  Approx 9 uops do not depend on vector size and every load
1554      is 7 uops.  */
1555   18, 8,				/* Gather load static, per_elt.  */
1556   18, 10,				/* Gather store static, per_elt.  */
1557   32,					/* size of l1 cache.  */
1558   512,					/* size of l2 cache.  */
1559   64,					/* size of prefetch block.  */
1560   /* New AMD processors never drop prefetches; if they cannot be performed
1561      immediately, they are queued.  We set number of simultaneous prefetches
1562      to a large constant to reflect this (it probably is not a good idea not
1563      to limit number of prefetches at all, as their execution also takes some
1564      time).  */
1565   100,					/* number of parallel prefetches.  */
1566   3,					/* Branch cost.  */
1567   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1568   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1569   /* Latency of fdiv is 8-15.  */
1570   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1571   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1572   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1573   /* Latency of fsqrt is 4-10.  */
1574   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1575 
1576   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1577   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1578   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1579   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
1580   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1581   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1582   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1583   /* 9-13.  */
1584   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1585   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1586   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1587   /* Zen can execute 4 integer operations per cycle.  FP operations
1588      take 3 cycles and it can execute 2 integer additions and 2
1589      multiplications thus reassociation may make sense up to with of 6.
1590      SPEC2k6 bencharks suggests
1591      that 4 works better than 6 probably due to register pressure.
1592 
1593      Integer vector operations are taken by FP unit and execute 3 vector
1594      plus/minus operations per cycle but only one multiply.  This is adjusted
1595      in ix86_reassociation_width.  */
1596   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1597   znver2_memcpy,
1598   znver2_memset,
1599   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1600   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1601   "16",					/* Loop alignment.  */
1602   "16",					/* Jump alignment.  */
1603   "0:0:8",				/* Label alignment.  */
1604   "16",					/* Func alignment.  */
1605 };
1606 
1607 struct processor_costs znver3_cost = {
1608   {
1609   /* Start of register allocator costs.  integer->integer move cost is 2. */
1610 
1611   /* reg-reg moves are done by renaming and thus they are even cheaper than
1612      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1613      to doubles of latencies, we do not model this correctly.  It does not
1614      seem to make practical difference to bump prices up even more.  */
1615   6,					/* cost for loading QImode using
1616 					   movzbl.  */
1617   {6, 6, 6},				/* cost of loading integer registers
1618 					   in QImode, HImode and SImode.
1619 					   Relative to reg-reg move (2).  */
1620   {8, 8, 8},				/* cost of storing integer
1621 					   registers.  */
1622   2,					/* cost of reg,reg fld/fst.  */
1623   {6, 6, 16},				/* cost of loading fp registers
1624 					   in SFmode, DFmode and XFmode.  */
1625   {8, 8, 16},				/* cost of storing fp registers
1626 					   in SFmode, DFmode and XFmode.  */
1627   2,					/* cost of moving MMX register.  */
1628   {6, 6},				/* cost of loading MMX registers
1629 					   in SImode and DImode.  */
1630   {8, 8},				/* cost of storing MMX registers
1631 					   in SImode and DImode.  */
1632   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1633 					   register.  */
1634   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1635 					   in 32,64,128,256 and 512-bit.  */
1636   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
1637 					   in 32,64,128,256 and 512-bit.  */
1638   6, 6,					/* SSE->integer and integer->SSE
1639 					   moves.  */
1640   /* End of register allocator costs.  */
1641   },
1642 
1643   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1644   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1645   COSTS_N_INSNS (1),			/* variable shift costs.  */
1646   COSTS_N_INSNS (1),			/* constant shift costs.  */
1647   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1648    COSTS_N_INSNS (3),			/* 				 HI.  */
1649    COSTS_N_INSNS (3),			/*				 SI.  */
1650    COSTS_N_INSNS (3),			/*				 DI.  */
1651    COSTS_N_INSNS (3)},			/*			other.  */
1652   0,					/* cost of multiply per each bit
1653 					   set.  */
1654   {COSTS_N_INSNS (9),			/* cost of a divide/mod for QI.  */
1655    COSTS_N_INSNS (10),			/* 			    HI.  */
1656    COSTS_N_INSNS (12),			/*			    SI.  */
1657    COSTS_N_INSNS (17),			/*			    DI.  */
1658    COSTS_N_INSNS (17)},			/*			    other.  */
1659   COSTS_N_INSNS (1),			/* cost of movsx.  */
1660   COSTS_N_INSNS (1),			/* cost of movzx.  */
1661   8,					/* "large" insn.  */
1662   9,					/* MOVE_RATIO.  */
1663   6,					/* CLEAR_RATIO */
1664   {6, 6, 6},				/* cost of loading integer registers
1665 					   in QImode, HImode and SImode.
1666 					   Relative to reg-reg move (2).  */
1667   {8, 8, 8},				/* cost of storing integer
1668 					   registers.  */
1669   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1670 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1671   {8, 8, 8, 8, 16},			/* cost of storing SSE register
1672 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1673   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
1674   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1675   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1676 					   register.  */
1677   6,					/* cost of moving SSE register to integer.  */
1678   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1679      throughput 12.  Approx 9 uops do not depend on vector size and every load
1680      is 7 uops.  */
1681   18, 8,				/* Gather load static, per_elt.  */
1682   18, 10,				/* Gather store static, per_elt.  */
1683   32,					/* size of l1 cache.  */
1684   512,					/* size of l2 cache.  */
1685   64,					/* size of prefetch block.  */
1686   /* New AMD processors never drop prefetches; if they cannot be performed
1687      immediately, they are queued.  We set number of simultaneous prefetches
1688      to a large constant to reflect this (it probably is not a good idea not
1689      to limit number of prefetches at all, as their execution also takes some
1690      time).  */
1691   100,					/* number of parallel prefetches.  */
1692   3,					/* Branch cost.  */
1693   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1694   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1695   /* Latency of fdiv is 8-15.  */
1696   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1697   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1698   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1699   /* Latency of fsqrt is 4-10.  */
1700   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1701 
1702   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1703   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1704   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1705   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
1706   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1707   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1708   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1709   /* 9-13.  */
1710   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1711   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1712   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1713   /* Zen can execute 4 integer operations per cycle.  FP operations
1714      take 3 cycles and it can execute 2 integer additions and 2
1715      multiplications thus reassociation may make sense up to with of 6.
1716      SPEC2k6 bencharks suggests
1717      that 4 works better than 6 probably due to register pressure.
1718 
1719      Integer vector operations are taken by FP unit and execute 3 vector
1720      plus/minus operations per cycle but only one multiply.  This is adjusted
1721      in ix86_reassociation_width.  */
1722   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1723   znver2_memcpy,
1724   znver2_memset,
1725   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1726   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1727   "16",					/* Loop alignment.  */
1728   "16",					/* Jump alignment.  */
1729   "0:0:8",				/* Label alignment.  */
1730   "16",					/* Func alignment.  */
1731 };
1732 
1733 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1734 static stringop_algs skylake_memcpy[2] =   {
1735   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1736   {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1737              {-1, libcall, false}}}};
1738 
1739 static stringop_algs skylake_memset[2] = {
1740   {libcall, {{6, loop_1_byte, true},
1741              {24, loop, true},
1742              {8192, rep_prefix_4_byte, true},
1743              {-1, libcall, false}}},
1744   {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1745              {-1, libcall, false}}}};
1746 
1747 static const
1748 struct processor_costs skylake_cost = {
1749   {
1750   /* Start of register allocator costs.  integer->integer move cost is 2. */
1751   6,				     /* cost for loading QImode using movzbl */
1752   {4, 4, 4},				/* cost of loading integer registers
1753 					   in QImode, HImode and SImode.
1754 					   Relative to reg-reg move (2).  */
1755   {6, 6, 6},				/* cost of storing integer registers */
1756   2,					/* cost of reg,reg fld/fst */
1757   {6, 6, 8},				/* cost of loading fp registers
1758 					   in SFmode, DFmode and XFmode */
1759   {6, 6, 10},				/* cost of storing fp registers
1760 					   in SFmode, DFmode and XFmode */
1761   2,					/* cost of moving MMX register */
1762   {6, 6},				/* cost of loading MMX registers
1763 					   in SImode and DImode */
1764   {6, 6},				/* cost of storing MMX registers
1765 					   in SImode and DImode */
1766   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
1767   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
1768 					   in 32,64,128,256 and 512-bit */
1769   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
1770 					   in 32,64,128,256 and 512-bit */
1771   6, 6,					/* SSE->integer and integer->SSE moves */
1772   /* End of register allocator costs.  */
1773   },
1774 
1775   COSTS_N_INSNS (1),			/* cost of an add instruction */
1776   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
1777   COSTS_N_INSNS (1),			/* variable shift costs */
1778   COSTS_N_INSNS (1),			/* constant shift costs */
1779   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1780    COSTS_N_INSNS (4),			/*				 HI */
1781    COSTS_N_INSNS (3),			/*				 SI */
1782    COSTS_N_INSNS (3),			/*				 DI */
1783    COSTS_N_INSNS (3)},			/*			      other */
1784   0,					/* cost of multiply per each bit set */
1785   /* Expanding div/mod currently doesn't consider parallelism. So the cost
1786      model is not realistic. We compensate by increasing the latencies a bit.  */
1787   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
1788    COSTS_N_INSNS (11),			/*			    HI */
1789    COSTS_N_INSNS (14),			/*			    SI */
1790    COSTS_N_INSNS (76),			/*			    DI */
1791    COSTS_N_INSNS (76)},			/*			    other */
1792   COSTS_N_INSNS (1),			/* cost of movsx */
1793   COSTS_N_INSNS (0),			/* cost of movzx */
1794   8,					/* "large" insn */
1795   17,					/* MOVE_RATIO */
1796   6,					/* CLEAR_RATIO */
1797   {4, 4, 4},				/* cost of loading integer registers
1798 					   in QImode, HImode and SImode.
1799 					   Relative to reg-reg move (2).  */
1800   {6, 6, 6},				/* cost of storing integer registers */
1801   {6, 6, 6, 10, 20},			/* cost of loading SSE register
1802 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1803   {8, 8, 8, 12, 24},			/* cost of storing SSE register
1804 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1805   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
1806   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1807   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
1808   2,					/* cost of moving SSE register to integer.  */
1809   20, 8,				/* Gather load static, per_elt.  */
1810   22, 10,				/* Gather store static, per_elt.  */
1811   64,					/* size of l1 cache.  */
1812   512,					/* size of l2 cache.  */
1813   64,					/* size of prefetch block */
1814   6,					/* number of parallel prefetches */
1815   3,					/* Branch cost */
1816   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
1817   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1818   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1819   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1820   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1821   COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
1822 
1823   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1824   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1825   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1826   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1827   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
1828   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
1829   COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
1830   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
1831   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
1832   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
1833   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
1834   skylake_memcpy,
1835   skylake_memset,
1836   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1837   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1838   "16:11:8",				/* Loop alignment.  */
1839   "16:11:8",				/* Jump alignment.  */
1840   "0:0:8",				/* Label alignment.  */
1841   "16",					/* Func alignment.  */
1842 };
1843   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1844      very small blocks it is better to use loop. For large blocks, libcall can
1845      do nontemporary accesses and beat inline considerably.  */
1846 static stringop_algs btver1_memcpy[2] = {
1847   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1848              {-1, rep_prefix_4_byte, false}}},
1849   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1850              {-1, libcall, false}}}};
1851 static stringop_algs btver1_memset[2] = {
1852   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1853              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1854   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1855              {-1, libcall, false}}}};
1856 const struct processor_costs btver1_cost = {
1857   {
1858   /* Start of register allocator costs.  integer->integer move cost is 2. */
1859   8,				     /* cost for loading QImode using movzbl */
1860   {6, 8, 6},				/* cost of loading integer registers
1861 					   in QImode, HImode and SImode.
1862 					   Relative to reg-reg move (2).  */
1863   {6, 8, 6},				/* cost of storing integer registers */
1864   4,					/* cost of reg,reg fld/fst */
1865   {12, 12, 28},				/* cost of loading fp registers
1866 					   in SFmode, DFmode and XFmode */
1867   {12, 12, 38},				/* cost of storing fp registers
1868 					   in SFmode, DFmode and XFmode */
1869   4,					/* cost of moving MMX register */
1870   {10, 10},				/* cost of loading MMX registers
1871 					   in SImode and DImode */
1872   {12, 12},				/* cost of storing MMX registers
1873 					   in SImode and DImode */
1874   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1875   {10, 10, 12, 48, 96},			/* cost of loading SSE registers
1876 					   in 32,64,128,256 and 512-bit */
1877   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
1878 					   in 32,64,128,256 and 512-bit */
1879   14, 14,				/* SSE->integer and integer->SSE moves */
1880   /* End of register allocator costs.  */
1881   },
1882 
1883   COSTS_N_INSNS (1),			/* cost of an add instruction */
1884   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1885   COSTS_N_INSNS (1),			/* variable shift costs */
1886   COSTS_N_INSNS (1),			/* constant shift costs */
1887   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1888    COSTS_N_INSNS (4),			/*				 HI */
1889    COSTS_N_INSNS (3),			/*				 SI */
1890    COSTS_N_INSNS (4),			/*				 DI */
1891    COSTS_N_INSNS (5)},			/*			      other */
1892   0,					/* cost of multiply per each bit set */
1893   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1894    COSTS_N_INSNS (35),			/*			    HI */
1895    COSTS_N_INSNS (51),			/*			    SI */
1896    COSTS_N_INSNS (83),			/*			    DI */
1897    COSTS_N_INSNS (83)},			/*			    other */
1898   COSTS_N_INSNS (1),			/* cost of movsx */
1899   COSTS_N_INSNS (1),			/* cost of movzx */
1900   8,					/* "large" insn */
1901   9,					/* MOVE_RATIO */
1902   6,					/* CLEAR_RATIO */
1903   {6, 8, 6},				/* cost of loading integer registers
1904 					   in QImode, HImode and SImode.
1905 					   Relative to reg-reg move (2).  */
1906   {6, 8, 6},				/* cost of storing integer registers */
1907   {10, 10, 12, 48, 96},			/* cost of loading SSE register
1908 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1909   {10, 10, 12, 48, 96},			/* cost of storing SSE register
1910 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1911   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
1912   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
1913   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1914   14,					/* cost of moving SSE register to integer.  */
1915   10, 10,				/* Gather load static, per_elt.  */
1916   10, 10,				/* Gather store static, per_elt.  */
1917   32,					/* size of l1 cache.  */
1918   512,					/* size of l2 cache.  */
1919   64,					/* size of prefetch block */
1920   100,					/* number of parallel prefetches */
1921   2,					/* Branch cost */
1922   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1923   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1924   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1925   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1926   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1927   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1928 
1929   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1930   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1931   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
1932   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1933   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1934   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1935   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
1936   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
1937   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
1938   COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
1939   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1940   btver1_memcpy,
1941   btver1_memset,
1942   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1943   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1944   "16:11:8",				/* Loop alignment.  */
1945   "16:8:8",				/* Jump alignment.  */
1946   "0:0:8",				/* Label alignment.  */
1947   "11",					/* Func alignment.  */
1948 };
1949 
1950 static stringop_algs btver2_memcpy[2] = {
1951   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1952              {-1, rep_prefix_4_byte, false}}},
1953   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1954              {-1, libcall, false}}}};
1955 static stringop_algs btver2_memset[2] = {
1956   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1957              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1958   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1959              {-1, libcall, false}}}};
1960 const struct processor_costs btver2_cost = {
1961   {
1962   /* Start of register allocator costs.  integer->integer move cost is 2. */
1963   8,				     /* cost for loading QImode using movzbl */
1964   {8, 8, 6},				/* cost of loading integer registers
1965 					   in QImode, HImode and SImode.
1966 					   Relative to reg-reg move (2).  */
1967   {8, 8, 6},				/* cost of storing integer registers */
1968   4,					/* cost of reg,reg fld/fst */
1969   {12, 12, 28},				/* cost of loading fp registers
1970 					   in SFmode, DFmode and XFmode */
1971   {12, 12, 38},				/* cost of storing fp registers
1972 					   in SFmode, DFmode and XFmode */
1973   4,					/* cost of moving MMX register */
1974   {10, 10},				/* cost of loading MMX registers
1975 					   in SImode and DImode */
1976   {12, 12},				/* cost of storing MMX registers
1977 					   in SImode and DImode */
1978   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1979   {10, 10, 12, 48, 96},			/* cost of loading SSE registers
1980 					   in 32,64,128,256 and 512-bit */
1981   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
1982 					   in 32,64,128,256 and 512-bit */
1983   14, 14,				/* SSE->integer and integer->SSE moves */
1984   /* End of register allocator costs.  */
1985   },
1986 
1987   COSTS_N_INSNS (1),			/* cost of an add instruction */
1988   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1989   COSTS_N_INSNS (1),			/* variable shift costs */
1990   COSTS_N_INSNS (1),			/* constant shift costs */
1991   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1992    COSTS_N_INSNS (4),			/*				 HI */
1993    COSTS_N_INSNS (3),			/*				 SI */
1994    COSTS_N_INSNS (4),			/*				 DI */
1995    COSTS_N_INSNS (5)},			/*			      other */
1996   0,					/* cost of multiply per each bit set */
1997   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1998    COSTS_N_INSNS (35),			/*			    HI */
1999    COSTS_N_INSNS (51),			/*			    SI */
2000    COSTS_N_INSNS (83),			/*			    DI */
2001    COSTS_N_INSNS (83)},			/*			    other */
2002   COSTS_N_INSNS (1),			/* cost of movsx */
2003   COSTS_N_INSNS (1),			/* cost of movzx */
2004   8,					/* "large" insn */
2005   9,					/* MOVE_RATIO */
2006   6,					/* CLEAR_RATIO */
2007   {8, 8, 6},				/* cost of loading integer registers
2008 					   in QImode, HImode and SImode.
2009 					   Relative to reg-reg move (2).  */
2010   {8, 8, 6},				/* cost of storing integer registers */
2011   {10, 10, 12, 48, 96},			/* cost of loading SSE register
2012 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2013   {10, 10, 12, 48, 96},			/* cost of storing SSE register
2014 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2015   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
2016   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
2017   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2018   14,					/* cost of moving SSE register to integer.  */
2019   10, 10,				/* Gather load static, per_elt.  */
2020   10, 10,				/* Gather store static, per_elt.  */
2021   32,					/* size of l1 cache.  */
2022   2048,					/* size of l2 cache.  */
2023   64,					/* size of prefetch block */
2024   100,					/* number of parallel prefetches */
2025   2,					/* Branch cost */
2026   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
2027   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
2028   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
2029   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
2030   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
2031   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
2032 
2033   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2034   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2035   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
2036   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
2037   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2038   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2039   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2040   COSTS_N_INSNS (19),			/* cost of DIVSD instruction.  */
2041   COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
2042   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
2043   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2044   btver2_memcpy,
2045   btver2_memset,
2046   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
2047   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2048   "16:11:8",				/* Loop alignment.  */
2049   "16:8:8",				/* Jump alignment.  */
2050   "0:0:8",				/* Label alignment.  */
2051   "11",					/* Func alignment.  */
2052 };
2053 
2054 static stringop_algs pentium4_memcpy[2] = {
2055   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2056   DUMMY_STRINGOP_ALGS};
2057 static stringop_algs pentium4_memset[2] = {
2058   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2059              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2060   DUMMY_STRINGOP_ALGS};
2061 
2062 static const
2063 struct processor_costs pentium4_cost = {
2064   {
2065   /* Start of register allocator costs.  integer->integer move cost is 2. */
2066   5,				     /* cost for loading QImode using movzbl */
2067   {4, 5, 4},				/* cost of loading integer registers
2068 					   in QImode, HImode and SImode.
2069 					   Relative to reg-reg move (2).  */
2070   {2, 3, 2},				/* cost of storing integer registers */
2071   12,					/* cost of reg,reg fld/fst */
2072   {14, 14, 14},				/* cost of loading fp registers
2073 					   in SFmode, DFmode and XFmode */
2074   {14, 14, 14},				/* cost of storing fp registers
2075 					   in SFmode, DFmode and XFmode */
2076   12,					/* cost of moving MMX register */
2077   {16, 16},				/* cost of loading MMX registers
2078 					   in SImode and DImode */
2079   {16, 16},				/* cost of storing MMX registers
2080 					   in SImode and DImode */
2081   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
2082   {16, 16, 16, 32, 64},			/* cost of loading SSE registers
2083 					   in 32,64,128,256 and 512-bit */
2084   {16, 16, 16, 32, 64},			/* cost of storing SSE registers
2085 					   in 32,64,128,256 and 512-bit */
2086   20, 12,				/* SSE->integer and integer->SSE moves */
2087   /* End of register allocator costs.  */
2088   },
2089 
2090   COSTS_N_INSNS (1),			/* cost of an add instruction */
2091   COSTS_N_INSNS (3),			/* cost of a lea instruction */
2092   COSTS_N_INSNS (4),			/* variable shift costs */
2093   COSTS_N_INSNS (4),			/* constant shift costs */
2094   {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
2095    COSTS_N_INSNS (15),			/*				 HI */
2096    COSTS_N_INSNS (15),			/*				 SI */
2097    COSTS_N_INSNS (15),			/*				 DI */
2098    COSTS_N_INSNS (15)},			/*			      other */
2099   0,					/* cost of multiply per each bit set */
2100   {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
2101    COSTS_N_INSNS (56),			/*			    HI */
2102    COSTS_N_INSNS (56),			/*			    SI */
2103    COSTS_N_INSNS (56),			/*			    DI */
2104    COSTS_N_INSNS (56)},			/*			    other */
2105   COSTS_N_INSNS (1),			/* cost of movsx */
2106   COSTS_N_INSNS (1),			/* cost of movzx */
2107   16,					/* "large" insn */
2108   6,					/* MOVE_RATIO */
2109   6,					/* CLEAR_RATIO */
2110   {4, 5, 4},				/* cost of loading integer registers
2111 					   in QImode, HImode and SImode.
2112 					   Relative to reg-reg move (2).  */
2113   {2, 3, 2},				/* cost of storing integer registers */
2114   {16, 16, 16, 32, 64},			/* cost of loading SSE register
2115 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2116   {16, 16, 16, 32, 64},			/* cost of storing SSE register
2117 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2118   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
2119   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
2120   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
2121   20,					/* cost of moving SSE register to integer.  */
2122   16, 16,				/* Gather load static, per_elt.  */
2123   16, 16,				/* Gather store static, per_elt.  */
2124   8,					/* size of l1 cache.  */
2125   256,					/* size of l2 cache.  */
2126   64,					/* size of prefetch block */
2127   6,					/* number of parallel prefetches */
2128   2,					/* Branch cost */
2129   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
2130   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
2131   COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
2132   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
2133   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
2134   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
2135 
2136   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
2137   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2138   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
2139   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
2140   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2141   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2142   COSTS_N_INSNS (23),			/* cost of DIVSS instruction.  */
2143   COSTS_N_INSNS (38),			/* cost of DIVSD instruction.  */
2144   COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
2145   COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
2146   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2147   pentium4_memcpy,
2148   pentium4_memset,
2149   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2150   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2151   NULL,					/* Loop alignment.  */
2152   NULL,					/* Jump alignment.  */
2153   NULL,					/* Label alignment.  */
2154   NULL,					/* Func alignment.  */
2155 };
2156 
2157 static stringop_algs nocona_memcpy[2] = {
2158   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2159   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2160              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2161 
2162 static stringop_algs nocona_memset[2] = {
2163   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2164              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2165   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2166              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2167 
2168 static const
2169 struct processor_costs nocona_cost = {
2170   {
2171   /* Start of register allocator costs.  integer->integer move cost is 2. */
2172   4,				     /* cost for loading QImode using movzbl */
2173   {4, 4, 4},				/* cost of loading integer registers
2174 					   in QImode, HImode and SImode.
2175 					   Relative to reg-reg move (2).  */
2176   {4, 4, 4},				/* cost of storing integer registers */
2177   12,					/* cost of reg,reg fld/fst */
2178   {14, 14, 14},				/* cost of loading fp registers
2179 					   in SFmode, DFmode and XFmode */
2180   {14, 14, 14},				/* cost of storing fp registers
2181 					   in SFmode, DFmode and XFmode */
2182   14,					/* cost of moving MMX register */
2183   {12, 12},				/* cost of loading MMX registers
2184 					   in SImode and DImode */
2185   {12, 12},				/* cost of storing MMX registers
2186 					   in SImode and DImode */
2187   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
2188   {12, 12, 12, 24, 48},			/* cost of loading SSE registers
2189 					   in 32,64,128,256 and 512-bit */
2190   {12, 12, 12, 24, 48},			/* cost of storing SSE registers
2191 					   in 32,64,128,256 and 512-bit */
2192   20, 12,				/* SSE->integer and integer->SSE moves */
2193   /* End of register allocator costs.  */
2194   },
2195 
2196   COSTS_N_INSNS (1),			/* cost of an add instruction */
2197   COSTS_N_INSNS (1),			/* cost of a lea instruction */
2198   COSTS_N_INSNS (1),			/* variable shift costs */
2199   COSTS_N_INSNS (1),			/* constant shift costs */
2200   {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
2201    COSTS_N_INSNS (10),			/*				 HI */
2202    COSTS_N_INSNS (10),			/*				 SI */
2203    COSTS_N_INSNS (10),			/*				 DI */
2204    COSTS_N_INSNS (10)},			/*			      other */
2205   0,					/* cost of multiply per each bit set */
2206   {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
2207    COSTS_N_INSNS (66),			/*			    HI */
2208    COSTS_N_INSNS (66),			/*			    SI */
2209    COSTS_N_INSNS (66),			/*			    DI */
2210    COSTS_N_INSNS (66)},			/*			    other */
2211   COSTS_N_INSNS (1),			/* cost of movsx */
2212   COSTS_N_INSNS (1),			/* cost of movzx */
2213   16,					/* "large" insn */
2214   17,					/* MOVE_RATIO */
2215   6,					/* CLEAR_RATIO */
2216   {4, 4, 4},				/* cost of loading integer registers
2217 					   in QImode, HImode and SImode.
2218 					   Relative to reg-reg move (2).  */
2219   {4, 4, 4},				/* cost of storing integer registers */
2220   {12, 12, 12, 24, 48},			/* cost of loading SSE register
2221 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2222   {12, 12, 12, 24, 48},			/* cost of storing SSE register
2223 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2224   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
2225   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
2226   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
2227   20,					/* cost of moving SSE register to integer.  */
2228   12, 12,				/* Gather load static, per_elt.  */
2229   12, 12,				/* Gather store static, per_elt.  */
2230   8,					/* size of l1 cache.  */
2231   1024,					/* size of l2 cache.  */
2232   64,					/* size of prefetch block */
2233   8,					/* number of parallel prefetches */
2234   1,					/* Branch cost */
2235   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
2236   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2237   COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
2238   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
2239   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
2240   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
2241 
2242   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
2243   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2244   COSTS_N_INSNS (7),			/* cost of MULSS instruction.  */
2245   COSTS_N_INSNS (7),			/* cost of MULSD instruction.  */
2246   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
2247   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
2248   COSTS_N_INSNS (32),			/* cost of DIVSS instruction.  */
2249   COSTS_N_INSNS (40),			/* cost of DIVSD instruction.  */
2250   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
2251   COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
2252   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2253   nocona_memcpy,
2254   nocona_memset,
2255   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2256   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2257   NULL,					/* Loop alignment.  */
2258   NULL,					/* Jump alignment.  */
2259   NULL,					/* Label alignment.  */
2260   NULL,					/* Func alignment.  */
2261 };
2262 
2263 static stringop_algs atom_memcpy[2] = {
2264   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2265   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2266              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2267 static stringop_algs atom_memset[2] = {
2268   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2269              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2270   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2271              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2272 static const
2273 struct processor_costs atom_cost = {
2274   {
2275   /* Start of register allocator costs.  integer->integer move cost is 2. */
2276   6,					/* cost for loading QImode using movzbl */
2277   {6, 6, 6},				/* cost of loading integer registers
2278 					   in QImode, HImode and SImode.
2279 					   Relative to reg-reg move (2).  */
2280   {6, 6, 6},				/* cost of storing integer registers */
2281   4,					/* cost of reg,reg fld/fst */
2282   {6, 6, 18},				/* cost of loading fp registers
2283 					   in SFmode, DFmode and XFmode */
2284   {14, 14, 24},				/* cost of storing fp registers
2285 					   in SFmode, DFmode and XFmode */
2286   2,					/* cost of moving MMX register */
2287   {8, 8},				/* cost of loading MMX registers
2288 					   in SImode and DImode */
2289   {10, 10},				/* cost of storing MMX registers
2290 					   in SImode and DImode */
2291   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2292   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2293 					   in 32,64,128,256 and 512-bit */
2294   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2295 					   in 32,64,128,256 and 512-bit */
2296   8, 6,					/* SSE->integer and integer->SSE moves */
2297   /* End of register allocator costs.  */
2298   },
2299 
2300   COSTS_N_INSNS (1),			/* cost of an add instruction */
2301   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2302   COSTS_N_INSNS (1),			/* variable shift costs */
2303   COSTS_N_INSNS (1),			/* constant shift costs */
2304   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2305    COSTS_N_INSNS (4),			/*				 HI */
2306    COSTS_N_INSNS (3),			/*				 SI */
2307    COSTS_N_INSNS (4),			/*				 DI */
2308    COSTS_N_INSNS (2)},			/*			      other */
2309   0,					/* cost of multiply per each bit set */
2310   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2311    COSTS_N_INSNS (26),			/*			    HI */
2312    COSTS_N_INSNS (42),			/*			    SI */
2313    COSTS_N_INSNS (74),			/*			    DI */
2314    COSTS_N_INSNS (74)},			/*			    other */
2315   COSTS_N_INSNS (1),			/* cost of movsx */
2316   COSTS_N_INSNS (1),			/* cost of movzx */
2317   8,					/* "large" insn */
2318   17,					/* MOVE_RATIO */
2319   6,					/* CLEAR_RATIO */
2320   {6, 6, 6},				/* cost of loading integer registers
2321 					   in QImode, HImode and SImode.
2322 					   Relative to reg-reg move (2).  */
2323   {6, 6, 6},				/* cost of storing integer registers */
2324   {8, 8, 8, 16, 32},			/* cost of loading SSE register
2325 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2326   {8, 8, 8, 16, 32},			/* cost of storing SSE register
2327 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2328   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2329   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2330   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2331   8,					/* cost of moving SSE register to integer.  */
2332   8, 8,					/* Gather load static, per_elt.  */
2333   8, 8,					/* Gather store static, per_elt.  */
2334   32,					/* size of l1 cache.  */
2335   256,					/* size of l2 cache.  */
2336   64,					/* size of prefetch block */
2337   6,					/* number of parallel prefetches */
2338   3,					/* Branch cost */
2339   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2340   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2341   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2342   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2343   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2344   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2345 
2346   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2347   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2348   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2349   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2350   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2351   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2352   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
2353   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
2354   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
2355   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
2356   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2357   atom_memcpy,
2358   atom_memset,
2359   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2360   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2361   "16",					/* Loop alignment.  */
2362   "16:8:8",				/* Jump alignment.  */
2363   "0:0:8",				/* Label alignment.  */
2364   "16",					/* Func alignment.  */
2365 };
2366 
2367 static stringop_algs slm_memcpy[2] = {
2368   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2369   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2370              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2371 static stringop_algs slm_memset[2] = {
2372   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2373              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2374   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2375              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2376 static const
2377 struct processor_costs slm_cost = {
2378   {
2379   /* Start of register allocator costs.  integer->integer move cost is 2. */
2380   8,					/* cost for loading QImode using movzbl */
2381   {8, 8, 8},				/* cost of loading integer registers
2382 					   in QImode, HImode and SImode.
2383 					   Relative to reg-reg move (2).  */
2384   {6, 6, 6},				/* cost of storing integer registers */
2385   2,					/* cost of reg,reg fld/fst */
2386   {8, 8, 18},				/* cost of loading fp registers
2387 					   in SFmode, DFmode and XFmode */
2388   {6, 6, 18},				/* cost of storing fp registers
2389 					   in SFmode, DFmode and XFmode */
2390   2,					/* cost of moving MMX register */
2391   {8, 8},				/* cost of loading MMX registers
2392 					   in SImode and DImode */
2393   {6, 6},				/* cost of storing MMX registers
2394 					   in SImode and DImode */
2395   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2396   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2397 					   in 32,64,128,256 and 512-bit */
2398   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2399 					   in 32,64,128,256 and 512-bit */
2400   8, 6,					/* SSE->integer and integer->SSE moves */
2401   /* End of register allocator costs.  */
2402   },
2403 
2404   COSTS_N_INSNS (1),			/* cost of an add instruction */
2405   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2406   COSTS_N_INSNS (1),			/* variable shift costs */
2407   COSTS_N_INSNS (1),			/* constant shift costs */
2408   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2409    COSTS_N_INSNS (3),			/*				 HI */
2410    COSTS_N_INSNS (3),			/*				 SI */
2411    COSTS_N_INSNS (4),			/*				 DI */
2412    COSTS_N_INSNS (2)},			/*			      other */
2413   0,					/* cost of multiply per each bit set */
2414   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2415    COSTS_N_INSNS (26),			/*			    HI */
2416    COSTS_N_INSNS (42),			/*			    SI */
2417    COSTS_N_INSNS (74),			/*			    DI */
2418    COSTS_N_INSNS (74)},			/*			    other */
2419   COSTS_N_INSNS (1),			/* cost of movsx */
2420   COSTS_N_INSNS (1),			/* cost of movzx */
2421   8,					/* "large" insn */
2422   17,					/* MOVE_RATIO */
2423   6,					/* CLEAR_RATIO */
2424   {8, 8, 8},				/* cost of loading integer registers
2425 					   in QImode, HImode and SImode.
2426 					   Relative to reg-reg move (2).  */
2427   {6, 6, 6},				/* cost of storing integer registers */
2428   {8, 8, 8, 16, 32},			/* cost of loading SSE register
2429 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2430   {8, 8, 8, 16, 32},			/* cost of storing SSE register
2431 					   in SImode, DImode and TImode.  */
2432   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2433   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2434   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2435   8,					/* cost of moving SSE register to integer.  */
2436   8, 8,					/* Gather load static, per_elt.  */
2437   8, 8,					/* Gather store static, per_elt.  */
2438   32,					/* size of l1 cache.  */
2439   256,					/* size of l2 cache.  */
2440   64,					/* size of prefetch block */
2441   6,					/* number of parallel prefetches */
2442   3,					/* Branch cost */
2443   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2444   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2445   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2446   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2447   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2448   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2449 
2450   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2451   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2452   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2453   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2454   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2455   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2456   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
2457   COSTS_N_INSNS (69),			/* cost of DIVSD instruction.  */
2458   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
2459   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
2460   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2461   slm_memcpy,
2462   slm_memset,
2463   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2464   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2465   "16",					/* Loop alignment.  */
2466   "16:8:8",				/* Jump alignment.  */
2467   "0:0:8",				/* Label alignment.  */
2468   "16",					/* Func alignment.  */
2469 };
2470 
2471 static stringop_algs intel_memcpy[2] = {
2472   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2473   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2474              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2475 static stringop_algs intel_memset[2] = {
2476   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2477              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2478   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2479              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2480 static const
2481 struct processor_costs intel_cost = {
2482   {
2483   /* Start of register allocator costs.  integer->integer move cost is 2. */
2484   6,				     /* cost for loading QImode using movzbl */
2485   {4, 4, 4},				/* cost of loading integer registers
2486 					   in QImode, HImode and SImode.
2487 					   Relative to reg-reg move (2).  */
2488   {6, 6, 6},				/* cost of storing integer registers */
2489   2,					/* cost of reg,reg fld/fst */
2490   {6, 6, 8},				/* cost of loading fp registers
2491 					   in SFmode, DFmode and XFmode */
2492   {6, 6, 10},				/* cost of storing fp registers
2493 					   in SFmode, DFmode and XFmode */
2494   2,					/* cost of moving MMX register */
2495   {6, 6},				/* cost of loading MMX registers
2496 					   in SImode and DImode */
2497   {6, 6},				/* cost of storing MMX registers
2498 					   in SImode and DImode */
2499   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
2500   {6, 6, 6, 6, 6},			/* cost of loading SSE registers
2501 					   in 32,64,128,256 and 512-bit */
2502   {6, 6, 6, 6, 6},			/* cost of storing SSE registers
2503 					   in 32,64,128,256 and 512-bit */
2504   4, 4,					/* SSE->integer and integer->SSE moves */
2505   /* End of register allocator costs.  */
2506   },
2507 
2508   COSTS_N_INSNS (1),			/* cost of an add instruction */
2509   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2510   COSTS_N_INSNS (1),			/* variable shift costs */
2511   COSTS_N_INSNS (1),			/* constant shift costs */
2512   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2513    COSTS_N_INSNS (3),			/*				 HI */
2514    COSTS_N_INSNS (3),			/*				 SI */
2515    COSTS_N_INSNS (4),			/*				 DI */
2516    COSTS_N_INSNS (2)},			/*			      other */
2517   0,					/* cost of multiply per each bit set */
2518   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2519    COSTS_N_INSNS (26),			/*			    HI */
2520    COSTS_N_INSNS (42),			/*			    SI */
2521    COSTS_N_INSNS (74),			/*			    DI */
2522    COSTS_N_INSNS (74)},			/*			    other */
2523   COSTS_N_INSNS (1),			/* cost of movsx */
2524   COSTS_N_INSNS (1),			/* cost of movzx */
2525   8,					/* "large" insn */
2526   17,					/* MOVE_RATIO */
2527   6,					/* CLEAR_RATIO */
2528   {4, 4, 4},				/* cost of loading integer registers
2529 					   in QImode, HImode and SImode.
2530 					   Relative to reg-reg move (2).  */
2531   {6, 6, 6},				/* cost of storing integer registers */
2532   {6, 6, 6, 6, 6},			/* cost of loading SSE register
2533 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2534   {6, 6, 6, 6, 6},			/* cost of storing SSE register
2535 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2536   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2537   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2538   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
2539   4,					/* cost of moving SSE register to integer.  */
2540   6, 6,					/* Gather load static, per_elt.  */
2541   6, 6,					/* Gather store static, per_elt.  */
2542   32,					/* size of l1 cache.  */
2543   256,					/* size of l2 cache.  */
2544   64,					/* size of prefetch block */
2545   6,					/* number of parallel prefetches */
2546   3,					/* Branch cost */
2547   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2548   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2549   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2550   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2551   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2552   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2553 
2554   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2555   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2556   COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
2557   COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
2558   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2559   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2560   COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
2561   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
2562   COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
2563   COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
2564   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2565   intel_memcpy,
2566   intel_memset,
2567   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2568   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2569   "16",					/* Loop alignment.  */
2570   "16:8:8",				/* Jump alignment.  */
2571   "0:0:8",				/* Label alignment.  */
2572   "16",					/* Func alignment.  */
2573 };
2574 
2575 /* Generic should produce code tuned for Core-i7 (and newer chips)
2576    and btver1 (and newer chips).  */
2577 
2578 static stringop_algs generic_memcpy[2] = {
2579   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2580              {-1, libcall, false}}},
2581   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2582              {-1, libcall, false}}}};
2583 static stringop_algs generic_memset[2] = {
2584   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2585              {-1, libcall, false}}},
2586   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2587              {-1, libcall, false}}}};
2588 static const
2589 struct processor_costs generic_cost = {
2590   {
2591   /* Start of register allocator costs.  integer->integer move cost is 2. */
2592   6,				     /* cost for loading QImode using movzbl */
2593   {6, 6, 6},				/* cost of loading integer registers
2594 					   in QImode, HImode and SImode.
2595 					   Relative to reg-reg move (2).  */
2596   {6, 6, 6},				/* cost of storing integer registers */
2597   4,					/* cost of reg,reg fld/fst */
2598   {6, 6, 12},				/* cost of loading fp registers
2599 					   in SFmode, DFmode and XFmode */
2600   {6, 6, 12},				/* cost of storing fp registers
2601 					   in SFmode, DFmode and XFmode */
2602   2,					/* cost of moving MMX register */
2603   {6, 6},				/* cost of loading MMX registers
2604 					   in SImode and DImode */
2605   {6, 6},				/* cost of storing MMX registers
2606 					   in SImode and DImode */
2607   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
2608   {6, 6, 6, 10, 15},			/* cost of loading SSE registers
2609 					   in 32,64,128,256 and 512-bit */
2610   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
2611 					   in 32,64,128,256 and 512-bit */
2612   6, 6,					/* SSE->integer and integer->SSE moves */
2613   /* End of register allocator costs.  */
2614   },
2615 
2616   COSTS_N_INSNS (1),			/* cost of an add instruction */
2617   /* Setting cost to 2 makes our current implementation of synth_mult result in
2618      use of unnecessary temporary registers causing regression on several
2619      SPECfp benchmarks.  */
2620   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2621   COSTS_N_INSNS (1),			/* variable shift costs */
2622   COSTS_N_INSNS (1),			/* constant shift costs */
2623   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2624    COSTS_N_INSNS (4),			/*				 HI */
2625    COSTS_N_INSNS (3),			/*				 SI */
2626    COSTS_N_INSNS (4),			/*				 DI */
2627    COSTS_N_INSNS (4)},			/*			      other */
2628   0,					/* cost of multiply per each bit set */
2629   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
2630    COSTS_N_INSNS (22),			/*			    HI */
2631    COSTS_N_INSNS (30),			/*			    SI */
2632    COSTS_N_INSNS (74),			/*			    DI */
2633    COSTS_N_INSNS (74)},			/*			    other */
2634   COSTS_N_INSNS (1),			/* cost of movsx */
2635   COSTS_N_INSNS (1),			/* cost of movzx */
2636   8,					/* "large" insn */
2637   17,					/* MOVE_RATIO */
2638   6,					/* CLEAR_RATIO */
2639   {6, 6, 6},				/* cost of loading integer registers
2640 					   in QImode, HImode and SImode.
2641 					   Relative to reg-reg move (2).  */
2642   {6, 6, 6},				/* cost of storing integer registers */
2643   {6, 6, 6, 10, 15},			/* cost of loading SSE register
2644 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2645   {6, 6, 6, 10, 15},			/* cost of storing SSE register
2646 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2647   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
2648   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
2649   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
2650   6,					/* cost of moving SSE register to integer.  */
2651   18, 6,				/* Gather load static, per_elt.  */
2652   18, 6,				/* Gather store static, per_elt.  */
2653   32,					/* size of l1 cache.  */
2654   512,					/* size of l2 cache.  */
2655   64,					/* size of prefetch block */
2656   6,					/* number of parallel prefetches */
2657   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2658      value is increased to perhaps more appropriate value of 5.  */
2659   3,					/* Branch cost */
2660   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2661   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2662   COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
2663   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2664   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2665   COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
2666 
2667   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2668   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2669   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2670   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2671   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2672   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2673   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2674   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
2675   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
2676   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
2677   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
2678   generic_memcpy,
2679   generic_memset,
2680   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
2681   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
2682   "16:11:8",				/* Loop alignment.  */
2683   "16:11:8",				/* Jump alignment.  */
2684   "0:0:8",				/* Label alignment.  */
2685   "16",					/* Func alignment.  */
2686 };
2687 
2688 /* core_cost should produce code tuned for Core familly of CPUs.  */
2689 static stringop_algs core_memcpy[2] = {
2690   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2691   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2692              {-1, libcall, false}}}};
2693 static stringop_algs core_memset[2] = {
2694   {libcall, {{6, loop_1_byte, true},
2695              {24, loop, true},
2696              {8192, rep_prefix_4_byte, true},
2697              {-1, libcall, false}}},
2698   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2699              {-1, libcall, false}}}};
2700 
2701 static const
2702 struct processor_costs core_cost = {
2703   {
2704   /* Start of register allocator costs.  integer->integer move cost is 2. */
2705   6,				     /* cost for loading QImode using movzbl */
2706   {4, 4, 4},				/* cost of loading integer registers
2707 					   in QImode, HImode and SImode.
2708 					   Relative to reg-reg move (2).  */
2709   {6, 6, 6},				/* cost of storing integer registers */
2710   2,					/* cost of reg,reg fld/fst */
2711   {6, 6, 8},				/* cost of loading fp registers
2712 					   in SFmode, DFmode and XFmode */
2713   {6, 6, 10},				/* cost of storing fp registers
2714 					   in SFmode, DFmode and XFmode */
2715   2,					/* cost of moving MMX register */
2716   {6, 6},				/* cost of loading MMX registers
2717 					   in SImode and DImode */
2718   {6, 6},				/* cost of storing MMX registers
2719 					   in SImode and DImode */
2720   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2721   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
2722 					   in 32,64,128,256 and 512-bit */
2723   {6, 6, 6, 6, 12},			/* cost of storing SSE registers
2724 					   in 32,64,128,256 and 512-bit */
2725   6, 6,					/* SSE->integer and integer->SSE moves */
2726   /* End of register allocator costs.  */
2727   },
2728 
2729   COSTS_N_INSNS (1),			/* cost of an add instruction */
2730   /* On all chips taken into consideration lea is 2 cycles and more.  With
2731      this cost however our current implementation of synth_mult results in
2732      use of unnecessary temporary registers causing regression on several
2733      SPECfp benchmarks.  */
2734   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2735   COSTS_N_INSNS (1),			/* variable shift costs */
2736   COSTS_N_INSNS (1),			/* constant shift costs */
2737   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2738    COSTS_N_INSNS (4),			/*				 HI */
2739    COSTS_N_INSNS (3),			/*				 SI */
2740    /* Here we tune for Sandybridge or newer.  */
2741    COSTS_N_INSNS (3),			/*				 DI */
2742    COSTS_N_INSNS (3)},			/*			      other */
2743   0,					/* cost of multiply per each bit set */
2744   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2745      model is not realistic. We compensate by increasing the latencies a bit.  */
2746   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
2747    COSTS_N_INSNS (11),			/*			    HI */
2748    COSTS_N_INSNS (14),			/*			    SI */
2749    COSTS_N_INSNS (81),			/*			    DI */
2750    COSTS_N_INSNS (81)},			/*			    other */
2751   COSTS_N_INSNS (1),			/* cost of movsx */
2752   COSTS_N_INSNS (1),			/* cost of movzx */
2753   8,					/* "large" insn */
2754   17,					/* MOVE_RATIO */
2755   6,					/* CLEAR_RATIO */
2756   {4, 4, 4},				/* cost of loading integer registers
2757 					   in QImode, HImode and SImode.
2758 					   Relative to reg-reg move (2).  */
2759   {6, 6, 6},				/* cost of storing integer registers */
2760   {6, 6, 6, 6, 12},			/* cost of loading SSE register
2761 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2762   {6, 6, 6, 6, 12},			/* cost of storing SSE register
2763 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2764   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
2765   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
2766   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2767   2,					/* cost of moving SSE register to integer.  */
2768   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2769      rec. throughput 6.
2770      So 5 uops statically and one uops per load.  */
2771   10, 6,				/* Gather load static, per_elt.  */
2772   10, 6,				/* Gather store static, per_elt.  */
2773   64,					/* size of l1 cache.  */
2774   512,					/* size of l2 cache.  */
2775   64,					/* size of prefetch block */
2776   6,					/* number of parallel prefetches */
2777   /* FIXME perhaps more appropriate value is 5.  */
2778   3,					/* Branch cost */
2779   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2780   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2781   /* 10-24 */
2782   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
2783   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2784   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2785   COSTS_N_INSNS (23),			/* cost of FSQRT instruction.  */
2786 
2787   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2788   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2789   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2790   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2791   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2792   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2793   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
2794   COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
2795   COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
2796   COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
2797   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2798   core_memcpy,
2799   core_memset,
2800   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2801   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2802   "16:11:8",				/* Loop alignment.  */
2803   "16:11:8",				/* Jump alignment.  */
2804   "0:0:8",				/* Label alignment.  */
2805   "16",					/* Func alignment.  */
2806 };
2807 
2808