xref: /netbsd-src/external/gpl3/gcc/dist/gcc/config/i386/x86-tune-costs.h (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Costs of operations of individual x86 CPUs.
2    Copyright (C) 1988-2022 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10 
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
19 
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 <http://www.gnu.org/licenses/>.  */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
26 #define COSTS_N_BYTES(N) ((N) * 2)
27 
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29 
30 static stringop_algs ix86_size_memcpy[2] = {
31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36 
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39   {
40   /* Start of register allocator costs.  integer->integer move cost is 2. */
41   2,				     /* cost for loading QImode using movzbl */
42   {2, 2, 2},				/* cost of loading integer registers
43 					   in QImode, HImode and SImode.
44 					   Relative to reg-reg move (2).  */
45   {2, 2, 2},				/* cost of storing integer registers */
46   2,					/* cost of reg,reg fld/fst */
47   {2, 2, 2},				/* cost of loading fp registers
48 					   in SFmode, DFmode and XFmode */
49   {2, 2, 2},				/* cost of storing fp registers
50 					   in SFmode, DFmode and XFmode */
51   3,					/* cost of moving MMX register */
52   {3, 3},				/* cost of loading MMX registers
53 					   in SImode and DImode */
54   {3, 3},				/* cost of storing MMX registers
55 					   in SImode and DImode */
56   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
57   {3, 3, 3, 3, 3},			/* cost of loading SSE registers
58 					   in 32,64,128,256 and 512-bit */
59   {3, 3, 3, 3, 3},			/* cost of storing SSE registers
60 					   in 32,64,128,256 and 512-bit */
61   3, 3,				/* SSE->integer and integer->SSE moves */
62   3, 3,				/* mask->integer and integer->mask moves */
63   {2, 2, 2},				/* cost of loading mask register
64 					   in QImode, HImode, SImode.  */
65   {2, 2, 2},				/* cost if storing mask register
66 					   in QImode, HImode, SImode.  */
67   2,					/* cost of moving mask register.  */
68   /* End of register allocator costs.  */
69   },
70 
71   COSTS_N_BYTES (2),			/* cost of an add instruction */
72   COSTS_N_BYTES (3),			/* cost of a lea instruction */
73   COSTS_N_BYTES (2),			/* variable shift costs */
74   COSTS_N_BYTES (3),			/* constant shift costs */
75   {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
76    COSTS_N_BYTES (3),			/*				 HI */
77    COSTS_N_BYTES (3),			/*				 SI */
78    COSTS_N_BYTES (3),			/*				 DI */
79    COSTS_N_BYTES (5)},			/*			      other */
80   0,					/* cost of multiply per each bit set */
81   {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
82    COSTS_N_BYTES (3),			/*			    HI */
83    COSTS_N_BYTES (3),			/*			    SI */
84    COSTS_N_BYTES (3),			/*			    DI */
85    COSTS_N_BYTES (5)},			/*			    other */
86   COSTS_N_BYTES (3),			/* cost of movsx */
87   COSTS_N_BYTES (3),			/* cost of movzx */
88   0,					/* "large" insn */
89   2,					/* MOVE_RATIO */
90   2,					/* CLEAR_RATIO */
91   {2, 2, 2},				/* cost of loading integer registers
92 					   in QImode, HImode and SImode.
93 					   Relative to reg-reg move (2).  */
94   {2, 2, 2},				/* cost of storing integer registers */
95   {3, 3, 3, 3, 3},			/* cost of loading SSE register
96 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
97   {3, 3, 3, 3, 3},			/* cost of storing SSE register
98 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
99   {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
100 					   in 128bit, 256bit and 512bit */
101   {3, 3, 3, 3, 3},			/* cost of unaligned SSE store
102 					   in 128bit, 256bit and 512bit */
103   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
104   3,					/* cost of moving SSE register to integer.  */
105   5, 0,					/* Gather load static, per_elt.  */
106   5, 0,					/* Gather store static, per_elt.  */
107   0,					/* size of l1 cache  */
108   0,					/* size of l2 cache  */
109   0,					/* size of prefetch block */
110   0,					/* number of parallel prefetches */
111   2,					/* Branch cost */
112   COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
113   COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
114   COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
115   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
116   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
117   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
118 
119   COSTS_N_BYTES (2),			/* cost of cheap SSE instruction.  */
120   COSTS_N_BYTES (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
121   COSTS_N_BYTES (2),			/* cost of MULSS instruction.  */
122   COSTS_N_BYTES (2),			/* cost of MULSD instruction.  */
123   COSTS_N_BYTES (2),			/* cost of FMA SS instruction.  */
124   COSTS_N_BYTES (2),			/* cost of FMA SD instruction.  */
125   COSTS_N_BYTES (2),			/* cost of DIVSS instruction.  */
126   COSTS_N_BYTES (2),			/* cost of DIVSD instruction.  */
127   COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
128   COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
129   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
130   ix86_size_memcpy,
131   ix86_size_memset,
132   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
133   COSTS_N_BYTES (1),			/* cond_not_taken_branch_cost.  */
134   NULL,					/* Loop alignment.  */
135   NULL,					/* Jump alignment.  */
136   NULL,					/* Label alignment.  */
137   NULL,					/* Func alignment.  */
138 };
139 
140 /* Processor costs (relative to an add) */
141 static stringop_algs i386_memcpy[2] = {
142   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
143   DUMMY_STRINGOP_ALGS};
144 static stringop_algs i386_memset[2] = {
145   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
146   DUMMY_STRINGOP_ALGS};
147 
148 static const
149 struct processor_costs i386_cost = {	/* 386 specific costs */
150   {
151   /* Start of register allocator costs.  integer->integer move cost is 2. */
152   4,				     /* cost for loading QImode using movzbl */
153   {2, 4, 2},				/* cost of loading integer registers
154 					   in QImode, HImode and SImode.
155 					   Relative to reg-reg move (2).  */
156   {2, 4, 2},				/* cost of storing integer registers */
157   2,					/* cost of reg,reg fld/fst */
158   {8, 8, 8},				/* cost of loading fp registers
159 					   in SFmode, DFmode and XFmode */
160   {8, 8, 8},				/* cost of storing fp registers
161 					   in SFmode, DFmode and XFmode */
162   2,					/* cost of moving MMX register */
163   {4, 8},				/* cost of loading MMX registers
164 					   in SImode and DImode */
165   {4, 8},				/* cost of storing MMX registers
166 					   in SImode and DImode */
167   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
168   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
169 					   in 32,64,128,256 and 512-bit */
170   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
171 					   in 32,64,128,256 and 512-bit */
172   3, 3,				/* SSE->integer and integer->SSE moves */
173   3, 3,				/* mask->integer and integer->mask moves */
174   {2, 4, 2},				/* cost of loading mask register
175 					   in QImode, HImode, SImode.  */
176   {2, 4, 2},				/* cost if storing mask register
177 					   in QImode, HImode, SImode.  */
178   2,					/* cost of moving mask register.  */
179   /* End of register allocator costs.  */
180   },
181 
182   COSTS_N_INSNS (1),			/* cost of an add instruction */
183   COSTS_N_INSNS (1),			/* cost of a lea instruction */
184   COSTS_N_INSNS (3),			/* variable shift costs */
185   COSTS_N_INSNS (2),			/* constant shift costs */
186   {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
187    COSTS_N_INSNS (6),			/*				 HI */
188    COSTS_N_INSNS (6),			/*				 SI */
189    COSTS_N_INSNS (6),			/*				 DI */
190    COSTS_N_INSNS (6)},			/*			      other */
191   COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
192   {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
193    COSTS_N_INSNS (23),			/*			    HI */
194    COSTS_N_INSNS (23),			/*			    SI */
195    COSTS_N_INSNS (23),			/*			    DI */
196    COSTS_N_INSNS (23)},			/*			    other */
197   COSTS_N_INSNS (3),			/* cost of movsx */
198   COSTS_N_INSNS (2),			/* cost of movzx */
199   15,					/* "large" insn */
200   3,					/* MOVE_RATIO */
201   3,					/* CLEAR_RATIO */
202   {2, 4, 2},				/* cost of loading integer registers
203 					   in QImode, HImode and SImode.
204 					   Relative to reg-reg move (2).  */
205   {2, 4, 2},				/* cost of storing integer registers */
206   {4, 8, 16, 32, 64},			/* cost of loading SSE register
207 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
208   {4, 8, 16, 32, 64},			/* cost of storing SSE register
209 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
210   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
211   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
212   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
213   3,					/* cost of moving SSE register to integer.  */
214   4, 4,					/* Gather load static, per_elt.  */
215   4, 4,					/* Gather store static, per_elt.  */
216   0,					/* size of l1 cache  */
217   0,					/* size of l2 cache  */
218   0,					/* size of prefetch block */
219   0,					/* number of parallel prefetches */
220   1,					/* Branch cost */
221   COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
222   COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
223   COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
224   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
225   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
226   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
227 
228   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
229   COSTS_N_INSNS (23),			/* cost of ADDSS/SD SUBSS/SD insns.  */
230   COSTS_N_INSNS (27),			/* cost of MULSS instruction.  */
231   COSTS_N_INSNS (27),			/* cost of MULSD instruction.  */
232   COSTS_N_INSNS (27),			/* cost of FMA SS instruction.  */
233   COSTS_N_INSNS (27),			/* cost of FMA SD instruction.  */
234   COSTS_N_INSNS (88),			/* cost of DIVSS instruction.  */
235   COSTS_N_INSNS (88),			/* cost of DIVSD instruction.  */
236   COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
237   COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
238   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
239   i386_memcpy,
240   i386_memset,
241   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
242   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
243   "4",					/* Loop alignment.  */
244   "4",					/* Jump alignment.  */
245   NULL,					/* Label alignment.  */
246   "4",					/* Func alignment.  */
247 };
248 
249 static stringop_algs i486_memcpy[2] = {
250   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
251   DUMMY_STRINGOP_ALGS};
252 static stringop_algs i486_memset[2] = {
253   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
254   DUMMY_STRINGOP_ALGS};
255 
256 static const
257 struct processor_costs i486_cost = {	/* 486 specific costs */
258   {
259   /* Start of register allocator costs.  integer->integer move cost is 2. */
260   4,				     /* cost for loading QImode using movzbl */
261   {2, 4, 2},				/* cost of loading integer registers
262 					   in QImode, HImode and SImode.
263 					   Relative to reg-reg move (2).  */
264   {2, 4, 2},				/* cost of storing integer registers */
265   2,					/* cost of reg,reg fld/fst */
266   {8, 8, 8},				/* cost of loading fp registers
267 					   in SFmode, DFmode and XFmode */
268   {8, 8, 8},				/* cost of storing fp registers
269 					   in SFmode, DFmode and XFmode */
270   2,					/* cost of moving MMX register */
271   {4, 8},				/* cost of loading MMX registers
272 					   in SImode and DImode */
273   {4, 8},				/* cost of storing MMX registers
274 					   in SImode and DImode */
275   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
276   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
277 					   in 32,64,128,256 and 512-bit */
278   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
279 					   in 32,64,128,256 and 512-bit */
280   3, 3,				/* SSE->integer and integer->SSE moves */
281   3, 3,				/* mask->integer and integer->mask moves */
282   {2, 4, 2},				/* cost of loading mask register
283 					   in QImode, HImode, SImode.  */
284   {2, 4, 2},				/* cost if storing mask register
285 					   in QImode, HImode, SImode.  */
286   2,					/* cost of moving mask register.  */
287   /* End of register allocator costs.  */
288   },
289 
290   COSTS_N_INSNS (1),			/* cost of an add instruction */
291   COSTS_N_INSNS (1),			/* cost of a lea instruction */
292   COSTS_N_INSNS (3),			/* variable shift costs */
293   COSTS_N_INSNS (2),			/* constant shift costs */
294   {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
295    COSTS_N_INSNS (12),			/*				 HI */
296    COSTS_N_INSNS (12),			/*				 SI */
297    COSTS_N_INSNS (12),			/*				 DI */
298    COSTS_N_INSNS (12)},			/*			      other */
299   1,					/* cost of multiply per each bit set */
300   {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
301    COSTS_N_INSNS (40),			/*			    HI */
302    COSTS_N_INSNS (40),			/*			    SI */
303    COSTS_N_INSNS (40),			/*			    DI */
304    COSTS_N_INSNS (40)},			/*			    other */
305   COSTS_N_INSNS (3),			/* cost of movsx */
306   COSTS_N_INSNS (2),			/* cost of movzx */
307   15,					/* "large" insn */
308   3,					/* MOVE_RATIO */
309   3,					/* CLEAR_RATIO */
310   {2, 4, 2},				/* cost of loading integer registers
311 					   in QImode, HImode and SImode.
312 					   Relative to reg-reg move (2).  */
313   {2, 4, 2},				/* cost of storing integer registers */
314   {4, 8, 16, 32, 64},			/* cost of loading SSE register
315 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
316   {4, 8, 16, 32, 64},			/* cost of storing SSE register
317 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
318   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
319   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
320   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
321   3,					/* cost of moving SSE register to integer.  */
322   4, 4,					/* Gather load static, per_elt.  */
323   4, 4,					/* Gather store static, per_elt.  */
324   4,					/* size of l1 cache.  486 has 8kB cache
325 					   shared for code and data, so 4kB is
326 					   not really precise.  */
327   4,					/* size of l2 cache  */
328   0,					/* size of prefetch block */
329   0,					/* number of parallel prefetches */
330   1,					/* Branch cost */
331   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
332   COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
333   COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
334   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
335   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
336   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
337 
338   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
339   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
340   COSTS_N_INSNS (16),			/* cost of MULSS instruction.  */
341   COSTS_N_INSNS (16),			/* cost of MULSD instruction.  */
342   COSTS_N_INSNS (16),			/* cost of FMA SS instruction.  */
343   COSTS_N_INSNS (16),			/* cost of FMA SD instruction.  */
344   COSTS_N_INSNS (73),			/* cost of DIVSS instruction.  */
345   COSTS_N_INSNS (74),			/* cost of DIVSD instruction.  */
346   COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
347   COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
348   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
349   i486_memcpy,
350   i486_memset,
351   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
352   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
353   "16",					/* Loop alignment.  */
354   "16",					/* Jump alignment.  */
355   "0:0:8",				/* Label alignment.  */
356   "16",					/* Func alignment.  */
357 };
358 
359 static stringop_algs pentium_memcpy[2] = {
360   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
361   DUMMY_STRINGOP_ALGS};
362 static stringop_algs pentium_memset[2] = {
363   {libcall, {{-1, rep_prefix_4_byte, false}}},
364   DUMMY_STRINGOP_ALGS};
365 
366 static const
367 struct processor_costs pentium_cost = {
368   {
369   /* Start of register allocator costs.  integer->integer move cost is 2. */
370   6,				     /* cost for loading QImode using movzbl */
371   {2, 4, 2},				/* cost of loading integer registers
372 					   in QImode, HImode and SImode.
373 					   Relative to reg-reg move (2).  */
374   {2, 4, 2},				/* cost of storing integer registers */
375   2,					/* cost of reg,reg fld/fst */
376   {2, 2, 6},				/* cost of loading fp registers
377 					   in SFmode, DFmode and XFmode */
378   {4, 4, 6},				/* cost of storing fp registers
379 					   in SFmode, DFmode and XFmode */
380   8,					/* cost of moving MMX register */
381   {8, 8},				/* cost of loading MMX registers
382 					   in SImode and DImode */
383   {8, 8},				/* cost of storing MMX registers
384 					   in SImode and DImode */
385   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
386   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
387 					   in 32,64,128,256 and 512-bit */
388   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
389 					   in 32,64,128,256 and 512-bit */
390   3, 3,				/* SSE->integer and integer->SSE moves */
391   3, 3,				/* mask->integer and integer->mask moves */
392   {2, 4, 2},				/* cost of loading mask register
393 					   in QImode, HImode, SImode.  */
394   {2, 4, 2},				/* cost if storing mask register
395 					   in QImode, HImode, SImode.  */
396   2,					/* cost of moving mask register.  */
397   /* End of register allocator costs.  */
398   },
399 
400   COSTS_N_INSNS (1),			/* cost of an add instruction */
401   COSTS_N_INSNS (1),			/* cost of a lea instruction */
402   COSTS_N_INSNS (4),			/* variable shift costs */
403   COSTS_N_INSNS (1),			/* constant shift costs */
404   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
405    COSTS_N_INSNS (11),			/*				 HI */
406    COSTS_N_INSNS (11),			/*				 SI */
407    COSTS_N_INSNS (11),			/*				 DI */
408    COSTS_N_INSNS (11)},			/*			      other */
409   0,					/* cost of multiply per each bit set */
410   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
411    COSTS_N_INSNS (25),			/*			    HI */
412    COSTS_N_INSNS (25),			/*			    SI */
413    COSTS_N_INSNS (25),			/*			    DI */
414    COSTS_N_INSNS (25)},			/*			    other */
415   COSTS_N_INSNS (3),			/* cost of movsx */
416   COSTS_N_INSNS (2),			/* cost of movzx */
417   8,					/* "large" insn */
418   6,					/* MOVE_RATIO */
419   6,					/* CLEAR_RATIO */
420   {2, 4, 2},				/* cost of loading integer registers
421 					   in QImode, HImode and SImode.
422 					   Relative to reg-reg move (2).  */
423   {2, 4, 2},				/* cost of storing integer registers */
424   {4, 8, 16, 32, 64},			/* cost of loading SSE register
425 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
426   {4, 8, 16, 32, 64},			/* cost of storing SSE register
427 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
428   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
429   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
430   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
431   3,					/* cost of moving SSE register to integer.  */
432   4, 4,					/* Gather load static, per_elt.  */
433   4, 4,					/* Gather store static, per_elt.  */
434   8,					/* size of l1 cache.  */
435   8,					/* size of l2 cache  */
436   0,					/* size of prefetch block */
437   0,					/* number of parallel prefetches */
438   2,					/* Branch cost */
439   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
440   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
441   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
442   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
443   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
444   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
445 
446   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
447   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
448   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
449   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
450   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
451   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
452   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
453   COSTS_N_INSNS (39),			/* cost of DIVSD instruction.  */
454   COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
455   COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
456   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
457   pentium_memcpy,
458   pentium_memset,
459   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
460   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
461   "16:8:8",				/* Loop alignment.  */
462   "16:8:8",				/* Jump alignment.  */
463   "0:0:8",				/* Label alignment.  */
464   "16",					/* Func alignment.  */
465 };
466 
467 static const
468 struct processor_costs lakemont_cost = {
469   {
470   /* Start of register allocator costs.  integer->integer move cost is 2. */
471   6,				     /* cost for loading QImode using movzbl */
472   {2, 4, 2},				/* cost of loading integer registers
473 					   in QImode, HImode and SImode.
474 					   Relative to reg-reg move (2).  */
475   {2, 4, 2},				/* cost of storing integer registers */
476   2,					/* cost of reg,reg fld/fst */
477   {2, 2, 6},				/* cost of loading fp registers
478 					   in SFmode, DFmode and XFmode */
479   {4, 4, 6},				/* cost of storing fp registers
480 					   in SFmode, DFmode and XFmode */
481   8,					/* cost of moving MMX register */
482   {8, 8},				/* cost of loading MMX registers
483 					   in SImode and DImode */
484   {8, 8},				/* cost of storing MMX registers
485 					   in SImode and DImode */
486   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
487   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
488 					   in 32,64,128,256 and 512-bit */
489   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
490 					   in 32,64,128,256 and 512-bit */
491   3, 3,				/* SSE->integer and integer->SSE moves */
492   3, 3,				/* mask->integer and integer->mask moves */
493   {2, 4, 2},				/* cost of loading mask register
494 					   in QImode, HImode, SImode.  */
495   {2, 4, 2},				/* cost if storing mask register
496 					   in QImode, HImode, SImode.  */
497   2,					/* cost of moving mask register.  */
498   /* End of register allocator costs.  */
499   },
500 
501   COSTS_N_INSNS (1),			/* cost of an add instruction */
502   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
503   COSTS_N_INSNS (1),			/* variable shift costs */
504   COSTS_N_INSNS (1),			/* constant shift costs */
505   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
506    COSTS_N_INSNS (11),			/*				 HI */
507    COSTS_N_INSNS (11),			/*				 SI */
508    COSTS_N_INSNS (11),			/*				 DI */
509    COSTS_N_INSNS (11)},			/*			      other */
510   0,					/* cost of multiply per each bit set */
511   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
512    COSTS_N_INSNS (25),			/*			    HI */
513    COSTS_N_INSNS (25),			/*			    SI */
514    COSTS_N_INSNS (25),			/*			    DI */
515    COSTS_N_INSNS (25)},			/*			    other */
516   COSTS_N_INSNS (3),			/* cost of movsx */
517   COSTS_N_INSNS (2),			/* cost of movzx */
518   8,					/* "large" insn */
519   17,					/* MOVE_RATIO */
520   6,					/* CLEAR_RATIO */
521   {2, 4, 2},				/* cost of loading integer registers
522 					   in QImode, HImode and SImode.
523 					   Relative to reg-reg move (2).  */
524   {2, 4, 2},				/* cost of storing integer registers */
525   {4, 8, 16, 32, 64},			/* cost of loading SSE register
526 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
527   {4, 8, 16, 32, 64},			/* cost of storing SSE register
528 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
529   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
530   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
531   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
532   3,					/* cost of moving SSE register to integer.  */
533   4, 4,					/* Gather load static, per_elt.  */
534   4, 4,					/* Gather store static, per_elt.  */
535   8,					/* size of l1 cache.  */
536   8,					/* size of l2 cache  */
537   0,					/* size of prefetch block */
538   0,					/* number of parallel prefetches */
539   2,					/* Branch cost */
540   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
541   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
542   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
543   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
544   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
545   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
546 
547   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
548   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
549   COSTS_N_INSNS (5),			/* cost of MULSS instruction.  */
550   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
551   COSTS_N_INSNS (10),			/* cost of FMA SS instruction.  */
552   COSTS_N_INSNS (10),			/* cost of FMA SD instruction.  */
553   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
554   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
555   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
556   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
557   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
558   pentium_memcpy,
559   pentium_memset,
560   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
561   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
562   "16:8:8",				/* Loop alignment.  */
563   "16:8:8",				/* Jump alignment.  */
564   "0:0:8",				/* Label alignment.  */
565   "16",					/* Func alignment.  */
566 };
567 
568 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
569    (we ensure the alignment).  For small blocks inline loop is still a
570    noticeable win, for bigger blocks either rep movsl or rep movsb is
571    way to go.  Rep movsb has apparently more expensive startup time in CPU,
572    but after 4K the difference is down in the noise.  */
573 static stringop_algs pentiumpro_memcpy[2] = {
574   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
575                        {8192, rep_prefix_4_byte, false},
576                        {-1, rep_prefix_1_byte, false}}},
577   DUMMY_STRINGOP_ALGS};
578 static stringop_algs pentiumpro_memset[2] = {
579   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
580                        {8192, rep_prefix_4_byte, false},
581                        {-1, libcall, false}}},
582   DUMMY_STRINGOP_ALGS};
583 static const
584 struct processor_costs pentiumpro_cost = {
585   {
586   /* Start of register allocator costs.  integer->integer move cost is 2. */
587   2,				     /* cost for loading QImode using movzbl */
588   {4, 4, 4},				/* cost of loading integer registers
589 					   in QImode, HImode and SImode.
590 					   Relative to reg-reg move (2).  */
591   {2, 2, 2},				/* cost of storing integer registers */
592   2,					/* cost of reg,reg fld/fst */
593   {2, 2, 6},				/* cost of loading fp registers
594 					   in SFmode, DFmode and XFmode */
595   {4, 4, 6},				/* cost of storing fp registers
596 					   in SFmode, DFmode and XFmode */
597   2,					/* cost of moving MMX register */
598   {2, 2},				/* cost of loading MMX registers
599 					   in SImode and DImode */
600   {2, 2},				/* cost of storing MMX registers
601 					   in SImode and DImode */
602   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
603   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
604 					   in 32,64,128,256 and 512-bit */
605   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
606 					   in 32,64,128,256 and 512-bit */
607   3, 3,				/* SSE->integer and integer->SSE moves */
608   3, 3,				/* mask->integer and integer->mask moves */
609   {4, 4, 4},				/* cost of loading mask register
610 					   in QImode, HImode, SImode.  */
611   {2, 2, 2},				/* cost if storing mask register
612 					   in QImode, HImode, SImode.  */
613   2,					/* cost of moving mask register.  */
614   /* End of register allocator costs.  */
615   },
616 
617   COSTS_N_INSNS (1),			/* cost of an add instruction */
618   COSTS_N_INSNS (1),			/* cost of a lea instruction */
619   COSTS_N_INSNS (1),			/* variable shift costs */
620   COSTS_N_INSNS (1),			/* constant shift costs */
621   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
622    COSTS_N_INSNS (4),			/*				 HI */
623    COSTS_N_INSNS (4),			/*				 SI */
624    COSTS_N_INSNS (4),			/*				 DI */
625    COSTS_N_INSNS (4)},			/*			      other */
626   0,					/* cost of multiply per each bit set */
627   {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
628    COSTS_N_INSNS (17),			/*			    HI */
629    COSTS_N_INSNS (17),			/*			    SI */
630    COSTS_N_INSNS (17),			/*			    DI */
631    COSTS_N_INSNS (17)},			/*			    other */
632   COSTS_N_INSNS (1),			/* cost of movsx */
633   COSTS_N_INSNS (1),			/* cost of movzx */
634   8,					/* "large" insn */
635   6,					/* MOVE_RATIO */
636   6,					/* CLEAR_RATIO */
637   {4, 4, 4},				/* cost of loading integer registers
638 					   in QImode, HImode and SImode.
639 					   Relative to reg-reg move (2).  */
640   {2, 2, 2},				/* cost of storing integer registers */
641   {4, 8, 16, 32, 64},			/* cost of loading SSE register
642 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
643   {4, 8, 16, 32, 64},			/* cost of storing SSE register
644 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
645   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
646   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
647   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
648   3,					/* cost of moving SSE register to integer.  */
649   4, 4,					/* Gather load static, per_elt.  */
650   4, 4,					/* Gather store static, per_elt.  */
651   8,					/* size of l1 cache.  */
652   256,					/* size of l2 cache  */
653   32,					/* size of prefetch block */
654   6,					/* number of parallel prefetches */
655   2,					/* Branch cost */
656   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
657   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
658   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
659   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
660   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
661   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
662 
663   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
664   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
665   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
666   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
667   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
668   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
669   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
670   COSTS_N_INSNS (18),			/* cost of DIVSD instruction.  */
671   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
672   COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
673   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
674   pentiumpro_memcpy,
675   pentiumpro_memset,
676   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
677   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
678   "16",					/* Loop alignment.  */
679   "16:11:8",				/* Jump alignment.  */
680   "0:0:8",				/* Label alignment.  */
681   "16",					/* Func alignment.  */
682 };
683 
684 static stringop_algs geode_memcpy[2] = {
685   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
686   DUMMY_STRINGOP_ALGS};
687 static stringop_algs geode_memset[2] = {
688   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
689   DUMMY_STRINGOP_ALGS};
690 static const
691 struct processor_costs geode_cost = {
692   {
693   /* Start of register allocator costs.  integer->integer move cost is 2. */
694   2,				     /* cost for loading QImode using movzbl */
695   {2, 2, 2},				/* cost of loading integer registers
696 					   in QImode, HImode and SImode.
697 					   Relative to reg-reg move (2).  */
698   {2, 2, 2},				/* cost of storing integer registers */
699   2,					/* cost of reg,reg fld/fst */
700   {2, 2, 2},				/* cost of loading fp registers
701 					   in SFmode, DFmode and XFmode */
702   {4, 6, 6},				/* cost of storing fp registers
703 					   in SFmode, DFmode and XFmode */
704   2,					/* cost of moving MMX register */
705   {2, 2},				/* cost of loading MMX registers
706 					   in SImode and DImode */
707   {2, 2},				/* cost of storing MMX registers
708 					   in SImode and DImode */
709   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
710   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
711 					   in 32,64,128,256 and 512-bit */
712   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
713 					   in 32,64,128,256 and 512-bit */
714   6, 6,				/* SSE->integer and integer->SSE moves */
715   6, 6,				/* mask->integer and integer->mask moves */
716   {2, 2, 2},				/* cost of loading mask register
717 					   in QImode, HImode, SImode.  */
718   {2, 2, 2},				/* cost if storing mask register
719 					   in QImode, HImode, SImode.  */
720   2,					/* cost of moving mask register.  */
721   /* End of register allocator costs.  */
722   },
723 
724   COSTS_N_INSNS (1),			/* cost of an add instruction */
725   COSTS_N_INSNS (1),			/* cost of a lea instruction */
726   COSTS_N_INSNS (2),			/* variable shift costs */
727   COSTS_N_INSNS (1),			/* constant shift costs */
728   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
729    COSTS_N_INSNS (4),			/*				 HI */
730    COSTS_N_INSNS (7),			/*				 SI */
731    COSTS_N_INSNS (7),			/*				 DI */
732    COSTS_N_INSNS (7)},			/*			      other */
733   0,					/* cost of multiply per each bit set */
734   {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
735    COSTS_N_INSNS (23),			/*			    HI */
736    COSTS_N_INSNS (39),			/*			    SI */
737    COSTS_N_INSNS (39),			/*			    DI */
738    COSTS_N_INSNS (39)},			/*			    other */
739   COSTS_N_INSNS (1),			/* cost of movsx */
740   COSTS_N_INSNS (1),			/* cost of movzx */
741   8,					/* "large" insn */
742   4,					/* MOVE_RATIO */
743   4,					/* CLEAR_RATIO */
744   {2, 2, 2},				/* cost of loading integer registers
745 					   in QImode, HImode and SImode.
746 					   Relative to reg-reg move (2).  */
747   {2, 2, 2},				/* cost of storing integer registers */
748   {2, 2, 8, 16, 32},			/* cost of loading SSE register
749 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
750   {2, 2, 8, 16, 32},			/* cost of storing SSE register
751 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
752   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
753   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
754   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
755   6,					/* cost of moving SSE register to integer.  */
756   2, 2,					/* Gather load static, per_elt.  */
757   2, 2,					/* Gather store static, per_elt.  */
758   64,					/* size of l1 cache.  */
759   128,					/* size of l2 cache.  */
760   32,					/* size of prefetch block */
761   1,					/* number of parallel prefetches */
762   1,					/* Branch cost */
763   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
764   COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
765   COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
766   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
767   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
768   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
769 
770   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
771   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
772   COSTS_N_INSNS (11),			/* cost of MULSS instruction.  */
773   COSTS_N_INSNS (11),			/* cost of MULSD instruction.  */
774   COSTS_N_INSNS (17),			/* cost of FMA SS instruction.  */
775   COSTS_N_INSNS (17),			/* cost of FMA SD instruction.  */
776   COSTS_N_INSNS (47),			/* cost of DIVSS instruction.  */
777   COSTS_N_INSNS (47),			/* cost of DIVSD instruction.  */
778   COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
779   COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
780   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
781   geode_memcpy,
782   geode_memset,
783   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
784   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
785   NULL,					/* Loop alignment.  */
786   NULL,					/* Jump alignment.  */
787   NULL,					/* Label alignment.  */
788   NULL,					/* Func alignment.  */
789 };
790 
791 static stringop_algs k6_memcpy[2] = {
792   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
793   DUMMY_STRINGOP_ALGS};
794 static stringop_algs k6_memset[2] = {
795   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
796   DUMMY_STRINGOP_ALGS};
797 static const
798 struct processor_costs k6_cost = {
799   {
800   /* Start of register allocator costs.  integer->integer move cost is 2. */
801   3,				     /* cost for loading QImode using movzbl */
802   {4, 5, 4},				/* cost of loading integer registers
803 					   in QImode, HImode and SImode.
804 					   Relative to reg-reg move (2).  */
805   {2, 3, 2},				/* cost of storing integer registers */
806   4,					/* cost of reg,reg fld/fst */
807   {6, 6, 6},				/* cost of loading fp registers
808 					   in SFmode, DFmode and XFmode */
809   {4, 4, 4},				/* cost of storing fp registers
810 					   in SFmode, DFmode and XFmode */
811   2,					/* cost of moving MMX register */
812   {2, 2},				/* cost of loading MMX registers
813 					   in SImode and DImode */
814   {2, 2},				/* cost of storing MMX registers
815 					   in SImode and DImode */
816   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
817   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
818 					   in 32,64,128,256 and 512-bit */
819   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
820 					   in 32,64,128,256 and 512-bit */
821   6, 6,				/* SSE->integer and integer->SSE moves */
822   6, 6,				/* mask->integer and integer->mask moves */
823   {4, 5, 4},				/* cost of loading mask register
824 					   in QImode, HImode, SImode.  */
825   {2, 3, 2},				/* cost if storing mask register
826 					   in QImode, HImode, SImode.  */
827   2,					/* cost of moving mask register.  */
828   /* End of register allocator costs.  */
829   },
830 
831   COSTS_N_INSNS (1),			/* cost of an add instruction */
832   COSTS_N_INSNS (2),			/* cost of a lea instruction */
833   COSTS_N_INSNS (1),			/* variable shift costs */
834   COSTS_N_INSNS (1),			/* constant shift costs */
835   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
836    COSTS_N_INSNS (3),			/*				 HI */
837    COSTS_N_INSNS (3),			/*				 SI */
838    COSTS_N_INSNS (3),			/*				 DI */
839    COSTS_N_INSNS (3)},			/*			      other */
840   0,					/* cost of multiply per each bit set */
841   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
842    COSTS_N_INSNS (18),			/*			    HI */
843    COSTS_N_INSNS (18),			/*			    SI */
844    COSTS_N_INSNS (18),			/*			    DI */
845    COSTS_N_INSNS (18)},			/*			    other */
846   COSTS_N_INSNS (2),			/* cost of movsx */
847   COSTS_N_INSNS (2),			/* cost of movzx */
848   8,					/* "large" insn */
849   4,					/* MOVE_RATIO */
850   4,					/* CLEAR_RATIO */
851   {4, 5, 4},				/* cost of loading integer registers
852 					   in QImode, HImode and SImode.
853 					   Relative to reg-reg move (2).  */
854   {2, 3, 2},				/* cost of storing integer registers */
855   {2, 2, 8, 16, 32},			/* cost of loading SSE register
856 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
857   {2, 2, 8, 16, 32},			/* cost of storing SSE register
858 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
859   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
860   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
861   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
862   6,					/* cost of moving SSE register to integer.  */
863   2, 2,					/* Gather load static, per_elt.  */
864   2, 2,					/* Gather store static, per_elt.  */
865   32,					/* size of l1 cache.  */
866   32,					/* size of l2 cache.  Some models
867 					   have integrated l2 cache, but
868 					   optimizing for k6 is not important
869 					   enough to worry about that.  */
870   32,					/* size of prefetch block */
871   1,					/* number of parallel prefetches */
872   1,					/* Branch cost */
873   COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
874   COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
875   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
876   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
877   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
878   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
879 
880   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
881   COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
882   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
883   COSTS_N_INSNS (2),			/* cost of MULSD instruction.  */
884   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
885   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
886   COSTS_N_INSNS (56),			/* cost of DIVSS instruction.  */
887   COSTS_N_INSNS (56),			/* cost of DIVSD instruction.  */
888   COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
889   COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
890   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
891   k6_memcpy,
892   k6_memset,
893   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
894   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
895   "32:8:8",				/* Loop alignment.  */
896   "32:8:8",				/* Jump alignment.  */
897   "0:0:8",				/* Label alignment.  */
898   "32",					/* Func alignment.  */
899 };
900 
901 /* For some reason, Athlon deals better with REP prefix (relative to loops)
902    compared to K8. Alignment becomes important after 8 bytes for memcpy and
903    128 bytes for memset.  */
904 static stringop_algs athlon_memcpy[2] = {
905   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
906   DUMMY_STRINGOP_ALGS};
907 static stringop_algs athlon_memset[2] = {
908   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
909   DUMMY_STRINGOP_ALGS};
910 static const
911 struct processor_costs athlon_cost = {
912   {
913   /* Start of register allocator costs.  integer->integer move cost is 2. */
914   4,				     /* cost for loading QImode using movzbl */
915   {3, 4, 3},				/* cost of loading integer registers
916 					   in QImode, HImode and SImode.
917 					   Relative to reg-reg move (2).  */
918   {3, 4, 3},				/* cost of storing integer registers */
919   4,					/* cost of reg,reg fld/fst */
920   {4, 4, 12},				/* cost of loading fp registers
921 					   in SFmode, DFmode and XFmode */
922   {6, 6, 8},				/* cost of storing fp registers
923 					   in SFmode, DFmode and XFmode */
924   2,					/* cost of moving MMX register */
925   {4, 4},				/* cost of loading MMX registers
926 					   in SImode and DImode */
927   {4, 4},				/* cost of storing MMX registers
928 					   in SImode and DImode */
929   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
930   {4, 4, 12, 12, 24},			/* cost of loading SSE registers
931 					   in 32,64,128,256 and 512-bit */
932   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
933 					   in 32,64,128,256 and 512-bit */
934   5, 5,				/* SSE->integer and integer->SSE moves */
935   5, 5,				/* mask->integer and integer->mask moves */
936   {3, 4, 3},				/* cost of loading mask register
937 					   in QImode, HImode, SImode.  */
938   {3, 4, 3},				/* cost if storing mask register
939 					   in QImode, HImode, SImode.  */
940   2,					/* cost of moving mask register.  */
941   /* End of register allocator costs.  */
942   },
943 
944   COSTS_N_INSNS (1),			/* cost of an add instruction */
945   COSTS_N_INSNS (2),			/* cost of a lea instruction */
946   COSTS_N_INSNS (1),			/* variable shift costs */
947   COSTS_N_INSNS (1),			/* constant shift costs */
948   {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
949    COSTS_N_INSNS (5),			/*				 HI */
950    COSTS_N_INSNS (5),			/*				 SI */
951    COSTS_N_INSNS (5),			/*				 DI */
952    COSTS_N_INSNS (5)},			/*			      other */
953   0,					/* cost of multiply per each bit set */
954   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
955    COSTS_N_INSNS (26),			/*			    HI */
956    COSTS_N_INSNS (42),			/*			    SI */
957    COSTS_N_INSNS (74),			/*			    DI */
958    COSTS_N_INSNS (74)},			/*			    other */
959   COSTS_N_INSNS (1),			/* cost of movsx */
960   COSTS_N_INSNS (1),			/* cost of movzx */
961   8,					/* "large" insn */
962   9,					/* MOVE_RATIO */
963   6,					/* CLEAR_RATIO */
964   {3, 4, 3},				/* cost of loading integer registers
965 					   in QImode, HImode and SImode.
966 					   Relative to reg-reg move (2).  */
967   {3, 4, 3},				/* cost of storing integer registers */
968   {4, 4, 12, 12, 24},			/* cost of loading SSE register
969 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
970   {4, 4, 10, 10, 20},			/* cost of storing SSE register
971 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
972   {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
973   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
974   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
975   5,					/* cost of moving SSE register to integer.  */
976   4, 4,					/* Gather load static, per_elt.  */
977   4, 4,					/* Gather store static, per_elt.  */
978   64,					/* size of l1 cache.  */
979   256,					/* size of l2 cache.  */
980   64,					/* size of prefetch block */
981   6,					/* number of parallel prefetches */
982   5,					/* Branch cost */
983   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
984   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
985   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
986   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
987   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
988   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
989 
990   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
991   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
992   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
993   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
994   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
995   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
996   /* 11-16  */
997   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
998   COSTS_N_INSNS (24),			/* cost of DIVSD instruction.  */
999   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
1000   COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
1001   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1002   athlon_memcpy,
1003   athlon_memset,
1004   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1005   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1006   "16:8:8",				/* Loop alignment.  */
1007   "16:8:8",				/* Jump alignment.  */
1008   "0:0:8",				/* Label alignment.  */
1009   "16",					/* Func alignment.  */
1010 };
1011 
1012 /* K8 has optimized REP instruction for medium sized blocks, but for very
1013    small blocks it is better to use loop. For large blocks, libcall can
1014    do nontemporary accesses and beat inline considerably.  */
1015 static stringop_algs k8_memcpy[2] = {
1016   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1017              {-1, rep_prefix_4_byte, false}}},
1018   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1019              {-1, libcall, false}}}};
1020 static stringop_algs k8_memset[2] = {
1021   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1022              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1023   {libcall, {{48, unrolled_loop, false},
1024              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1025 static const
1026 struct processor_costs k8_cost = {
1027   {
1028   /* Start of register allocator costs.  integer->integer move cost is 2. */
1029   4,				     /* cost for loading QImode using movzbl */
1030   {3, 4, 3},				/* cost of loading integer registers
1031 					   in QImode, HImode and SImode.
1032 					   Relative to reg-reg move (2).  */
1033   {3, 4, 3},				/* cost of storing integer registers */
1034   4,					/* cost of reg,reg fld/fst */
1035   {4, 4, 12},				/* cost of loading fp registers
1036 					   in SFmode, DFmode and XFmode */
1037   {6, 6, 8},				/* cost of storing fp registers
1038 					   in SFmode, DFmode and XFmode */
1039   2,					/* cost of moving MMX register */
1040   {3, 3},				/* cost of loading MMX registers
1041 					   in SImode and DImode */
1042   {4, 4},				/* cost of storing MMX registers
1043 					   in SImode and DImode */
1044   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1045   {4, 3, 12, 12, 24},			/* cost of loading SSE registers
1046 					   in 32,64,128,256 and 512-bit */
1047   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
1048 					   in 32,64,128,256 and 512-bit */
1049   5, 5,				/* SSE->integer and integer->SSE moves */
1050   5, 5,				/* mask->integer and integer->mask moves */
1051   {3, 4, 3},				/* cost of loading mask register
1052 					   in QImode, HImode, SImode.  */
1053   {3, 4, 3},				/* cost if storing mask register
1054 					   in QImode, HImode, SImode.  */
1055   2,					/* cost of moving mask register.  */
1056   /* End of register allocator costs.  */
1057   },
1058 
1059   COSTS_N_INSNS (1),			/* cost of an add instruction */
1060   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1061   COSTS_N_INSNS (1),			/* variable shift costs */
1062   COSTS_N_INSNS (1),			/* constant shift costs */
1063   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1064    COSTS_N_INSNS (4),			/*				 HI */
1065    COSTS_N_INSNS (3),			/*				 SI */
1066    COSTS_N_INSNS (4),			/*				 DI */
1067    COSTS_N_INSNS (5)},			/*			      other */
1068   0,					/* cost of multiply per each bit set */
1069   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1070    COSTS_N_INSNS (26),			/*			    HI */
1071    COSTS_N_INSNS (42),			/*			    SI */
1072    COSTS_N_INSNS (74),			/*			    DI */
1073    COSTS_N_INSNS (74)},			/*			    other */
1074   COSTS_N_INSNS (1),			/* cost of movsx */
1075   COSTS_N_INSNS (1),			/* cost of movzx */
1076   8,					/* "large" insn */
1077   9,					/* MOVE_RATIO */
1078   6,					/* CLEAR_RATIO */
1079   {3, 4, 3},				/* cost of loading integer registers
1080 					   in QImode, HImode and SImode.
1081 					   Relative to reg-reg move (2).  */
1082   {3, 4, 3},				/* cost of storing integer registers */
1083   {4, 3, 12, 12, 24},			/* cost of loading SSE register
1084 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1085   {4, 4, 10, 10, 20},			/* cost of storing SSE register
1086 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1087   {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
1088   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
1089   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1090   5,					/* cost of moving SSE register to integer.  */
1091   4, 4,					/* Gather load static, per_elt.  */
1092   4, 4,					/* Gather store static, per_elt.  */
1093   64,					/* size of l1 cache.  */
1094   512,					/* size of l2 cache.  */
1095   64,					/* size of prefetch block */
1096   /* New AMD processors never drop prefetches; if they cannot be performed
1097      immediately, they are queued.  We set number of simultaneous prefetches
1098      to a large constant to reflect this (it probably is not a good idea not
1099      to limit number of prefetches at all, as their execution also takes some
1100      time).  */
1101   100,					/* number of parallel prefetches */
1102   3,					/* Branch cost */
1103   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1104   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1105   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1106   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1107   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1108   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1109 
1110   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1111   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1112   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1113   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1114   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
1115   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
1116   /* 11-16  */
1117   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
1118   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
1119   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
1120   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
1121   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1122   k8_memcpy,
1123   k8_memset,
1124   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1125   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1126   "16:8:8",				/* Loop alignment.  */
1127   "16:8:8",				/* Jump alignment.  */
1128   "0:0:8",				/* Label alignment.  */
1129   "16",					/* Func alignment.  */
1130 };
1131 
1132 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1133    very small blocks it is better to use loop. For large blocks, libcall can
1134    do nontemporary accesses and beat inline considerably.  */
1135 static stringop_algs amdfam10_memcpy[2] = {
1136   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1137              {-1, rep_prefix_4_byte, false}}},
1138   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1139              {-1, libcall, false}}}};
1140 static stringop_algs amdfam10_memset[2] = {
1141   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1142              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1143   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1144              {-1, libcall, false}}}};
1145 struct processor_costs amdfam10_cost = {
1146   {
1147   /* Start of register allocator costs.  integer->integer move cost is 2. */
1148   4,				     /* cost for loading QImode using movzbl */
1149   {3, 4, 3},				/* cost of loading integer registers
1150 					   in QImode, HImode and SImode.
1151 					   Relative to reg-reg move (2).  */
1152   {3, 4, 3},				/* cost of storing integer registers */
1153   4,					/* cost of reg,reg fld/fst */
1154   {4, 4, 12},				/* cost of loading fp registers
1155 		   			   in SFmode, DFmode and XFmode */
1156   {6, 6, 8},				/* cost of storing fp registers
1157  		   			   in SFmode, DFmode and XFmode */
1158   2,					/* cost of moving MMX register */
1159   {3, 3},				/* cost of loading MMX registers
1160 					   in SImode and DImode */
1161   {4, 4},				/* cost of storing MMX registers
1162 					   in SImode and DImode */
1163   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1164   {4, 4, 3, 6, 12},			/* cost of loading SSE registers
1165 					   in 32,64,128,256 and 512-bit */
1166   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
1167 					   in 32,64,128,256 and 512-bit */
1168   3, 3,				/* SSE->integer and integer->SSE moves */
1169   3, 3,				/* mask->integer and integer->mask moves */
1170   {3, 4, 3},				/* cost of loading mask register
1171 					   in QImode, HImode, SImode.  */
1172   {3, 4, 3},				/* cost if storing mask register
1173 					   in QImode, HImode, SImode.  */
1174   2,					/* cost of moving mask register.  */
1175 
1176   					/* On K8:
1177   					    MOVD reg64, xmmreg Double FSTORE 4
1178 					    MOVD reg32, xmmreg Double FSTORE 4
1179 					   On AMDFAM10:
1180 					    MOVD reg64, xmmreg Double FADD 3
1181 							       1/1  1/1
1182 					    MOVD reg32, xmmreg Double FADD 3
1183 							       1/1  1/1 */
1184   /* End of register allocator costs.  */
1185   },
1186 
1187   COSTS_N_INSNS (1),			/* cost of an add instruction */
1188   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1189   COSTS_N_INSNS (1),			/* variable shift costs */
1190   COSTS_N_INSNS (1),			/* constant shift costs */
1191   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1192    COSTS_N_INSNS (4),			/*				 HI */
1193    COSTS_N_INSNS (3),			/*				 SI */
1194    COSTS_N_INSNS (4),			/*				 DI */
1195    COSTS_N_INSNS (5)},			/*			      other */
1196   0,					/* cost of multiply per each bit set */
1197   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1198    COSTS_N_INSNS (35),			/*			    HI */
1199    COSTS_N_INSNS (51),			/*			    SI */
1200    COSTS_N_INSNS (83),			/*			    DI */
1201    COSTS_N_INSNS (83)},			/*			    other */
1202   COSTS_N_INSNS (1),			/* cost of movsx */
1203   COSTS_N_INSNS (1),			/* cost of movzx */
1204   8,					/* "large" insn */
1205   9,					/* MOVE_RATIO */
1206   6,					/* CLEAR_RATIO */
1207   {3, 4, 3},				/* cost of loading integer registers
1208 					   in QImode, HImode and SImode.
1209 					   Relative to reg-reg move (2).  */
1210   {3, 4, 3},				/* cost of storing integer registers */
1211   {4, 4, 3, 6, 12},			/* cost of loading SSE register
1212 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1213   {4, 4, 5, 10, 20},			/* cost of storing SSE register
1214 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1215   {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
1216   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
1217   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1218   3,					/* cost of moving SSE register to integer.  */
1219   4, 4,					/* Gather load static, per_elt.  */
1220   4, 4,					/* Gather store static, per_elt.  */
1221   64,					/* size of l1 cache.  */
1222   512,					/* size of l2 cache.  */
1223   64,					/* size of prefetch block */
1224   /* New AMD processors never drop prefetches; if they cannot be performed
1225      immediately, they are queued.  We set number of simultaneous prefetches
1226      to a large constant to reflect this (it probably is not a good idea not
1227      to limit number of prefetches at all, as their execution also takes some
1228      time).  */
1229   100,					/* number of parallel prefetches */
1230   2,					/* Branch cost */
1231   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1232   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1233   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1234   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1235   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1236   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1237 
1238   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1239   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1240   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1241   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1242   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
1243   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
1244   /* 11-16  */
1245   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
1246   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
1247   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
1248   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
1249   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1250   amdfam10_memcpy,
1251   amdfam10_memset,
1252   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1253   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1254   "32:25:8",				/* Loop alignment.  */
1255   "32:8:8",				/* Jump alignment.  */
1256   "0:0:8",				/* Label alignment.  */
1257   "32",					/* Func alignment.  */
1258 };
1259 
1260 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1261     very small blocks it is better to use loop. For large blocks, libcall
1262     can do nontemporary accesses and beat inline considerably.  */
1263 static stringop_algs bdver_memcpy[2] = {
1264   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1265              {-1, rep_prefix_4_byte, false}}},
1266   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1267              {-1, libcall, false}}}};
1268 static stringop_algs bdver_memset[2] = {
1269   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1270              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1271   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1272              {-1, libcall, false}}}};
1273 
1274 const struct processor_costs bdver_cost = {
1275   {
1276   /* Start of register allocator costs.  integer->integer move cost is 2. */
1277   8,				     /* cost for loading QImode using movzbl */
1278   {8, 8, 8},				/* cost of loading integer registers
1279 					   in QImode, HImode and SImode.
1280 					   Relative to reg-reg move (2).  */
1281   {8, 8, 8},				/* cost of storing integer registers */
1282   4,					/* cost of reg,reg fld/fst */
1283   {12, 12, 28},				/* cost of loading fp registers
1284 		   			   in SFmode, DFmode and XFmode */
1285   {10, 10, 18},				/* cost of storing fp registers
1286  		   			   in SFmode, DFmode and XFmode */
1287   4,					/* cost of moving MMX register */
1288   {12, 12},				/* cost of loading MMX registers
1289 					   in SImode and DImode */
1290   {10, 10},				/* cost of storing MMX registers
1291 					   in SImode and DImode */
1292   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1293   {12, 12, 10, 40, 60},			/* cost of loading SSE registers
1294 					   in 32,64,128,256 and 512-bit */
1295   {10, 10, 10, 40, 60},			/* cost of storing SSE registers
1296 					   in 32,64,128,256 and 512-bit */
1297   16, 20,				/* SSE->integer and integer->SSE moves */
1298   16, 20,				/* mask->integer and integer->mask moves */
1299   {8, 8, 8},				/* cost of loading mask register
1300 					   in QImode, HImode, SImode.  */
1301   {8, 8, 8},				/* cost if storing mask register
1302 					   in QImode, HImode, SImode.  */
1303   2,					/* cost of moving mask register.  */
1304   /* End of register allocator costs.  */
1305   },
1306 
1307   COSTS_N_INSNS (1),			/* cost of an add instruction */
1308   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1309   COSTS_N_INSNS (1),			/* variable shift costs */
1310   COSTS_N_INSNS (1),			/* constant shift costs */
1311   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1312    COSTS_N_INSNS (4),			/*				 HI */
1313    COSTS_N_INSNS (4),			/*				 SI */
1314    COSTS_N_INSNS (6),			/*				 DI */
1315    COSTS_N_INSNS (6)},			/*			      other */
1316   0,					/* cost of multiply per each bit set */
1317   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1318    COSTS_N_INSNS (35),			/*			    HI */
1319    COSTS_N_INSNS (51),			/*			    SI */
1320    COSTS_N_INSNS (83),			/*			    DI */
1321    COSTS_N_INSNS (83)},			/*			    other */
1322   COSTS_N_INSNS (1),			/* cost of movsx */
1323   COSTS_N_INSNS (1),			/* cost of movzx */
1324   8,					/* "large" insn */
1325   9,					/* MOVE_RATIO */
1326   6,					/* CLEAR_RATIO */
1327   {8, 8, 8},				/* cost of loading integer registers
1328 					   in QImode, HImode and SImode.
1329 					   Relative to reg-reg move (2).  */
1330   {8, 8, 8},				/* cost of storing integer registers */
1331   {12, 12, 10, 40, 60},			/* cost of loading SSE register
1332 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1333   {10, 10, 10, 40, 60},			/* cost of storing SSE register
1334 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1335   {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
1336   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
1337   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1338   16,					/* cost of moving SSE register to integer.  */
1339   12, 12,				/* Gather load static, per_elt.  */
1340   10, 10,				/* Gather store static, per_elt.  */
1341   16,					/* size of l1 cache.  */
1342   2048,					/* size of l2 cache.  */
1343   64,					/* size of prefetch block */
1344   /* New AMD processors never drop prefetches; if they cannot be performed
1345      immediately, they are queued.  We set number of simultaneous prefetches
1346      to a large constant to reflect this (it probably is not a good idea not
1347      to limit number of prefetches at all, as their execution also takes some
1348      time).  */
1349   100,					/* number of parallel prefetches */
1350   2,					/* Branch cost */
1351   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1352   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1353   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1354   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1355   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1356   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1357 
1358   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1359   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1360   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1361   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1362   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1363   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1364   /* 9-24  */
1365   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1366   /* 9-27  */
1367   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1368   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1369   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1370   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1371   bdver_memcpy,
1372   bdver_memset,
1373   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1374   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1375   "16:11:8",				/* Loop alignment.  */
1376   "16:8:8",				/* Jump alignment.  */
1377   "0:0:8",				/* Label alignment.  */
1378   "11",					/* Func alignment.  */
1379 };
1380 
1381 
1382 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1383     very small blocks it is better to use loop.  For large blocks, libcall
1384     can do nontemporary accesses and beat inline considerably.  */
1385 static stringop_algs znver1_memcpy[2] = {
1386   /* 32-bit tuning.  */
1387   {libcall, {{6, loop, false},
1388 	     {14, unrolled_loop, false},
1389 	     {-1, libcall, false}}},
1390   /* 64-bit tuning.  */
1391   {libcall, {{16, loop, false},
1392 	     {128, rep_prefix_8_byte, false},
1393 	     {-1, libcall, false}}}};
1394 static stringop_algs znver1_memset[2] = {
1395   /* 32-bit tuning.  */
1396   {libcall, {{8, loop, false},
1397 	     {24, unrolled_loop, false},
1398 	     {128, rep_prefix_4_byte, false},
1399 	     {-1, libcall, false}}},
1400   /* 64-bit tuning.  */
1401   {libcall, {{48, unrolled_loop, false},
1402 	     {128, rep_prefix_8_byte, false},
1403 	     {-1, libcall, false}}}};
1404 struct processor_costs znver1_cost = {
1405   {
1406   /* Start of register allocator costs.  integer->integer move cost is 2. */
1407 
1408   /* reg-reg moves are done by renaming and thus they are even cheaper than
1409      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1410      to doubles of latencies, we do not model this correctly.  It does not
1411      seem to make practical difference to bump prices up even more.  */
1412   6,					/* cost for loading QImode using
1413 					   movzbl.  */
1414   {6, 6, 6},				/* cost of loading integer registers
1415 					   in QImode, HImode and SImode.
1416 					   Relative to reg-reg move (2).  */
1417   {8, 8, 8},				/* cost of storing integer
1418 					   registers.  */
1419   2,					/* cost of reg,reg fld/fst.  */
1420   {6, 6, 16},				/* cost of loading fp registers
1421 		   			   in SFmode, DFmode and XFmode.  */
1422   {8, 8, 16},				/* cost of storing fp registers
1423  		   			   in SFmode, DFmode and XFmode.  */
1424   2,					/* cost of moving MMX register.  */
1425   {6, 6},				/* cost of loading MMX registers
1426 					   in SImode and DImode.  */
1427   {8, 8},				/* cost of storing MMX registers
1428 					   in SImode and DImode.  */
1429   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
1430   {6, 6, 6, 12, 24},			/* cost of loading SSE registers
1431 					   in 32,64,128,256 and 512-bit.  */
1432   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
1433 					   in 32,64,128,256 and 512-bit.  */
1434   6, 6,				/* SSE->integer and integer->SSE moves.  */
1435   8, 8,				/* mask->integer and integer->mask moves */
1436   {6, 6, 6},				/* cost of loading mask register
1437 					   in QImode, HImode, SImode.  */
1438   {8, 8, 8},				/* cost if storing mask register
1439 					   in QImode, HImode, SImode.  */
1440   2,					/* cost of moving mask register.  */
1441   /* End of register allocator costs.  */
1442   },
1443 
1444   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1445   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1446   COSTS_N_INSNS (1),			/* variable shift costs.  */
1447   COSTS_N_INSNS (1),			/* constant shift costs.  */
1448   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1449    COSTS_N_INSNS (3),			/*				 HI.  */
1450    COSTS_N_INSNS (3),			/*				 SI.  */
1451    COSTS_N_INSNS (3),			/*				 DI.  */
1452    COSTS_N_INSNS (3)},			/*			      other.  */
1453   0,					/* cost of multiply per each bit
1454 					    set.  */
1455    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1456       bound.  */
1457   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1458    COSTS_N_INSNS (22),			/*			    HI.  */
1459    COSTS_N_INSNS (30),			/*			    SI.  */
1460    COSTS_N_INSNS (45),			/*			    DI.  */
1461    COSTS_N_INSNS (45)},			/*			    other.  */
1462   COSTS_N_INSNS (1),			/* cost of movsx.  */
1463   COSTS_N_INSNS (1),			/* cost of movzx.  */
1464   8,					/* "large" insn.  */
1465   9,					/* MOVE_RATIO.  */
1466   6,					/* CLEAR_RATIO */
1467   {6, 6, 6},				/* cost of loading integer registers
1468 					   in QImode, HImode and SImode.
1469 					   Relative to reg-reg move (2).  */
1470   {8, 8, 8},				/* cost of storing integer
1471 					   registers.  */
1472   {6, 6, 6, 12, 24},			/* cost of loading SSE register
1473 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1474   {8, 8, 8, 16, 32},			/* cost of storing SSE register
1475 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1476   {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
1477   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
1478   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
1479   6,					/* cost of moving SSE register to integer.  */
1480   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1481      throughput 12.  Approx 9 uops do not depend on vector size and every load
1482      is 7 uops.  */
1483   18, 8,				/* Gather load static, per_elt.  */
1484   18, 10,				/* Gather store static, per_elt.  */
1485   32,					/* size of l1 cache.  */
1486   512,					/* size of l2 cache.  */
1487   64,					/* size of prefetch block.  */
1488   /* New AMD processors never drop prefetches; if they cannot be performed
1489      immediately, they are queued.  We set number of simultaneous prefetches
1490      to a large constant to reflect this (it probably is not a good idea not
1491      to limit number of prefetches at all, as their execution also takes some
1492      time).  */
1493   100,					/* number of parallel prefetches.  */
1494   3,					/* Branch cost.  */
1495   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1496   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1497   /* Latency of fdiv is 8-15.  */
1498   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1499   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1500   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1501   /* Latency of fsqrt is 4-10.  */
1502   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1503 
1504   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1505   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1506   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1507   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1508   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1509   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1510   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1511   /* 9-13  */
1512   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1513   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1514   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1515   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1516      and it can execute 2 integer additions and 2 multiplications thus
1517      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1518      that 4 works better than 6 probably due to register pressure.
1519 
1520      Integer vector operations are taken by FP unit and execute 3 vector
1521      plus/minus operations per cycle but only one multiply.  This is adjusted
1522      in ix86_reassociation_width.  */
1523   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1524   znver1_memcpy,
1525   znver1_memset,
1526   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1527   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1528   "16",					/* Loop alignment.  */
1529   "16",					/* Jump alignment.  */
1530   "0:0:8",				/* Label alignment.  */
1531   "16",					/* Func alignment.  */
1532 };
1533 
1534 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1535     very small blocks it is better to use loop.  For large blocks, libcall
1536     can do nontemporary accesses and beat inline considerably.  */
1537 static stringop_algs znver2_memcpy[2] = {
1538   /* 32-bit tuning.  */
1539   {libcall, {{6, loop, false},
1540 	     {14, unrolled_loop, false},
1541 	     {-1, libcall, false}}},
1542   /* 64-bit tuning.  */
1543   {libcall, {{16, loop, false},
1544 	     {64, rep_prefix_4_byte, false},
1545 	     {-1, libcall, false}}}};
1546 static stringop_algs znver2_memset[2] = {
1547   /* 32-bit tuning.  */
1548   {libcall, {{8, loop, false},
1549 	     {24, unrolled_loop, false},
1550 	     {128, rep_prefix_4_byte, false},
1551 	     {-1, libcall, false}}},
1552   /* 64-bit tuning.  */
1553   {libcall, {{24, rep_prefix_4_byte, false},
1554 	     {128, rep_prefix_8_byte, false},
1555 	     {-1, libcall, false}}}};
1556 
1557 struct processor_costs znver2_cost = {
1558   {
1559   /* Start of register allocator costs.  integer->integer move cost is 2. */
1560 
1561   /* reg-reg moves are done by renaming and thus they are even cheaper than
1562      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1563      to doubles of latencies, we do not model this correctly.  It does not
1564      seem to make practical difference to bump prices up even more.  */
1565   6,					/* cost for loading QImode using
1566 					   movzbl.  */
1567   {6, 6, 6},				/* cost of loading integer registers
1568 					   in QImode, HImode and SImode.
1569 					   Relative to reg-reg move (2).  */
1570   {8, 8, 8},				/* cost of storing integer
1571 					   registers.  */
1572   2,					/* cost of reg,reg fld/fst.  */
1573   {6, 6, 16},				/* cost of loading fp registers
1574 					   in SFmode, DFmode and XFmode.  */
1575   {8, 8, 16},				/* cost of storing fp registers
1576 					   in SFmode, DFmode and XFmode.  */
1577   2,					/* cost of moving MMX register.  */
1578   {6, 6},				/* cost of loading MMX registers
1579 					   in SImode and DImode.  */
1580   {8, 8},				/* cost of storing MMX registers
1581 					   in SImode and DImode.  */
1582   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1583 					   register.  */
1584   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1585 					   in 32,64,128,256 and 512-bit.  */
1586   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
1587 					   in 32,64,128,256 and 512-bit.  */
1588   6, 6,					/* SSE->integer and integer->SSE
1589 					   moves.  */
1590   8, 8,				/* mask->integer and integer->mask moves */
1591   {6, 6, 6},				/* cost of loading mask register
1592 					   in QImode, HImode, SImode.  */
1593   {8, 8, 8},				/* cost if storing mask register
1594 					   in QImode, HImode, SImode.  */
1595   2,					/* cost of moving mask register.  */
1596   /* End of register allocator costs.  */
1597   },
1598 
1599   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1600   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1601   COSTS_N_INSNS (1),			/* variable shift costs.  */
1602   COSTS_N_INSNS (1),			/* constant shift costs.  */
1603   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1604    COSTS_N_INSNS (3),			/* 				 HI.  */
1605    COSTS_N_INSNS (3),			/*				 SI.  */
1606    COSTS_N_INSNS (3),			/*				 DI.  */
1607    COSTS_N_INSNS (3)},			/*			other.  */
1608   0,					/* cost of multiply per each bit
1609 					   set.  */
1610    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1611       bound.  */
1612   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1613    COSTS_N_INSNS (22),			/* 			    HI.  */
1614    COSTS_N_INSNS (30),			/*			    SI.  */
1615    COSTS_N_INSNS (45),			/*			    DI.  */
1616    COSTS_N_INSNS (45)},			/*			    other.  */
1617   COSTS_N_INSNS (1),			/* cost of movsx.  */
1618   COSTS_N_INSNS (1),			/* cost of movzx.  */
1619   8,					/* "large" insn.  */
1620   9,					/* MOVE_RATIO.  */
1621   6,					/* CLEAR_RATIO */
1622   {6, 6, 6},				/* cost of loading integer registers
1623 					   in QImode, HImode and SImode.
1624 					   Relative to reg-reg move (2).  */
1625   {8, 8, 8},				/* cost of storing integer
1626 					   registers.  */
1627   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1628 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1629   {8, 8, 8, 8, 16},			/* cost of storing SSE register
1630 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1631   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
1632   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1633   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1634 					   register.  */
1635   6,					/* cost of moving SSE register to integer.  */
1636   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1637      throughput 12.  Approx 9 uops do not depend on vector size and every load
1638      is 7 uops.  */
1639   18, 8,				/* Gather load static, per_elt.  */
1640   18, 10,				/* Gather store static, per_elt.  */
1641   32,					/* size of l1 cache.  */
1642   512,					/* size of l2 cache.  */
1643   64,					/* size of prefetch block.  */
1644   /* New AMD processors never drop prefetches; if they cannot be performed
1645      immediately, they are queued.  We set number of simultaneous prefetches
1646      to a large constant to reflect this (it probably is not a good idea not
1647      to limit number of prefetches at all, as their execution also takes some
1648      time).  */
1649   100,					/* number of parallel prefetches.  */
1650   3,					/* Branch cost.  */
1651   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1652   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1653   /* Latency of fdiv is 8-15.  */
1654   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1655   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1656   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1657   /* Latency of fsqrt is 4-10.  */
1658   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1659 
1660   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1661   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1662   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1663   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
1664   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1665   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1666   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1667   /* 9-13.  */
1668   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1669   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1670   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1671   /* Zen can execute 4 integer operations per cycle.  FP operations
1672      take 3 cycles and it can execute 2 integer additions and 2
1673      multiplications thus reassociation may make sense up to with of 6.
1674      SPEC2k6 bencharks suggests
1675      that 4 works better than 6 probably due to register pressure.
1676 
1677      Integer vector operations are taken by FP unit and execute 3 vector
1678      plus/minus operations per cycle but only one multiply.  This is adjusted
1679      in ix86_reassociation_width.  */
1680   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1681   znver2_memcpy,
1682   znver2_memset,
1683   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1684   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1685   "16",					/* Loop alignment.  */
1686   "16",					/* Jump alignment.  */
1687   "0:0:8",				/* Label alignment.  */
1688   "16",					/* Func alignment.  */
1689 };
1690 
1691 struct processor_costs znver3_cost = {
1692   {
1693   /* Start of register allocator costs.  integer->integer move cost is 2. */
1694 
1695   /* reg-reg moves are done by renaming and thus they are even cheaper than
1696      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1697      to doubles of latencies, we do not model this correctly.  It does not
1698      seem to make practical difference to bump prices up even more.  */
1699   6,					/* cost for loading QImode using
1700 					   movzbl.  */
1701   {6, 6, 6},				/* cost of loading integer registers
1702 					   in QImode, HImode and SImode.
1703 					   Relative to reg-reg move (2).  */
1704   {8, 8, 8},				/* cost of storing integer
1705 					   registers.  */
1706   2,					/* cost of reg,reg fld/fst.  */
1707   {6, 6, 16},				/* cost of loading fp registers
1708 					   in SFmode, DFmode and XFmode.  */
1709   {8, 8, 16},				/* cost of storing fp registers
1710 					   in SFmode, DFmode and XFmode.  */
1711   2,					/* cost of moving MMX register.  */
1712   {6, 6},				/* cost of loading MMX registers
1713 					   in SImode and DImode.  */
1714   {8, 8},				/* cost of storing MMX registers
1715 					   in SImode and DImode.  */
1716   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1717 					   register.  */
1718   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1719 					   in 32,64,128,256 and 512-bit.  */
1720   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
1721 					   in 32,64,128,256 and 512-bit.  */
1722   6, 6,					/* SSE->integer and integer->SSE
1723 					   moves.  */
1724   8, 8,				/* mask->integer and integer->mask moves */
1725   {6, 6, 6},				/* cost of loading mask register
1726 					   in QImode, HImode, SImode.  */
1727   {8, 8, 8},				/* cost if storing mask register
1728 					   in QImode, HImode, SImode.  */
1729   2,					/* cost of moving mask register.  */
1730   /* End of register allocator costs.  */
1731   },
1732 
1733   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1734   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1735   COSTS_N_INSNS (1),			/* variable shift costs.  */
1736   COSTS_N_INSNS (1),			/* constant shift costs.  */
1737   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1738    COSTS_N_INSNS (3),			/* 				 HI.  */
1739    COSTS_N_INSNS (3),			/*				 SI.  */
1740    COSTS_N_INSNS (3),			/*				 DI.  */
1741    COSTS_N_INSNS (3)},			/*			other.  */
1742   0,					/* cost of multiply per each bit
1743 					   set.  */
1744   {COSTS_N_INSNS (9),			/* cost of a divide/mod for QI.  */
1745    COSTS_N_INSNS (10),			/* 			    HI.  */
1746    COSTS_N_INSNS (12),			/*			    SI.  */
1747    COSTS_N_INSNS (17),			/*			    DI.  */
1748    COSTS_N_INSNS (17)},			/*			    other.  */
1749   COSTS_N_INSNS (1),			/* cost of movsx.  */
1750   COSTS_N_INSNS (1),			/* cost of movzx.  */
1751   8,					/* "large" insn.  */
1752   9,					/* MOVE_RATIO.  */
1753   6,					/* CLEAR_RATIO */
1754   {6, 6, 6},				/* cost of loading integer registers
1755 					   in QImode, HImode and SImode.
1756 					   Relative to reg-reg move (2).  */
1757   {8, 8, 8},				/* cost of storing integer
1758 					   registers.  */
1759   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1760 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1761   {8, 8, 8, 8, 16},			/* cost of storing SSE register
1762 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1763   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
1764   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1765   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1766 					   register.  */
1767   6,					/* cost of moving SSE register to integer.  */
1768   /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1769      throughput 9.  Approx 7 uops do not depend on vector size and every load
1770      is 4 uops.  */
1771   14, 8,				/* Gather load static, per_elt.  */
1772   14, 10,				/* Gather store static, per_elt.  */
1773   32,					/* size of l1 cache.  */
1774   512,					/* size of l2 cache.  */
1775   64,					/* size of prefetch block.  */
1776   /* New AMD processors never drop prefetches; if they cannot be performed
1777      immediately, they are queued.  We set number of simultaneous prefetches
1778      to a large constant to reflect this (it probably is not a good idea not
1779      to limit number of prefetches at all, as their execution also takes some
1780      time).  */
1781   100,					/* number of parallel prefetches.  */
1782   3,					/* Branch cost.  */
1783   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1784   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1785   /* Latency of fdiv is 8-15.  */
1786   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1787   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1788   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1789   /* Latency of fsqrt is 4-10.  */
1790   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1791 
1792   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1793   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1794   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1795   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
1796   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1797   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1798   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1799   /* 9-13.  */
1800   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1801   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1802   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1803   /* Zen can execute 4 integer operations per cycle.  FP operations
1804      take 3 cycles and it can execute 2 integer additions and 2
1805      multiplications thus reassociation may make sense up to with of 6.
1806      SPEC2k6 bencharks suggests
1807      that 4 works better than 6 probably due to register pressure.
1808 
1809      Integer vector operations are taken by FP unit and execute 3 vector
1810      plus/minus operations per cycle but only one multiply.  This is adjusted
1811      in ix86_reassociation_width.  */
1812   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1813   znver2_memcpy,
1814   znver2_memset,
1815   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1816   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1817   "16",					/* Loop alignment.  */
1818   "16",					/* Jump alignment.  */
1819   "0:0:8",				/* Label alignment.  */
1820   "16",					/* Func alignment.  */
1821 };
1822 
1823 /* This table currently replicates znver3_cost table. */
1824 struct processor_costs znver4_cost = {
1825   {
1826   /* Start of register allocator costs.  integer->integer move cost is 2. */
1827 
1828   /* reg-reg moves are done by renaming and thus they are even cheaper than
1829      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1830      to doubles of latencies, we do not model this correctly.  It does not
1831      seem to make practical difference to bump prices up even more.  */
1832   6,					/* cost for loading QImode using
1833 					   movzbl.  */
1834   {6, 6, 6},				/* cost of loading integer registers
1835 					   in QImode, HImode and SImode.
1836 					   Relative to reg-reg move (2).  */
1837   {8, 8, 8},				/* cost of storing integer
1838 					   registers.  */
1839   2,					/* cost of reg,reg fld/fst.  */
1840   {14, 14, 17},				/* cost of loading fp registers
1841 					   in SFmode, DFmode and XFmode.  */
1842   {12, 12, 16},				/* cost of storing fp registers
1843 					   in SFmode, DFmode and XFmode.  */
1844   2,					/* cost of moving MMX register.  */
1845   {6, 6},				/* cost of loading MMX registers
1846 					   in SImode and DImode.  */
1847   {8, 8},				/* cost of storing MMX registers
1848 					   in SImode and DImode.  */
1849   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1850 					   register.  */
1851   {6, 6, 10, 10, 12},			/* cost of loading SSE registers
1852 					   in 32,64,128,256 and 512-bit.  */
1853   {8, 8, 8, 12, 12},			/* cost of storing SSE registers
1854 					   in 32,64,128,256 and 512-bit.  */
1855   6, 8,					/* SSE->integer and integer->SSE
1856 					   moves.  */
1857   8, 8,					/* mask->integer and integer->mask moves */
1858   {6, 6, 6},				/* cost of loading mask register
1859 					   in QImode, HImode, SImode.  */
1860   {8, 8, 8},				/* cost if storing mask register
1861 					   in QImode, HImode, SImode.  */
1862   2,					/* cost of moving mask register.  */
1863   /* End of register allocator costs.  */
1864   },
1865 
1866   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1867   /* TODO: Lea with 3 components has cost 2.  */
1868   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1869   COSTS_N_INSNS (1),			/* variable shift costs.  */
1870   COSTS_N_INSNS (1),			/* constant shift costs.  */
1871   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1872    COSTS_N_INSNS (3),			/* 				 HI.  */
1873    COSTS_N_INSNS (3),			/*				 SI.  */
1874    COSTS_N_INSNS (3),			/*				 DI.  */
1875    COSTS_N_INSNS (3)},			/*			other.  */
1876   0,					/* cost of multiply per each bit
1877 					   set.  */
1878   {COSTS_N_INSNS (12),			/* cost of a divide/mod for QI.  */
1879    COSTS_N_INSNS (13),			/* 			    HI.  */
1880    COSTS_N_INSNS (13),			/*			    SI.  */
1881    COSTS_N_INSNS (18),			/*			    DI.  */
1882    COSTS_N_INSNS (18)},			/*			    other.  */
1883   COSTS_N_INSNS (1),			/* cost of movsx.  */
1884   COSTS_N_INSNS (1),			/* cost of movzx.  */
1885   8,					/* "large" insn.  */
1886   9,					/* MOVE_RATIO.  */
1887   6,					/* CLEAR_RATIO */
1888   {6, 6, 6},				/* cost of loading integer registers
1889 					   in QImode, HImode and SImode.
1890 					   Relative to reg-reg move (2).  */
1891   {8, 8, 8},				/* cost of storing integer
1892 					   registers.  */
1893   {6, 6, 10, 10, 12},			/* cost of loading SSE registers
1894 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1895   {8, 8, 8, 12, 12},			/* cost of storing SSE register
1896 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1897   {6, 6, 6, 6, 6},			/* cost of unaligned loads.  */
1898   {8, 8, 8, 8, 8},			/* cost of unaligned stores.  */
1899   2, 2, 2,				/* cost of moving XMM,YMM,ZMM
1900 					   register.  */
1901   6,					/* cost of moving SSE register to integer.  */
1902   /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
1903      throughput 5.  Approx 7 uops do not depend on vector size and every load
1904      is 5 uops.  */
1905   14, 10,				/* Gather load static, per_elt.  */
1906   14, 20,				/* Gather store static, per_elt.  */
1907   32,					/* size of l1 cache.  */
1908   1024,					/* size of l2 cache.  */
1909   64,					/* size of prefetch block.  */
1910   /* New AMD processors never drop prefetches; if they cannot be performed
1911      immediately, they are queued.  We set number of simultaneous prefetches
1912      to a large constant to reflect this (it probably is not a good idea not
1913      to limit number of prefetches at all, as their execution also takes some
1914      time).  */
1915   100,					/* number of parallel prefetches.  */
1916   3,					/* Branch cost.  */
1917   COSTS_N_INSNS (7),			/* cost of FADD and FSUB insns.  */
1918   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
1919   /* Latency of fdiv is 8-15.  */
1920   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1921   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1922   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1923   /* Latency of fsqrt is 4-10.  */
1924   COSTS_N_INSNS (25),			/* cost of FSQRT instruction.  */
1925 
1926   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1927   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1928   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1929   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
1930   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
1931   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
1932   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
1933   /* 9-13.  */
1934   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1935   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1936   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
1937   /* Zen can execute 4 integer operations per cycle.  FP operations
1938      take 3 cycles and it can execute 2 integer additions and 2
1939      multiplications thus reassociation may make sense up to with of 6.
1940      SPEC2k6 bencharks suggests
1941      that 4 works better than 6 probably due to register pressure.
1942 
1943      Integer vector operations are taken by FP unit and execute 3 vector
1944      plus/minus operations per cycle but only one multiply.  This is adjusted
1945      in ix86_reassociation_width.  */
1946   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1947   znver2_memcpy,
1948   znver2_memset,
1949   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1950   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1951   "16",					/* Loop alignment.  */
1952   "16",					/* Jump alignment.  */
1953   "0:0:8",				/* Label alignment.  */
1954   "16",					/* Func alignment.  */
1955 };
1956 
1957 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1958 static stringop_algs skylake_memcpy[2] =   {
1959   {libcall,
1960    {{256, rep_prefix_1_byte, true},
1961     {256, loop, false},
1962     {-1, libcall, false}}},
1963   {libcall,
1964    {{256, rep_prefix_1_byte, true},
1965     {256, loop, false},
1966     {-1, libcall, false}}}};
1967 
1968 static stringop_algs skylake_memset[2] = {
1969   {libcall,
1970    {{256, rep_prefix_1_byte, true},
1971     {256, loop, false},
1972     {-1, libcall, false}}},
1973   {libcall,
1974    {{256, rep_prefix_1_byte, true},
1975     {256, loop, false},
1976     {-1, libcall, false}}}};
1977 
1978 static const
1979 struct processor_costs skylake_cost = {
1980   {
1981   /* Start of register allocator costs.  integer->integer move cost is 2. */
1982   6,				     /* cost for loading QImode using movzbl */
1983   {4, 4, 4},				/* cost of loading integer registers
1984 					   in QImode, HImode and SImode.
1985 					   Relative to reg-reg move (2).  */
1986   {6, 6, 6},				/* cost of storing integer registers */
1987   2,					/* cost of reg,reg fld/fst */
1988   {6, 6, 8},				/* cost of loading fp registers
1989 					   in SFmode, DFmode and XFmode */
1990   {6, 6, 10},				/* cost of storing fp registers
1991 					   in SFmode, DFmode and XFmode */
1992   2,					/* cost of moving MMX register */
1993   {6, 6},				/* cost of loading MMX registers
1994 					   in SImode and DImode */
1995   {6, 6},				/* cost of storing MMX registers
1996 					   in SImode and DImode */
1997   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
1998   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
1999 					   in 32,64,128,256 and 512-bit */
2000   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
2001 					   in 32,64,128,256 and 512-bit */
2002   6, 6,				/* SSE->integer and integer->SSE moves */
2003   5, 5,				/* mask->integer and integer->mask moves */
2004   {8, 8, 8},				/* cost of loading mask register
2005 					   in QImode, HImode, SImode.  */
2006   {6, 6, 6},				/* cost if storing mask register
2007 					   in QImode, HImode, SImode.  */
2008   3,					/* cost of moving mask register.  */
2009   /* End of register allocator costs.  */
2010   },
2011 
2012   COSTS_N_INSNS (1),			/* cost of an add instruction */
2013   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
2014   COSTS_N_INSNS (1),			/* variable shift costs */
2015   COSTS_N_INSNS (1),			/* constant shift costs */
2016   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2017    COSTS_N_INSNS (4),			/*				 HI */
2018    COSTS_N_INSNS (3),			/*				 SI */
2019    COSTS_N_INSNS (3),			/*				 DI */
2020    COSTS_N_INSNS (3)},			/*			      other */
2021   0,					/* cost of multiply per each bit set */
2022   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2023      model is not realistic. We compensate by increasing the latencies a bit.  */
2024   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
2025    COSTS_N_INSNS (11),			/*			    HI */
2026    COSTS_N_INSNS (14),			/*			    SI */
2027    COSTS_N_INSNS (76),			/*			    DI */
2028    COSTS_N_INSNS (76)},			/*			    other */
2029   COSTS_N_INSNS (1),			/* cost of movsx */
2030   COSTS_N_INSNS (0),			/* cost of movzx */
2031   8,					/* "large" insn */
2032   17,					/* MOVE_RATIO */
2033   17,					/* CLEAR_RATIO */
2034   {4, 4, 4},				/* cost of loading integer registers
2035 					   in QImode, HImode and SImode.
2036 					   Relative to reg-reg move (2).  */
2037   {6, 6, 6},				/* cost of storing integer registers */
2038   {6, 6, 6, 10, 20},			/* cost of loading SSE register
2039 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2040   {8, 8, 8, 8, 16},			/* cost of storing SSE register
2041 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2042   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
2043   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
2044   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2045   6,					/* cost of moving SSE register to integer.  */
2046   20, 8,				/* Gather load static, per_elt.  */
2047   22, 10,				/* Gather store static, per_elt.  */
2048   64,					/* size of l1 cache.  */
2049   512,					/* size of l2 cache.  */
2050   64,					/* size of prefetch block */
2051   6,					/* number of parallel prefetches */
2052   3,					/* Branch cost */
2053   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2054   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
2055   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2056   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2057   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2058   COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
2059 
2060   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2061   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2062   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2063   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
2064   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
2065   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
2066   COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
2067   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
2068   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
2069   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
2070   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2071   skylake_memcpy,
2072   skylake_memset,
2073   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2074   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2075   "16:11:8",				/* Loop alignment.  */
2076   "16:11:8",				/* Jump alignment.  */
2077   "0:0:8",				/* Label alignment.  */
2078   "16",					/* Func alignment.  */
2079 };
2080 
2081 /* icelake_cost should produce code tuned for Icelake family of CPUs.
2082    NB: rep_prefix_1_byte is used only for known size. */
2083 
2084 static stringop_algs icelake_memcpy[2] =   {
2085   {libcall,
2086    {{256, rep_prefix_1_byte, true},
2087     {256, loop, false},
2088     {-1, libcall, false}}},
2089   {libcall,
2090    {{256, rep_prefix_1_byte, true},
2091     {256, loop, false},
2092     {-1, libcall, false}}}};
2093 
2094 static stringop_algs icelake_memset[2] = {
2095   {libcall,
2096    {{256, rep_prefix_1_byte, true},
2097     {256, loop, false},
2098     {-1, libcall, false}}},
2099   {libcall,
2100    {{256, rep_prefix_1_byte, true},
2101     {256, loop, false},
2102     {-1, libcall, false}}}};
2103 
2104 static const
2105 struct processor_costs icelake_cost = {
2106   {
2107   /* Start of register allocator costs.  integer->integer move cost is 2. */
2108   6,				     /* cost for loading QImode using movzbl */
2109   {4, 4, 4},				/* cost of loading integer registers
2110 					   in QImode, HImode and SImode.
2111 					   Relative to reg-reg move (2).  */
2112   {6, 6, 6},				/* cost of storing integer registers */
2113   2,					/* cost of reg,reg fld/fst */
2114   {6, 6, 8},				/* cost of loading fp registers
2115 					   in SFmode, DFmode and XFmode */
2116   {6, 6, 10},				/* cost of storing fp registers
2117 					   in SFmode, DFmode and XFmode */
2118   2,					/* cost of moving MMX register */
2119   {6, 6},				/* cost of loading MMX registers
2120 					   in SImode and DImode */
2121   {6, 6},				/* cost of storing MMX registers
2122 					   in SImode and DImode */
2123   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2124   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
2125 					   in 32,64,128,256 and 512-bit */
2126   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
2127 					   in 32,64,128,256 and 512-bit */
2128   6, 6,				/* SSE->integer and integer->SSE moves */
2129   5, 5,				/* mask->integer and integer->mask moves */
2130   {8, 8, 8},				/* cost of loading mask register
2131 					   in QImode, HImode, SImode.  */
2132   {6, 6, 6},				/* cost if storing mask register
2133 					   in QImode, HImode, SImode.  */
2134   3,					/* cost of moving mask register.  */
2135   /* End of register allocator costs.  */
2136   },
2137 
2138   COSTS_N_INSNS (1),			/* cost of an add instruction */
2139   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
2140   COSTS_N_INSNS (1),			/* variable shift costs */
2141   COSTS_N_INSNS (1),			/* constant shift costs */
2142   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2143    COSTS_N_INSNS (4),			/*				 HI */
2144    COSTS_N_INSNS (3),			/*				 SI */
2145    COSTS_N_INSNS (3),			/*				 DI */
2146    COSTS_N_INSNS (3)},			/*			      other */
2147   0,					/* cost of multiply per each bit set */
2148   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2149      model is not realistic. We compensate by increasing the latencies a bit.  */
2150   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
2151    COSTS_N_INSNS (11),			/*			    HI */
2152    COSTS_N_INSNS (14),			/*			    SI */
2153    COSTS_N_INSNS (76),			/*			    DI */
2154    COSTS_N_INSNS (76)},			/*			    other */
2155   COSTS_N_INSNS (1),			/* cost of movsx */
2156   COSTS_N_INSNS (0),			/* cost of movzx */
2157   8,					/* "large" insn */
2158   17,					/* MOVE_RATIO */
2159   17,					/* CLEAR_RATIO */
2160   {4, 4, 4},				/* cost of loading integer registers
2161 					   in QImode, HImode and SImode.
2162 					   Relative to reg-reg move (2).  */
2163   {6, 6, 6},				/* cost of storing integer registers */
2164   {6, 6, 6, 10, 20},			/* cost of loading SSE register
2165 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2166   {8, 8, 8, 8, 16},			/* cost of storing SSE register
2167 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2168   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
2169   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
2170   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2171   6,					/* cost of moving SSE register to integer.  */
2172   20, 8,				/* Gather load static, per_elt.  */
2173   22, 10,				/* Gather store static, per_elt.  */
2174   64,					/* size of l1 cache.  */
2175   512,					/* size of l2 cache.  */
2176   64,					/* size of prefetch block */
2177   6,					/* number of parallel prefetches */
2178   3,					/* Branch cost */
2179   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2180   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
2181   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2182   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2183   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2184   COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
2185 
2186   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2187   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2188   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2189   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
2190   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
2191   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
2192   COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
2193   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
2194   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
2195   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
2196   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2197   icelake_memcpy,
2198   icelake_memset,
2199   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2200   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2201   "16:11:8",				/* Loop alignment.  */
2202   "16:11:8",				/* Jump alignment.  */
2203   "0:0:8",				/* Label alignment.  */
2204   "16",					/* Func alignment.  */
2205 };
2206 
2207 /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
2208 static stringop_algs alderlake_memcpy[2] = {
2209   {libcall,
2210    {{256, rep_prefix_1_byte, true},
2211     {256, loop, false},
2212     {-1, libcall, false}}},
2213   {libcall,
2214    {{256, rep_prefix_1_byte, true},
2215     {256, loop, false},
2216     {-1, libcall, false}}}};
2217 static stringop_algs alderlake_memset[2] = {
2218   {libcall,
2219    {{256, rep_prefix_1_byte, true},
2220     {256, loop, false},
2221     {-1, libcall, false}}},
2222   {libcall,
2223    {{256, rep_prefix_1_byte, true},
2224     {256, loop, false},
2225     {-1, libcall, false}}}};
2226 static const
2227 struct processor_costs alderlake_cost = {
2228   {
2229   /* Start of register allocator costs.  integer->integer move cost is 2.  */
2230   6,				     /* cost for loading QImode using movzbl */
2231   {6, 6, 6},				/* cost of loading integer registers
2232 					   in QImode, HImode and SImode.
2233 					   Relative to reg-reg move (2).  */
2234   {6, 6, 6},				/* cost of storing integer registers */
2235   4,					/* cost of reg,reg fld/fst */
2236   {6, 6, 12},				/* cost of loading fp registers
2237 					   in SFmode, DFmode and XFmode */
2238   {6, 6, 12},				/* cost of storing fp registers
2239 					   in SFmode, DFmode and XFmode */
2240   2,					/* cost of moving MMX register */
2241   {6, 6},				/* cost of loading MMX registers
2242 					   in SImode and DImode */
2243   {6, 6},				/* cost of storing MMX registers
2244 					   in SImode and DImode */
2245   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
2246   {6, 6, 6, 10, 15},			/* cost of loading SSE registers
2247 					   in 32,64,128,256 and 512-bit */
2248   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
2249 					   in 32,64,128,256 and 512-bit */
2250   6, 6,				/* SSE->integer and integer->SSE moves */
2251   6, 6,				/* mask->integer and integer->mask moves */
2252   {6, 6, 6},				/* cost of loading mask register
2253 					   in QImode, HImode, SImode.  */
2254   {6, 6, 6},			/* cost if storing mask register
2255 					   in QImode, HImode, SImode.  */
2256   2,					/* cost of moving mask register.  */
2257   /* End of register allocator costs.  */
2258   },
2259 
2260   COSTS_N_INSNS (1),			/* cost of an add instruction */
2261   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2262   COSTS_N_INSNS (1),			/* variable shift costs */
2263   COSTS_N_INSNS (1),			/* constant shift costs */
2264   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2265    COSTS_N_INSNS (4),			/*				 HI */
2266    COSTS_N_INSNS (3),			/*				 SI */
2267    COSTS_N_INSNS (4),			/*				 DI */
2268    COSTS_N_INSNS (4)},			/*			      other */
2269   0,					/* cost of multiply per each bit set */
2270   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
2271    COSTS_N_INSNS (22),			/*			    HI */
2272    COSTS_N_INSNS (30),			/*			    SI */
2273    COSTS_N_INSNS (74),			/*			    DI */
2274    COSTS_N_INSNS (74)},			/*			    other */
2275   COSTS_N_INSNS (1),			/* cost of movsx */
2276   COSTS_N_INSNS (1),			/* cost of movzx */
2277   8,					/* "large" insn */
2278   17,					/* MOVE_RATIO */
2279   17,					/* CLEAR_RATIO */
2280   {6, 6, 6},				/* cost of loading integer registers
2281 					   in QImode, HImode and SImode.
2282 					   Relative to reg-reg move (2).  */
2283   {6, 6, 6},				/* cost of storing integer registers */
2284   {6, 6, 6, 10, 15},			/* cost of loading SSE register
2285 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2286   {6, 6, 6, 10, 15},			/* cost of storing SSE register
2287 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2288   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
2289   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
2290   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
2291   6,					/* cost of moving SSE register to integer.  */
2292   18, 6,				/* Gather load static, per_elt.  */
2293   18, 6,				/* Gather store static, per_elt.  */
2294   32,					/* size of l1 cache.  */
2295   512,					/* size of l2 cache.  */
2296   64,					/* size of prefetch block */
2297   6,					/* number of parallel prefetches */
2298   3,					/* Branch cost */
2299   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2300   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2301   COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
2302   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2303   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2304   COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
2305 
2306   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2307   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2308   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2309   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2310   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2311   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2312   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2313   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
2314   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
2315   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
2316   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
2317   alderlake_memcpy,
2318   alderlake_memset,
2319   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
2320   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
2321   "16:11:8",				/* Loop alignment.  */
2322   "16:11:8",				/* Jump alignment.  */
2323   "0:0:8",				/* Label alignment.  */
2324   "16",					/* Func alignment.  */
2325 };
2326 
2327   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2328      very small blocks it is better to use loop. For large blocks, libcall can
2329      do nontemporary accesses and beat inline considerably.  */
2330 static stringop_algs btver1_memcpy[2] = {
2331   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2332              {-1, rep_prefix_4_byte, false}}},
2333   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2334              {-1, libcall, false}}}};
2335 static stringop_algs btver1_memset[2] = {
2336   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2337              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2338   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2339              {-1, libcall, false}}}};
2340 const struct processor_costs btver1_cost = {
2341   {
2342   /* Start of register allocator costs.  integer->integer move cost is 2. */
2343   8,				     /* cost for loading QImode using movzbl */
2344   {6, 8, 6},				/* cost of loading integer registers
2345 					   in QImode, HImode and SImode.
2346 					   Relative to reg-reg move (2).  */
2347   {6, 8, 6},				/* cost of storing integer registers */
2348   4,					/* cost of reg,reg fld/fst */
2349   {12, 12, 28},				/* cost of loading fp registers
2350 					   in SFmode, DFmode and XFmode */
2351   {12, 12, 38},				/* cost of storing fp registers
2352 					   in SFmode, DFmode and XFmode */
2353   4,					/* cost of moving MMX register */
2354   {10, 10},				/* cost of loading MMX registers
2355 					   in SImode and DImode */
2356   {12, 12},				/* cost of storing MMX registers
2357 					   in SImode and DImode */
2358   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2359   {10, 10, 12, 48, 96},			/* cost of loading SSE registers
2360 					   in 32,64,128,256 and 512-bit */
2361   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
2362 					   in 32,64,128,256 and 512-bit */
2363   14, 14,				/* SSE->integer and integer->SSE moves */
2364   14, 14,				/* mask->integer and integer->mask moves */
2365   {6, 8, 6},				/* cost of loading mask register
2366 					   in QImode, HImode, SImode.  */
2367   {6, 8, 6},				/* cost if storing mask register
2368 					   in QImode, HImode, SImode.  */
2369   2,					/* cost of moving mask register.  */
2370   /* End of register allocator costs.  */
2371   },
2372 
2373   COSTS_N_INSNS (1),			/* cost of an add instruction */
2374   COSTS_N_INSNS (2),			/* cost of a lea instruction */
2375   COSTS_N_INSNS (1),			/* variable shift costs */
2376   COSTS_N_INSNS (1),			/* constant shift costs */
2377   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2378    COSTS_N_INSNS (4),			/*				 HI */
2379    COSTS_N_INSNS (3),			/*				 SI */
2380    COSTS_N_INSNS (4),			/*				 DI */
2381    COSTS_N_INSNS (5)},			/*			      other */
2382   0,					/* cost of multiply per each bit set */
2383   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
2384    COSTS_N_INSNS (35),			/*			    HI */
2385    COSTS_N_INSNS (51),			/*			    SI */
2386    COSTS_N_INSNS (83),			/*			    DI */
2387    COSTS_N_INSNS (83)},			/*			    other */
2388   COSTS_N_INSNS (1),			/* cost of movsx */
2389   COSTS_N_INSNS (1),			/* cost of movzx */
2390   8,					/* "large" insn */
2391   9,					/* MOVE_RATIO */
2392   6,					/* CLEAR_RATIO */
2393   {6, 8, 6},				/* cost of loading integer registers
2394 					   in QImode, HImode and SImode.
2395 					   Relative to reg-reg move (2).  */
2396   {6, 8, 6},				/* cost of storing integer registers */
2397   {10, 10, 12, 48, 96},			/* cost of loading SSE register
2398 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2399   {10, 10, 12, 48, 96},			/* cost of storing SSE register
2400 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2401   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
2402   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
2403   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2404   14,					/* cost of moving SSE register to integer.  */
2405   10, 10,				/* Gather load static, per_elt.  */
2406   10, 10,				/* Gather store static, per_elt.  */
2407   32,					/* size of l1 cache.  */
2408   512,					/* size of l2 cache.  */
2409   64,					/* size of prefetch block */
2410   100,					/* number of parallel prefetches */
2411   2,					/* Branch cost */
2412   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
2413   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
2414   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
2415   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
2416   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
2417   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
2418 
2419   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2420   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2421   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
2422   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
2423   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2424   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2425   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2426   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
2427   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
2428   COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
2429   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2430   btver1_memcpy,
2431   btver1_memset,
2432   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
2433   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2434   "16:11:8",				/* Loop alignment.  */
2435   "16:8:8",				/* Jump alignment.  */
2436   "0:0:8",				/* Label alignment.  */
2437   "11",					/* Func alignment.  */
2438 };
2439 
2440 static stringop_algs btver2_memcpy[2] = {
2441   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2442              {-1, rep_prefix_4_byte, false}}},
2443   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2444              {-1, libcall, false}}}};
2445 static stringop_algs btver2_memset[2] = {
2446   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2447              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2448   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2449              {-1, libcall, false}}}};
2450 const struct processor_costs btver2_cost = {
2451   {
2452   /* Start of register allocator costs.  integer->integer move cost is 2. */
2453   8,				     /* cost for loading QImode using movzbl */
2454   {8, 8, 6},				/* cost of loading integer registers
2455 					   in QImode, HImode and SImode.
2456 					   Relative to reg-reg move (2).  */
2457   {8, 8, 6},				/* cost of storing integer registers */
2458   4,					/* cost of reg,reg fld/fst */
2459   {12, 12, 28},				/* cost of loading fp registers
2460 					   in SFmode, DFmode and XFmode */
2461   {12, 12, 38},				/* cost of storing fp registers
2462 					   in SFmode, DFmode and XFmode */
2463   4,					/* cost of moving MMX register */
2464   {10, 10},				/* cost of loading MMX registers
2465 					   in SImode and DImode */
2466   {12, 12},				/* cost of storing MMX registers
2467 					   in SImode and DImode */
2468   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2469   {10, 10, 12, 48, 96},			/* cost of loading SSE registers
2470 					   in 32,64,128,256 and 512-bit */
2471   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
2472 					   in 32,64,128,256 and 512-bit */
2473   14, 14,				/* SSE->integer and integer->SSE moves */
2474   14, 14,				/* mask->integer and integer->mask moves */
2475   {8, 8, 6},				/* cost of loading mask register
2476 					   in QImode, HImode, SImode.  */
2477   {8, 8, 6},				/* cost if storing mask register
2478 					   in QImode, HImode, SImode.  */
2479   2,					/* cost of moving mask register.  */
2480   /* End of register allocator costs.  */
2481   },
2482 
2483   COSTS_N_INSNS (1),			/* cost of an add instruction */
2484   COSTS_N_INSNS (2),			/* cost of a lea instruction */
2485   COSTS_N_INSNS (1),			/* variable shift costs */
2486   COSTS_N_INSNS (1),			/* constant shift costs */
2487   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2488    COSTS_N_INSNS (4),			/*				 HI */
2489    COSTS_N_INSNS (3),			/*				 SI */
2490    COSTS_N_INSNS (4),			/*				 DI */
2491    COSTS_N_INSNS (5)},			/*			      other */
2492   0,					/* cost of multiply per each bit set */
2493   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
2494    COSTS_N_INSNS (35),			/*			    HI */
2495    COSTS_N_INSNS (51),			/*			    SI */
2496    COSTS_N_INSNS (83),			/*			    DI */
2497    COSTS_N_INSNS (83)},			/*			    other */
2498   COSTS_N_INSNS (1),			/* cost of movsx */
2499   COSTS_N_INSNS (1),			/* cost of movzx */
2500   8,					/* "large" insn */
2501   9,					/* MOVE_RATIO */
2502   6,					/* CLEAR_RATIO */
2503   {8, 8, 6},				/* cost of loading integer registers
2504 					   in QImode, HImode and SImode.
2505 					   Relative to reg-reg move (2).  */
2506   {8, 8, 6},				/* cost of storing integer registers */
2507   {10, 10, 12, 48, 96},			/* cost of loading SSE register
2508 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2509   {10, 10, 12, 48, 96},			/* cost of storing SSE register
2510 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2511   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
2512   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
2513   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2514   14,					/* cost of moving SSE register to integer.  */
2515   10, 10,				/* Gather load static, per_elt.  */
2516   10, 10,				/* Gather store static, per_elt.  */
2517   32,					/* size of l1 cache.  */
2518   2048,					/* size of l2 cache.  */
2519   64,					/* size of prefetch block */
2520   100,					/* number of parallel prefetches */
2521   2,					/* Branch cost */
2522   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
2523   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
2524   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
2525   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
2526   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
2527   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
2528 
2529   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2530   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2531   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
2532   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
2533   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2534   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2535   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2536   COSTS_N_INSNS (19),			/* cost of DIVSD instruction.  */
2537   COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
2538   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
2539   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2540   btver2_memcpy,
2541   btver2_memset,
2542   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
2543   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2544   "16:11:8",				/* Loop alignment.  */
2545   "16:8:8",				/* Jump alignment.  */
2546   "0:0:8",				/* Label alignment.  */
2547   "11",					/* Func alignment.  */
2548 };
2549 
2550 static stringop_algs pentium4_memcpy[2] = {
2551   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2552   DUMMY_STRINGOP_ALGS};
2553 static stringop_algs pentium4_memset[2] = {
2554   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2555              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2556   DUMMY_STRINGOP_ALGS};
2557 
2558 static const
2559 struct processor_costs pentium4_cost = {
2560   {
2561   /* Start of register allocator costs.  integer->integer move cost is 2. */
2562   5,				     /* cost for loading QImode using movzbl */
2563   {4, 5, 4},				/* cost of loading integer registers
2564 					   in QImode, HImode and SImode.
2565 					   Relative to reg-reg move (2).  */
2566   {2, 3, 2},				/* cost of storing integer registers */
2567   12,					/* cost of reg,reg fld/fst */
2568   {14, 14, 14},				/* cost of loading fp registers
2569 					   in SFmode, DFmode and XFmode */
2570   {14, 14, 14},				/* cost of storing fp registers
2571 					   in SFmode, DFmode and XFmode */
2572   12,					/* cost of moving MMX register */
2573   {16, 16},				/* cost of loading MMX registers
2574 					   in SImode and DImode */
2575   {16, 16},				/* cost of storing MMX registers
2576 					   in SImode and DImode */
2577   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
2578   {16, 16, 16, 32, 64},			/* cost of loading SSE registers
2579 					   in 32,64,128,256 and 512-bit */
2580   {16, 16, 16, 32, 64},			/* cost of storing SSE registers
2581 					   in 32,64,128,256 and 512-bit */
2582   20, 12,				/* SSE->integer and integer->SSE moves */
2583   20, 12,				/* mask->integer and integer->mask moves */
2584   {4, 5, 4},				/* cost of loading mask register
2585 					   in QImode, HImode, SImode.  */
2586   {2, 3, 2},				/* cost if storing mask register
2587 					   in QImode, HImode, SImode.  */
2588   2,					/* cost of moving mask register.  */
2589   /* End of register allocator costs.  */
2590   },
2591 
2592   COSTS_N_INSNS (1),			/* cost of an add instruction */
2593   COSTS_N_INSNS (3),			/* cost of a lea instruction */
2594   COSTS_N_INSNS (4),			/* variable shift costs */
2595   COSTS_N_INSNS (4),			/* constant shift costs */
2596   {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
2597    COSTS_N_INSNS (15),			/*				 HI */
2598    COSTS_N_INSNS (15),			/*				 SI */
2599    COSTS_N_INSNS (15),			/*				 DI */
2600    COSTS_N_INSNS (15)},			/*			      other */
2601   0,					/* cost of multiply per each bit set */
2602   {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
2603    COSTS_N_INSNS (56),			/*			    HI */
2604    COSTS_N_INSNS (56),			/*			    SI */
2605    COSTS_N_INSNS (56),			/*			    DI */
2606    COSTS_N_INSNS (56)},			/*			    other */
2607   COSTS_N_INSNS (1),			/* cost of movsx */
2608   COSTS_N_INSNS (1),			/* cost of movzx */
2609   16,					/* "large" insn */
2610   6,					/* MOVE_RATIO */
2611   6,					/* CLEAR_RATIO */
2612   {4, 5, 4},				/* cost of loading integer registers
2613 					   in QImode, HImode and SImode.
2614 					   Relative to reg-reg move (2).  */
2615   {2, 3, 2},				/* cost of storing integer registers */
2616   {16, 16, 16, 32, 64},			/* cost of loading SSE register
2617 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2618   {16, 16, 16, 32, 64},			/* cost of storing SSE register
2619 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2620   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
2621   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
2622   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
2623   20,					/* cost of moving SSE register to integer.  */
2624   16, 16,				/* Gather load static, per_elt.  */
2625   16, 16,				/* Gather store static, per_elt.  */
2626   8,					/* size of l1 cache.  */
2627   256,					/* size of l2 cache.  */
2628   64,					/* size of prefetch block */
2629   6,					/* number of parallel prefetches */
2630   2,					/* Branch cost */
2631   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
2632   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
2633   COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
2634   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
2635   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
2636   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
2637 
2638   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
2639   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2640   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
2641   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
2642   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2643   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2644   COSTS_N_INSNS (23),			/* cost of DIVSS instruction.  */
2645   COSTS_N_INSNS (38),			/* cost of DIVSD instruction.  */
2646   COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
2647   COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
2648   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2649   pentium4_memcpy,
2650   pentium4_memset,
2651   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2652   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2653   NULL,					/* Loop alignment.  */
2654   NULL,					/* Jump alignment.  */
2655   NULL,					/* Label alignment.  */
2656   NULL,					/* Func alignment.  */
2657 };
2658 
2659 static stringop_algs nocona_memcpy[2] = {
2660   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2661   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2662              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2663 
2664 static stringop_algs nocona_memset[2] = {
2665   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2666              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2667   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2668              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2669 
2670 static const
2671 struct processor_costs nocona_cost = {
2672   {
2673   /* Start of register allocator costs.  integer->integer move cost is 2. */
2674   4,				     /* cost for loading QImode using movzbl */
2675   {4, 4, 4},				/* cost of loading integer registers
2676 					   in QImode, HImode and SImode.
2677 					   Relative to reg-reg move (2).  */
2678   {4, 4, 4},				/* cost of storing integer registers */
2679   12,					/* cost of reg,reg fld/fst */
2680   {14, 14, 14},				/* cost of loading fp registers
2681 					   in SFmode, DFmode and XFmode */
2682   {14, 14, 14},				/* cost of storing fp registers
2683 					   in SFmode, DFmode and XFmode */
2684   14,					/* cost of moving MMX register */
2685   {12, 12},				/* cost of loading MMX registers
2686 					   in SImode and DImode */
2687   {12, 12},				/* cost of storing MMX registers
2688 					   in SImode and DImode */
2689   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
2690   {12, 12, 12, 24, 48},			/* cost of loading SSE registers
2691 					   in 32,64,128,256 and 512-bit */
2692   {12, 12, 12, 24, 48},			/* cost of storing SSE registers
2693 					   in 32,64,128,256 and 512-bit */
2694   20, 12,				/* SSE->integer and integer->SSE moves */
2695   20, 12,				/* mask->integer and integer->mask moves */
2696   {4, 4, 4},				/* cost of loading mask register
2697 					   in QImode, HImode, SImode.  */
2698   {4, 4, 4},				/* cost if storing mask register
2699 					   in QImode, HImode, SImode.  */
2700   2,					/* cost of moving mask register.  */
2701   /* End of register allocator costs.  */
2702   },
2703 
2704   COSTS_N_INSNS (1),			/* cost of an add instruction */
2705   COSTS_N_INSNS (1),			/* cost of a lea instruction */
2706   COSTS_N_INSNS (1),			/* variable shift costs */
2707   COSTS_N_INSNS (1),			/* constant shift costs */
2708   {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
2709    COSTS_N_INSNS (10),			/*				 HI */
2710    COSTS_N_INSNS (10),			/*				 SI */
2711    COSTS_N_INSNS (10),			/*				 DI */
2712    COSTS_N_INSNS (10)},			/*			      other */
2713   0,					/* cost of multiply per each bit set */
2714   {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
2715    COSTS_N_INSNS (66),			/*			    HI */
2716    COSTS_N_INSNS (66),			/*			    SI */
2717    COSTS_N_INSNS (66),			/*			    DI */
2718    COSTS_N_INSNS (66)},			/*			    other */
2719   COSTS_N_INSNS (1),			/* cost of movsx */
2720   COSTS_N_INSNS (1),			/* cost of movzx */
2721   16,					/* "large" insn */
2722   17,					/* MOVE_RATIO */
2723   6,					/* CLEAR_RATIO */
2724   {4, 4, 4},				/* cost of loading integer registers
2725 					   in QImode, HImode and SImode.
2726 					   Relative to reg-reg move (2).  */
2727   {4, 4, 4},				/* cost of storing integer registers */
2728   {12, 12, 12, 24, 48},			/* cost of loading SSE register
2729 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2730   {12, 12, 12, 24, 48},			/* cost of storing SSE register
2731 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2732   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
2733   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
2734   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
2735   20,					/* cost of moving SSE register to integer.  */
2736   12, 12,				/* Gather load static, per_elt.  */
2737   12, 12,				/* Gather store static, per_elt.  */
2738   8,					/* size of l1 cache.  */
2739   1024,					/* size of l2 cache.  */
2740   64,					/* size of prefetch block */
2741   8,					/* number of parallel prefetches */
2742   1,					/* Branch cost */
2743   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
2744   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2745   COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
2746   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
2747   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
2748   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
2749 
2750   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
2751   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2752   COSTS_N_INSNS (7),			/* cost of MULSS instruction.  */
2753   COSTS_N_INSNS (7),			/* cost of MULSD instruction.  */
2754   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
2755   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
2756   COSTS_N_INSNS (32),			/* cost of DIVSS instruction.  */
2757   COSTS_N_INSNS (40),			/* cost of DIVSD instruction.  */
2758   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
2759   COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
2760   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2761   nocona_memcpy,
2762   nocona_memset,
2763   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2764   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2765   NULL,					/* Loop alignment.  */
2766   NULL,					/* Jump alignment.  */
2767   NULL,					/* Label alignment.  */
2768   NULL,					/* Func alignment.  */
2769 };
2770 
2771 static stringop_algs atom_memcpy[2] = {
2772   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2773   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2774              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2775 static stringop_algs atom_memset[2] = {
2776   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2777              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2778   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2779              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2780 static const
2781 struct processor_costs atom_cost = {
2782   {
2783   /* Start of register allocator costs.  integer->integer move cost is 2. */
2784   6,					/* cost for loading QImode using movzbl */
2785   {6, 6, 6},				/* cost of loading integer registers
2786 					   in QImode, HImode and SImode.
2787 					   Relative to reg-reg move (2).  */
2788   {6, 6, 6},				/* cost of storing integer registers */
2789   4,					/* cost of reg,reg fld/fst */
2790   {6, 6, 18},				/* cost of loading fp registers
2791 					   in SFmode, DFmode and XFmode */
2792   {14, 14, 24},				/* cost of storing fp registers
2793 					   in SFmode, DFmode and XFmode */
2794   2,					/* cost of moving MMX register */
2795   {8, 8},				/* cost of loading MMX registers
2796 					   in SImode and DImode */
2797   {10, 10},				/* cost of storing MMX registers
2798 					   in SImode and DImode */
2799   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2800   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2801 					   in 32,64,128,256 and 512-bit */
2802   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2803 					   in 32,64,128,256 and 512-bit */
2804   8, 6,				/* SSE->integer and integer->SSE moves */
2805   8, 6,				/* mask->integer and integer->mask moves */
2806   {6, 6, 6},				/* cost of loading mask register
2807 					   in QImode, HImode, SImode.  */
2808   {6, 6, 6},			/* cost if storing mask register
2809 					   in QImode, HImode, SImode.  */
2810   2,					/* cost of moving mask register.  */
2811   /* End of register allocator costs.  */
2812   },
2813 
2814   COSTS_N_INSNS (1),			/* cost of an add instruction */
2815   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2816   COSTS_N_INSNS (1),			/* variable shift costs */
2817   COSTS_N_INSNS (1),			/* constant shift costs */
2818   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2819    COSTS_N_INSNS (4),			/*				 HI */
2820    COSTS_N_INSNS (3),			/*				 SI */
2821    COSTS_N_INSNS (4),			/*				 DI */
2822    COSTS_N_INSNS (2)},			/*			      other */
2823   0,					/* cost of multiply per each bit set */
2824   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2825    COSTS_N_INSNS (26),			/*			    HI */
2826    COSTS_N_INSNS (42),			/*			    SI */
2827    COSTS_N_INSNS (74),			/*			    DI */
2828    COSTS_N_INSNS (74)},			/*			    other */
2829   COSTS_N_INSNS (1),			/* cost of movsx */
2830   COSTS_N_INSNS (1),			/* cost of movzx */
2831   8,					/* "large" insn */
2832   17,					/* MOVE_RATIO */
2833   6,					/* CLEAR_RATIO */
2834   {6, 6, 6},				/* cost of loading integer registers
2835 					   in QImode, HImode and SImode.
2836 					   Relative to reg-reg move (2).  */
2837   {6, 6, 6},				/* cost of storing integer registers */
2838   {8, 8, 8, 16, 32},			/* cost of loading SSE register
2839 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2840   {8, 8, 8, 16, 32},			/* cost of storing SSE register
2841 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2842   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2843   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2844   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2845   8,					/* cost of moving SSE register to integer.  */
2846   8, 8,					/* Gather load static, per_elt.  */
2847   8, 8,					/* Gather store static, per_elt.  */
2848   32,					/* size of l1 cache.  */
2849   256,					/* size of l2 cache.  */
2850   64,					/* size of prefetch block */
2851   6,					/* number of parallel prefetches */
2852   3,					/* Branch cost */
2853   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2854   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2855   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2856   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2857   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2858   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2859 
2860   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2861   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2862   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2863   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2864   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2865   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2866   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
2867   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
2868   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
2869   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
2870   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2871   atom_memcpy,
2872   atom_memset,
2873   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2874   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2875   "16",					/* Loop alignment.  */
2876   "16:8:8",				/* Jump alignment.  */
2877   "0:0:8",				/* Label alignment.  */
2878   "16",					/* Func alignment.  */
2879 };
2880 
2881 static stringop_algs slm_memcpy[2] = {
2882   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2883   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2884              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2885 static stringop_algs slm_memset[2] = {
2886   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2887              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2888   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2889              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2890 static const
2891 struct processor_costs slm_cost = {
2892   {
2893   /* Start of register allocator costs.  integer->integer move cost is 2. */
2894   8,					/* cost for loading QImode using movzbl */
2895   {8, 8, 8},				/* cost of loading integer registers
2896 					   in QImode, HImode and SImode.
2897 					   Relative to reg-reg move (2).  */
2898   {6, 6, 6},				/* cost of storing integer registers */
2899   2,					/* cost of reg,reg fld/fst */
2900   {8, 8, 18},				/* cost of loading fp registers
2901 					   in SFmode, DFmode and XFmode */
2902   {6, 6, 18},				/* cost of storing fp registers
2903 					   in SFmode, DFmode and XFmode */
2904   2,					/* cost of moving MMX register */
2905   {8, 8},				/* cost of loading MMX registers
2906 					   in SImode and DImode */
2907   {6, 6},				/* cost of storing MMX registers
2908 					   in SImode and DImode */
2909   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2910   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2911 					   in 32,64,128,256 and 512-bit */
2912   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2913 					   in 32,64,128,256 and 512-bit */
2914   8, 6,				/* SSE->integer and integer->SSE moves */
2915   8, 6,				/* mask->integer and integer->mask moves */
2916   {8, 8, 8},			/* cost of loading mask register
2917 					   in QImode, HImode, SImode.  */
2918   {6, 6, 6},			/* cost if storing mask register
2919 					   in QImode, HImode, SImode.  */
2920   2,					/* cost of moving mask register.  */
2921   /* End of register allocator costs.  */
2922   },
2923 
2924   COSTS_N_INSNS (1),			/* cost of an add instruction */
2925   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2926   COSTS_N_INSNS (1),			/* variable shift costs */
2927   COSTS_N_INSNS (1),			/* constant shift costs */
2928   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2929    COSTS_N_INSNS (3),			/*				 HI */
2930    COSTS_N_INSNS (3),			/*				 SI */
2931    COSTS_N_INSNS (4),			/*				 DI */
2932    COSTS_N_INSNS (2)},			/*			      other */
2933   0,					/* cost of multiply per each bit set */
2934   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2935    COSTS_N_INSNS (26),			/*			    HI */
2936    COSTS_N_INSNS (42),			/*			    SI */
2937    COSTS_N_INSNS (74),			/*			    DI */
2938    COSTS_N_INSNS (74)},			/*			    other */
2939   COSTS_N_INSNS (1),			/* cost of movsx */
2940   COSTS_N_INSNS (1),			/* cost of movzx */
2941   8,					/* "large" insn */
2942   17,					/* MOVE_RATIO */
2943   6,					/* CLEAR_RATIO */
2944   {8, 8, 8},				/* cost of loading integer registers
2945 					   in QImode, HImode and SImode.
2946 					   Relative to reg-reg move (2).  */
2947   {6, 6, 6},				/* cost of storing integer registers */
2948   {8, 8, 8, 16, 32},			/* cost of loading SSE register
2949 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2950   {8, 8, 8, 16, 32},			/* cost of storing SSE register
2951 					   in SImode, DImode and TImode.  */
2952   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2953   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2954   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2955   8,					/* cost of moving SSE register to integer.  */
2956   8, 8,					/* Gather load static, per_elt.  */
2957   8, 8,					/* Gather store static, per_elt.  */
2958   32,					/* size of l1 cache.  */
2959   256,					/* size of l2 cache.  */
2960   64,					/* size of prefetch block */
2961   6,					/* number of parallel prefetches */
2962   3,					/* Branch cost */
2963   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2964   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2965   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2966   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2967   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2968   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2969 
2970   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2971   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2972   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2973   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2974   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2975   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2976   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
2977   COSTS_N_INSNS (69),			/* cost of DIVSD instruction.  */
2978   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
2979   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
2980   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2981   slm_memcpy,
2982   slm_memset,
2983   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2984   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2985   "16",					/* Loop alignment.  */
2986   "16:8:8",				/* Jump alignment.  */
2987   "0:0:8",				/* Label alignment.  */
2988   "16",					/* Func alignment.  */
2989 };
2990 
2991 static stringop_algs tremont_memcpy[2] = {
2992   {libcall,
2993    {{256, rep_prefix_1_byte, true},
2994     {256, loop, false},
2995     {-1, libcall, false}}},
2996   {libcall,
2997    {{256, rep_prefix_1_byte, true},
2998     {256, loop, false},
2999     {-1, libcall, false}}}};
3000 static stringop_algs tremont_memset[2] = {
3001   {libcall,
3002    {{256, rep_prefix_1_byte, true},
3003     {256, loop, false},
3004     {-1, libcall, false}}},
3005   {libcall,
3006    {{256, rep_prefix_1_byte, true},
3007     {256, loop, false},
3008     {-1, libcall, false}}}};
3009 static const
3010 struct processor_costs tremont_cost = {
3011   {
3012   /* Start of register allocator costs.  integer->integer move cost is 2. */
3013   6,				     /* cost for loading QImode using movzbl */
3014   {6, 6, 6},				/* cost of loading integer registers
3015 					   in QImode, HImode and SImode.
3016 					   Relative to reg-reg move (2).  */
3017   {6, 6, 6},				/* cost of storing integer registers */
3018   4,					/* cost of reg,reg fld/fst */
3019   {6, 6, 12},				/* cost of loading fp registers
3020 					   in SFmode, DFmode and XFmode */
3021   {6, 6, 12},				/* cost of storing fp registers
3022 					   in SFmode, DFmode and XFmode */
3023   2,					/* cost of moving MMX register */
3024   {6, 6},				/* cost of loading MMX registers
3025 					   in SImode and DImode */
3026   {6, 6},				/* cost of storing MMX registers
3027 					   in SImode and DImode */
3028   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
3029   {6, 6, 6, 10, 15},			/* cost of loading SSE registers
3030 					   in 32,64,128,256 and 512-bit */
3031   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
3032 					   in 32,64,128,256 and 512-bit */
3033   6, 6,				/* SSE->integer and integer->SSE moves */
3034   6, 6,				/* mask->integer and integer->mask moves */
3035   {6, 6, 6},				/* cost of loading mask register
3036 					   in QImode, HImode, SImode.  */
3037   {6, 6, 6},			/* cost if storing mask register
3038 					   in QImode, HImode, SImode.  */
3039   2,					/* cost of moving mask register.  */
3040   /* End of register allocator costs.  */
3041   },
3042 
3043   COSTS_N_INSNS (1),			/* cost of an add instruction */
3044   /* Setting cost to 2 makes our current implementation of synth_mult result in
3045      use of unnecessary temporary registers causing regression on several
3046      SPECfp benchmarks.  */
3047   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
3048   COSTS_N_INSNS (1),			/* variable shift costs */
3049   COSTS_N_INSNS (1),			/* constant shift costs */
3050   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
3051    COSTS_N_INSNS (4),			/*				 HI */
3052    COSTS_N_INSNS (3),			/*				 SI */
3053    COSTS_N_INSNS (4),			/*				 DI */
3054    COSTS_N_INSNS (4)},			/*			      other */
3055   0,					/* cost of multiply per each bit set */
3056   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
3057    COSTS_N_INSNS (22),			/*			    HI */
3058    COSTS_N_INSNS (30),			/*			    SI */
3059    COSTS_N_INSNS (74),			/*			    DI */
3060    COSTS_N_INSNS (74)},			/*			    other */
3061   COSTS_N_INSNS (1),			/* cost of movsx */
3062   COSTS_N_INSNS (1),			/* cost of movzx */
3063   8,					/* "large" insn */
3064   17,					/* MOVE_RATIO */
3065   17,					/* CLEAR_RATIO */
3066   {6, 6, 6},				/* cost of loading integer registers
3067 					   in QImode, HImode and SImode.
3068 					   Relative to reg-reg move (2).  */
3069   {6, 6, 6},				/* cost of storing integer registers */
3070   {6, 6, 6, 10, 15},			/* cost of loading SSE register
3071 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3072   {6, 6, 6, 10, 15},			/* cost of storing SSE register
3073 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3074   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
3075   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
3076   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
3077   6,					/* cost of moving SSE register to integer.  */
3078   18, 6,				/* Gather load static, per_elt.  */
3079   18, 6,				/* Gather store static, per_elt.  */
3080   32,					/* size of l1 cache.  */
3081   512,					/* size of l2 cache.  */
3082   64,					/* size of prefetch block */
3083   6,					/* number of parallel prefetches */
3084   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3085      value is increased to perhaps more appropriate value of 5.  */
3086   3,					/* Branch cost */
3087   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
3088   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
3089   COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
3090   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
3091   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
3092   COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
3093 
3094   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
3095   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
3096   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
3097   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
3098   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
3099   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
3100   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
3101   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
3102   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
3103   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
3104   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
3105   tremont_memcpy,
3106   tremont_memset,
3107   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
3108   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
3109   "16:11:8",				/* Loop alignment.  */
3110   "16:11:8",				/* Jump alignment.  */
3111   "0:0:8",				/* Label alignment.  */
3112   "16",					/* Func alignment.  */
3113 };
3114 
3115 static stringop_algs intel_memcpy[2] = {
3116   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3117   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3118              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3119 static stringop_algs intel_memset[2] = {
3120   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3121              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3122   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3123              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3124 static const
3125 struct processor_costs intel_cost = {
3126   {
3127   /* Start of register allocator costs.  integer->integer move cost is 2. */
3128   6,				     /* cost for loading QImode using movzbl */
3129   {4, 4, 4},				/* cost of loading integer registers
3130 					   in QImode, HImode and SImode.
3131 					   Relative to reg-reg move (2).  */
3132   {6, 6, 6},				/* cost of storing integer registers */
3133   2,					/* cost of reg,reg fld/fst */
3134   {6, 6, 8},				/* cost of loading fp registers
3135 					   in SFmode, DFmode and XFmode */
3136   {6, 6, 10},				/* cost of storing fp registers
3137 					   in SFmode, DFmode and XFmode */
3138   2,					/* cost of moving MMX register */
3139   {6, 6},				/* cost of loading MMX registers
3140 					   in SImode and DImode */
3141   {6, 6},				/* cost of storing MMX registers
3142 					   in SImode and DImode */
3143   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
3144   {6, 6, 6, 6, 6},			/* cost of loading SSE registers
3145 					   in 32,64,128,256 and 512-bit */
3146   {6, 6, 6, 6, 6},			/* cost of storing SSE registers
3147 					   in 32,64,128,256 and 512-bit */
3148   4, 4,				/* SSE->integer and integer->SSE moves */
3149   4, 4,				/* mask->integer and integer->mask moves */
3150   {4, 4, 4},				/* cost of loading mask register
3151 					   in QImode, HImode, SImode.  */
3152   {6, 6, 6},				/* cost if storing mask register
3153 					   in QImode, HImode, SImode.  */
3154   2,					/* cost of moving mask register.  */
3155   /* End of register allocator costs.  */
3156   },
3157 
3158   COSTS_N_INSNS (1),			/* cost of an add instruction */
3159   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
3160   COSTS_N_INSNS (1),			/* variable shift costs */
3161   COSTS_N_INSNS (1),			/* constant shift costs */
3162   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
3163    COSTS_N_INSNS (3),			/*				 HI */
3164    COSTS_N_INSNS (3),			/*				 SI */
3165    COSTS_N_INSNS (4),			/*				 DI */
3166    COSTS_N_INSNS (2)},			/*			      other */
3167   0,					/* cost of multiply per each bit set */
3168   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
3169    COSTS_N_INSNS (26),			/*			    HI */
3170    COSTS_N_INSNS (42),			/*			    SI */
3171    COSTS_N_INSNS (74),			/*			    DI */
3172    COSTS_N_INSNS (74)},			/*			    other */
3173   COSTS_N_INSNS (1),			/* cost of movsx */
3174   COSTS_N_INSNS (1),			/* cost of movzx */
3175   8,					/* "large" insn */
3176   17,					/* MOVE_RATIO */
3177   6,					/* CLEAR_RATIO */
3178   {4, 4, 4},				/* cost of loading integer registers
3179 					   in QImode, HImode and SImode.
3180 					   Relative to reg-reg move (2).  */
3181   {6, 6, 6},				/* cost of storing integer registers */
3182   {6, 6, 6, 6, 6},			/* cost of loading SSE register
3183 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3184   {6, 6, 6, 6, 6},			/* cost of storing SSE register
3185 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3186   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
3187   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
3188   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
3189   4,					/* cost of moving SSE register to integer.  */
3190   6, 6,					/* Gather load static, per_elt.  */
3191   6, 6,					/* Gather store static, per_elt.  */
3192   32,					/* size of l1 cache.  */
3193   256,					/* size of l2 cache.  */
3194   64,					/* size of prefetch block */
3195   6,					/* number of parallel prefetches */
3196   3,					/* Branch cost */
3197   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
3198   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
3199   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
3200   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
3201   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
3202   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
3203 
3204   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
3205   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
3206   COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
3207   COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
3208   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
3209   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
3210   COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
3211   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
3212   COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
3213   COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
3214   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
3215   intel_memcpy,
3216   intel_memset,
3217   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
3218   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
3219   "16",					/* Loop alignment.  */
3220   "16:8:8",				/* Jump alignment.  */
3221   "0:0:8",				/* Label alignment.  */
3222   "16",					/* Func alignment.  */
3223 };
3224 
3225 /* Generic should produce code tuned for Core-i7 (and newer chips)
3226    and btver1 (and newer chips).  */
3227 
3228 static stringop_algs generic_memcpy[2] = {
3229   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3230              {-1, libcall, false}}},
3231   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3232              {-1, libcall, false}}}};
3233 static stringop_algs generic_memset[2] = {
3234   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3235              {-1, libcall, false}}},
3236   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3237              {-1, libcall, false}}}};
3238 static const
3239 struct processor_costs generic_cost = {
3240   {
3241   /* Start of register allocator costs.  integer->integer move cost is 2. */
3242   6,				     /* cost for loading QImode using movzbl */
3243   {6, 6, 6},				/* cost of loading integer registers
3244 					   in QImode, HImode and SImode.
3245 					   Relative to reg-reg move (2).  */
3246   {6, 6, 6},				/* cost of storing integer registers */
3247   4,					/* cost of reg,reg fld/fst */
3248   {6, 6, 12},				/* cost of loading fp registers
3249 					   in SFmode, DFmode and XFmode */
3250   {6, 6, 12},				/* cost of storing fp registers
3251 					   in SFmode, DFmode and XFmode */
3252   2,					/* cost of moving MMX register */
3253   {6, 6},				/* cost of loading MMX registers
3254 					   in SImode and DImode */
3255   {6, 6},				/* cost of storing MMX registers
3256 					   in SImode and DImode */
3257   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
3258   {6, 6, 6, 10, 15},			/* cost of loading SSE registers
3259 					   in 32,64,128,256 and 512-bit */
3260   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
3261 					   in 32,64,128,256 and 512-bit */
3262   6, 6,				/* SSE->integer and integer->SSE moves */
3263   6, 6,				/* mask->integer and integer->mask moves */
3264   {6, 6, 6},				/* cost of loading mask register
3265 					   in QImode, HImode, SImode.  */
3266   {6, 6, 6},			/* cost if storing mask register
3267 					   in QImode, HImode, SImode.  */
3268   2,					/* cost of moving mask register.  */
3269   /* End of register allocator costs.  */
3270   },
3271 
3272   COSTS_N_INSNS (1),			/* cost of an add instruction */
3273   /* Setting cost to 2 makes our current implementation of synth_mult result in
3274      use of unnecessary temporary registers causing regression on several
3275      SPECfp benchmarks.  */
3276   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
3277   COSTS_N_INSNS (1),			/* variable shift costs */
3278   COSTS_N_INSNS (1),			/* constant shift costs */
3279   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
3280    COSTS_N_INSNS (4),			/*				 HI */
3281    COSTS_N_INSNS (3),			/*				 SI */
3282    COSTS_N_INSNS (4),			/*				 DI */
3283    COSTS_N_INSNS (4)},			/*			      other */
3284   0,					/* cost of multiply per each bit set */
3285   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
3286    COSTS_N_INSNS (22),			/*			    HI */
3287    COSTS_N_INSNS (30),			/*			    SI */
3288    COSTS_N_INSNS (74),			/*			    DI */
3289    COSTS_N_INSNS (74)},			/*			    other */
3290   COSTS_N_INSNS (1),			/* cost of movsx */
3291   COSTS_N_INSNS (1),			/* cost of movzx */
3292   8,					/* "large" insn */
3293   17,					/* MOVE_RATIO */
3294   6,					/* CLEAR_RATIO */
3295   {6, 6, 6},				/* cost of loading integer registers
3296 					   in QImode, HImode and SImode.
3297 					   Relative to reg-reg move (2).  */
3298   {6, 6, 6},				/* cost of storing integer registers */
3299   {6, 6, 6, 10, 15},			/* cost of loading SSE register
3300 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3301   {6, 6, 6, 10, 15},			/* cost of storing SSE register
3302 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3303   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
3304   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
3305   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
3306   6,					/* cost of moving SSE register to integer.  */
3307   18, 6,				/* Gather load static, per_elt.  */
3308   18, 6,				/* Gather store static, per_elt.  */
3309   32,					/* size of l1 cache.  */
3310   512,					/* size of l2 cache.  */
3311   64,					/* size of prefetch block */
3312   6,					/* number of parallel prefetches */
3313   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3314      value is increased to perhaps more appropriate value of 5.  */
3315   3,					/* Branch cost */
3316   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
3317   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
3318   COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
3319   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
3320   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
3321   COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
3322 
3323   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
3324   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
3325   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
3326   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
3327   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
3328   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
3329   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
3330   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
3331   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
3332   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
3333   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
3334   generic_memcpy,
3335   generic_memset,
3336   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
3337   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
3338   "16:11:8",				/* Loop alignment.  */
3339   "16:11:8",				/* Jump alignment.  */
3340   "0:0:8",				/* Label alignment.  */
3341   "16",					/* Func alignment.  */
3342 };
3343 
3344 /* core_cost should produce code tuned for Core familly of CPUs.  */
3345 static stringop_algs core_memcpy[2] = {
3346   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
3347   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
3348              {-1, libcall, false}}}};
3349 static stringop_algs core_memset[2] = {
3350   {libcall, {{6, loop_1_byte, true},
3351              {24, loop, true},
3352              {8192, rep_prefix_4_byte, true},
3353              {-1, libcall, false}}},
3354   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
3355              {-1, libcall, false}}}};
3356 
3357 static const
3358 struct processor_costs core_cost = {
3359   {
3360   /* Start of register allocator costs.  integer->integer move cost is 2. */
3361   6,				     /* cost for loading QImode using movzbl */
3362   {4, 4, 4},				/* cost of loading integer registers
3363 					   in QImode, HImode and SImode.
3364 					   Relative to reg-reg move (2).  */
3365   {6, 6, 6},				/* cost of storing integer registers */
3366   2,					/* cost of reg,reg fld/fst */
3367   {6, 6, 8},				/* cost of loading fp registers
3368 					   in SFmode, DFmode and XFmode */
3369   {6, 6, 10},				/* cost of storing fp registers
3370 					   in SFmode, DFmode and XFmode */
3371   2,					/* cost of moving MMX register */
3372   {6, 6},				/* cost of loading MMX registers
3373 					   in SImode and DImode */
3374   {6, 6},				/* cost of storing MMX registers
3375 					   in SImode and DImode */
3376   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
3377   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
3378 					   in 32,64,128,256 and 512-bit */
3379   {6, 6, 6, 6, 12},			/* cost of storing SSE registers
3380 					   in 32,64,128,256 and 512-bit */
3381   6, 6,				/* SSE->integer and integer->SSE moves */
3382   6, 6,				/* mask->integer and integer->mask moves */
3383   {4, 4, 4},				/* cost of loading mask register
3384 					   in QImode, HImode, SImode.  */
3385   {6, 6, 6},				/* cost if storing mask register
3386 					   in QImode, HImode, SImode.  */
3387   2,					/* cost of moving mask register.  */
3388   /* End of register allocator costs.  */
3389   },
3390 
3391   COSTS_N_INSNS (1),			/* cost of an add instruction */
3392   /* On all chips taken into consideration lea is 2 cycles and more.  With
3393      this cost however our current implementation of synth_mult results in
3394      use of unnecessary temporary registers causing regression on several
3395      SPECfp benchmarks.  */
3396   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
3397   COSTS_N_INSNS (1),			/* variable shift costs */
3398   COSTS_N_INSNS (1),			/* constant shift costs */
3399   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
3400    COSTS_N_INSNS (4),			/*				 HI */
3401    COSTS_N_INSNS (3),			/*				 SI */
3402    /* Here we tune for Sandybridge or newer.  */
3403    COSTS_N_INSNS (3),			/*				 DI */
3404    COSTS_N_INSNS (3)},			/*			      other */
3405   0,					/* cost of multiply per each bit set */
3406   /* Expanding div/mod currently doesn't consider parallelism. So the cost
3407      model is not realistic. We compensate by increasing the latencies a bit.  */
3408   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
3409    COSTS_N_INSNS (11),			/*			    HI */
3410    COSTS_N_INSNS (14),			/*			    SI */
3411    COSTS_N_INSNS (81),			/*			    DI */
3412    COSTS_N_INSNS (81)},			/*			    other */
3413   COSTS_N_INSNS (1),			/* cost of movsx */
3414   COSTS_N_INSNS (1),			/* cost of movzx */
3415   8,					/* "large" insn */
3416   17,					/* MOVE_RATIO */
3417   6,					/* CLEAR_RATIO */
3418   {4, 4, 4},				/* cost of loading integer registers
3419 					   in QImode, HImode and SImode.
3420 					   Relative to reg-reg move (2).  */
3421   {6, 6, 6},				/* cost of storing integer registers */
3422   {6, 6, 6, 6, 12},			/* cost of loading SSE register
3423 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3424   {6, 6, 6, 6, 12},			/* cost of storing SSE register
3425 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3426   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
3427   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
3428   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
3429   2,					/* cost of moving SSE register to integer.  */
3430   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
3431      rec. throughput 6.
3432      So 5 uops statically and one uops per load.  */
3433   10, 6,				/* Gather load static, per_elt.  */
3434   10, 6,				/* Gather store static, per_elt.  */
3435   64,					/* size of l1 cache.  */
3436   512,					/* size of l2 cache.  */
3437   64,					/* size of prefetch block */
3438   6,					/* number of parallel prefetches */
3439   /* FIXME perhaps more appropriate value is 5.  */
3440   3,					/* Branch cost */
3441   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
3442   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
3443   /* 10-24 */
3444   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
3445   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
3446   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
3447   COSTS_N_INSNS (23),			/* cost of FSQRT instruction.  */
3448 
3449   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
3450   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
3451   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
3452   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
3453   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
3454   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
3455   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
3456   COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
3457   COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
3458   COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
3459   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
3460   core_memcpy,
3461   core_memset,
3462   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
3463   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
3464   "16:11:8",				/* Loop alignment.  */
3465   "16:11:8",				/* Jump alignment.  */
3466   "0:0:8",				/* Label alignment.  */
3467   "16",					/* Func alignment.  */
3468 };
3469 
3470