xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/i386/x86-tune-costs.h (revision f0fde9902fd4d72ded2807793acc7bfaa1ebf243)
1 /* Costs of operations of individual x86 CPUs.
2    Copyright (C) 1988-2019 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10 
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
19 
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 <http://www.gnu.org/licenses/>.  */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
26 #define COSTS_N_BYTES(N) ((N) * 2)
27 
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29 
30 static stringop_algs ix86_size_memcpy[2] = {
31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36 
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39   COSTS_N_BYTES (2),			/* cost of an add instruction */
40   COSTS_N_BYTES (3),			/* cost of a lea instruction */
41   COSTS_N_BYTES (2),			/* variable shift costs */
42   COSTS_N_BYTES (3),			/* constant shift costs */
43   {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
44    COSTS_N_BYTES (3),			/*				 HI */
45    COSTS_N_BYTES (3),			/*				 SI */
46    COSTS_N_BYTES (3),			/*				 DI */
47    COSTS_N_BYTES (5)},			/*			      other */
48   0,					/* cost of multiply per each bit set */
49   {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
50    COSTS_N_BYTES (3),			/*			    HI */
51    COSTS_N_BYTES (3),			/*			    SI */
52    COSTS_N_BYTES (3),			/*			    DI */
53    COSTS_N_BYTES (5)},			/*			    other */
54   COSTS_N_BYTES (3),			/* cost of movsx */
55   COSTS_N_BYTES (3),			/* cost of movzx */
56   0,					/* "large" insn */
57   2,					/* MOVE_RATIO */
58 
59   /* All move costs are relative to integer->integer move times 2. */
60   2,				     /* cost for loading QImode using movzbl */
61   {2, 2, 2},				/* cost of loading integer registers
62 					   in QImode, HImode and SImode.
63 					   Relative to reg-reg move (2).  */
64   {2, 2, 2},				/* cost of storing integer registers */
65   2,					/* cost of reg,reg fld/fst */
66   {2, 2, 2},				/* cost of loading fp registers
67 					   in SFmode, DFmode and XFmode */
68   {2, 2, 2},				/* cost of storing fp registers
69 					   in SFmode, DFmode and XFmode */
70   3,					/* cost of moving MMX register */
71   {3, 3},				/* cost of loading MMX registers
72 					   in SImode and DImode */
73   {3, 3},				/* cost of storing MMX registers
74 					   in SImode and DImode */
75   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
76   {3, 3, 3, 3, 3},			/* cost of loading SSE registers
77 					   in 32,64,128,256 and 512-bit */
78   {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
79 					   in 128bit, 256bit and 512bit */
80   {3, 3, 3, 3, 3},			/* cost of storing SSE registers
81 					   in 32,64,128,256 and 512-bit */
82   {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
83 					   in 128bit, 256bit and 512bit */
84   3, 3,					/* SSE->integer and integer->SSE moves */
85   5, 0,					/* Gather load static, per_elt.  */
86   5, 0,					/* Gather store static, per_elt.  */
87   0,					/* size of l1 cache  */
88   0,					/* size of l2 cache  */
89   0,					/* size of prefetch block */
90   0,					/* number of parallel prefetches */
91   2,					/* Branch cost */
92   COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
93   COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
94   COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
95   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
96   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
97   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
98 
99   COSTS_N_BYTES (2),			/* cost of cheap SSE instruction.  */
100   COSTS_N_BYTES (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
101   COSTS_N_BYTES (2),			/* cost of MULSS instruction.  */
102   COSTS_N_BYTES (2),			/* cost of MULSD instruction.  */
103   COSTS_N_BYTES (2),			/* cost of FMA SS instruction.  */
104   COSTS_N_BYTES (2),			/* cost of FMA SD instruction.  */
105   COSTS_N_BYTES (2),			/* cost of DIVSS instruction.  */
106   COSTS_N_BYTES (2),			/* cost of DIVSD instruction.  */
107   COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
108   COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
109   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
110   ix86_size_memcpy,
111   ix86_size_memset,
112   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
113   COSTS_N_BYTES (1),			/* cond_not_taken_branch_cost.  */
114   NULL,					/* Loop alignment.  */
115   NULL,					/* Jump alignment.  */
116   NULL,					/* Label alignment.  */
117   NULL,					/* Func alignment.  */
118 };
119 
120 /* Processor costs (relative to an add) */
121 static stringop_algs i386_memcpy[2] = {
122   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
123   DUMMY_STRINGOP_ALGS};
124 static stringop_algs i386_memset[2] = {
125   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
126   DUMMY_STRINGOP_ALGS};
127 
128 static const
129 struct processor_costs i386_cost = {	/* 386 specific costs */
130   COSTS_N_INSNS (1),			/* cost of an add instruction */
131   COSTS_N_INSNS (1),			/* cost of a lea instruction */
132   COSTS_N_INSNS (3),			/* variable shift costs */
133   COSTS_N_INSNS (2),			/* constant shift costs */
134   {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
135    COSTS_N_INSNS (6),			/*				 HI */
136    COSTS_N_INSNS (6),			/*				 SI */
137    COSTS_N_INSNS (6),			/*				 DI */
138    COSTS_N_INSNS (6)},			/*			      other */
139   COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
140   {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
141    COSTS_N_INSNS (23),			/*			    HI */
142    COSTS_N_INSNS (23),			/*			    SI */
143    COSTS_N_INSNS (23),			/*			    DI */
144    COSTS_N_INSNS (23)},			/*			    other */
145   COSTS_N_INSNS (3),			/* cost of movsx */
146   COSTS_N_INSNS (2),			/* cost of movzx */
147   15,					/* "large" insn */
148   3,					/* MOVE_RATIO */
149 
150   /* All move costs are relative to integer->integer move times 2 and thus
151      they are latency*2. */
152   4,				     /* cost for loading QImode using movzbl */
153   {2, 4, 2},				/* cost of loading integer registers
154 					   in QImode, HImode and SImode.
155 					   Relative to reg-reg move (2).  */
156   {2, 4, 2},				/* cost of storing integer registers */
157   2,					/* cost of reg,reg fld/fst */
158   {8, 8, 8},				/* cost of loading fp registers
159 					   in SFmode, DFmode and XFmode */
160   {8, 8, 8},				/* cost of storing fp registers
161 					   in SFmode, DFmode and XFmode */
162   2,					/* cost of moving MMX register */
163   {4, 8},				/* cost of loading MMX registers
164 					   in SImode and DImode */
165   {4, 8},				/* cost of storing MMX registers
166 					   in SImode and DImode */
167   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
168   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
169 					   in 32,64,128,256 and 512-bit */
170   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
171   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
172 					   in 32,64,128,256 and 512-bit */
173   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
174   3, 3,					/* SSE->integer and integer->SSE moves */
175   4, 4,					/* Gather load static, per_elt.  */
176   4, 4,					/* Gather store static, per_elt.  */
177   0,					/* size of l1 cache  */
178   0,					/* size of l2 cache  */
179   0,					/* size of prefetch block */
180   0,					/* number of parallel prefetches */
181   1,					/* Branch cost */
182   COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
183   COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
184   COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
185   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
186   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
187   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
188 
189   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
190   COSTS_N_INSNS (23),			/* cost of ADDSS/SD SUBSS/SD insns.  */
191   COSTS_N_INSNS (27),			/* cost of MULSS instruction.  */
192   COSTS_N_INSNS (27),			/* cost of MULSD instruction.  */
193   COSTS_N_INSNS (27),			/* cost of FMA SS instruction.  */
194   COSTS_N_INSNS (27),			/* cost of FMA SD instruction.  */
195   COSTS_N_INSNS (88),			/* cost of DIVSS instruction.  */
196   COSTS_N_INSNS (88),			/* cost of DIVSD instruction.  */
197   COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
198   COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
199   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
200   i386_memcpy,
201   i386_memset,
202   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
203   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
204   "4",					/* Loop alignment.  */
205   "4",					/* Jump alignment.  */
206   NULL,					/* Label alignment.  */
207   "4",					/* Func alignment.  */
208 };
209 
210 static stringop_algs i486_memcpy[2] = {
211   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
212   DUMMY_STRINGOP_ALGS};
213 static stringop_algs i486_memset[2] = {
214   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
215   DUMMY_STRINGOP_ALGS};
216 
217 static const
218 struct processor_costs i486_cost = {	/* 486 specific costs */
219   COSTS_N_INSNS (1),			/* cost of an add instruction */
220   COSTS_N_INSNS (1),			/* cost of a lea instruction */
221   COSTS_N_INSNS (3),			/* variable shift costs */
222   COSTS_N_INSNS (2),			/* constant shift costs */
223   {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
224    COSTS_N_INSNS (12),			/*				 HI */
225    COSTS_N_INSNS (12),			/*				 SI */
226    COSTS_N_INSNS (12),			/*				 DI */
227    COSTS_N_INSNS (12)},			/*			      other */
228   1,					/* cost of multiply per each bit set */
229   {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
230    COSTS_N_INSNS (40),			/*			    HI */
231    COSTS_N_INSNS (40),			/*			    SI */
232    COSTS_N_INSNS (40),			/*			    DI */
233    COSTS_N_INSNS (40)},			/*			    other */
234   COSTS_N_INSNS (3),			/* cost of movsx */
235   COSTS_N_INSNS (2),			/* cost of movzx */
236   15,					/* "large" insn */
237   3,					/* MOVE_RATIO */
238 
239   /* All move costs are relative to integer->integer move times 2 and thus
240      they are latency*2. */
241   4,				     /* cost for loading QImode using movzbl */
242   {2, 4, 2},				/* cost of loading integer registers
243 					   in QImode, HImode and SImode.
244 					   Relative to reg-reg move (2).  */
245   {2, 4, 2},				/* cost of storing integer registers */
246   2,					/* cost of reg,reg fld/fst */
247   {8, 8, 8},				/* cost of loading fp registers
248 					   in SFmode, DFmode and XFmode */
249   {8, 8, 8},				/* cost of storing fp registers
250 					   in SFmode, DFmode and XFmode */
251   2,					/* cost of moving MMX register */
252   {4, 8},				/* cost of loading MMX registers
253 					   in SImode and DImode */
254   {4, 8},				/* cost of storing MMX registers
255 					   in SImode and DImode */
256   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
257   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
258 					   in 32,64,128,256 and 512-bit */
259   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
260   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
261 					   in 32,64,128,256 and 512-bit */
262   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
263   3, 3,					/* SSE->integer and integer->SSE moves */
264   4, 4,					/* Gather load static, per_elt.  */
265   4, 4,					/* Gather store static, per_elt.  */
266   4,					/* size of l1 cache.  486 has 8kB cache
267 					   shared for code and data, so 4kB is
268 					   not really precise.  */
269   4,					/* size of l2 cache  */
270   0,					/* size of prefetch block */
271   0,					/* number of parallel prefetches */
272   1,					/* Branch cost */
273   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
274   COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
275   COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
276   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
277   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
278   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
279 
280   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
281   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
282   COSTS_N_INSNS (16),			/* cost of MULSS instruction.  */
283   COSTS_N_INSNS (16),			/* cost of MULSD instruction.  */
284   COSTS_N_INSNS (16),			/* cost of FMA SS instruction.  */
285   COSTS_N_INSNS (16),			/* cost of FMA SD instruction.  */
286   COSTS_N_INSNS (73),			/* cost of DIVSS instruction.  */
287   COSTS_N_INSNS (74),			/* cost of DIVSD instruction.  */
288   COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
289   COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
290   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
291   i486_memcpy,
292   i486_memset,
293   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
294   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
295   "16",					/* Loop alignment.  */
296   "16",					/* Jump alignment.  */
297   "0:0:8",				/* Label alignment.  */
298   "16",					/* Func alignment.  */
299 };
300 
301 static stringop_algs pentium_memcpy[2] = {
302   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
303   DUMMY_STRINGOP_ALGS};
304 static stringop_algs pentium_memset[2] = {
305   {libcall, {{-1, rep_prefix_4_byte, false}}},
306   DUMMY_STRINGOP_ALGS};
307 
308 static const
309 struct processor_costs pentium_cost = {
310   COSTS_N_INSNS (1),			/* cost of an add instruction */
311   COSTS_N_INSNS (1),			/* cost of a lea instruction */
312   COSTS_N_INSNS (4),			/* variable shift costs */
313   COSTS_N_INSNS (1),			/* constant shift costs */
314   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
315    COSTS_N_INSNS (11),			/*				 HI */
316    COSTS_N_INSNS (11),			/*				 SI */
317    COSTS_N_INSNS (11),			/*				 DI */
318    COSTS_N_INSNS (11)},			/*			      other */
319   0,					/* cost of multiply per each bit set */
320   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
321    COSTS_N_INSNS (25),			/*			    HI */
322    COSTS_N_INSNS (25),			/*			    SI */
323    COSTS_N_INSNS (25),			/*			    DI */
324    COSTS_N_INSNS (25)},			/*			    other */
325   COSTS_N_INSNS (3),			/* cost of movsx */
326   COSTS_N_INSNS (2),			/* cost of movzx */
327   8,					/* "large" insn */
328   6,					/* MOVE_RATIO */
329 
330   /* All move costs are relative to integer->integer move times 2 and thus
331      they are latency*2. */
332   6,				     /* cost for loading QImode using movzbl */
333   {2, 4, 2},				/* cost of loading integer registers
334 					   in QImode, HImode and SImode.
335 					   Relative to reg-reg move (2).  */
336   {2, 4, 2},				/* cost of storing integer registers */
337   2,					/* cost of reg,reg fld/fst */
338   {2, 2, 6},				/* cost of loading fp registers
339 					   in SFmode, DFmode and XFmode */
340   {4, 4, 6},				/* cost of storing fp registers
341 					   in SFmode, DFmode and XFmode */
342   8,					/* cost of moving MMX register */
343   {8, 8},				/* cost of loading MMX registers
344 					   in SImode and DImode */
345   {8, 8},				/* cost of storing MMX registers
346 					   in SImode and DImode */
347   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
348   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
349 					   in 32,64,128,256 and 512-bit */
350   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
351   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
352 					   in 32,64,128,256 and 512-bit */
353   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
354   3, 3,					/* SSE->integer and integer->SSE moves */
355   4, 4,					/* Gather load static, per_elt.  */
356   4, 4,					/* Gather store static, per_elt.  */
357   8,					/* size of l1 cache.  */
358   8,					/* size of l2 cache  */
359   0,					/* size of prefetch block */
360   0,					/* number of parallel prefetches */
361   2,					/* Branch cost */
362   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
363   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
364   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
365   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
366   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
367   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
368 
369   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
370   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
371   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
372   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
373   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
374   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
375   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
376   COSTS_N_INSNS (39),			/* cost of DIVSD instruction.  */
377   COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
378   COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
379   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
380   pentium_memcpy,
381   pentium_memset,
382   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
383   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
384   "16:8:8",				/* Loop alignment.  */
385   "16:8:8",				/* Jump alignment.  */
386   "0:0:8",				/* Label alignment.  */
387   "16",					/* Func alignment.  */
388 };
389 
390 static const
391 struct processor_costs lakemont_cost = {
392   COSTS_N_INSNS (1),			/* cost of an add instruction */
393   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
394   COSTS_N_INSNS (1),			/* variable shift costs */
395   COSTS_N_INSNS (1),			/* constant shift costs */
396   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
397    COSTS_N_INSNS (11),			/*				 HI */
398    COSTS_N_INSNS (11),			/*				 SI */
399    COSTS_N_INSNS (11),			/*				 DI */
400    COSTS_N_INSNS (11)},			/*			      other */
401   0,					/* cost of multiply per each bit set */
402   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
403    COSTS_N_INSNS (25),			/*			    HI */
404    COSTS_N_INSNS (25),			/*			    SI */
405    COSTS_N_INSNS (25),			/*			    DI */
406    COSTS_N_INSNS (25)},			/*			    other */
407   COSTS_N_INSNS (3),			/* cost of movsx */
408   COSTS_N_INSNS (2),			/* cost of movzx */
409   8,					/* "large" insn */
410   17,					/* MOVE_RATIO */
411 
412   /* All move costs are relative to integer->integer move times 2 and thus
413      they are latency*2. */
414   6,				     /* cost for loading QImode using movzbl */
415   {2, 4, 2},				/* cost of loading integer registers
416 					   in QImode, HImode and SImode.
417 					   Relative to reg-reg move (2).  */
418   {2, 4, 2},				/* cost of storing integer registers */
419   2,					/* cost of reg,reg fld/fst */
420   {2, 2, 6},				/* cost of loading fp registers
421 					   in SFmode, DFmode and XFmode */
422   {4, 4, 6},				/* cost of storing fp registers
423 					   in SFmode, DFmode and XFmode */
424   8,					/* cost of moving MMX register */
425   {8, 8},				/* cost of loading MMX registers
426 					   in SImode and DImode */
427   {8, 8},				/* cost of storing MMX registers
428 					   in SImode and DImode */
429   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
430   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
431 					   in 32,64,128,256 and 512-bit */
432   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
433   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
434 					   in 32,64,128,256 and 512-bit */
435   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
436   3, 3,					/* SSE->integer and integer->SSE moves */
437   4, 4,					/* Gather load static, per_elt.  */
438   4, 4,					/* Gather store static, per_elt.  */
439   8,					/* size of l1 cache.  */
440   8,					/* size of l2 cache  */
441   0,					/* size of prefetch block */
442   0,					/* number of parallel prefetches */
443   2,					/* Branch cost */
444   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
445   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
446   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
447   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
448   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
449   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
450 
451   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
452   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
453   COSTS_N_INSNS (5),			/* cost of MULSS instruction.  */
454   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
455   COSTS_N_INSNS (10),			/* cost of FMA SS instruction.  */
456   COSTS_N_INSNS (10),			/* cost of FMA SD instruction.  */
457   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
458   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
459   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
460   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
461   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
462   pentium_memcpy,
463   pentium_memset,
464   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
465   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
466   "16:8:8",				/* Loop alignment.  */
467   "16:8:8",				/* Jump alignment.  */
468   "0:0:8",				/* Label alignment.  */
469   "16",					/* Func alignment.  */
470 };
471 
472 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
473    (we ensure the alignment).  For small blocks inline loop is still a
474    noticeable win, for bigger blocks either rep movsl or rep movsb is
475    way to go.  Rep movsb has apparently more expensive startup time in CPU,
476    but after 4K the difference is down in the noise.  */
477 static stringop_algs pentiumpro_memcpy[2] = {
478   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
479                        {8192, rep_prefix_4_byte, false},
480                        {-1, rep_prefix_1_byte, false}}},
481   DUMMY_STRINGOP_ALGS};
482 static stringop_algs pentiumpro_memset[2] = {
483   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
484                        {8192, rep_prefix_4_byte, false},
485                        {-1, libcall, false}}},
486   DUMMY_STRINGOP_ALGS};
487 static const
488 struct processor_costs pentiumpro_cost = {
489   COSTS_N_INSNS (1),			/* cost of an add instruction */
490   COSTS_N_INSNS (1),			/* cost of a lea instruction */
491   COSTS_N_INSNS (1),			/* variable shift costs */
492   COSTS_N_INSNS (1),			/* constant shift costs */
493   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
494    COSTS_N_INSNS (4),			/*				 HI */
495    COSTS_N_INSNS (4),			/*				 SI */
496    COSTS_N_INSNS (4),			/*				 DI */
497    COSTS_N_INSNS (4)},			/*			      other */
498   0,					/* cost of multiply per each bit set */
499   {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
500    COSTS_N_INSNS (17),			/*			    HI */
501    COSTS_N_INSNS (17),			/*			    SI */
502    COSTS_N_INSNS (17),			/*			    DI */
503    COSTS_N_INSNS (17)},			/*			    other */
504   COSTS_N_INSNS (1),			/* cost of movsx */
505   COSTS_N_INSNS (1),			/* cost of movzx */
506   8,					/* "large" insn */
507   6,					/* MOVE_RATIO */
508 
509   /* All move costs are relative to integer->integer move times 2 and thus
510      they are latency*2. */
511   2,				     /* cost for loading QImode using movzbl */
512   {4, 4, 4},				/* cost of loading integer registers
513 					   in QImode, HImode and SImode.
514 					   Relative to reg-reg move (2).  */
515   {2, 2, 2},				/* cost of storing integer registers */
516   2,					/* cost of reg,reg fld/fst */
517   {2, 2, 6},				/* cost of loading fp registers
518 					   in SFmode, DFmode and XFmode */
519   {4, 4, 6},				/* cost of storing fp registers
520 					   in SFmode, DFmode and XFmode */
521   2,					/* cost of moving MMX register */
522   {2, 2},				/* cost of loading MMX registers
523 					   in SImode and DImode */
524   {2, 2},				/* cost of storing MMX registers
525 					   in SImode and DImode */
526   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
527   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
528 					   in 32,64,128,256 and 512-bit */
529   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
530   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
531 					   in 32,64,128,256 and 512-bit */
532   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
533   3, 3,					/* SSE->integer and integer->SSE moves */
534   4, 4,					/* Gather load static, per_elt.  */
535   4, 4,					/* Gather store static, per_elt.  */
536   8,					/* size of l1 cache.  */
537   256,					/* size of l2 cache  */
538   32,					/* size of prefetch block */
539   6,					/* number of parallel prefetches */
540   2,					/* Branch cost */
541   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
542   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
543   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
544   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
545   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
546   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
547 
548   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
549   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
550   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
551   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
552   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
553   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
554   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
555   COSTS_N_INSNS (18),			/* cost of DIVSD instruction.  */
556   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
557   COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
558   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
559   pentiumpro_memcpy,
560   pentiumpro_memset,
561   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
562   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
563   "16",					/* Loop alignment.  */
564   "16:11:8",				/* Jump alignment.  */
565   "0:0:8",				/* Label alignment.  */
566   "16",					/* Func alignment.  */
567 };
568 
569 static stringop_algs geode_memcpy[2] = {
570   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
571   DUMMY_STRINGOP_ALGS};
572 static stringop_algs geode_memset[2] = {
573   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
574   DUMMY_STRINGOP_ALGS};
575 static const
576 struct processor_costs geode_cost = {
577   COSTS_N_INSNS (1),			/* cost of an add instruction */
578   COSTS_N_INSNS (1),			/* cost of a lea instruction */
579   COSTS_N_INSNS (2),			/* variable shift costs */
580   COSTS_N_INSNS (1),			/* constant shift costs */
581   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
582    COSTS_N_INSNS (4),			/*				 HI */
583    COSTS_N_INSNS (7),			/*				 SI */
584    COSTS_N_INSNS (7),			/*				 DI */
585    COSTS_N_INSNS (7)},			/*			      other */
586   0,					/* cost of multiply per each bit set */
587   {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
588    COSTS_N_INSNS (23),			/*			    HI */
589    COSTS_N_INSNS (39),			/*			    SI */
590    COSTS_N_INSNS (39),			/*			    DI */
591    COSTS_N_INSNS (39)},			/*			    other */
592   COSTS_N_INSNS (1),			/* cost of movsx */
593   COSTS_N_INSNS (1),			/* cost of movzx */
594   8,					/* "large" insn */
595   4,					/* MOVE_RATIO */
596 
597   /* All move costs are relative to integer->integer move times 2 and thus
598      they are latency*2. */
599   2,				     /* cost for loading QImode using movzbl */
600   {2, 2, 2},				/* cost of loading integer registers
601 					   in QImode, HImode and SImode.
602 					   Relative to reg-reg move (2).  */
603   {2, 2, 2},				/* cost of storing integer registers */
604   2,					/* cost of reg,reg fld/fst */
605   {2, 2, 2},				/* cost of loading fp registers
606 					   in SFmode, DFmode and XFmode */
607   {4, 6, 6},				/* cost of storing fp registers
608 					   in SFmode, DFmode and XFmode */
609 
610   2,					/* cost of moving MMX register */
611   {2, 2},				/* cost of loading MMX registers
612 					   in SImode and DImode */
613   {2, 2},				/* cost of storing MMX registers
614 					   in SImode and DImode */
615   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
616   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
617 					   in 32,64,128,256 and 512-bit */
618   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
619   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
620 					   in 32,64,128,256 and 512-bit */
621   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
622   6, 6,					/* SSE->integer and integer->SSE moves */
623   2, 2,					/* Gather load static, per_elt.  */
624   2, 2,					/* Gather store static, per_elt.  */
625   64,					/* size of l1 cache.  */
626   128,					/* size of l2 cache.  */
627   32,					/* size of prefetch block */
628   1,					/* number of parallel prefetches */
629   1,					/* Branch cost */
630   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
631   COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
632   COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
633   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
634   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
635   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
636 
637   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
638   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
639   COSTS_N_INSNS (11),			/* cost of MULSS instruction.  */
640   COSTS_N_INSNS (11),			/* cost of MULSD instruction.  */
641   COSTS_N_INSNS (17),			/* cost of FMA SS instruction.  */
642   COSTS_N_INSNS (17),			/* cost of FMA SD instruction.  */
643   COSTS_N_INSNS (47),			/* cost of DIVSS instruction.  */
644   COSTS_N_INSNS (47),			/* cost of DIVSD instruction.  */
645   COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
646   COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
647   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
648   geode_memcpy,
649   geode_memset,
650   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
651   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
652   NULL,					/* Loop alignment.  */
653   NULL,					/* Jump alignment.  */
654   NULL,					/* Label alignment.  */
655   NULL,					/* Func alignment.  */
656 };
657 
658 static stringop_algs k6_memcpy[2] = {
659   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
660   DUMMY_STRINGOP_ALGS};
661 static stringop_algs k6_memset[2] = {
662   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
663   DUMMY_STRINGOP_ALGS};
664 static const
665 struct processor_costs k6_cost = {
666   COSTS_N_INSNS (1),			/* cost of an add instruction */
667   COSTS_N_INSNS (2),			/* cost of a lea instruction */
668   COSTS_N_INSNS (1),			/* variable shift costs */
669   COSTS_N_INSNS (1),			/* constant shift costs */
670   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
671    COSTS_N_INSNS (3),			/*				 HI */
672    COSTS_N_INSNS (3),			/*				 SI */
673    COSTS_N_INSNS (3),			/*				 DI */
674    COSTS_N_INSNS (3)},			/*			      other */
675   0,					/* cost of multiply per each bit set */
676   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
677    COSTS_N_INSNS (18),			/*			    HI */
678    COSTS_N_INSNS (18),			/*			    SI */
679    COSTS_N_INSNS (18),			/*			    DI */
680    COSTS_N_INSNS (18)},			/*			    other */
681   COSTS_N_INSNS (2),			/* cost of movsx */
682   COSTS_N_INSNS (2),			/* cost of movzx */
683   8,					/* "large" insn */
684   4,					/* MOVE_RATIO */
685 
686   /* All move costs are relative to integer->integer move times 2 and thus
687      they are latency*2. */
688   3,				     /* cost for loading QImode using movzbl */
689   {4, 5, 4},				/* cost of loading integer registers
690 					   in QImode, HImode and SImode.
691 					   Relative to reg-reg move (2).  */
692   {2, 3, 2},				/* cost of storing integer registers */
693   4,					/* cost of reg,reg fld/fst */
694   {6, 6, 6},				/* cost of loading fp registers
695 					   in SFmode, DFmode and XFmode */
696   {4, 4, 4},				/* cost of storing fp registers
697 					   in SFmode, DFmode and XFmode */
698   2,					/* cost of moving MMX register */
699   {2, 2},				/* cost of loading MMX registers
700 					   in SImode and DImode */
701   {2, 2},				/* cost of storing MMX registers
702 					   in SImode and DImode */
703   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
704   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
705 					   in 32,64,128,256 and 512-bit */
706   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
707   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
708 					   in 32,64,128,256 and 512-bit */
709   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
710   6, 6,					/* SSE->integer and integer->SSE moves */
711   2, 2,					/* Gather load static, per_elt.  */
712   2, 2,					/* Gather store static, per_elt.  */
713   32,					/* size of l1 cache.  */
714   32,					/* size of l2 cache.  Some models
715 					   have integrated l2 cache, but
716 					   optimizing for k6 is not important
717 					   enough to worry about that.  */
718   32,					/* size of prefetch block */
719   1,					/* number of parallel prefetches */
720   1,					/* Branch cost */
721   COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
722   COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
723   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
724   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
725   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
726   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
727 
728   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
729   COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
730   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
731   COSTS_N_INSNS (2),			/* cost of MULSD instruction.  */
732   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
733   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
734   COSTS_N_INSNS (56),			/* cost of DIVSS instruction.  */
735   COSTS_N_INSNS (56),			/* cost of DIVSD instruction.  */
736   COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
737   COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
738   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
739   k6_memcpy,
740   k6_memset,
741   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
742   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
743   "32:8:8",				/* Loop alignment.  */
744   "32:8:8",				/* Jump alignment.  */
745   "0:0:8",				/* Label alignment.  */
746   "32",					/* Func alignment.  */
747 };
748 
749 /* For some reason, Athlon deals better with REP prefix (relative to loops)
750    compared to K8. Alignment becomes important after 8 bytes for memcpy and
751    128 bytes for memset.  */
752 static stringop_algs athlon_memcpy[2] = {
753   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
754   DUMMY_STRINGOP_ALGS};
755 static stringop_algs athlon_memset[2] = {
756   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
757   DUMMY_STRINGOP_ALGS};
758 static const
759 struct processor_costs athlon_cost = {
760   COSTS_N_INSNS (1),			/* cost of an add instruction */
761   COSTS_N_INSNS (2),			/* cost of a lea instruction */
762   COSTS_N_INSNS (1),			/* variable shift costs */
763   COSTS_N_INSNS (1),			/* constant shift costs */
764   {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
765    COSTS_N_INSNS (5),			/*				 HI */
766    COSTS_N_INSNS (5),			/*				 SI */
767    COSTS_N_INSNS (5),			/*				 DI */
768    COSTS_N_INSNS (5)},			/*			      other */
769   0,					/* cost of multiply per each bit set */
770   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
771    COSTS_N_INSNS (26),			/*			    HI */
772    COSTS_N_INSNS (42),			/*			    SI */
773    COSTS_N_INSNS (74),			/*			    DI */
774    COSTS_N_INSNS (74)},			/*			    other */
775   COSTS_N_INSNS (1),			/* cost of movsx */
776   COSTS_N_INSNS (1),			/* cost of movzx */
777   8,					/* "large" insn */
778   9,					/* MOVE_RATIO */
779 
780   /* All move costs are relative to integer->integer move times 2 and thus
781      they are latency*2. */
782   4,				     /* cost for loading QImode using movzbl */
783   {3, 4, 3},				/* cost of loading integer registers
784 					   in QImode, HImode and SImode.
785 					   Relative to reg-reg move (2).  */
786   {3, 4, 3},				/* cost of storing integer registers */
787   4,					/* cost of reg,reg fld/fst */
788   {4, 4, 12},				/* cost of loading fp registers
789 					   in SFmode, DFmode and XFmode */
790   {6, 6, 8},				/* cost of storing fp registers
791 					   in SFmode, DFmode and XFmode */
792   2,					/* cost of moving MMX register */
793   {4, 4},				/* cost of loading MMX registers
794 					   in SImode and DImode */
795   {4, 4},				/* cost of storing MMX registers
796 					   in SImode and DImode */
797   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
798   {4, 4, 12, 12, 24},			/* cost of loading SSE registers
799 					   in 32,64,128,256 and 512-bit */
800   {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
801   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
802 					   in 32,64,128,256 and 512-bit */
803   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
804   5, 5,					/* SSE->integer and integer->SSE moves */
805   4, 4,					/* Gather load static, per_elt.  */
806   4, 4,					/* Gather store static, per_elt.  */
807   64,					/* size of l1 cache.  */
808   256,					/* size of l2 cache.  */
809   64,					/* size of prefetch block */
810   6,					/* number of parallel prefetches */
811   5,					/* Branch cost */
812   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
813   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
814   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
815   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
816   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
817   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
818 
819   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
820   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
821   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
822   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
823   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
824   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
825   /* 11-16  */
826   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
827   COSTS_N_INSNS (24),			/* cost of DIVSD instruction.  */
828   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
829   COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
830   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
831   athlon_memcpy,
832   athlon_memset,
833   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
834   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
835   "16:8:8",				/* Loop alignment.  */
836   "16:8:8",				/* Jump alignment.  */
837   "0:0:8",				/* Label alignment.  */
838   "16",					/* Func alignment.  */
839 };
840 
841 /* K8 has optimized REP instruction for medium sized blocks, but for very
842    small blocks it is better to use loop. For large blocks, libcall can
843    do nontemporary accesses and beat inline considerably.  */
844 static stringop_algs k8_memcpy[2] = {
845   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
846              {-1, rep_prefix_4_byte, false}}},
847   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
848              {-1, libcall, false}}}};
849 static stringop_algs k8_memset[2] = {
850   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
851              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
852   {libcall, {{48, unrolled_loop, false},
853              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
854 static const
855 struct processor_costs k8_cost = {
856   COSTS_N_INSNS (1),			/* cost of an add instruction */
857   COSTS_N_INSNS (2),			/* cost of a lea instruction */
858   COSTS_N_INSNS (1),			/* variable shift costs */
859   COSTS_N_INSNS (1),			/* constant shift costs */
860   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
861    COSTS_N_INSNS (4),			/*				 HI */
862    COSTS_N_INSNS (3),			/*				 SI */
863    COSTS_N_INSNS (4),			/*				 DI */
864    COSTS_N_INSNS (5)},			/*			      other */
865   0,					/* cost of multiply per each bit set */
866   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
867    COSTS_N_INSNS (26),			/*			    HI */
868    COSTS_N_INSNS (42),			/*			    SI */
869    COSTS_N_INSNS (74),			/*			    DI */
870    COSTS_N_INSNS (74)},			/*			    other */
871   COSTS_N_INSNS (1),			/* cost of movsx */
872   COSTS_N_INSNS (1),			/* cost of movzx */
873   8,					/* "large" insn */
874   9,					/* MOVE_RATIO */
875 
876   /* All move costs are relative to integer->integer move times 2 and thus
877      they are latency*2. */
878   4,				     /* cost for loading QImode using movzbl */
879   {3, 4, 3},				/* cost of loading integer registers
880 					   in QImode, HImode and SImode.
881 					   Relative to reg-reg move (2).  */
882   {3, 4, 3},				/* cost of storing integer registers */
883   4,					/* cost of reg,reg fld/fst */
884   {4, 4, 12},				/* cost of loading fp registers
885 					   in SFmode, DFmode and XFmode */
886   {6, 6, 8},				/* cost of storing fp registers
887 					   in SFmode, DFmode and XFmode */
888   2,					/* cost of moving MMX register */
889   {3, 3},				/* cost of loading MMX registers
890 					   in SImode and DImode */
891   {4, 4},				/* cost of storing MMX registers
892 					   in SImode and DImode */
893   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
894   {4, 3, 12, 12, 24},			/* cost of loading SSE registers
895 					   in 32,64,128,256 and 512-bit */
896   {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
897   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
898 					   in 32,64,128,256 and 512-bit */
899   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
900   5, 5,					/* SSE->integer and integer->SSE moves */
901   4, 4,					/* Gather load static, per_elt.  */
902   4, 4,					/* Gather store static, per_elt.  */
903   64,					/* size of l1 cache.  */
904   512,					/* size of l2 cache.  */
905   64,					/* size of prefetch block */
906   /* New AMD processors never drop prefetches; if they cannot be performed
907      immediately, they are queued.  We set number of simultaneous prefetches
908      to a large constant to reflect this (it probably is not a good idea not
909      to limit number of prefetches at all, as their execution also takes some
910      time).  */
911   100,					/* number of parallel prefetches */
912   3,					/* Branch cost */
913   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
914   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
915   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
916   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
917   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
918   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
919 
920   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
921   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
922   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
923   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
924   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
925   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
926   /* 11-16  */
927   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
928   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
929   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
930   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
931   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
932   k8_memcpy,
933   k8_memset,
934   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
935   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
936   "16:8:8",				/* Loop alignment.  */
937   "16:8:8",				/* Jump alignment.  */
938   "0:0:8",				/* Label alignment.  */
939   "16",					/* Func alignment.  */
940 };
941 
942 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
943    very small blocks it is better to use loop. For large blocks, libcall can
944    do nontemporary accesses and beat inline considerably.  */
945 static stringop_algs amdfam10_memcpy[2] = {
946   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
947              {-1, rep_prefix_4_byte, false}}},
948   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
949              {-1, libcall, false}}}};
950 static stringop_algs amdfam10_memset[2] = {
951   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
952              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
953   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
954              {-1, libcall, false}}}};
955 struct processor_costs amdfam10_cost = {
956   COSTS_N_INSNS (1),			/* cost of an add instruction */
957   COSTS_N_INSNS (2),			/* cost of a lea instruction */
958   COSTS_N_INSNS (1),			/* variable shift costs */
959   COSTS_N_INSNS (1),			/* constant shift costs */
960   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
961    COSTS_N_INSNS (4),			/*				 HI */
962    COSTS_N_INSNS (3),			/*				 SI */
963    COSTS_N_INSNS (4),			/*				 DI */
964    COSTS_N_INSNS (5)},			/*			      other */
965   0,					/* cost of multiply per each bit set */
966   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
967    COSTS_N_INSNS (35),			/*			    HI */
968    COSTS_N_INSNS (51),			/*			    SI */
969    COSTS_N_INSNS (83),			/*			    DI */
970    COSTS_N_INSNS (83)},			/*			    other */
971   COSTS_N_INSNS (1),			/* cost of movsx */
972   COSTS_N_INSNS (1),			/* cost of movzx */
973   8,					/* "large" insn */
974   9,					/* MOVE_RATIO */
975 
976   /* All move costs are relative to integer->integer move times 2 and thus
977      they are latency*2. */
978   4,				     /* cost for loading QImode using movzbl */
979   {3, 4, 3},				/* cost of loading integer registers
980 					   in QImode, HImode and SImode.
981 					   Relative to reg-reg move (2).  */
982   {3, 4, 3},				/* cost of storing integer registers */
983   4,					/* cost of reg,reg fld/fst */
984   {4, 4, 12},				/* cost of loading fp registers
985 		   			   in SFmode, DFmode and XFmode */
986   {6, 6, 8},				/* cost of storing fp registers
987  		   			   in SFmode, DFmode and XFmode */
988   2,					/* cost of moving MMX register */
989   {3, 3},				/* cost of loading MMX registers
990 					   in SImode and DImode */
991   {4, 4},				/* cost of storing MMX registers
992 					   in SImode and DImode */
993   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
994   {4, 4, 3, 6, 12},			/* cost of loading SSE registers
995 					   in 32,64,128,256 and 512-bit */
996   {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
997   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
998 					   in 32,64,128,256 and 512-bit */
999   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
1000   3, 3,					/* SSE->integer and integer->SSE moves */
1001   					/* On K8:
1002   					    MOVD reg64, xmmreg Double FSTORE 4
1003 					    MOVD reg32, xmmreg Double FSTORE 4
1004 					   On AMDFAM10:
1005 					    MOVD reg64, xmmreg Double FADD 3
1006 							       1/1  1/1
1007 					    MOVD reg32, xmmreg Double FADD 3
1008 							       1/1  1/1 */
1009   4, 4,					/* Gather load static, per_elt.  */
1010   4, 4,					/* Gather store static, per_elt.  */
1011   64,					/* size of l1 cache.  */
1012   512,					/* size of l2 cache.  */
1013   64,					/* size of prefetch block */
1014   /* New AMD processors never drop prefetches; if they cannot be performed
1015      immediately, they are queued.  We set number of simultaneous prefetches
1016      to a large constant to reflect this (it probably is not a good idea not
1017      to limit number of prefetches at all, as their execution also takes some
1018      time).  */
1019   100,					/* number of parallel prefetches */
1020   2,					/* Branch cost */
1021   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1022   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1023   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1024   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1025   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1026   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1027 
1028   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1029   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1030   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1031   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1032   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
1033   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
1034   /* 11-16  */
1035   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
1036   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
1037   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
1038   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
1039   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1040   amdfam10_memcpy,
1041   amdfam10_memset,
1042   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1043   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1044   "32:25:8",				/* Loop alignment.  */
1045   "32:8:8",				/* Jump alignment.  */
1046   "0:0:8",				/* Label alignment.  */
1047   "32",					/* Func alignment.  */
1048 };
1049 
1050 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1051     very small blocks it is better to use loop. For large blocks, libcall
1052     can do nontemporary accesses and beat inline considerably.  */
1053 static stringop_algs bdver_memcpy[2] = {
1054   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1055              {-1, rep_prefix_4_byte, false}}},
1056   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1057              {-1, libcall, false}}}};
1058 static stringop_algs bdver_memset[2] = {
1059   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1060              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1061   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1062              {-1, libcall, false}}}};
1063 
1064 const struct processor_costs bdver_cost = {
1065   COSTS_N_INSNS (1),			/* cost of an add instruction */
1066   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1067   COSTS_N_INSNS (1),			/* variable shift costs */
1068   COSTS_N_INSNS (1),			/* constant shift costs */
1069   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1070    COSTS_N_INSNS (4),			/*				 HI */
1071    COSTS_N_INSNS (4),			/*				 SI */
1072    COSTS_N_INSNS (6),			/*				 DI */
1073    COSTS_N_INSNS (6)},			/*			      other */
1074   0,					/* cost of multiply per each bit set */
1075   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1076    COSTS_N_INSNS (35),			/*			    HI */
1077    COSTS_N_INSNS (51),			/*			    SI */
1078    COSTS_N_INSNS (83),			/*			    DI */
1079    COSTS_N_INSNS (83)},			/*			    other */
1080   COSTS_N_INSNS (1),			/* cost of movsx */
1081   COSTS_N_INSNS (1),			/* cost of movzx */
1082   8,					/* "large" insn */
1083   9,					/* MOVE_RATIO */
1084 
1085   /* All move costs are relative to integer->integer move times 2 and thus
1086      they are latency*2. */
1087   8,				     /* cost for loading QImode using movzbl */
1088   {8, 8, 8},				/* cost of loading integer registers
1089 					   in QImode, HImode and SImode.
1090 					   Relative to reg-reg move (2).  */
1091   {8, 8, 8},				/* cost of storing integer registers */
1092   4,					/* cost of reg,reg fld/fst */
1093   {12, 12, 28},				/* cost of loading fp registers
1094 		   			   in SFmode, DFmode and XFmode */
1095   {10, 10, 18},				/* cost of storing fp registers
1096  		   			   in SFmode, DFmode and XFmode */
1097   4,					/* cost of moving MMX register */
1098   {12, 12},				/* cost of loading MMX registers
1099 					   in SImode and DImode */
1100   {10, 10},				/* cost of storing MMX registers
1101 					   in SImode and DImode */
1102   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1103   {12, 12, 10, 40, 60},			/* cost of loading SSE registers
1104 					   in 32,64,128,256 and 512-bit */
1105   {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
1106   {10, 10, 10, 40, 60},			/* cost of storing SSE registers
1107 					   in 32,64,128,256 and 512-bit */
1108   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
1109   16, 20,				/* SSE->integer and integer->SSE moves */
1110   12, 12,				/* Gather load static, per_elt.  */
1111   10, 10,				/* Gather store static, per_elt.  */
1112   16,					/* size of l1 cache.  */
1113   2048,					/* size of l2 cache.  */
1114   64,					/* size of prefetch block */
1115   /* New AMD processors never drop prefetches; if they cannot be performed
1116      immediately, they are queued.  We set number of simultaneous prefetches
1117      to a large constant to reflect this (it probably is not a good idea not
1118      to limit number of prefetches at all, as their execution also takes some
1119      time).  */
1120   100,					/* number of parallel prefetches */
1121   2,					/* Branch cost */
1122   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1123   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1124   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1125   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1126   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1127   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1128 
1129   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1130   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1131   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1132   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1133   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1134   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1135   /* 9-24  */
1136   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1137   /* 9-27  */
1138   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1139   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1140   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1141   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1142   bdver_memcpy,
1143   bdver_memset,
1144   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1145   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1146   "16:11:8",				/* Loop alignment.  */
1147   "16:8:8",				/* Jump alignment.  */
1148   "0:0:8",				/* Label alignment.  */
1149   "11",					/* Func alignment.  */
1150 };
1151 
1152 
1153 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1154     very small blocks it is better to use loop.  For large blocks, libcall
1155     can do nontemporary accesses and beat inline considerably.  */
1156 static stringop_algs znver1_memcpy[2] = {
1157   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1158 	     {-1, rep_prefix_4_byte, false}}},
1159   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1160 	     {-1, libcall, false}}}};
1161 static stringop_algs znver1_memset[2] = {
1162   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1163 	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1164   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1165 	     {-1, libcall, false}}}};
1166 struct processor_costs znver1_cost = {
1167   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1168   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1169   COSTS_N_INSNS (1),			/* variable shift costs.  */
1170   COSTS_N_INSNS (1),			/* constant shift costs.  */
1171   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1172    COSTS_N_INSNS (3),			/*				 HI.  */
1173    COSTS_N_INSNS (3),			/*				 SI.  */
1174    COSTS_N_INSNS (3),			/*				 DI.  */
1175    COSTS_N_INSNS (3)},			/*			      other.  */
1176   0,					/* cost of multiply per each bit
1177 					    set.  */
1178    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1179       bound.  */
1180   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1181    COSTS_N_INSNS (22),			/*			    HI.  */
1182    COSTS_N_INSNS (30),			/*			    SI.  */
1183    COSTS_N_INSNS (45),			/*			    DI.  */
1184    COSTS_N_INSNS (45)},			/*			    other.  */
1185   COSTS_N_INSNS (1),			/* cost of movsx.  */
1186   COSTS_N_INSNS (1),			/* cost of movzx.  */
1187   8,					/* "large" insn.  */
1188   9,					/* MOVE_RATIO.  */
1189 
1190   /* All move costs are relative to integer->integer move times 2 and thus
1191      they are latency*2. */
1192 
1193   /* reg-reg moves are done by renaming and thus they are even cheaper than
1194      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1195      to doubles of latencies, we do not model this correctly.  It does not
1196      seem to make practical difference to bump prices up even more.  */
1197   6,					/* cost for loading QImode using
1198 					   movzbl.  */
1199   {6, 6, 6},				/* cost of loading integer registers
1200 					   in QImode, HImode and SImode.
1201 					   Relative to reg-reg move (2).  */
1202   {8, 8, 8},				/* cost of storing integer
1203 					   registers.  */
1204   2,					/* cost of reg,reg fld/fst.  */
1205   {6, 6, 16},				/* cost of loading fp registers
1206 		   			   in SFmode, DFmode and XFmode.  */
1207   {8, 8, 16},				/* cost of storing fp registers
1208  		   			   in SFmode, DFmode and XFmode.  */
1209   2,					/* cost of moving MMX register.  */
1210   {6, 6},				/* cost of loading MMX registers
1211 					   in SImode and DImode.  */
1212   {8, 8},				/* cost of storing MMX registers
1213 					   in SImode and DImode.  */
1214   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
1215   {6, 6, 6, 12, 24},			/* cost of loading SSE registers
1216 					   in 32,64,128,256 and 512-bit.  */
1217   {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
1218   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
1219 					   in 32,64,128,256 and 512-bit.  */
1220   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
1221   6, 6,					/* SSE->integer and integer->SSE moves.  */
1222   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1223      throughput 12.  Approx 9 uops do not depend on vector size and every load
1224      is 7 uops.  */
1225   18, 8,				/* Gather load static, per_elt.  */
1226   18, 10,				/* Gather store static, per_elt.  */
1227   32,					/* size of l1 cache.  */
1228   512,					/* size of l2 cache.  */
1229   64,					/* size of prefetch block.  */
1230   /* New AMD processors never drop prefetches; if they cannot be performed
1231      immediately, they are queued.  We set number of simultaneous prefetches
1232      to a large constant to reflect this (it probably is not a good idea not
1233      to limit number of prefetches at all, as their execution also takes some
1234      time).  */
1235   100,					/* number of parallel prefetches.  */
1236   3,					/* Branch cost.  */
1237   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1238   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1239   /* Latency of fdiv is 8-15.  */
1240   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1241   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1242   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1243   /* Latency of fsqrt is 4-10.  */
1244   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1245 
1246   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1247   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1248   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1249   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1250   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1251   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1252   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1253   /* 9-13  */
1254   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1255   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1256   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1257   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1258      and it can execute 2 integer additions and 2 multiplications thus
1259      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1260      that 4 works better than 6 probably due to register pressure.
1261 
1262      Integer vector operations are taken by FP unit and execute 3 vector
1263      plus/minus operations per cycle but only one multiply.  This is adjusted
1264      in ix86_reassociation_width.  */
1265   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1266   znver1_memcpy,
1267   znver1_memset,
1268   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1269   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1270   "16",					/* Loop alignment.  */
1271   "16",					/* Jump alignment.  */
1272   "0:0:8",				/* Label alignment.  */
1273   "16",					/* Func alignment.  */
1274 };
1275 
1276 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1277     very small blocks it is better to use loop.  For large blocks, libcall
1278     can do nontemporary accesses and beat inline considerably.  */
1279 static stringop_algs znver2_memcpy[2] = {
1280   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1281 	     {-1, rep_prefix_4_byte, false}}},
1282   {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
1283 	     {-1, libcall, false}}}};
1284 static stringop_algs znver2_memset[2] = {
1285   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1286 	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1287   {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
1288 	     {-1, libcall, false}}}};
1289 
1290 struct processor_costs znver2_cost = {
1291   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1292   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1293   COSTS_N_INSNS (1),			/* variable shift costs.  */
1294   COSTS_N_INSNS (1),			/* constant shift costs.  */
1295   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1296    COSTS_N_INSNS (3),			/* 				 HI.  */
1297    COSTS_N_INSNS (3),			/*				 SI.  */
1298    COSTS_N_INSNS (3),			/*				 DI.  */
1299    COSTS_N_INSNS (3)},			/*			other.  */
1300   0,					/* cost of multiply per each bit
1301 					   set.  */
1302    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1303       bound.  */
1304   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1305    COSTS_N_INSNS (22),			/* 			    HI.  */
1306    COSTS_N_INSNS (30),			/*			    SI.  */
1307    COSTS_N_INSNS (45),			/*			    DI.  */
1308    COSTS_N_INSNS (45)},			/*			    other.  */
1309   COSTS_N_INSNS (1),			/* cost of movsx.  */
1310   COSTS_N_INSNS (1),			/* cost of movzx.  */
1311   8,					/* "large" insn.  */
1312   9,					/* MOVE_RATIO.  */
1313 
1314   /* All move costs are relative to integer->integer move times 2 and thus
1315      they are latency*2.  */
1316 
1317   /* reg-reg moves are done by renaming and thus they are even cheaper than
1318      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1319      to doubles of latencies, we do not model this correctly.  It does not
1320      seem to make practical difference to bump prices up even more.  */
1321   6,					/* cost for loading QImode using
1322 					   movzbl.  */
1323   {6, 6, 6},				/* cost of loading integer registers
1324 					   in QImode, HImode and SImode.
1325 					   Relative to reg-reg move (2).  */
1326   {8, 8, 8},				/* cost of storing integer
1327 					   registers.  */
1328   2,					/* cost of reg,reg fld/fst.  */
1329   {6, 6, 16},				/* cost of loading fp registers
1330 					   in SFmode, DFmode and XFmode.  */
1331   {8, 8, 16},				/* cost of storing fp registers
1332 					   in SFmode, DFmode and XFmode.  */
1333   2,					/* cost of moving MMX register.  */
1334   {6, 6},				/* cost of loading MMX registers
1335 					   in SImode and DImode.  */
1336   {8, 8},				/* cost of storing MMX registers
1337 					   in SImode and DImode.  */
1338   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1339 					   register.  */
1340   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1341 					   in 32,64,128,256 and 512-bit.  */
1342   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
1343   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
1344 					   in 32,64,128,256 and 512-bit.  */
1345   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1346   6, 6,					/* SSE->integer and integer->SSE
1347 					   moves.  */
1348   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1349      throughput 12.  Approx 9 uops do not depend on vector size and every load
1350      is 7 uops.  */
1351   18, 8,				/* Gather load static, per_elt.  */
1352   18, 10,				/* Gather store static, per_elt.  */
1353   32,					/* size of l1 cache.  */
1354   512,					/* size of l2 cache.  */
1355   64,					/* size of prefetch block.  */
1356   /* New AMD processors never drop prefetches; if they cannot be performed
1357      immediately, they are queued.  We set number of simultaneous prefetches
1358      to a large constant to reflect this (it probably is not a good idea not
1359      to limit number of prefetches at all, as their execution also takes some
1360      time).  */
1361   100,					/* number of parallel prefetches.  */
1362   3,					/* Branch cost.  */
1363   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1364   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1365   /* Latency of fdiv is 8-15.  */
1366   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1367   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1368   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1369   /* Latency of fsqrt is 4-10.  */
1370   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1371 
1372   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1373   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1374   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1375   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
1376   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1377   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1378   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1379   /* 9-13.  */
1380   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1381   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1382   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1383   /* Zen can execute 4 integer operations per cycle.  FP operations
1384      take 3 cycles and it can execute 2 integer additions and 2
1385      multiplications thus reassociation may make sense up to with of 6.
1386      SPEC2k6 bencharks suggests
1387      that 4 works better than 6 probably due to register pressure.
1388 
1389      Integer vector operations are taken by FP unit and execute 3 vector
1390      plus/minus operations per cycle but only one multiply.  This is adjusted
1391      in ix86_reassociation_width.  */
1392   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1393   znver2_memcpy,
1394   znver2_memset,
1395   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1396   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1397   "16",					/* Loop alignment.  */
1398   "16",					/* Jump alignment.  */
1399   "0:0:8",				/* Label alignment.  */
1400   "16",					/* Func alignment.  */
1401 };
1402 
1403 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1404 static stringop_algs skylake_memcpy[2] =   {
1405   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1406   {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1407              {-1, libcall, false}}}};
1408 
1409 static stringop_algs skylake_memset[2] = {
1410   {libcall, {{6, loop_1_byte, true},
1411              {24, loop, true},
1412              {8192, rep_prefix_4_byte, true},
1413              {-1, libcall, false}}},
1414   {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1415              {-1, libcall, false}}}};
1416 
1417 static const
1418 struct processor_costs skylake_cost = {
1419   COSTS_N_INSNS (1),			/* cost of an add instruction */
1420   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
1421   COSTS_N_INSNS (1),			/* variable shift costs */
1422   COSTS_N_INSNS (1),			/* constant shift costs */
1423   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1424    COSTS_N_INSNS (4),			/*				 HI */
1425    COSTS_N_INSNS (3),			/*				 SI */
1426    COSTS_N_INSNS (3),			/*				 DI */
1427    COSTS_N_INSNS (3)},			/*			      other */
1428   0,					/* cost of multiply per each bit set */
1429   /* Expanding div/mod currently doesn't consider parallelism. So the cost
1430      model is not realistic. We compensate by increasing the latencies a bit.  */
1431   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
1432    COSTS_N_INSNS (11),			/*			    HI */
1433    COSTS_N_INSNS (14),			/*			    SI */
1434    COSTS_N_INSNS (76),			/*			    DI */
1435    COSTS_N_INSNS (76)},			/*			    other */
1436   COSTS_N_INSNS (1),			/* cost of movsx */
1437   COSTS_N_INSNS (0),			/* cost of movzx */
1438   8,					/* "large" insn */
1439   17,					/* MOVE_RATIO */
1440 
1441   6,				     /* cost for loading QImode using movzbl */
1442   {4, 4, 4},				/* cost of loading integer registers
1443 					   in QImode, HImode and SImode.
1444 					   Relative to reg-reg move (2).  */
1445   {6, 6, 3},				/* cost of storing integer registers */
1446   2,					/* cost of reg,reg fld/fst */
1447   {6, 6, 8},				/* cost of loading fp registers
1448 					   in SFmode, DFmode and XFmode */
1449   {6, 6, 10},				/* cost of storing fp registers
1450 					   in SFmode, DFmode and XFmode */
1451   2,					/* cost of moving MMX register */
1452   {6, 6},				/* cost of loading MMX registers
1453 					   in SImode and DImode */
1454   {6, 6},				/* cost of storing MMX registers
1455 					   in SImode and DImode */
1456   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
1457   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
1458 					   in 32,64,128,256 and 512-bit */
1459   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
1460   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
1461 					   in 32,64,128,256 and 512-bit */
1462   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1463   2, 2,					/* SSE->integer and integer->SSE moves */
1464   20, 8,				/* Gather load static, per_elt.  */
1465   22, 10,				/* Gather store static, per_elt.  */
1466   64,					/* size of l1 cache.  */
1467   512,					/* size of l2 cache.  */
1468   64,					/* size of prefetch block */
1469   6,					/* number of parallel prefetches */
1470   3,					/* Branch cost */
1471   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
1472   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1473   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1474   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1475   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1476   COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
1477 
1478   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1479   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1480   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1481   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1482   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
1483   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
1484   COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
1485   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
1486   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
1487   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
1488   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
1489   skylake_memcpy,
1490   skylake_memset,
1491   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1492   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1493   "16:11:8",				/* Loop alignment.  */
1494   "16:11:8",				/* Jump alignment.  */
1495   "0:0:8",				/* Label alignment.  */
1496   "16",					/* Func alignment.  */
1497 };
1498   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1499      very small blocks it is better to use loop. For large blocks, libcall can
1500      do nontemporary accesses and beat inline considerably.  */
1501 static stringop_algs btver1_memcpy[2] = {
1502   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1503              {-1, rep_prefix_4_byte, false}}},
1504   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1505              {-1, libcall, false}}}};
1506 static stringop_algs btver1_memset[2] = {
1507   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1508              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1509   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1510              {-1, libcall, false}}}};
1511 const struct processor_costs btver1_cost = {
1512   COSTS_N_INSNS (1),			/* cost of an add instruction */
1513   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1514   COSTS_N_INSNS (1),			/* variable shift costs */
1515   COSTS_N_INSNS (1),			/* constant shift costs */
1516   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1517    COSTS_N_INSNS (4),			/*				 HI */
1518    COSTS_N_INSNS (3),			/*				 SI */
1519    COSTS_N_INSNS (4),			/*				 DI */
1520    COSTS_N_INSNS (5)},			/*			      other */
1521   0,					/* cost of multiply per each bit set */
1522   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1523    COSTS_N_INSNS (35),			/*			    HI */
1524    COSTS_N_INSNS (51),			/*			    SI */
1525    COSTS_N_INSNS (83),			/*			    DI */
1526    COSTS_N_INSNS (83)},			/*			    other */
1527   COSTS_N_INSNS (1),			/* cost of movsx */
1528   COSTS_N_INSNS (1),			/* cost of movzx */
1529   8,					/* "large" insn */
1530   9,					/* MOVE_RATIO */
1531 
1532   /* All move costs are relative to integer->integer move times 2 and thus
1533      they are latency*2. */
1534   8,				     /* cost for loading QImode using movzbl */
1535   {6, 8, 6},				/* cost of loading integer registers
1536 					   in QImode, HImode and SImode.
1537 					   Relative to reg-reg move (2).  */
1538   {6, 8, 6},				/* cost of storing integer registers */
1539   4,					/* cost of reg,reg fld/fst */
1540   {12, 12, 28},				/* cost of loading fp registers
1541 					   in SFmode, DFmode and XFmode */
1542   {12, 12, 38},				/* cost of storing fp registers
1543 					   in SFmode, DFmode and XFmode */
1544   4,					/* cost of moving MMX register */
1545   {10, 10},				/* cost of loading MMX registers
1546 					   in SImode and DImode */
1547   {12, 12},				/* cost of storing MMX registers
1548 					   in SImode and DImode */
1549   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1550   {10, 10, 12, 48, 96},			/* cost of loading SSE registers
1551 					   in 32,64,128,256 and 512-bit */
1552   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
1553   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
1554 					   in 32,64,128,256 and 512-bit */
1555   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
1556   14, 14,				/* SSE->integer and integer->SSE moves */
1557   10, 10,				/* Gather load static, per_elt.  */
1558   10, 10,				/* Gather store static, per_elt.  */
1559   32,					/* size of l1 cache.  */
1560   512,					/* size of l2 cache.  */
1561   64,					/* size of prefetch block */
1562   100,					/* number of parallel prefetches */
1563   2,					/* Branch cost */
1564   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1565   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1566   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1567   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1568   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1569   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1570 
1571   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1572   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1573   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
1574   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1575   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1576   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1577   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
1578   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
1579   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
1580   COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
1581   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1582   btver1_memcpy,
1583   btver1_memset,
1584   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1585   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1586   "16:11:8",				/* Loop alignment.  */
1587   "16:8:8",				/* Jump alignment.  */
1588   "0:0:8",				/* Label alignment.  */
1589   "11",					/* Func alignment.  */
1590 };
1591 
1592 static stringop_algs btver2_memcpy[2] = {
1593   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1594              {-1, rep_prefix_4_byte, false}}},
1595   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1596              {-1, libcall, false}}}};
1597 static stringop_algs btver2_memset[2] = {
1598   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1599              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1600   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1601              {-1, libcall, false}}}};
1602 const struct processor_costs btver2_cost = {
1603   COSTS_N_INSNS (1),			/* cost of an add instruction */
1604   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1605   COSTS_N_INSNS (1),			/* variable shift costs */
1606   COSTS_N_INSNS (1),			/* constant shift costs */
1607   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1608    COSTS_N_INSNS (4),			/*				 HI */
1609    COSTS_N_INSNS (3),			/*				 SI */
1610    COSTS_N_INSNS (4),			/*				 DI */
1611    COSTS_N_INSNS (5)},			/*			      other */
1612   0,					/* cost of multiply per each bit set */
1613   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1614    COSTS_N_INSNS (35),			/*			    HI */
1615    COSTS_N_INSNS (51),			/*			    SI */
1616    COSTS_N_INSNS (83),			/*			    DI */
1617    COSTS_N_INSNS (83)},			/*			    other */
1618   COSTS_N_INSNS (1),			/* cost of movsx */
1619   COSTS_N_INSNS (1),			/* cost of movzx */
1620   8,					/* "large" insn */
1621   9,					/* MOVE_RATIO */
1622 
1623   /* All move costs are relative to integer->integer move times 2 and thus
1624      they are latency*2. */
1625   8,				     /* cost for loading QImode using movzbl */
1626   {8, 8, 6},				/* cost of loading integer registers
1627 					   in QImode, HImode and SImode.
1628 					   Relative to reg-reg move (2).  */
1629   {8, 8, 6},				/* cost of storing integer registers */
1630   4,					/* cost of reg,reg fld/fst */
1631   {12, 12, 28},				/* cost of loading fp registers
1632 					   in SFmode, DFmode and XFmode */
1633   {12, 12, 38},				/* cost of storing fp registers
1634 					   in SFmode, DFmode and XFmode */
1635   4,					/* cost of moving MMX register */
1636   {10, 10},				/* cost of loading MMX registers
1637 					   in SImode and DImode */
1638   {12, 12},				/* cost of storing MMX registers
1639 					   in SImode and DImode */
1640   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1641   {10, 10, 12, 48, 96},			/* cost of loading SSE registers
1642 					   in 32,64,128,256 and 512-bit */
1643   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
1644   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
1645 					   in 32,64,128,256 and 512-bit */
1646   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
1647   14, 14,				/* SSE->integer and integer->SSE moves */
1648   10, 10,				/* Gather load static, per_elt.  */
1649   10, 10,				/* Gather store static, per_elt.  */
1650   32,					/* size of l1 cache.  */
1651   2048,					/* size of l2 cache.  */
1652   64,					/* size of prefetch block */
1653   100,					/* number of parallel prefetches */
1654   2,					/* Branch cost */
1655   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1656   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1657   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1658   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1659   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1660   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1661 
1662   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1663   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1664   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
1665   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1666   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1667   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1668   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
1669   COSTS_N_INSNS (19),			/* cost of DIVSD instruction.  */
1670   COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
1671   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
1672   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1673   btver2_memcpy,
1674   btver2_memset,
1675   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1676   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1677   "16:11:8",				/* Loop alignment.  */
1678   "16:8:8",				/* Jump alignment.  */
1679   "0:0:8",				/* Label alignment.  */
1680   "11",					/* Func alignment.  */
1681 };
1682 
1683 static stringop_algs pentium4_memcpy[2] = {
1684   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1685   DUMMY_STRINGOP_ALGS};
1686 static stringop_algs pentium4_memset[2] = {
1687   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1688              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1689   DUMMY_STRINGOP_ALGS};
1690 
1691 static const
1692 struct processor_costs pentium4_cost = {
1693   COSTS_N_INSNS (1),			/* cost of an add instruction */
1694   COSTS_N_INSNS (3),			/* cost of a lea instruction */
1695   COSTS_N_INSNS (4),			/* variable shift costs */
1696   COSTS_N_INSNS (4),			/* constant shift costs */
1697   {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
1698    COSTS_N_INSNS (15),			/*				 HI */
1699    COSTS_N_INSNS (15),			/*				 SI */
1700    COSTS_N_INSNS (15),			/*				 DI */
1701    COSTS_N_INSNS (15)},			/*			      other */
1702   0,					/* cost of multiply per each bit set */
1703   {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
1704    COSTS_N_INSNS (56),			/*			    HI */
1705    COSTS_N_INSNS (56),			/*			    SI */
1706    COSTS_N_INSNS (56),			/*			    DI */
1707    COSTS_N_INSNS (56)},			/*			    other */
1708   COSTS_N_INSNS (1),			/* cost of movsx */
1709   COSTS_N_INSNS (1),			/* cost of movzx */
1710   16,					/* "large" insn */
1711   6,					/* MOVE_RATIO */
1712 
1713   /* All move costs are relative to integer->integer move times 2 and thus
1714      they are latency*2. */
1715   5,				     /* cost for loading QImode using movzbl */
1716   {4, 5, 4},				/* cost of loading integer registers
1717 					   in QImode, HImode and SImode.
1718 					   Relative to reg-reg move (2).  */
1719   {2, 3, 2},				/* cost of storing integer registers */
1720   12,					/* cost of reg,reg fld/fst */
1721   {14, 14, 14},				/* cost of loading fp registers
1722 					   in SFmode, DFmode and XFmode */
1723   {14, 14, 14},				/* cost of storing fp registers
1724 					   in SFmode, DFmode and XFmode */
1725   12,					/* cost of moving MMX register */
1726   {16, 16},				/* cost of loading MMX registers
1727 					   in SImode and DImode */
1728   {16, 16},				/* cost of storing MMX registers
1729 					   in SImode and DImode */
1730   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
1731   {16, 16, 16, 32, 64},			/* cost of loading SSE registers
1732 					   in 32,64,128,256 and 512-bit */
1733   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
1734   {16, 16, 16, 32, 64},			/* cost of storing SSE registers
1735 					   in 32,64,128,256 and 512-bit */
1736   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
1737   20, 12,				/* SSE->integer and integer->SSE moves */
1738   16, 16,				/* Gather load static, per_elt.  */
1739   16, 16,				/* Gather store static, per_elt.  */
1740   8,					/* size of l1 cache.  */
1741   256,					/* size of l2 cache.  */
1742   64,					/* size of prefetch block */
1743   6,					/* number of parallel prefetches */
1744   2,					/* Branch cost */
1745   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1746   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
1747   COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
1748   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1749   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1750   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
1751 
1752   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1753   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1754   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1755   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1756   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1757   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1758   COSTS_N_INSNS (23),			/* cost of DIVSS instruction.  */
1759   COSTS_N_INSNS (38),			/* cost of DIVSD instruction.  */
1760   COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
1761   COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
1762   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1763   pentium4_memcpy,
1764   pentium4_memset,
1765   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1766   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1767   NULL,					/* Loop alignment.  */
1768   NULL,					/* Jump alignment.  */
1769   NULL,					/* Label alignment.  */
1770   NULL,					/* Func alignment.  */
1771 };
1772 
1773 static stringop_algs nocona_memcpy[2] = {
1774   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1775   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1776              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1777 
1778 static stringop_algs nocona_memset[2] = {
1779   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1780              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1781   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1782              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1783 
1784 static const
1785 struct processor_costs nocona_cost = {
1786   COSTS_N_INSNS (1),			/* cost of an add instruction */
1787   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1788   COSTS_N_INSNS (1),			/* variable shift costs */
1789   COSTS_N_INSNS (1),			/* constant shift costs */
1790   {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
1791    COSTS_N_INSNS (10),			/*				 HI */
1792    COSTS_N_INSNS (10),			/*				 SI */
1793    COSTS_N_INSNS (10),			/*				 DI */
1794    COSTS_N_INSNS (10)},			/*			      other */
1795   0,					/* cost of multiply per each bit set */
1796   {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
1797    COSTS_N_INSNS (66),			/*			    HI */
1798    COSTS_N_INSNS (66),			/*			    SI */
1799    COSTS_N_INSNS (66),			/*			    DI */
1800    COSTS_N_INSNS (66)},			/*			    other */
1801   COSTS_N_INSNS (1),			/* cost of movsx */
1802   COSTS_N_INSNS (1),			/* cost of movzx */
1803   16,					/* "large" insn */
1804   17,					/* MOVE_RATIO */
1805 
1806   /* All move costs are relative to integer->integer move times 2 and thus
1807      they are latency*2. */
1808   4,				     /* cost for loading QImode using movzbl */
1809   {4, 4, 4},				/* cost of loading integer registers
1810 					   in QImode, HImode and SImode.
1811 					   Relative to reg-reg move (2).  */
1812   {4, 4, 4},				/* cost of storing integer registers */
1813   12,					/* cost of reg,reg fld/fst */
1814   {14, 14, 14},				/* cost of loading fp registers
1815 					   in SFmode, DFmode and XFmode */
1816   {14, 14, 14},				/* cost of storing fp registers
1817 					   in SFmode, DFmode and XFmode */
1818   14,					/* cost of moving MMX register */
1819   {12, 12},				/* cost of loading MMX registers
1820 					   in SImode and DImode */
1821   {12, 12},				/* cost of storing MMX registers
1822 					   in SImode and DImode */
1823   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
1824   {12, 12, 12, 24, 48},			/* cost of loading SSE registers
1825 					   in 32,64,128,256 and 512-bit */
1826   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
1827   {12, 12, 12, 24, 48},			/* cost of storing SSE registers
1828 					   in 32,64,128,256 and 512-bit */
1829   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
1830   20, 12,				/* SSE->integer and integer->SSE moves */
1831   12, 12,				/* Gather load static, per_elt.  */
1832   12, 12,				/* Gather store static, per_elt.  */
1833   8,					/* size of l1 cache.  */
1834   1024,					/* size of l2 cache.  */
1835   64,					/* size of prefetch block */
1836   8,					/* number of parallel prefetches */
1837   1,					/* Branch cost */
1838   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1839   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1840   COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
1841   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
1842   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
1843   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
1844 
1845   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1846   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1847   COSTS_N_INSNS (7),			/* cost of MULSS instruction.  */
1848   COSTS_N_INSNS (7),			/* cost of MULSD instruction.  */
1849   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
1850   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
1851   COSTS_N_INSNS (32),			/* cost of DIVSS instruction.  */
1852   COSTS_N_INSNS (40),			/* cost of DIVSD instruction.  */
1853   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
1854   COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
1855   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1856   nocona_memcpy,
1857   nocona_memset,
1858   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1859   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1860   NULL,					/* Loop alignment.  */
1861   NULL,					/* Jump alignment.  */
1862   NULL,					/* Label alignment.  */
1863   NULL,					/* Func alignment.  */
1864 };
1865 
1866 static stringop_algs atom_memcpy[2] = {
1867   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1868   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1869              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1870 static stringop_algs atom_memset[2] = {
1871   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1872              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1873   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1874              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1875 static const
1876 struct processor_costs atom_cost = {
1877   COSTS_N_INSNS (1),			/* cost of an add instruction */
1878   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1879   COSTS_N_INSNS (1),			/* variable shift costs */
1880   COSTS_N_INSNS (1),			/* constant shift costs */
1881   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1882    COSTS_N_INSNS (4),			/*				 HI */
1883    COSTS_N_INSNS (3),			/*				 SI */
1884    COSTS_N_INSNS (4),			/*				 DI */
1885    COSTS_N_INSNS (2)},			/*			      other */
1886   0,					/* cost of multiply per each bit set */
1887   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1888    COSTS_N_INSNS (26),			/*			    HI */
1889    COSTS_N_INSNS (42),			/*			    SI */
1890    COSTS_N_INSNS (74),			/*			    DI */
1891    COSTS_N_INSNS (74)},			/*			    other */
1892   COSTS_N_INSNS (1),			/* cost of movsx */
1893   COSTS_N_INSNS (1),			/* cost of movzx */
1894   8,					/* "large" insn */
1895   17,					/* MOVE_RATIO */
1896 
1897   /* All move costs are relative to integer->integer move times 2 and thus
1898      they are latency*2. */
1899   6,					/* cost for loading QImode using movzbl */
1900   {6, 6, 6},				/* cost of loading integer registers
1901 					   in QImode, HImode and SImode.
1902 					   Relative to reg-reg move (2).  */
1903   {6, 6, 6},				/* cost of storing integer registers */
1904   4,					/* cost of reg,reg fld/fst */
1905   {6, 6, 18},				/* cost of loading fp registers
1906 					   in SFmode, DFmode and XFmode */
1907   {14, 14, 24},				/* cost of storing fp registers
1908 					   in SFmode, DFmode and XFmode */
1909   2,					/* cost of moving MMX register */
1910   {8, 8},				/* cost of loading MMX registers
1911 					   in SImode and DImode */
1912   {10, 10},				/* cost of storing MMX registers
1913 					   in SImode and DImode */
1914   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1915   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
1916 					   in 32,64,128,256 and 512-bit */
1917   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
1918   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
1919 					   in 32,64,128,256 and 512-bit */
1920   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
1921   8, 6,					/* SSE->integer and integer->SSE moves */
1922   8, 8,					/* Gather load static, per_elt.  */
1923   8, 8,					/* Gather store static, per_elt.  */
1924   32,					/* size of l1 cache.  */
1925   256,					/* size of l2 cache.  */
1926   64,					/* size of prefetch block */
1927   6,					/* number of parallel prefetches */
1928   3,					/* Branch cost */
1929   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1930   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1931   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1932   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1933   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1934   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1935 
1936   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1937   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1938   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1939   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
1940   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1941   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1942   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
1943   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
1944   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
1945   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
1946   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
1947   atom_memcpy,
1948   atom_memset,
1949   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1950   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1951   "16",					/* Loop alignment.  */
1952   "16:8:8",				/* Jump alignment.  */
1953   "0:0:8",				/* Label alignment.  */
1954   "16",					/* Func alignment.  */
1955 };
1956 
1957 static stringop_algs slm_memcpy[2] = {
1958   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1959   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1960              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1961 static stringop_algs slm_memset[2] = {
1962   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1963              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1964   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1965              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1966 static const
1967 struct processor_costs slm_cost = {
1968   COSTS_N_INSNS (1),			/* cost of an add instruction */
1969   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1970   COSTS_N_INSNS (1),			/* variable shift costs */
1971   COSTS_N_INSNS (1),			/* constant shift costs */
1972   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1973    COSTS_N_INSNS (3),			/*				 HI */
1974    COSTS_N_INSNS (3),			/*				 SI */
1975    COSTS_N_INSNS (4),			/*				 DI */
1976    COSTS_N_INSNS (2)},			/*			      other */
1977   0,					/* cost of multiply per each bit set */
1978   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1979    COSTS_N_INSNS (26),			/*			    HI */
1980    COSTS_N_INSNS (42),			/*			    SI */
1981    COSTS_N_INSNS (74),			/*			    DI */
1982    COSTS_N_INSNS (74)},			/*			    other */
1983   COSTS_N_INSNS (1),			/* cost of movsx */
1984   COSTS_N_INSNS (1),			/* cost of movzx */
1985   8,					/* "large" insn */
1986   17,					/* MOVE_RATIO */
1987 
1988   /* All move costs are relative to integer->integer move times 2 and thus
1989      they are latency*2. */
1990   8,					/* cost for loading QImode using movzbl */
1991   {8, 8, 8},				/* cost of loading integer registers
1992 					   in QImode, HImode and SImode.
1993 					   Relative to reg-reg move (2).  */
1994   {6, 6, 6},				/* cost of storing integer registers */
1995   2,					/* cost of reg,reg fld/fst */
1996   {8, 8, 18},				/* cost of loading fp registers
1997 					   in SFmode, DFmode and XFmode */
1998   {6, 6, 18},				/* cost of storing fp registers
1999 					   in SFmode, DFmode and XFmode */
2000   2,					/* cost of moving MMX register */
2001   {8, 8},				/* cost of loading MMX registers
2002 					   in SImode and DImode */
2003   {6, 6},				/* cost of storing MMX registers
2004 					   in SImode and DImode */
2005   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2006   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2007 					   in 32,64,128,256 and 512-bit */
2008   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2009   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2010 					   in 32,64,128,256 and 512-bit */
2011   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2012   8, 6,					/* SSE->integer and integer->SSE moves */
2013   8, 8,					/* Gather load static, per_elt.  */
2014   8, 8,					/* Gather store static, per_elt.  */
2015   32,					/* size of l1 cache.  */
2016   256,					/* size of l2 cache.  */
2017   64,					/* size of prefetch block */
2018   6,					/* number of parallel prefetches */
2019   3,					/* Branch cost */
2020   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2021   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2022   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2023   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2024   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2025   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2026 
2027   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2028   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2029   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2030   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2031   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2032   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2033   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
2034   COSTS_N_INSNS (69),			/* cost of DIVSD instruction.  */
2035   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
2036   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
2037   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2038   slm_memcpy,
2039   slm_memset,
2040   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2041   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2042   "16",					/* Loop alignment.  */
2043   "16:8:8",				/* Jump alignment.  */
2044   "0:0:8",				/* Label alignment.  */
2045   "16",					/* Func alignment.  */
2046 };
2047 
2048 static stringop_algs intel_memcpy[2] = {
2049   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2050   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2051              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2052 static stringop_algs intel_memset[2] = {
2053   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2054              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2055   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2056              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2057 static const
2058 struct processor_costs intel_cost = {
2059   COSTS_N_INSNS (1),			/* cost of an add instruction */
2060   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2061   COSTS_N_INSNS (1),			/* variable shift costs */
2062   COSTS_N_INSNS (1),			/* constant shift costs */
2063   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2064    COSTS_N_INSNS (3),			/*				 HI */
2065    COSTS_N_INSNS (3),			/*				 SI */
2066    COSTS_N_INSNS (4),			/*				 DI */
2067    COSTS_N_INSNS (2)},			/*			      other */
2068   0,					/* cost of multiply per each bit set */
2069   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2070    COSTS_N_INSNS (26),			/*			    HI */
2071    COSTS_N_INSNS (42),			/*			    SI */
2072    COSTS_N_INSNS (74),			/*			    DI */
2073    COSTS_N_INSNS (74)},			/*			    other */
2074   COSTS_N_INSNS (1),			/* cost of movsx */
2075   COSTS_N_INSNS (1),			/* cost of movzx */
2076   8,					/* "large" insn */
2077   17,					/* MOVE_RATIO */
2078 
2079   /* All move costs are relative to integer->integer move times 2 and thus
2080      they are latency*2. */
2081   6,				     /* cost for loading QImode using movzbl */
2082   {4, 4, 4},				/* cost of loading integer registers
2083 					   in QImode, HImode and SImode.
2084 					   Relative to reg-reg move (2).  */
2085   {6, 6, 6},				/* cost of storing integer registers */
2086   2,					/* cost of reg,reg fld/fst */
2087   {6, 6, 8},				/* cost of loading fp registers
2088 					   in SFmode, DFmode and XFmode */
2089   {6, 6, 10},				/* cost of storing fp registers
2090 					   in SFmode, DFmode and XFmode */
2091   2,					/* cost of moving MMX register */
2092   {6, 6},				/* cost of loading MMX registers
2093 					   in SImode and DImode */
2094   {6, 6},				/* cost of storing MMX registers
2095 					   in SImode and DImode */
2096   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
2097   {6, 6, 6, 6, 6},			/* cost of loading SSE registers
2098 					   in 32,64,128,256 and 512-bit */
2099   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2100   {6, 6, 6, 6, 6},			/* cost of storing SSE registers
2101 					   in 32,64,128,256 and 512-bit */
2102   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2103   4, 4,					/* SSE->integer and integer->SSE moves */
2104   6, 6,					/* Gather load static, per_elt.  */
2105   6, 6,					/* Gather store static, per_elt.  */
2106   32,					/* size of l1 cache.  */
2107   256,					/* size of l2 cache.  */
2108   64,					/* size of prefetch block */
2109   6,					/* number of parallel prefetches */
2110   3,					/* Branch cost */
2111   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2112   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2113   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2114   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2115   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2116   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2117 
2118   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2119   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2120   COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
2121   COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
2122   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2123   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2124   COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
2125   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
2126   COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
2127   COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
2128   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2129   intel_memcpy,
2130   intel_memset,
2131   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2132   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2133   "16",					/* Loop alignment.  */
2134   "16:8:8",				/* Jump alignment.  */
2135   "0:0:8",				/* Label alignment.  */
2136   "16",					/* Func alignment.  */
2137 };
2138 
2139 /* Generic should produce code tuned for Core-i7 (and newer chips)
2140    and btver1 (and newer chips).  */
2141 
2142 static stringop_algs generic_memcpy[2] = {
2143   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2144              {-1, libcall, false}}},
2145   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2146              {-1, libcall, false}}}};
2147 static stringop_algs generic_memset[2] = {
2148   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2149              {-1, libcall, false}}},
2150   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2151              {-1, libcall, false}}}};
2152 static const
2153 struct processor_costs generic_cost = {
2154   COSTS_N_INSNS (1),			/* cost of an add instruction */
2155   /* Setting cost to 2 makes our current implementation of synth_mult result in
2156      use of unnecessary temporary registers causing regression on several
2157      SPECfp benchmarks.  */
2158   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2159   COSTS_N_INSNS (1),			/* variable shift costs */
2160   COSTS_N_INSNS (1),			/* constant shift costs */
2161   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2162    COSTS_N_INSNS (4),			/*				 HI */
2163    COSTS_N_INSNS (3),			/*				 SI */
2164    COSTS_N_INSNS (4),			/*				 DI */
2165    COSTS_N_INSNS (4)},			/*			      other */
2166   0,					/* cost of multiply per each bit set */
2167   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
2168    COSTS_N_INSNS (22),			/*			    HI */
2169    COSTS_N_INSNS (30),			/*			    SI */
2170    COSTS_N_INSNS (74),			/*			    DI */
2171    COSTS_N_INSNS (74)},			/*			    other */
2172   COSTS_N_INSNS (1),			/* cost of movsx */
2173   COSTS_N_INSNS (1),			/* cost of movzx */
2174   8,					/* "large" insn */
2175   17,					/* MOVE_RATIO */
2176 
2177   /* All move costs are relative to integer->integer move times 2 and thus
2178      they are latency*2. */
2179   6,				     /* cost for loading QImode using movzbl */
2180   {6, 6, 6},				/* cost of loading integer registers
2181 					   in QImode, HImode and SImode.
2182 					   Relative to reg-reg move (2).  */
2183   {6, 6, 6},				/* cost of storing integer registers */
2184   4,					/* cost of reg,reg fld/fst */
2185   {6, 6, 12},				/* cost of loading fp registers
2186 					   in SFmode, DFmode and XFmode */
2187   {6, 6, 12},				/* cost of storing fp registers
2188 					   in SFmode, DFmode and XFmode */
2189   2,					/* cost of moving MMX register */
2190   {6, 6},				/* cost of loading MMX registers
2191 					   in SImode and DImode */
2192   {6, 6},				/* cost of storing MMX registers
2193 					   in SImode and DImode */
2194   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
2195   {6, 6, 6, 10, 15},			/* cost of loading SSE registers
2196 					   in 32,64,128,256 and 512-bit */
2197   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
2198   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
2199 					   in 32,64,128,256 and 512-bit */
2200   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
2201   6, 6,					/* SSE->integer and integer->SSE moves */
2202   18, 6,				/* Gather load static, per_elt.  */
2203   18, 6,				/* Gather store static, per_elt.  */
2204   32,					/* size of l1 cache.  */
2205   512,					/* size of l2 cache.  */
2206   64,					/* size of prefetch block */
2207   6,					/* number of parallel prefetches */
2208   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2209      value is increased to perhaps more appropriate value of 5.  */
2210   3,					/* Branch cost */
2211   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2212   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2213   COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
2214   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2215   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2216   COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
2217 
2218   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2219   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2220   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2221   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2222   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2223   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2224   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2225   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
2226   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
2227   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
2228   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
2229   generic_memcpy,
2230   generic_memset,
2231   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
2232   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
2233   "16:11:8",				/* Loop alignment.  */
2234   "16:11:8",				/* Jump alignment.  */
2235   "0:0:8",				/* Label alignment.  */
2236   "16",					/* Func alignment.  */
2237 };
2238 
2239 /* core_cost should produce code tuned for Core familly of CPUs.  */
2240 static stringop_algs core_memcpy[2] = {
2241   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2242   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2243              {-1, libcall, false}}}};
2244 static stringop_algs core_memset[2] = {
2245   {libcall, {{6, loop_1_byte, true},
2246              {24, loop, true},
2247              {8192, rep_prefix_4_byte, true},
2248              {-1, libcall, false}}},
2249   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2250              {-1, libcall, false}}}};
2251 
2252 static const
2253 struct processor_costs core_cost = {
2254   COSTS_N_INSNS (1),			/* cost of an add instruction */
2255   /* On all chips taken into consideration lea is 2 cycles and more.  With
2256      this cost however our current implementation of synth_mult results in
2257      use of unnecessary temporary registers causing regression on several
2258      SPECfp benchmarks.  */
2259   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2260   COSTS_N_INSNS (1),			/* variable shift costs */
2261   COSTS_N_INSNS (1),			/* constant shift costs */
2262   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2263    COSTS_N_INSNS (4),			/*				 HI */
2264    COSTS_N_INSNS (3),			/*				 SI */
2265    /* Here we tune for Sandybridge or newer.  */
2266    COSTS_N_INSNS (3),			/*				 DI */
2267    COSTS_N_INSNS (3)},			/*			      other */
2268   0,					/* cost of multiply per each bit set */
2269   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2270      model is not realistic. We compensate by increasing the latencies a bit.  */
2271   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
2272    COSTS_N_INSNS (11),			/*			    HI */
2273    COSTS_N_INSNS (14),			/*			    SI */
2274    COSTS_N_INSNS (81),			/*			    DI */
2275    COSTS_N_INSNS (81)},			/*			    other */
2276   COSTS_N_INSNS (1),			/* cost of movsx */
2277   COSTS_N_INSNS (1),			/* cost of movzx */
2278   8,					/* "large" insn */
2279   17,					/* MOVE_RATIO */
2280 
2281   /* All move costs are relative to integer->integer move times 2 and thus
2282      they are latency*2. */
2283   6,				     /* cost for loading QImode using movzbl */
2284   {4, 4, 4},				/* cost of loading integer registers
2285 					   in QImode, HImode and SImode.
2286 					   Relative to reg-reg move (2).  */
2287   {6, 6, 6},				/* cost of storing integer registers */
2288   2,					/* cost of reg,reg fld/fst */
2289   {6, 6, 8},				/* cost of loading fp registers
2290 					   in SFmode, DFmode and XFmode */
2291   {6, 6, 10},				/* cost of storing fp registers
2292 					   in SFmode, DFmode and XFmode */
2293   2,					/* cost of moving MMX register */
2294   {6, 6},				/* cost of loading MMX registers
2295 					   in SImode and DImode */
2296   {6, 6},				/* cost of storing MMX registers
2297 					   in SImode and DImode */
2298   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2299   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
2300 					   in 32,64,128,256 and 512-bit */
2301   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
2302   {6, 6, 6, 6, 12},			/* cost of storing SSE registers
2303 					   in 32,64,128,256 and 512-bit */
2304   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
2305   2, 2,					/* SSE->integer and integer->SSE moves */
2306   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2307      rec. throughput 6.
2308      So 5 uops statically and one uops per load.  */
2309   10, 6,				/* Gather load static, per_elt.  */
2310   10, 6,				/* Gather store static, per_elt.  */
2311   64,					/* size of l1 cache.  */
2312   512,					/* size of l2 cache.  */
2313   64,					/* size of prefetch block */
2314   6,					/* number of parallel prefetches */
2315   /* FIXME perhaps more appropriate value is 5.  */
2316   3,					/* Branch cost */
2317   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2318   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2319   /* 10-24 */
2320   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
2321   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2322   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2323   COSTS_N_INSNS (23),			/* cost of FSQRT instruction.  */
2324 
2325   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2326   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2327   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2328   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2329   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2330   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2331   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
2332   COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
2333   COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
2334   COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
2335   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2336   core_memcpy,
2337   core_memset,
2338   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2339   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2340   "16:11:8",				/* Loop alignment.  */
2341   "16:11:8",				/* Jump alignment.  */
2342   "0:0:8",				/* Label alignment.  */
2343   "16",					/* Func alignment.  */
2344 };
2345 
2346