xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/i386/i386.c (revision 49d8c9ecf4abd21261269266ef64939f71b3cd09)
1 /* Subroutines used for code generation on IA-32.
2    Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3    2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
4    Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12 
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "real.h"
32 #include "insn-config.h"
33 #include "conditions.h"
34 #include "output.h"
35 #include "insn-codes.h"
36 #include "insn-attr.h"
37 #include "flags.h"
38 #include "except.h"
39 #include "function.h"
40 #include "recog.h"
41 #include "expr.h"
42 #include "optabs.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "langhooks.h"
49 #include "reload.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "dwarf2out.h"
59 
60 static rtx legitimize_dllimport_symbol (rtx, bool);
61 
62 #ifndef CHECK_STACK_LIMIT
63 #define CHECK_STACK_LIMIT (-1)
64 #endif
65 
66 /* Return index of given mode in mult and division cost tables.  */
67 #define MODE_INDEX(mode)					\
68   ((mode) == QImode ? 0						\
69    : (mode) == HImode ? 1					\
70    : (mode) == SImode ? 2					\
71    : (mode) == DImode ? 3					\
72    : 4)
73 
74 /* Processor costs (relative to an add) */
75 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
76 #define COSTS_N_BYTES(N) ((N) * 2)
77 
78 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
79 
80 const
81 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
82   COSTS_N_BYTES (2),			/* cost of an add instruction */
83   COSTS_N_BYTES (3),			/* cost of a lea instruction */
84   COSTS_N_BYTES (2),			/* variable shift costs */
85   COSTS_N_BYTES (3),			/* constant shift costs */
86   {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
87    COSTS_N_BYTES (3),			/*                               HI */
88    COSTS_N_BYTES (3),			/*                               SI */
89    COSTS_N_BYTES (3),			/*                               DI */
90    COSTS_N_BYTES (5)},			/*                            other */
91   0,					/* cost of multiply per each bit set */
92   {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
93    COSTS_N_BYTES (3),			/*                          HI */
94    COSTS_N_BYTES (3),			/*                          SI */
95    COSTS_N_BYTES (3),			/*                          DI */
96    COSTS_N_BYTES (5)},			/*                       other */
97   COSTS_N_BYTES (3),			/* cost of movsx */
98   COSTS_N_BYTES (3),			/* cost of movzx */
99   0,					/* "large" insn */
100   2,					/* MOVE_RATIO */
101   2,					/* cost for loading QImode using movzbl */
102   {2, 2, 2},				/* cost of loading integer registers
103 					   in QImode, HImode and SImode.
104 					   Relative to reg-reg move (2).  */
105   {2, 2, 2},				/* cost of storing integer registers */
106   2,					/* cost of reg,reg fld/fst */
107   {2, 2, 2},				/* cost of loading fp registers
108 					   in SFmode, DFmode and XFmode */
109   {2, 2, 2},				/* cost of storing fp registers
110 					   in SFmode, DFmode and XFmode */
111   3,					/* cost of moving MMX register */
112   {3, 3},				/* cost of loading MMX registers
113 					   in SImode and DImode */
114   {3, 3},				/* cost of storing MMX registers
115 					   in SImode and DImode */
116   3,					/* cost of moving SSE register */
117   {3, 3, 3},				/* cost of loading SSE registers
118 					   in SImode, DImode and TImode */
119   {3, 3, 3},				/* cost of storing SSE registers
120 					   in SImode, DImode and TImode */
121   3,					/* MMX or SSE register to integer */
122   0,					/* size of l1 cache  */
123   0,					/* size of l2 cache  */
124   0,					/* size of prefetch block */
125   0,					/* number of parallel prefetches */
126   2,					/* Branch cost */
127   COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
128   COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
129   COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
130   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
131   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
132   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
133   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
134    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
135   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
136    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
137   1,                                    /* scalar_stmt_cost.  */
138   1,                                    /* scalar load_cost.  */
139   1,                                    /* scalar_store_cost.  */
140   1,                                    /* vec_stmt_cost.  */
141   1,                                    /* vec_to_scalar_cost.  */
142   1,                                    /* scalar_to_vec_cost.  */
143   1,                                    /* vec_align_load_cost.  */
144   1,                                    /* vec_unalign_load_cost.  */
145   1,                                    /* vec_store_cost.  */
146   1,                                    /* cond_taken_branch_cost.  */
147   1,                                    /* cond_not_taken_branch_cost.  */
148 };
149 
150 /* Processor costs (relative to an add) */
151 static const
152 struct processor_costs i386_cost = {	/* 386 specific costs */
153   COSTS_N_INSNS (1),			/* cost of an add instruction */
154   COSTS_N_INSNS (1),			/* cost of a lea instruction */
155   COSTS_N_INSNS (3),			/* variable shift costs */
156   COSTS_N_INSNS (2),			/* constant shift costs */
157   {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
158    COSTS_N_INSNS (6),			/*                               HI */
159    COSTS_N_INSNS (6),			/*                               SI */
160    COSTS_N_INSNS (6),			/*                               DI */
161    COSTS_N_INSNS (6)},			/*                               other */
162   COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
163   {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
164    COSTS_N_INSNS (23),			/*                          HI */
165    COSTS_N_INSNS (23),			/*                          SI */
166    COSTS_N_INSNS (23),			/*                          DI */
167    COSTS_N_INSNS (23)},			/*                          other */
168   COSTS_N_INSNS (3),			/* cost of movsx */
169   COSTS_N_INSNS (2),			/* cost of movzx */
170   15,					/* "large" insn */
171   3,					/* MOVE_RATIO */
172   4,					/* cost for loading QImode using movzbl */
173   {2, 4, 2},				/* cost of loading integer registers
174 					   in QImode, HImode and SImode.
175 					   Relative to reg-reg move (2).  */
176   {2, 4, 2},				/* cost of storing integer registers */
177   2,					/* cost of reg,reg fld/fst */
178   {8, 8, 8},				/* cost of loading fp registers
179 					   in SFmode, DFmode and XFmode */
180   {8, 8, 8},				/* cost of storing fp registers
181 					   in SFmode, DFmode and XFmode */
182   2,					/* cost of moving MMX register */
183   {4, 8},				/* cost of loading MMX registers
184 					   in SImode and DImode */
185   {4, 8},				/* cost of storing MMX registers
186 					   in SImode and DImode */
187   2,					/* cost of moving SSE register */
188   {4, 8, 16},				/* cost of loading SSE registers
189 					   in SImode, DImode and TImode */
190   {4, 8, 16},				/* cost of storing SSE registers
191 					   in SImode, DImode and TImode */
192   3,					/* MMX or SSE register to integer */
193   0,					/* size of l1 cache  */
194   0,					/* size of l2 cache  */
195   0,					/* size of prefetch block */
196   0,					/* number of parallel prefetches */
197   1,					/* Branch cost */
198   COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
199   COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
200   COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
201   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
202   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
203   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
204   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
205    DUMMY_STRINGOP_ALGS},
206   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
207    DUMMY_STRINGOP_ALGS},
208   1,                                    /* scalar_stmt_cost.  */
209   1,                                    /* scalar load_cost.  */
210   1,                                    /* scalar_store_cost.  */
211   1,                                    /* vec_stmt_cost.  */
212   1,                                    /* vec_to_scalar_cost.  */
213   1,                                    /* scalar_to_vec_cost.  */
214   1,                                    /* vec_align_load_cost.  */
215   2,                                    /* vec_unalign_load_cost.  */
216   1,                                    /* vec_store_cost.  */
217   3,                                    /* cond_taken_branch_cost.  */
218   1,                                    /* cond_not_taken_branch_cost.  */
219 };
220 
221 static const
222 struct processor_costs i486_cost = {	/* 486 specific costs */
223   COSTS_N_INSNS (1),			/* cost of an add instruction */
224   COSTS_N_INSNS (1),			/* cost of a lea instruction */
225   COSTS_N_INSNS (3),			/* variable shift costs */
226   COSTS_N_INSNS (2),			/* constant shift costs */
227   {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
228    COSTS_N_INSNS (12),			/*                               HI */
229    COSTS_N_INSNS (12),			/*                               SI */
230    COSTS_N_INSNS (12),			/*                               DI */
231    COSTS_N_INSNS (12)},			/*                               other */
232   1,					/* cost of multiply per each bit set */
233   {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
234    COSTS_N_INSNS (40),			/*                          HI */
235    COSTS_N_INSNS (40),			/*                          SI */
236    COSTS_N_INSNS (40),			/*                          DI */
237    COSTS_N_INSNS (40)},			/*                          other */
238   COSTS_N_INSNS (3),			/* cost of movsx */
239   COSTS_N_INSNS (2),			/* cost of movzx */
240   15,					/* "large" insn */
241   3,					/* MOVE_RATIO */
242   4,					/* cost for loading QImode using movzbl */
243   {2, 4, 2},				/* cost of loading integer registers
244 					   in QImode, HImode and SImode.
245 					   Relative to reg-reg move (2).  */
246   {2, 4, 2},				/* cost of storing integer registers */
247   2,					/* cost of reg,reg fld/fst */
248   {8, 8, 8},				/* cost of loading fp registers
249 					   in SFmode, DFmode and XFmode */
250   {8, 8, 8},				/* cost of storing fp registers
251 					   in SFmode, DFmode and XFmode */
252   2,					/* cost of moving MMX register */
253   {4, 8},				/* cost of loading MMX registers
254 					   in SImode and DImode */
255   {4, 8},				/* cost of storing MMX registers
256 					   in SImode and DImode */
257   2,					/* cost of moving SSE register */
258   {4, 8, 16},				/* cost of loading SSE registers
259 					   in SImode, DImode and TImode */
260   {4, 8, 16},				/* cost of storing SSE registers
261 					   in SImode, DImode and TImode */
262   3,					/* MMX or SSE register to integer */
263   4,					/* size of l1 cache.  486 has 8kB cache
264 					   shared for code and data, so 4kB is
265 					   not really precise.  */
266   4,					/* size of l2 cache  */
267   0,					/* size of prefetch block */
268   0,					/* number of parallel prefetches */
269   1,					/* Branch cost */
270   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
271   COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
272   COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
273   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
274   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
275   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
276   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
277    DUMMY_STRINGOP_ALGS},
278   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
279    DUMMY_STRINGOP_ALGS},
280   1,                                    /* scalar_stmt_cost.  */
281   1,                                    /* scalar load_cost.  */
282   1,                                    /* scalar_store_cost.  */
283   1,                                    /* vec_stmt_cost.  */
284   1,                                    /* vec_to_scalar_cost.  */
285   1,                                    /* scalar_to_vec_cost.  */
286   1,                                    /* vec_align_load_cost.  */
287   2,                                    /* vec_unalign_load_cost.  */
288   1,                                    /* vec_store_cost.  */
289   3,                                    /* cond_taken_branch_cost.  */
290   1,                                    /* cond_not_taken_branch_cost.  */
291 };
292 
293 static const
294 struct processor_costs pentium_cost = {
295   COSTS_N_INSNS (1),			/* cost of an add instruction */
296   COSTS_N_INSNS (1),			/* cost of a lea instruction */
297   COSTS_N_INSNS (4),			/* variable shift costs */
298   COSTS_N_INSNS (1),			/* constant shift costs */
299   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
300    COSTS_N_INSNS (11),			/*                               HI */
301    COSTS_N_INSNS (11),			/*                               SI */
302    COSTS_N_INSNS (11),			/*                               DI */
303    COSTS_N_INSNS (11)},			/*                               other */
304   0,					/* cost of multiply per each bit set */
305   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
306    COSTS_N_INSNS (25),			/*                          HI */
307    COSTS_N_INSNS (25),			/*                          SI */
308    COSTS_N_INSNS (25),			/*                          DI */
309    COSTS_N_INSNS (25)},			/*                          other */
310   COSTS_N_INSNS (3),			/* cost of movsx */
311   COSTS_N_INSNS (2),			/* cost of movzx */
312   8,					/* "large" insn */
313   6,					/* MOVE_RATIO */
314   6,					/* cost for loading QImode using movzbl */
315   {2, 4, 2},				/* cost of loading integer registers
316 					   in QImode, HImode and SImode.
317 					   Relative to reg-reg move (2).  */
318   {2, 4, 2},				/* cost of storing integer registers */
319   2,					/* cost of reg,reg fld/fst */
320   {2, 2, 6},				/* cost of loading fp registers
321 					   in SFmode, DFmode and XFmode */
322   {4, 4, 6},				/* cost of storing fp registers
323 					   in SFmode, DFmode and XFmode */
324   8,					/* cost of moving MMX register */
325   {8, 8},				/* cost of loading MMX registers
326 					   in SImode and DImode */
327   {8, 8},				/* cost of storing MMX registers
328 					   in SImode and DImode */
329   2,					/* cost of moving SSE register */
330   {4, 8, 16},				/* cost of loading SSE registers
331 					   in SImode, DImode and TImode */
332   {4, 8, 16},				/* cost of storing SSE registers
333 					   in SImode, DImode and TImode */
334   3,					/* MMX or SSE register to integer */
335   8,					/* size of l1 cache.  */
336   8,					/* size of l2 cache  */
337   0,					/* size of prefetch block */
338   0,					/* number of parallel prefetches */
339   2,					/* Branch cost */
340   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
341   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
342   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
343   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
344   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
345   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
346   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
347    DUMMY_STRINGOP_ALGS},
348   {{libcall, {{-1, rep_prefix_4_byte}}},
349    DUMMY_STRINGOP_ALGS},
350   1,                                    /* scalar_stmt_cost.  */
351   1,                                    /* scalar load_cost.  */
352   1,                                    /* scalar_store_cost.  */
353   1,                                    /* vec_stmt_cost.  */
354   1,                                    /* vec_to_scalar_cost.  */
355   1,                                    /* scalar_to_vec_cost.  */
356   1,                                    /* vec_align_load_cost.  */
357   2,                                    /* vec_unalign_load_cost.  */
358   1,                                    /* vec_store_cost.  */
359   3,                                    /* cond_taken_branch_cost.  */
360   1,                                    /* cond_not_taken_branch_cost.  */
361 };
362 
363 static const
364 struct processor_costs pentiumpro_cost = {
365   COSTS_N_INSNS (1),			/* cost of an add instruction */
366   COSTS_N_INSNS (1),			/* cost of a lea instruction */
367   COSTS_N_INSNS (1),			/* variable shift costs */
368   COSTS_N_INSNS (1),			/* constant shift costs */
369   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
370    COSTS_N_INSNS (4),			/*                               HI */
371    COSTS_N_INSNS (4),			/*                               SI */
372    COSTS_N_INSNS (4),			/*                               DI */
373    COSTS_N_INSNS (4)},			/*                               other */
374   0,					/* cost of multiply per each bit set */
375   {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
376    COSTS_N_INSNS (17),			/*                          HI */
377    COSTS_N_INSNS (17),			/*                          SI */
378    COSTS_N_INSNS (17),			/*                          DI */
379    COSTS_N_INSNS (17)},			/*                          other */
380   COSTS_N_INSNS (1),			/* cost of movsx */
381   COSTS_N_INSNS (1),			/* cost of movzx */
382   8,					/* "large" insn */
383   6,					/* MOVE_RATIO */
384   2,					/* cost for loading QImode using movzbl */
385   {4, 4, 4},				/* cost of loading integer registers
386 					   in QImode, HImode and SImode.
387 					   Relative to reg-reg move (2).  */
388   {2, 2, 2},				/* cost of storing integer registers */
389   2,					/* cost of reg,reg fld/fst */
390   {2, 2, 6},				/* cost of loading fp registers
391 					   in SFmode, DFmode and XFmode */
392   {4, 4, 6},				/* cost of storing fp registers
393 					   in SFmode, DFmode and XFmode */
394   2,					/* cost of moving MMX register */
395   {2, 2},				/* cost of loading MMX registers
396 					   in SImode and DImode */
397   {2, 2},				/* cost of storing MMX registers
398 					   in SImode and DImode */
399   2,					/* cost of moving SSE register */
400   {2, 2, 8},				/* cost of loading SSE registers
401 					   in SImode, DImode and TImode */
402   {2, 2, 8},				/* cost of storing SSE registers
403 					   in SImode, DImode and TImode */
404   3,					/* MMX or SSE register to integer */
405   8,					/* size of l1 cache.  */
406   256,					/* size of l2 cache  */
407   32,					/* size of prefetch block */
408   6,					/* number of parallel prefetches */
409   2,					/* Branch cost */
410   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
411   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
412   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
413   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
414   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
415   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
416   /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
417      the alignment).  For small blocks inline loop is still a noticeable win, for bigger
418      blocks either rep movsl or rep movsb is way to go.  Rep movsb has apparently
419      more expensive startup time in CPU, but after 4K the difference is down in the noise.
420    */
421   {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
422 			{8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
423    DUMMY_STRINGOP_ALGS},
424   {{rep_prefix_4_byte, {{1024, unrolled_loop},
425   		        {8192, rep_prefix_4_byte}, {-1, libcall}}},
426    DUMMY_STRINGOP_ALGS},
427   1,                                    /* scalar_stmt_cost.  */
428   1,                                    /* scalar load_cost.  */
429   1,                                    /* scalar_store_cost.  */
430   1,                                    /* vec_stmt_cost.  */
431   1,                                    /* vec_to_scalar_cost.  */
432   1,                                    /* scalar_to_vec_cost.  */
433   1,                                    /* vec_align_load_cost.  */
434   2,                                    /* vec_unalign_load_cost.  */
435   1,                                    /* vec_store_cost.  */
436   3,                                    /* cond_taken_branch_cost.  */
437   1,                                    /* cond_not_taken_branch_cost.  */
438 };
439 
440 static const
441 struct processor_costs geode_cost = {
442   COSTS_N_INSNS (1),			/* cost of an add instruction */
443   COSTS_N_INSNS (1),			/* cost of a lea instruction */
444   COSTS_N_INSNS (2),			/* variable shift costs */
445   COSTS_N_INSNS (1),			/* constant shift costs */
446   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
447    COSTS_N_INSNS (4),			/*                               HI */
448    COSTS_N_INSNS (7),			/*                               SI */
449    COSTS_N_INSNS (7),			/*                               DI */
450    COSTS_N_INSNS (7)},			/*                               other */
451   0,					/* cost of multiply per each bit set */
452   {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
453    COSTS_N_INSNS (23),			/*                          HI */
454    COSTS_N_INSNS (39),			/*                          SI */
455    COSTS_N_INSNS (39),			/*                          DI */
456    COSTS_N_INSNS (39)},			/*                          other */
457   COSTS_N_INSNS (1),			/* cost of movsx */
458   COSTS_N_INSNS (1),			/* cost of movzx */
459   8,					/* "large" insn */
460   4,					/* MOVE_RATIO */
461   1,					/* cost for loading QImode using movzbl */
462   {1, 1, 1},				/* cost of loading integer registers
463 					   in QImode, HImode and SImode.
464 					   Relative to reg-reg move (2).  */
465   {1, 1, 1},				/* cost of storing integer registers */
466   1,					/* cost of reg,reg fld/fst */
467   {1, 1, 1},				/* cost of loading fp registers
468 					   in SFmode, DFmode and XFmode */
469   {4, 6, 6},				/* cost of storing fp registers
470 					   in SFmode, DFmode and XFmode */
471 
472   1,					/* cost of moving MMX register */
473   {1, 1},				/* cost of loading MMX registers
474 					   in SImode and DImode */
475   {1, 1},				/* cost of storing MMX registers
476 					   in SImode and DImode */
477   1,					/* cost of moving SSE register */
478   {1, 1, 1},				/* cost of loading SSE registers
479 					   in SImode, DImode and TImode */
480   {1, 1, 1},				/* cost of storing SSE registers
481 					   in SImode, DImode and TImode */
482   1,					/* MMX or SSE register to integer */
483   64,					/* size of l1 cache.  */
484   128,					/* size of l2 cache.  */
485   32,					/* size of prefetch block */
486   1,					/* number of parallel prefetches */
487   1,					/* Branch cost */
488   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
489   COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
490   COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
491   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
492   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
493   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
494   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
495    DUMMY_STRINGOP_ALGS},
496   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
497    DUMMY_STRINGOP_ALGS},
498   1,                                    /* scalar_stmt_cost.  */
499   1,                                    /* scalar load_cost.  */
500   1,                                    /* scalar_store_cost.  */
501   1,                                    /* vec_stmt_cost.  */
502   1,                                    /* vec_to_scalar_cost.  */
503   1,                                    /* scalar_to_vec_cost.  */
504   1,                                    /* vec_align_load_cost.  */
505   2,                                    /* vec_unalign_load_cost.  */
506   1,                                    /* vec_store_cost.  */
507   3,                                    /* cond_taken_branch_cost.  */
508   1,                                    /* cond_not_taken_branch_cost.  */
509 };
510 
511 static const
512 struct processor_costs k6_cost = {
513   COSTS_N_INSNS (1),			/* cost of an add instruction */
514   COSTS_N_INSNS (2),			/* cost of a lea instruction */
515   COSTS_N_INSNS (1),			/* variable shift costs */
516   COSTS_N_INSNS (1),			/* constant shift costs */
517   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
518    COSTS_N_INSNS (3),			/*                               HI */
519    COSTS_N_INSNS (3),			/*                               SI */
520    COSTS_N_INSNS (3),			/*                               DI */
521    COSTS_N_INSNS (3)},			/*                               other */
522   0,					/* cost of multiply per each bit set */
523   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
524    COSTS_N_INSNS (18),			/*                          HI */
525    COSTS_N_INSNS (18),			/*                          SI */
526    COSTS_N_INSNS (18),			/*                          DI */
527    COSTS_N_INSNS (18)},			/*                          other */
528   COSTS_N_INSNS (2),			/* cost of movsx */
529   COSTS_N_INSNS (2),			/* cost of movzx */
530   8,					/* "large" insn */
531   4,					/* MOVE_RATIO */
532   3,					/* cost for loading QImode using movzbl */
533   {4, 5, 4},				/* cost of loading integer registers
534 					   in QImode, HImode and SImode.
535 					   Relative to reg-reg move (2).  */
536   {2, 3, 2},				/* cost of storing integer registers */
537   4,					/* cost of reg,reg fld/fst */
538   {6, 6, 6},				/* cost of loading fp registers
539 					   in SFmode, DFmode and XFmode */
540   {4, 4, 4},				/* cost of storing fp registers
541 					   in SFmode, DFmode and XFmode */
542   2,					/* cost of moving MMX register */
543   {2, 2},				/* cost of loading MMX registers
544 					   in SImode and DImode */
545   {2, 2},				/* cost of storing MMX registers
546 					   in SImode and DImode */
547   2,					/* cost of moving SSE register */
548   {2, 2, 8},				/* cost of loading SSE registers
549 					   in SImode, DImode and TImode */
550   {2, 2, 8},				/* cost of storing SSE registers
551 					   in SImode, DImode and TImode */
552   6,					/* MMX or SSE register to integer */
553   32,					/* size of l1 cache.  */
554   32,					/* size of l2 cache.  Some models
555 					   have integrated l2 cache, but
556 					   optimizing for k6 is not important
557 					   enough to worry about that.  */
558   32,					/* size of prefetch block */
559   1,					/* number of parallel prefetches */
560   1,					/* Branch cost */
561   COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
562   COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
563   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
564   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
565   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
566   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
567   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
568    DUMMY_STRINGOP_ALGS},
569   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
570    DUMMY_STRINGOP_ALGS},
571   1,                                    /* scalar_stmt_cost.  */
572   1,                                    /* scalar load_cost.  */
573   1,                                    /* scalar_store_cost.  */
574   1,                                    /* vec_stmt_cost.  */
575   1,                                    /* vec_to_scalar_cost.  */
576   1,                                    /* scalar_to_vec_cost.  */
577   1,                                    /* vec_align_load_cost.  */
578   2,                                    /* vec_unalign_load_cost.  */
579   1,                                    /* vec_store_cost.  */
580   3,                                    /* cond_taken_branch_cost.  */
581   1,                                    /* cond_not_taken_branch_cost.  */
582 };
583 
584 static const
585 struct processor_costs athlon_cost = {
586   COSTS_N_INSNS (1),			/* cost of an add instruction */
587   COSTS_N_INSNS (2),			/* cost of a lea instruction */
588   COSTS_N_INSNS (1),			/* variable shift costs */
589   COSTS_N_INSNS (1),			/* constant shift costs */
590   {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
591    COSTS_N_INSNS (5),			/*                               HI */
592    COSTS_N_INSNS (5),			/*                               SI */
593    COSTS_N_INSNS (5),			/*                               DI */
594    COSTS_N_INSNS (5)},			/*                               other */
595   0,					/* cost of multiply per each bit set */
596   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
597    COSTS_N_INSNS (26),			/*                          HI */
598    COSTS_N_INSNS (42),			/*                          SI */
599    COSTS_N_INSNS (74),			/*                          DI */
600    COSTS_N_INSNS (74)},			/*                          other */
601   COSTS_N_INSNS (1),			/* cost of movsx */
602   COSTS_N_INSNS (1),			/* cost of movzx */
603   8,					/* "large" insn */
604   9,					/* MOVE_RATIO */
605   4,					/* cost for loading QImode using movzbl */
606   {3, 4, 3},				/* cost of loading integer registers
607 					   in QImode, HImode and SImode.
608 					   Relative to reg-reg move (2).  */
609   {3, 4, 3},				/* cost of storing integer registers */
610   4,					/* cost of reg,reg fld/fst */
611   {4, 4, 12},				/* cost of loading fp registers
612 					   in SFmode, DFmode and XFmode */
613   {6, 6, 8},				/* cost of storing fp registers
614 					   in SFmode, DFmode and XFmode */
615   2,					/* cost of moving MMX register */
616   {4, 4},				/* cost of loading MMX registers
617 					   in SImode and DImode */
618   {4, 4},				/* cost of storing MMX registers
619 					   in SImode and DImode */
620   2,					/* cost of moving SSE register */
621   {4, 4, 6},				/* cost of loading SSE registers
622 					   in SImode, DImode and TImode */
623   {4, 4, 5},				/* cost of storing SSE registers
624 					   in SImode, DImode and TImode */
625   5,					/* MMX or SSE register to integer */
626   64,					/* size of l1 cache.  */
627   256,					/* size of l2 cache.  */
628   64,					/* size of prefetch block */
629   6,					/* number of parallel prefetches */
630   5,					/* Branch cost */
631   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
632   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
633   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
634   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
635   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
636   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
637   /* For some reason, Athlon deals better with REP prefix (relative to loops)
638      compared to K8. Alignment becomes important after 8 bytes for memcpy and
639      128 bytes for memset.  */
640   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
641    DUMMY_STRINGOP_ALGS},
642   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
643    DUMMY_STRINGOP_ALGS},
644   1,                                    /* scalar_stmt_cost.  */
645   1,                                    /* scalar load_cost.  */
646   1,                                    /* scalar_store_cost.  */
647   1,                                    /* vec_stmt_cost.  */
648   1,                                    /* vec_to_scalar_cost.  */
649   1,                                    /* scalar_to_vec_cost.  */
650   1,                                    /* vec_align_load_cost.  */
651   2,                                    /* vec_unalign_load_cost.  */
652   1,                                    /* vec_store_cost.  */
653   3,                                    /* cond_taken_branch_cost.  */
654   1,                                    /* cond_not_taken_branch_cost.  */
655 };
656 
657 static const
658 struct processor_costs k8_cost = {
659   COSTS_N_INSNS (1),			/* cost of an add instruction */
660   COSTS_N_INSNS (2),			/* cost of a lea instruction */
661   COSTS_N_INSNS (1),			/* variable shift costs */
662   COSTS_N_INSNS (1),			/* constant shift costs */
663   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
664    COSTS_N_INSNS (4),			/*                               HI */
665    COSTS_N_INSNS (3),			/*                               SI */
666    COSTS_N_INSNS (4),			/*                               DI */
667    COSTS_N_INSNS (5)},			/*                               other */
668   0,					/* cost of multiply per each bit set */
669   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
670    COSTS_N_INSNS (26),			/*                          HI */
671    COSTS_N_INSNS (42),			/*                          SI */
672    COSTS_N_INSNS (74),			/*                          DI */
673    COSTS_N_INSNS (74)},			/*                          other */
674   COSTS_N_INSNS (1),			/* cost of movsx */
675   COSTS_N_INSNS (1),			/* cost of movzx */
676   8,					/* "large" insn */
677   9,					/* MOVE_RATIO */
678   4,					/* cost for loading QImode using movzbl */
679   {3, 4, 3},				/* cost of loading integer registers
680 					   in QImode, HImode and SImode.
681 					   Relative to reg-reg move (2).  */
682   {3, 4, 3},				/* cost of storing integer registers */
683   4,					/* cost of reg,reg fld/fst */
684   {4, 4, 12},				/* cost of loading fp registers
685 					   in SFmode, DFmode and XFmode */
686   {6, 6, 8},				/* cost of storing fp registers
687 					   in SFmode, DFmode and XFmode */
688   2,					/* cost of moving MMX register */
689   {3, 3},				/* cost of loading MMX registers
690 					   in SImode and DImode */
691   {4, 4},				/* cost of storing MMX registers
692 					   in SImode and DImode */
693   2,					/* cost of moving SSE register */
694   {4, 3, 6},				/* cost of loading SSE registers
695 					   in SImode, DImode and TImode */
696   {4, 4, 5},				/* cost of storing SSE registers
697 					   in SImode, DImode and TImode */
698   5,					/* MMX or SSE register to integer */
699   64,					/* size of l1 cache.  */
700   512,					/* size of l2 cache.  */
701   64,					/* size of prefetch block */
702   /* New AMD processors never drop prefetches; if they cannot be performed
703      immediately, they are queued.  We set number of simultaneous prefetches
704      to a large constant to reflect this (it probably is not a good idea not
705      to limit number of prefetches at all, as their execution also takes some
706      time).  */
707   100,					/* number of parallel prefetches */
708   3,					/* Branch cost */
709   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
710   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
711   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
712   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
713   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
714   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
715   /* K8 has optimized REP instruction for medium sized blocks, but for very small
716      blocks it is better to use loop. For large blocks, libcall can do
717      nontemporary accesses and beat inline considerably.  */
718   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
719    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
720   {{libcall, {{8, loop}, {24, unrolled_loop},
721 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
722    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
723   4,                                    /* scalar_stmt_cost.  */
724   2,                                    /* scalar load_cost.  */
725   2,                                    /* scalar_store_cost.  */
726   5,                                    /* vec_stmt_cost.  */
727   0,                                    /* vec_to_scalar_cost.  */
728   2,                                    /* scalar_to_vec_cost.  */
729   2,                                    /* vec_align_load_cost.  */
730   3,                                    /* vec_unalign_load_cost.  */
731   3,                                    /* vec_store_cost.  */
732   3,                                    /* cond_taken_branch_cost.  */
733   2,                                    /* cond_not_taken_branch_cost.  */
734 };
735 
736 struct processor_costs amdfam10_cost = {
737   COSTS_N_INSNS (1),                    /* cost of an add instruction */
738   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
739   COSTS_N_INSNS (1),                    /* variable shift costs */
740   COSTS_N_INSNS (1),                    /* constant shift costs */
741   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
742    COSTS_N_INSNS (4),                   /*                               HI */
743    COSTS_N_INSNS (3),                   /*                               SI */
744    COSTS_N_INSNS (4),                   /*                               DI */
745    COSTS_N_INSNS (5)},                  /*                               other */
746   0,                                    /* cost of multiply per each bit set */
747   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
748    COSTS_N_INSNS (35),                  /*                          HI */
749    COSTS_N_INSNS (51),                  /*                          SI */
750    COSTS_N_INSNS (83),                  /*                          DI */
751    COSTS_N_INSNS (83)},                 /*                          other */
752   COSTS_N_INSNS (1),			/* cost of movsx */
753   COSTS_N_INSNS (1),			/* cost of movzx */
754   8,					/* "large" insn */
755   9,					/* MOVE_RATIO */
756   4,					/* cost for loading QImode using movzbl */
757   {3, 4, 3},				/* cost of loading integer registers
758 					   in QImode, HImode and SImode.
759 					   Relative to reg-reg move (2).  */
760   {3, 4, 3},				/* cost of storing integer registers */
761   4,					/* cost of reg,reg fld/fst */
762   {4, 4, 12},				/* cost of loading fp registers
763 		   			   in SFmode, DFmode and XFmode */
764   {6, 6, 8},				/* cost of storing fp registers
765  		   			   in SFmode, DFmode and XFmode */
766   2,					/* cost of moving MMX register */
767   {3, 3},				/* cost of loading MMX registers
768 					   in SImode and DImode */
769   {4, 4},				/* cost of storing MMX registers
770 					   in SImode and DImode */
771   2,					/* cost of moving SSE register */
772   {4, 4, 3},				/* cost of loading SSE registers
773 					   in SImode, DImode and TImode */
774   {4, 4, 5},				/* cost of storing SSE registers
775 					   in SImode, DImode and TImode */
776   3,					/* MMX or SSE register to integer */
777   					/* On K8
778   					    MOVD reg64, xmmreg 	Double	FSTORE 4
779 					    MOVD reg32, xmmreg 	Double	FSTORE 4
780 					   On AMDFAM10
781 					    MOVD reg64, xmmreg 	Double	FADD 3
782                                                                 1/1  1/1
783 					    MOVD reg32, xmmreg 	Double	FADD 3
784                                                                 1/1  1/1 */
785   64,					/* size of l1 cache.  */
786   512,					/* size of l2 cache.  */
787   64,					/* size of prefetch block */
788   /* New AMD processors never drop prefetches; if they cannot be performed
789      immediately, they are queued.  We set number of simultaneous prefetches
790      to a large constant to reflect this (it probably is not a good idea not
791      to limit number of prefetches at all, as their execution also takes some
792      time).  */
793   100,					/* number of parallel prefetches */
794   2,					/* Branch cost */
795   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
796   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
797   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
798   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
799   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
800   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
801 
802   /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
803      very small blocks it is better to use loop. For large blocks, libcall can
804      do nontemporary accesses and beat inline considerably.  */
805   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
806    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
807   {{libcall, {{8, loop}, {24, unrolled_loop},
808 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
809    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
810   4,                                    /* scalar_stmt_cost.  */
811   2,                                    /* scalar load_cost.  */
812   2,                                    /* scalar_store_cost.  */
813   6,                                    /* vec_stmt_cost.  */
814   0,                                    /* vec_to_scalar_cost.  */
815   2,                                    /* scalar_to_vec_cost.  */
816   2,                                    /* vec_align_load_cost.  */
817   2,                                    /* vec_unalign_load_cost.  */
818   2,                                    /* vec_store_cost.  */
819   2,                                    /* cond_taken_branch_cost.  */
820   1,                                    /* cond_not_taken_branch_cost.  */
821 };
822 
823 static const
824 struct processor_costs pentium4_cost = {
825   COSTS_N_INSNS (1),			/* cost of an add instruction */
826   COSTS_N_INSNS (3),			/* cost of a lea instruction */
827   COSTS_N_INSNS (4),			/* variable shift costs */
828   COSTS_N_INSNS (4),			/* constant shift costs */
829   {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
830    COSTS_N_INSNS (15),			/*                               HI */
831    COSTS_N_INSNS (15),			/*                               SI */
832    COSTS_N_INSNS (15),			/*                               DI */
833    COSTS_N_INSNS (15)},			/*                               other */
834   0,					/* cost of multiply per each bit set */
835   {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
836    COSTS_N_INSNS (56),			/*                          HI */
837    COSTS_N_INSNS (56),			/*                          SI */
838    COSTS_N_INSNS (56),			/*                          DI */
839    COSTS_N_INSNS (56)},			/*                          other */
840   COSTS_N_INSNS (1),			/* cost of movsx */
841   COSTS_N_INSNS (1),			/* cost of movzx */
842   16,					/* "large" insn */
843   6,					/* MOVE_RATIO */
844   2,					/* cost for loading QImode using movzbl */
845   {4, 5, 4},				/* cost of loading integer registers
846 					   in QImode, HImode and SImode.
847 					   Relative to reg-reg move (2).  */
848   {2, 3, 2},				/* cost of storing integer registers */
849   2,					/* cost of reg,reg fld/fst */
850   {2, 2, 6},				/* cost of loading fp registers
851 					   in SFmode, DFmode and XFmode */
852   {4, 4, 6},				/* cost of storing fp registers
853 					   in SFmode, DFmode and XFmode */
854   2,					/* cost of moving MMX register */
855   {2, 2},				/* cost of loading MMX registers
856 					   in SImode and DImode */
857   {2, 2},				/* cost of storing MMX registers
858 					   in SImode and DImode */
859   12,					/* cost of moving SSE register */
860   {12, 12, 12},				/* cost of loading SSE registers
861 					   in SImode, DImode and TImode */
862   {2, 2, 8},				/* cost of storing SSE registers
863 					   in SImode, DImode and TImode */
864   10,					/* MMX or SSE register to integer */
865   8,					/* size of l1 cache.  */
866   256,					/* size of l2 cache.  */
867   64,					/* size of prefetch block */
868   6,					/* number of parallel prefetches */
869   2,					/* Branch cost */
870   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
871   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
872   COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
873   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
874   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
875   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
876   {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
877    DUMMY_STRINGOP_ALGS},
878   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
879    {-1, libcall}}},
880    DUMMY_STRINGOP_ALGS},
881   1,                                    /* scalar_stmt_cost.  */
882   1,                                    /* scalar load_cost.  */
883   1,                                    /* scalar_store_cost.  */
884   1,                                    /* vec_stmt_cost.  */
885   1,                                    /* vec_to_scalar_cost.  */
886   1,                                    /* scalar_to_vec_cost.  */
887   1,                                    /* vec_align_load_cost.  */
888   2,                                    /* vec_unalign_load_cost.  */
889   1,                                    /* vec_store_cost.  */
890   3,                                    /* cond_taken_branch_cost.  */
891   1,                                    /* cond_not_taken_branch_cost.  */
892 };
893 
894 static const
895 struct processor_costs nocona_cost = {
896   COSTS_N_INSNS (1),			/* cost of an add instruction */
897   COSTS_N_INSNS (1),			/* cost of a lea instruction */
898   COSTS_N_INSNS (1),			/* variable shift costs */
899   COSTS_N_INSNS (1),			/* constant shift costs */
900   {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
901    COSTS_N_INSNS (10),			/*                               HI */
902    COSTS_N_INSNS (10),			/*                               SI */
903    COSTS_N_INSNS (10),			/*                               DI */
904    COSTS_N_INSNS (10)},			/*                               other */
905   0,					/* cost of multiply per each bit set */
906   {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
907    COSTS_N_INSNS (66),			/*                          HI */
908    COSTS_N_INSNS (66),			/*                          SI */
909    COSTS_N_INSNS (66),			/*                          DI */
910    COSTS_N_INSNS (66)},			/*                          other */
911   COSTS_N_INSNS (1),			/* cost of movsx */
912   COSTS_N_INSNS (1),			/* cost of movzx */
913   16,					/* "large" insn */
914   17,					/* MOVE_RATIO */
915   4,					/* cost for loading QImode using movzbl */
916   {4, 4, 4},				/* cost of loading integer registers
917 					   in QImode, HImode and SImode.
918 					   Relative to reg-reg move (2).  */
919   {4, 4, 4},				/* cost of storing integer registers */
920   3,					/* cost of reg,reg fld/fst */
921   {12, 12, 12},				/* cost of loading fp registers
922 					   in SFmode, DFmode and XFmode */
923   {4, 4, 4},				/* cost of storing fp registers
924 					   in SFmode, DFmode and XFmode */
925   6,					/* cost of moving MMX register */
926   {12, 12},				/* cost of loading MMX registers
927 					   in SImode and DImode */
928   {12, 12},				/* cost of storing MMX registers
929 					   in SImode and DImode */
930   6,					/* cost of moving SSE register */
931   {12, 12, 12},				/* cost of loading SSE registers
932 					   in SImode, DImode and TImode */
933   {12, 12, 12},				/* cost of storing SSE registers
934 					   in SImode, DImode and TImode */
935   8,					/* MMX or SSE register to integer */
936   8,					/* size of l1 cache.  */
937   1024,					/* size of l2 cache.  */
938   128,					/* size of prefetch block */
939   8,					/* number of parallel prefetches */
940   1,					/* Branch cost */
941   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
942   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
943   COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
944   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
945   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
946   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
947   {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
948    {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
949 	      {100000, unrolled_loop}, {-1, libcall}}}},
950   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
951    {-1, libcall}}},
952    {libcall, {{24, loop}, {64, unrolled_loop},
953 	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
954   1,                                    /* scalar_stmt_cost.  */
955   1,                                    /* scalar load_cost.  */
956   1,                                    /* scalar_store_cost.  */
957   1,                                    /* vec_stmt_cost.  */
958   1,                                    /* vec_to_scalar_cost.  */
959   1,                                    /* scalar_to_vec_cost.  */
960   1,                                    /* vec_align_load_cost.  */
961   2,                                    /* vec_unalign_load_cost.  */
962   1,                                    /* vec_store_cost.  */
963   3,                                    /* cond_taken_branch_cost.  */
964   1,                                    /* cond_not_taken_branch_cost.  */
965 };
966 
967 static const
968 struct processor_costs core2_cost = {
969   COSTS_N_INSNS (1),			/* cost of an add instruction */
970   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
971   COSTS_N_INSNS (1),			/* variable shift costs */
972   COSTS_N_INSNS (1),			/* constant shift costs */
973   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
974    COSTS_N_INSNS (3),			/*                               HI */
975    COSTS_N_INSNS (3),			/*                               SI */
976    COSTS_N_INSNS (3),			/*                               DI */
977    COSTS_N_INSNS (3)},			/*                               other */
978   0,					/* cost of multiply per each bit set */
979   {COSTS_N_INSNS (22),			/* cost of a divide/mod for QI */
980    COSTS_N_INSNS (22),			/*                          HI */
981    COSTS_N_INSNS (22),			/*                          SI */
982    COSTS_N_INSNS (22),			/*                          DI */
983    COSTS_N_INSNS (22)},			/*                          other */
984   COSTS_N_INSNS (1),			/* cost of movsx */
985   COSTS_N_INSNS (1),			/* cost of movzx */
986   8,					/* "large" insn */
987   16,					/* MOVE_RATIO */
988   2,					/* cost for loading QImode using movzbl */
989   {6, 6, 6},				/* cost of loading integer registers
990 					   in QImode, HImode and SImode.
991 					   Relative to reg-reg move (2).  */
992   {4, 4, 4},				/* cost of storing integer registers */
993   2,					/* cost of reg,reg fld/fst */
994   {6, 6, 6},				/* cost of loading fp registers
995 					   in SFmode, DFmode and XFmode */
996   {4, 4, 4},				/* cost of storing fp registers
997 					   in SFmode, DFmode and XFmode */
998   2,					/* cost of moving MMX register */
999   {6, 6},				/* cost of loading MMX registers
1000 					   in SImode and DImode */
1001   {4, 4},				/* cost of storing MMX registers
1002 					   in SImode and DImode */
1003   2,					/* cost of moving SSE register */
1004   {6, 6, 6},				/* cost of loading SSE registers
1005 					   in SImode, DImode and TImode */
1006   {4, 4, 4},				/* cost of storing SSE registers
1007 					   in SImode, DImode and TImode */
1008   2,					/* MMX or SSE register to integer */
1009   32,					/* size of l1 cache.  */
1010   2048,					/* size of l2 cache.  */
1011   128,					/* size of prefetch block */
1012   8,					/* number of parallel prefetches */
1013   3,					/* Branch cost */
1014   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
1015   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1016   COSTS_N_INSNS (32),			/* cost of FDIV instruction.  */
1017   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1018   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1019   COSTS_N_INSNS (58),			/* cost of FSQRT instruction.  */
1020   {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1021    {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1022 	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1023   {{libcall, {{8, loop}, {15, unrolled_loop},
1024 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
1025    {libcall, {{24, loop}, {32, unrolled_loop},
1026 	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1027   1,                                    /* scalar_stmt_cost.  */
1028   1,                                    /* scalar load_cost.  */
1029   1,                                    /* scalar_store_cost.  */
1030   1,                                    /* vec_stmt_cost.  */
1031   1,                                    /* vec_to_scalar_cost.  */
1032   1,                                    /* scalar_to_vec_cost.  */
1033   1,                                    /* vec_align_load_cost.  */
1034   2,                                    /* vec_unalign_load_cost.  */
1035   1,                                    /* vec_store_cost.  */
1036   3,                                    /* cond_taken_branch_cost.  */
1037   1,                                    /* cond_not_taken_branch_cost.  */
1038 };
1039 
1040 static const
1041 struct processor_costs atom_cost = {
1042   COSTS_N_INSNS (1),			/* cost of an add instruction */
1043   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1044   COSTS_N_INSNS (1),			/* variable shift costs */
1045   COSTS_N_INSNS (1),			/* constant shift costs */
1046   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1047    COSTS_N_INSNS (4),			/*                               HI */
1048    COSTS_N_INSNS (3),			/*                               SI */
1049    COSTS_N_INSNS (4),			/*                               DI */
1050    COSTS_N_INSNS (2)},			/*                               other */
1051   0,					/* cost of multiply per each bit set */
1052   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1053    COSTS_N_INSNS (26),			/*                          HI */
1054    COSTS_N_INSNS (42),			/*                          SI */
1055    COSTS_N_INSNS (74),			/*                          DI */
1056    COSTS_N_INSNS (74)},			/*                          other */
1057   COSTS_N_INSNS (1),			/* cost of movsx */
1058   COSTS_N_INSNS (1),			/* cost of movzx */
1059   8,					/* "large" insn */
1060   17,					/* MOVE_RATIO */
1061   2,					/* cost for loading QImode using movzbl */
1062   {4, 4, 4},				/* cost of loading integer registers
1063 					   in QImode, HImode and SImode.
1064 					   Relative to reg-reg move (2).  */
1065   {4, 4, 4},				/* cost of storing integer registers */
1066   4,					/* cost of reg,reg fld/fst */
1067   {12, 12, 12},				/* cost of loading fp registers
1068 					   in SFmode, DFmode and XFmode */
1069   {6, 6, 8},				/* cost of storing fp registers
1070 					   in SFmode, DFmode and XFmode */
1071   2,					/* cost of moving MMX register */
1072   {8, 8},				/* cost of loading MMX registers
1073 					   in SImode and DImode */
1074   {8, 8},				/* cost of storing MMX registers
1075 					   in SImode and DImode */
1076   2,					/* cost of moving SSE register */
1077   {8, 8, 8},				/* cost of loading SSE registers
1078 					   in SImode, DImode and TImode */
1079   {8, 8, 8},				/* cost of storing SSE registers
1080 					   in SImode, DImode and TImode */
1081   5,					/* MMX or SSE register to integer */
1082   32,					/* size of l1 cache.  */
1083   256,					/* size of l2 cache.  */
1084   64,					/* size of prefetch block */
1085   6,					/* number of parallel prefetches */
1086   3,					/* Branch cost */
1087   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1088   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1089   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1090   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1091   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1092   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1093   {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1094    {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1095           {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1096   {{libcall, {{8, loop}, {15, unrolled_loop},
1097           {2048, rep_prefix_4_byte}, {-1, libcall}}},
1098    {libcall, {{24, loop}, {32, unrolled_loop},
1099           {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1100   1,                                    /* scalar_stmt_cost.  */
1101   1,                                    /* scalar load_cost.  */
1102   1,                                    /* scalar_store_cost.  */
1103   1,                                    /* vec_stmt_cost.  */
1104   1,                                    /* vec_to_scalar_cost.  */
1105   1,                                    /* scalar_to_vec_cost.  */
1106   1,                                    /* vec_align_load_cost.  */
1107   2,                                    /* vec_unalign_load_cost.  */
1108   1,                                    /* vec_store_cost.  */
1109   3,                                    /* cond_taken_branch_cost.  */
1110   1,                                    /* cond_not_taken_branch_cost.  */
1111 };
1112 
1113 /* Generic64 should produce code tuned for Nocona and K8.  */
1114 static const
1115 struct processor_costs generic64_cost = {
1116   COSTS_N_INSNS (1),			/* cost of an add instruction */
1117   /* On all chips taken into consideration lea is 2 cycles and more.  With
1118      this cost however our current implementation of synth_mult results in
1119      use of unnecessary temporary registers causing regression on several
1120      SPECfp benchmarks.  */
1121   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1122   COSTS_N_INSNS (1),			/* variable shift costs */
1123   COSTS_N_INSNS (1),			/* constant shift costs */
1124   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1125    COSTS_N_INSNS (4),			/*                               HI */
1126    COSTS_N_INSNS (3),			/*                               SI */
1127    COSTS_N_INSNS (4),			/*                               DI */
1128    COSTS_N_INSNS (2)},			/*                               other */
1129   0,					/* cost of multiply per each bit set */
1130   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1131    COSTS_N_INSNS (26),			/*                          HI */
1132    COSTS_N_INSNS (42),			/*                          SI */
1133    COSTS_N_INSNS (74),			/*                          DI */
1134    COSTS_N_INSNS (74)},			/*                          other */
1135   COSTS_N_INSNS (1),			/* cost of movsx */
1136   COSTS_N_INSNS (1),			/* cost of movzx */
1137   8,					/* "large" insn */
1138   17,					/* MOVE_RATIO */
1139   4,					/* cost for loading QImode using movzbl */
1140   {4, 4, 4},				/* cost of loading integer registers
1141 					   in QImode, HImode and SImode.
1142 					   Relative to reg-reg move (2).  */
1143   {4, 4, 4},				/* cost of storing integer registers */
1144   4,					/* cost of reg,reg fld/fst */
1145   {12, 12, 12},				/* cost of loading fp registers
1146 					   in SFmode, DFmode and XFmode */
1147   {6, 6, 8},				/* cost of storing fp registers
1148 					   in SFmode, DFmode and XFmode */
1149   2,					/* cost of moving MMX register */
1150   {8, 8},				/* cost of loading MMX registers
1151 					   in SImode and DImode */
1152   {8, 8},				/* cost of storing MMX registers
1153 					   in SImode and DImode */
1154   2,					/* cost of moving SSE register */
1155   {8, 8, 8},				/* cost of loading SSE registers
1156 					   in SImode, DImode and TImode */
1157   {8, 8, 8},				/* cost of storing SSE registers
1158 					   in SImode, DImode and TImode */
1159   5,					/* MMX or SSE register to integer */
1160   32,					/* size of l1 cache.  */
1161   512,					/* size of l2 cache.  */
1162   64,					/* size of prefetch block */
1163   6,					/* number of parallel prefetches */
1164   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
1165      is increased to perhaps more appropriate value of 5.  */
1166   3,					/* Branch cost */
1167   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1168   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1169   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1170   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1171   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1172   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1173   {DUMMY_STRINGOP_ALGS,
1174    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1175   {DUMMY_STRINGOP_ALGS,
1176    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1177   1,                                    /* scalar_stmt_cost.  */
1178   1,                                    /* scalar load_cost.  */
1179   1,                                    /* scalar_store_cost.  */
1180   1,                                    /* vec_stmt_cost.  */
1181   1,                                    /* vec_to_scalar_cost.  */
1182   1,                                    /* scalar_to_vec_cost.  */
1183   1,                                    /* vec_align_load_cost.  */
1184   2,                                    /* vec_unalign_load_cost.  */
1185   1,                                    /* vec_store_cost.  */
1186   3,                                    /* cond_taken_branch_cost.  */
1187   1,                                    /* cond_not_taken_branch_cost.  */
1188 };
1189 
1190 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8.  */
1191 static const
1192 struct processor_costs generic32_cost = {
1193   COSTS_N_INSNS (1),			/* cost of an add instruction */
1194   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1195   COSTS_N_INSNS (1),			/* variable shift costs */
1196   COSTS_N_INSNS (1),			/* constant shift costs */
1197   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1198    COSTS_N_INSNS (4),			/*                               HI */
1199    COSTS_N_INSNS (3),			/*                               SI */
1200    COSTS_N_INSNS (4),			/*                               DI */
1201    COSTS_N_INSNS (2)},			/*                               other */
1202   0,					/* cost of multiply per each bit set */
1203   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1204    COSTS_N_INSNS (26),			/*                          HI */
1205    COSTS_N_INSNS (42),			/*                          SI */
1206    COSTS_N_INSNS (74),			/*                          DI */
1207    COSTS_N_INSNS (74)},			/*                          other */
1208   COSTS_N_INSNS (1),			/* cost of movsx */
1209   COSTS_N_INSNS (1),			/* cost of movzx */
1210   8,					/* "large" insn */
1211   17,					/* MOVE_RATIO */
1212   4,					/* cost for loading QImode using movzbl */
1213   {4, 4, 4},				/* cost of loading integer registers
1214 					   in QImode, HImode and SImode.
1215 					   Relative to reg-reg move (2).  */
1216   {4, 4, 4},				/* cost of storing integer registers */
1217   4,					/* cost of reg,reg fld/fst */
1218   {12, 12, 12},				/* cost of loading fp registers
1219 					   in SFmode, DFmode and XFmode */
1220   {6, 6, 8},				/* cost of storing fp registers
1221 					   in SFmode, DFmode and XFmode */
1222   2,					/* cost of moving MMX register */
1223   {8, 8},				/* cost of loading MMX registers
1224 					   in SImode and DImode */
1225   {8, 8},				/* cost of storing MMX registers
1226 					   in SImode and DImode */
1227   2,					/* cost of moving SSE register */
1228   {8, 8, 8},				/* cost of loading SSE registers
1229 					   in SImode, DImode and TImode */
1230   {8, 8, 8},				/* cost of storing SSE registers
1231 					   in SImode, DImode and TImode */
1232   5,					/* MMX or SSE register to integer */
1233   32,					/* size of l1 cache.  */
1234   256,					/* size of l2 cache.  */
1235   64,					/* size of prefetch block */
1236   6,					/* number of parallel prefetches */
1237   3,					/* Branch cost */
1238   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
1239   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1240   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1241   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
1242   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
1243   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
1244   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1245    DUMMY_STRINGOP_ALGS},
1246   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1247    DUMMY_STRINGOP_ALGS},
1248   1,                                    /* scalar_stmt_cost.  */
1249   1,                                    /* scalar load_cost.  */
1250   1,                                    /* scalar_store_cost.  */
1251   1,                                    /* vec_stmt_cost.  */
1252   1,                                    /* vec_to_scalar_cost.  */
1253   1,                                    /* scalar_to_vec_cost.  */
1254   1,                                    /* vec_align_load_cost.  */
1255   2,                                    /* vec_unalign_load_cost.  */
1256   1,                                    /* vec_store_cost.  */
1257   3,                                    /* cond_taken_branch_cost.  */
1258   1,                                    /* cond_not_taken_branch_cost.  */
1259 };
1260 
1261 const struct processor_costs *ix86_cost = &pentium_cost;
1262 
1263 /* Processor feature/optimization bitmasks.  */
1264 #define m_386 (1<<PROCESSOR_I386)
1265 #define m_486 (1<<PROCESSOR_I486)
1266 #define m_PENT (1<<PROCESSOR_PENTIUM)
1267 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1268 #define m_PENT4  (1<<PROCESSOR_PENTIUM4)
1269 #define m_NOCONA  (1<<PROCESSOR_NOCONA)
1270 #define m_CORE2  (1<<PROCESSOR_CORE2)
1271 #define m_ATOM  (1<<PROCESSOR_ATOM)
1272 
1273 #define m_GEODE  (1<<PROCESSOR_GEODE)
1274 #define m_K6  (1<<PROCESSOR_K6)
1275 #define m_K6_GEODE  (m_K6 | m_GEODE)
1276 #define m_K8  (1<<PROCESSOR_K8)
1277 #define m_ATHLON  (1<<PROCESSOR_ATHLON)
1278 #define m_ATHLON_K8  (m_K8 | m_ATHLON)
1279 #define m_AMDFAM10  (1<<PROCESSOR_AMDFAM10)
1280 #define m_AMD_MULTIPLE  (m_K8 | m_ATHLON | m_AMDFAM10)
1281 
1282 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1283 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1284 
1285 /* Generic instruction choice should be common subset of supported CPUs
1286    (PPro/PENT4/NOCONA/CORE2/Athlon/K8).  */
1287 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1288 
1289 /* Feature tests against the various tunings.  */
1290 unsigned char ix86_tune_features[X86_TUNE_LAST];
1291 
1292 /* Feature tests against the various tunings used to create ix86_tune_features
1293    based on the processor mask.  */
1294 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1295   /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1296      negatively, so enabling for Generic64 seems like good code size
1297      tradeoff.  We can't enable it for 32bit generic because it does not
1298      work well with PPro base chips.  */
1299   m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_CORE2 | m_GENERIC64,
1300 
1301   /* X86_TUNE_PUSH_MEMORY */
1302   m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
1303   | m_NOCONA | m_CORE2 | m_GENERIC,
1304 
1305   /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1306   m_486 | m_PENT,
1307 
1308   /* X86_TUNE_UNROLL_STRLEN */
1309   m_486 | m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_K6
1310   | m_CORE2 | m_GENERIC,
1311 
1312   /* X86_TUNE_DEEP_BRANCH_PREDICTION */
1313   m_ATOM | m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4 | m_GENERIC,
1314 
1315   /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1316      on simulation result. But after P4 was made, no performance benefit
1317      was observed with branch hints.  It also increases the code size.
1318      As a result, icc never generates branch hints.  */
1319   0,
1320 
1321   /* X86_TUNE_DOUBLE_WITH_ADD */
1322   ~m_386,
1323 
1324   /* X86_TUNE_USE_SAHF */
1325   m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
1326   | m_NOCONA | m_CORE2 | m_GENERIC,
1327 
1328   /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1329      partial dependencies.  */
1330   m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_PENT4 | m_NOCONA
1331   | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
1332 
1333   /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1334      register stalls on Generic32 compilation setting as well.  However
1335      in current implementation the partial register stalls are not eliminated
1336      very well - they can be introduced via subregs synthesized by combine
1337      and can happen in caller/callee saving sequences.  Because this option
1338      pays back little on PPro based chips and is in conflict with partial reg
1339      dependencies used by Athlon/P4 based chips, it is better to leave it off
1340      for generic32 for now.  */
1341   m_PPRO,
1342 
1343   /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1344   m_CORE2 | m_GENERIC,
1345 
1346   /* X86_TUNE_USE_HIMODE_FIOP */
1347   m_386 | m_486 | m_K6_GEODE,
1348 
1349   /* X86_TUNE_USE_SIMODE_FIOP */
1350   ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_ATOM | m_CORE2 | m_GENERIC),
1351 
1352   /* X86_TUNE_USE_MOV0 */
1353   m_K6,
1354 
1355   /* X86_TUNE_USE_CLTD */
1356   ~(m_PENT | m_ATOM | m_K6 | m_CORE2 | m_GENERIC),
1357 
1358   /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
1359   m_PENT4,
1360 
1361   /* X86_TUNE_SPLIT_LONG_MOVES */
1362   m_PPRO,
1363 
1364   /* X86_TUNE_READ_MODIFY_WRITE */
1365   ~m_PENT,
1366 
1367   /* X86_TUNE_READ_MODIFY */
1368   ~(m_PENT | m_PPRO),
1369 
1370   /* X86_TUNE_PROMOTE_QIMODE */
1371   m_K6_GEODE | m_PENT | m_ATOM | m_386 | m_486 | m_AMD_MULTIPLE
1372   | m_CORE2 | m_GENERIC /* | m_PENT4 ? */,
1373 
1374   /* X86_TUNE_FAST_PREFIX */
1375   ~(m_PENT | m_486 | m_386),
1376 
1377   /* X86_TUNE_SINGLE_STRINGOP */
1378   m_386 | m_PENT4 | m_NOCONA,
1379 
1380   /* X86_TUNE_QIMODE_MATH */
1381   ~0,
1382 
1383   /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1384      register stalls.  Just like X86_TUNE_PARTIAL_REG_STALL this option
1385      might be considered for Generic32 if our scheme for avoiding partial
1386      stalls was more effective.  */
1387   ~m_PPRO,
1388 
1389   /* X86_TUNE_PROMOTE_QI_REGS */
1390   0,
1391 
1392   /* X86_TUNE_PROMOTE_HI_REGS */
1393   m_PPRO,
1394 
1395   /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop.  */
1396   m_ATOM | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT4 | m_NOCONA
1397   | m_CORE2 | m_GENERIC,
1398 
1399   /* X86_TUNE_ADD_ESP_8 */
1400   m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_K6_GEODE | m_386
1401   | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1402 
1403   /* X86_TUNE_SUB_ESP_4 */
1404   m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2
1405   | m_GENERIC,
1406 
1407   /* X86_TUNE_SUB_ESP_8 */
1408   m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_386 | m_486
1409   | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1410 
1411   /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1412      for DFmode copies */
1413   ~(m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1414     | m_GENERIC | m_GEODE),
1415 
1416   /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1417   m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1418 
1419   /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1420      conflict here in between PPro/Pentium4 based chips that thread 128bit
1421      SSE registers as single units versus K8 based chips that divide SSE
1422      registers to two 64bit halves.  This knob promotes all store destinations
1423      to be 128bit to allow register renaming on 128bit SSE units, but usually
1424      results in one extra microop on 64bit SSE units.  Experimental results
1425      shows that disabling this option on P4 brings over 20% SPECfp regression,
1426      while enabling it on K8 brings roughly 2.4% regression that can be partly
1427      masked by careful scheduling of moves.  */
1428   m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC
1429   | m_AMDFAM10,
1430 
1431   /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
1432   m_AMDFAM10,
1433 
1434   /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1435      are resolved on SSE register parts instead of whole registers, so we may
1436      maintain just lower part of scalar values in proper format leaving the
1437      upper part undefined.  */
1438   m_ATHLON_K8,
1439 
1440   /* X86_TUNE_SSE_TYPELESS_STORES */
1441   m_AMD_MULTIPLE,
1442 
1443   /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1444   m_PPRO | m_PENT4 | m_NOCONA,
1445 
1446   /* X86_TUNE_MEMORY_MISMATCH_STALL */
1447   m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1448 
1449   /* X86_TUNE_PROLOGUE_USING_MOVE */
1450   m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2 | m_GENERIC,
1451 
1452   /* X86_TUNE_EPILOGUE_USING_MOVE */
1453   m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2 | m_GENERIC,
1454 
1455   /* X86_TUNE_SHIFT1 */
1456   ~m_486,
1457 
1458   /* X86_TUNE_USE_FFREEP */
1459   m_AMD_MULTIPLE,
1460 
1461   /* X86_TUNE_INTER_UNIT_MOVES */
1462   ~(m_AMD_MULTIPLE | m_GENERIC),
1463 
1464   /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1465   ~(m_AMDFAM10),
1466 
1467   /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1468      than 4 branch instructions in the 16 byte window.  */
1469   m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2
1470   | m_GENERIC,
1471 
1472   /* X86_TUNE_SCHEDULE */
1473   m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_ATOM | m_CORE2
1474   | m_GENERIC,
1475 
1476   /* X86_TUNE_USE_BT */
1477   m_AMD_MULTIPLE | m_ATOM | m_CORE2 | m_GENERIC,
1478 
1479   /* X86_TUNE_USE_INCDEC */
1480   ~(m_PENT4 | m_NOCONA | m_GENERIC | m_ATOM),
1481 
1482   /* X86_TUNE_PAD_RETURNS */
1483   m_AMD_MULTIPLE | m_CORE2 | m_GENERIC,
1484 
1485   /* X86_TUNE_EXT_80387_CONSTANTS */
1486   m_K6_GEODE | m_ATHLON_K8 | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO
1487   | m_CORE2 | m_GENERIC,
1488 
1489   /* X86_TUNE_SHORTEN_X87_SSE */
1490   ~m_K8,
1491 
1492   /* X86_TUNE_AVOID_VECTOR_DECODE */
1493   m_K8 | m_GENERIC64,
1494 
1495   /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1496      and SImode multiply, but 386 and 486 do HImode multiply faster.  */
1497   ~(m_386 | m_486),
1498 
1499   /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1500      vector path on AMD machines.  */
1501   m_K8 | m_GENERIC64 | m_AMDFAM10,
1502 
1503   /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1504      machines.  */
1505   m_K8 | m_GENERIC64 | m_AMDFAM10,
1506 
1507   /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1508      than a MOV.  */
1509   m_PENT,
1510 
1511   /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1512      but one byte longer.  */
1513   m_PENT,
1514 
1515   /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1516      operand that cannot be represented using a modRM byte.  The XOR
1517      replacement is long decoded, so this split helps here as well.  */
1518   m_K6,
1519 
1520   /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1521      from FP to FP. */
1522   m_AMDFAM10 | m_GENERIC,
1523 
1524   /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1525      from integer to FP. */
1526   m_AMDFAM10,
1527 
1528   /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1529      with a subsequent conditional jump instruction into a single
1530      compare-and-branch uop.  */
1531   m_CORE2,
1532 
1533   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
1534      will impact LEA instruction selection. */
1535   m_ATOM,
1536 };
1537 
1538 /* Feature tests against the various architecture variations.  */
1539 unsigned char ix86_arch_features[X86_ARCH_LAST];
1540 
1541 /* Feature tests against the various architecture variations, used to create
1542    ix86_arch_features based on the processor mask.  */
1543 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1544   /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
1545   ~(m_386 | m_486 | m_PENT | m_K6),
1546 
1547   /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
1548   ~m_386,
1549 
1550   /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1551   ~(m_386 | m_486),
1552 
1553   /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
1554   ~m_386,
1555 
1556   /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
1557   ~m_386,
1558 };
1559 
1560 static const unsigned int x86_accumulate_outgoing_args
1561   = m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1562     | m_GENERIC;
1563 
1564 static const unsigned int x86_arch_always_fancy_math_387
1565   = m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_PENT4
1566     | m_NOCONA | m_CORE2 | m_GENERIC;
1567 
1568 static enum stringop_alg stringop_alg = no_stringop;
1569 
1570 /* In case the average insn count for single function invocation is
1571    lower than this constant, emit fast (but longer) prologue and
1572    epilogue code.  */
1573 #define FAST_PROLOGUE_INSN_COUNT 20
1574 
1575 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively.  */
1576 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1577 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1578 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1579 
1580 /* Array of the smallest class containing reg number REGNO, indexed by
1581    REGNO.  Used by REGNO_REG_CLASS in i386.h.  */
1582 
1583 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1584 {
1585   /* ax, dx, cx, bx */
1586   AREG, DREG, CREG, BREG,
1587   /* si, di, bp, sp */
1588   SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1589   /* FP registers */
1590   FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1591   FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1592   /* arg pointer */
1593   NON_Q_REGS,
1594   /* flags, fpsr, fpcr, frame */
1595   NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1596   /* SSE registers */
1597   SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1598   SSE_REGS, SSE_REGS,
1599   /* MMX registers */
1600   MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1601   MMX_REGS, MMX_REGS,
1602   /* REX registers */
1603   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1604   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1605   /* SSE REX registers */
1606   SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1607   SSE_REGS, SSE_REGS,
1608 };
1609 
1610 /* The "default" register map used in 32bit mode.  */
1611 
1612 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1613 {
1614   0, 2, 1, 3, 6, 7, 4, 5,		/* general regs */
1615   12, 13, 14, 15, 16, 17, 18, 19,	/* fp regs */
1616   -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
1617   21, 22, 23, 24, 25, 26, 27, 28,	/* SSE */
1618   29, 30, 31, 32, 33, 34, 35, 36,       /* MMX */
1619   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
1620   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
1621 };
1622 
1623 /* The "default" register map used in 64bit mode.  */
1624 
1625 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1626 {
1627   0, 1, 2, 3, 4, 5, 6, 7,		/* general regs */
1628   33, 34, 35, 36, 37, 38, 39, 40,	/* fp regs */
1629   -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
1630   17, 18, 19, 20, 21, 22, 23, 24,	/* SSE */
1631   41, 42, 43, 44, 45, 46, 47, 48,       /* MMX */
1632   8,9,10,11,12,13,14,15,		/* extended integer registers */
1633   25, 26, 27, 28, 29, 30, 31, 32,	/* extended SSE registers */
1634 };
1635 
1636 /* Define the register numbers to be used in Dwarf debugging information.
1637    The SVR4 reference port C compiler uses the following register numbers
1638    in its Dwarf output code:
1639 	0 for %eax (gcc regno = 0)
1640 	1 for %ecx (gcc regno = 2)
1641 	2 for %edx (gcc regno = 1)
1642 	3 for %ebx (gcc regno = 3)
1643 	4 for %esp (gcc regno = 7)
1644 	5 for %ebp (gcc regno = 6)
1645 	6 for %esi (gcc regno = 4)
1646 	7 for %edi (gcc regno = 5)
1647    The following three DWARF register numbers are never generated by
1648    the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1649    believes these numbers have these meanings.
1650 	8  for %eip    (no gcc equivalent)
1651 	9  for %eflags (gcc regno = 17)
1652 	10 for %trapno (no gcc equivalent)
1653    It is not at all clear how we should number the FP stack registers
1654    for the x86 architecture.  If the version of SDB on x86/svr4 were
1655    a bit less brain dead with respect to floating-point then we would
1656    have a precedent to follow with respect to DWARF register numbers
1657    for x86 FP registers, but the SDB on x86/svr4 is so completely
1658    broken with respect to FP registers that it is hardly worth thinking
1659    of it as something to strive for compatibility with.
1660    The version of x86/svr4 SDB I have at the moment does (partially)
1661    seem to believe that DWARF register number 11 is associated with
1662    the x86 register %st(0), but that's about all.  Higher DWARF
1663    register numbers don't seem to be associated with anything in
1664    particular, and even for DWARF regno 11, SDB only seems to under-
1665    stand that it should say that a variable lives in %st(0) (when
1666    asked via an `=' command) if we said it was in DWARF regno 11,
1667    but SDB still prints garbage when asked for the value of the
1668    variable in question (via a `/' command).
1669    (Also note that the labels SDB prints for various FP stack regs
1670    when doing an `x' command are all wrong.)
1671    Note that these problems generally don't affect the native SVR4
1672    C compiler because it doesn't allow the use of -O with -g and
1673    because when it is *not* optimizing, it allocates a memory
1674    location for each floating-point variable, and the memory
1675    location is what gets described in the DWARF AT_location
1676    attribute for the variable in question.
1677    Regardless of the severe mental illness of the x86/svr4 SDB, we
1678    do something sensible here and we use the following DWARF
1679    register numbers.  Note that these are all stack-top-relative
1680    numbers.
1681 	11 for %st(0) (gcc regno = 8)
1682 	12 for %st(1) (gcc regno = 9)
1683 	13 for %st(2) (gcc regno = 10)
1684 	14 for %st(3) (gcc regno = 11)
1685 	15 for %st(4) (gcc regno = 12)
1686 	16 for %st(5) (gcc regno = 13)
1687 	17 for %st(6) (gcc regno = 14)
1688 	18 for %st(7) (gcc regno = 15)
1689 */
1690 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
1691 {
1692   0, 2, 1, 3, 6, 7, 5, 4,		/* general regs */
1693   11, 12, 13, 14, 15, 16, 17, 18,	/* fp regs */
1694   -1, 9, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
1695   21, 22, 23, 24, 25, 26, 27, 28,	/* SSE registers */
1696   29, 30, 31, 32, 33, 34, 35, 36,	/* MMX registers */
1697   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
1698   -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
1699 };
1700 
1701 /* Test and compare insns in i386.md store the information needed to
1702    generate branch and scc insns here.  */
1703 
1704 rtx ix86_compare_op0 = NULL_RTX;
1705 rtx ix86_compare_op1 = NULL_RTX;
1706 
1707 /* Define parameter passing and return registers.  */
1708 
1709 static int const x86_64_int_parameter_registers[6] =
1710 {
1711   DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
1712 };
1713 
1714 static int const x86_64_ms_abi_int_parameter_registers[4] =
1715 {
1716   CX_REG, DX_REG, R8_REG, R9_REG
1717 };
1718 
1719 static int const x86_64_int_return_registers[4] =
1720 {
1721   AX_REG, DX_REG, DI_REG, SI_REG
1722 };
1723 
1724 /* Define the structure for the machine field in struct function.  */
1725 
1726 struct GTY(()) stack_local_entry {
1727   unsigned short mode;
1728   unsigned short n;
1729   rtx rtl;
1730   struct stack_local_entry *next;
1731 };
1732 
1733 /* Structure describing stack frame layout.
1734    Stack grows downward:
1735 
1736    [arguments]
1737 					      <- ARG_POINTER
1738    saved pc
1739 
1740    saved frame pointer if frame_pointer_needed
1741 					      <- HARD_FRAME_POINTER
1742    [saved regs]
1743 
1744    [padding0]
1745 
1746    [saved SSE regs]
1747 
1748    [padding1]          \
1749 		        )
1750    [va_arg registers]  (
1751 		        > to_allocate	      <- FRAME_POINTER
1752    [frame]	       (
1753 		        )
1754    [padding2]	       /
1755   */
1756 struct ix86_frame
1757 {
1758   int padding0;
1759   int nsseregs;
1760   int nregs;
1761   int padding1;
1762   int va_arg_size;
1763   HOST_WIDE_INT frame;
1764   int padding2;
1765   int outgoing_arguments_size;
1766   int red_zone_size;
1767 
1768   HOST_WIDE_INT to_allocate;
1769   /* The offsets relative to ARG_POINTER.  */
1770   HOST_WIDE_INT frame_pointer_offset;
1771   HOST_WIDE_INT hard_frame_pointer_offset;
1772   HOST_WIDE_INT stack_pointer_offset;
1773 
1774   /* When save_regs_using_mov is set, emit prologue using
1775      move instead of push instructions.  */
1776   bool save_regs_using_mov;
1777 };
1778 
1779 /* Code model option.  */
1780 enum cmodel ix86_cmodel;
1781 /* Asm dialect.  */
1782 enum asm_dialect ix86_asm_dialect = ASM_ATT;
1783 /* TLS dialects.  */
1784 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
1785 
1786 /* Which unit we are generating floating point math for.  */
1787 enum fpmath_unit ix86_fpmath;
1788 
1789 /* Which cpu are we scheduling for.  */
1790 enum attr_cpu ix86_schedule;
1791 
1792 /* Which cpu are we optimizing for.  */
1793 enum processor_type ix86_tune;
1794 
1795 /* Which instruction set architecture to use.  */
1796 enum processor_type ix86_arch;
1797 
1798 /* true if sse prefetch instruction is not NOOP.  */
1799 int x86_prefetch_sse;
1800 
1801 /* ix86_regparm_string as a number */
1802 static int ix86_regparm;
1803 
1804 /* -mstackrealign option */
1805 extern int ix86_force_align_arg_pointer;
1806 static const char ix86_force_align_arg_pointer_string[]
1807   = "force_align_arg_pointer";
1808 
1809 static rtx (*ix86_gen_leave) (void);
1810 static rtx (*ix86_gen_pop1) (rtx);
1811 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
1812 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
1813 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
1814 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
1815 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
1816 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
1817 
1818 /* Preferred alignment for stack boundary in bits.  */
1819 unsigned int ix86_preferred_stack_boundary;
1820 
1821 /* Alignment for incoming stack boundary in bits specified at
1822    command line.  */
1823 static unsigned int ix86_user_incoming_stack_boundary;
1824 
1825 /* Default alignment for incoming stack boundary in bits.  */
1826 static unsigned int ix86_default_incoming_stack_boundary;
1827 
1828 /* Alignment for incoming stack boundary in bits.  */
1829 unsigned int ix86_incoming_stack_boundary;
1830 
1831 /* The abi used by target.  */
1832 enum calling_abi ix86_abi;
1833 
1834 /* Values 1-5: see jump.c */
1835 int ix86_branch_cost;
1836 
1837 /* Calling abi specific va_list type nodes.  */
1838 static GTY(()) tree sysv_va_list_type_node;
1839 static GTY(()) tree ms_va_list_type_node;
1840 
1841 /* Variables which are this size or smaller are put in the data/bss
1842    or ldata/lbss sections.  */
1843 
1844 int ix86_section_threshold = 65536;
1845 
1846 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL.  */
1847 char internal_label_prefix[16];
1848 int internal_label_prefix_len;
1849 
1850 /* Fence to use after loop using movnt.  */
1851 tree x86_mfence;
1852 
1853 /* Register class used for passing given 64bit part of the argument.
1854    These represent classes as documented by the PS ABI, with the exception
1855    of SSESF, SSEDF classes, that are basically SSE class, just gcc will
1856    use SF or DFmode move instead of DImode to avoid reformatting penalties.
1857 
1858    Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
1859    whenever possible (upper half does contain padding).  */
1860 enum x86_64_reg_class
1861   {
1862     X86_64_NO_CLASS,
1863     X86_64_INTEGER_CLASS,
1864     X86_64_INTEGERSI_CLASS,
1865     X86_64_SSE_CLASS,
1866     X86_64_SSESF_CLASS,
1867     X86_64_SSEDF_CLASS,
1868     X86_64_SSEUP_CLASS,
1869     X86_64_X87_CLASS,
1870     X86_64_X87UP_CLASS,
1871     X86_64_COMPLEX_X87_CLASS,
1872     X86_64_MEMORY_CLASS
1873   };
1874 
1875 #define MAX_CLASSES 4
1876 
1877 /* Table of constants used by fldpi, fldln2, etc....  */
1878 static REAL_VALUE_TYPE ext_80387_constants_table [5];
1879 static bool ext_80387_constants_init = 0;
1880 
1881 
1882 static struct machine_function * ix86_init_machine_status (void);
1883 static rtx ix86_function_value (const_tree, const_tree, bool);
1884 static rtx ix86_static_chain (const_tree, bool);
1885 static int ix86_function_regparm (const_tree, const_tree);
1886 static void ix86_compute_frame_layout (struct ix86_frame *);
1887 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
1888 						 rtx, rtx, int);
1889 static void ix86_add_new_builtins (int);
1890 static rtx ix86_expand_vec_perm_builtin (tree);
1891 
1892 enum ix86_function_specific_strings
1893 {
1894   IX86_FUNCTION_SPECIFIC_ARCH,
1895   IX86_FUNCTION_SPECIFIC_TUNE,
1896   IX86_FUNCTION_SPECIFIC_FPMATH,
1897   IX86_FUNCTION_SPECIFIC_MAX
1898 };
1899 
1900 static char *ix86_target_string (int, int, const char *, const char *,
1901 				 const char *, bool);
1902 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
1903 static void ix86_function_specific_save (struct cl_target_option *);
1904 static void ix86_function_specific_restore (struct cl_target_option *);
1905 static void ix86_function_specific_print (FILE *, int,
1906 					  struct cl_target_option *);
1907 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
1908 static bool ix86_valid_target_attribute_inner_p (tree, char *[]);
1909 static bool ix86_can_inline_p (tree, tree);
1910 static void ix86_set_current_function (tree);
1911 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
1912 
1913 static enum calling_abi ix86_function_abi (const_tree);
1914 
1915 
1916 #ifndef SUBTARGET32_DEFAULT_CPU
1917 #define SUBTARGET32_DEFAULT_CPU "i386"
1918 #endif
1919 
1920 /* The svr4 ABI for the i386 says that records and unions are returned
1921    in memory.  */
1922 #ifndef DEFAULT_PCC_STRUCT_RETURN
1923 #define DEFAULT_PCC_STRUCT_RETURN 1
1924 #endif
1925 
1926 /* Whether -mtune= or -march= were specified */
1927 static int ix86_tune_defaulted;
1928 static int ix86_arch_specified;
1929 
1930 /* Bit flags that specify the ISA we are compiling for.  */
1931 int ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT;
1932 
1933 /* A mask of ix86_isa_flags that includes bit X if X
1934    was set or cleared on the command line.  */
1935 static int ix86_isa_flags_explicit;
1936 
1937 /* Define a set of ISAs which are available when a given ISA is
1938    enabled.  MMX and SSE ISAs are handled separately.  */
1939 
1940 #define OPTION_MASK_ISA_MMX_SET OPTION_MASK_ISA_MMX
1941 #define OPTION_MASK_ISA_3DNOW_SET \
1942   (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_MMX_SET)
1943 
1944 #define OPTION_MASK_ISA_SSE_SET OPTION_MASK_ISA_SSE
1945 #define OPTION_MASK_ISA_SSE2_SET \
1946   (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE_SET)
1947 #define OPTION_MASK_ISA_SSE3_SET \
1948   (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE2_SET)
1949 #define OPTION_MASK_ISA_SSSE3_SET \
1950   (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE3_SET)
1951 #define OPTION_MASK_ISA_SSE4_1_SET \
1952   (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSSE3_SET)
1953 #define OPTION_MASK_ISA_SSE4_2_SET \
1954   (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_1_SET)
1955 #define OPTION_MASK_ISA_AVX_SET \
1956   (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_SSE4_2_SET)
1957 #define OPTION_MASK_ISA_FMA_SET \
1958   (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_AVX_SET)
1959 
1960 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
1961    as -msse4.2.  */
1962 #define OPTION_MASK_ISA_SSE4_SET OPTION_MASK_ISA_SSE4_2_SET
1963 
1964 #define OPTION_MASK_ISA_SSE4A_SET \
1965   (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET)
1966 #define OPTION_MASK_ISA_FMA4_SET \
1967   (OPTION_MASK_ISA_FMA4 | OPTION_MASK_ISA_SSE4A_SET \
1968    | OPTION_MASK_ISA_AVX_SET)
1969 #define OPTION_MASK_ISA_XOP_SET \
1970   (OPTION_MASK_ISA_XOP | OPTION_MASK_ISA_FMA4_SET)
1971 #define OPTION_MASK_ISA_LWP_SET \
1972   OPTION_MASK_ISA_LWP
1973 
1974 /* AES and PCLMUL need SSE2 because they use xmm registers */
1975 #define OPTION_MASK_ISA_AES_SET \
1976   (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2_SET)
1977 #define OPTION_MASK_ISA_PCLMUL_SET \
1978   (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2_SET)
1979 
1980 #define OPTION_MASK_ISA_ABM_SET \
1981   (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)
1982 
1983 #define OPTION_MASK_ISA_POPCNT_SET OPTION_MASK_ISA_POPCNT
1984 #define OPTION_MASK_ISA_CX16_SET OPTION_MASK_ISA_CX16
1985 #define OPTION_MASK_ISA_SAHF_SET OPTION_MASK_ISA_SAHF
1986 #define OPTION_MASK_ISA_MOVBE_SET OPTION_MASK_ISA_MOVBE
1987 #define OPTION_MASK_ISA_CRC32_SET OPTION_MASK_ISA_CRC32
1988 
1989 /* Define a set of ISAs which aren't available when a given ISA is
1990    disabled.  MMX and SSE ISAs are handled separately.  */
1991 
1992 #define OPTION_MASK_ISA_MMX_UNSET \
1993   (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_3DNOW_UNSET)
1994 #define OPTION_MASK_ISA_3DNOW_UNSET \
1995   (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_A_UNSET)
1996 #define OPTION_MASK_ISA_3DNOW_A_UNSET OPTION_MASK_ISA_3DNOW_A
1997 
1998 #define OPTION_MASK_ISA_SSE_UNSET \
1999   (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2_UNSET)
2000 #define OPTION_MASK_ISA_SSE2_UNSET \
2001   (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE3_UNSET)
2002 #define OPTION_MASK_ISA_SSE3_UNSET \
2003   (OPTION_MASK_ISA_SSE3 \
2004    | OPTION_MASK_ISA_SSSE3_UNSET \
2005    | OPTION_MASK_ISA_SSE4A_UNSET )
2006 #define OPTION_MASK_ISA_SSSE3_UNSET \
2007   (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE4_1_UNSET)
2008 #define OPTION_MASK_ISA_SSE4_1_UNSET \
2009   (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2_UNSET)
2010 #define OPTION_MASK_ISA_SSE4_2_UNSET \
2011   (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_AVX_UNSET )
2012 #define OPTION_MASK_ISA_AVX_UNSET \
2013   (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET \
2014    | OPTION_MASK_ISA_FMA4_UNSET)
2015 #define OPTION_MASK_ISA_FMA_UNSET OPTION_MASK_ISA_FMA
2016 
2017 /* SSE4 includes both SSE4.1 and SSE4.2.  -mno-sse4 should the same
2018    as -mno-sse4.1. */
2019 #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
2020 
2021 #define OPTION_MASK_ISA_SSE4A_UNSET \
2022   (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_FMA4_UNSET)
2023 
2024 #define OPTION_MASK_ISA_FMA4_UNSET \
2025   (OPTION_MASK_ISA_FMA4 | OPTION_MASK_ISA_XOP_UNSET)
2026 #define OPTION_MASK_ISA_XOP_UNSET OPTION_MASK_ISA_XOP
2027 #define OPTION_MASK_ISA_LWP_UNSET OPTION_MASK_ISA_LWP
2028 
2029 #define OPTION_MASK_ISA_AES_UNSET OPTION_MASK_ISA_AES
2030 #define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
2031 #define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
2032 #define OPTION_MASK_ISA_POPCNT_UNSET OPTION_MASK_ISA_POPCNT
2033 #define OPTION_MASK_ISA_CX16_UNSET OPTION_MASK_ISA_CX16
2034 #define OPTION_MASK_ISA_SAHF_UNSET OPTION_MASK_ISA_SAHF
2035 #define OPTION_MASK_ISA_MOVBE_UNSET OPTION_MASK_ISA_MOVBE
2036 #define OPTION_MASK_ISA_CRC32_UNSET OPTION_MASK_ISA_CRC32
2037 
2038 /* Vectorization library interface and handlers.  */
2039 tree (*ix86_veclib_handler)(enum built_in_function, tree, tree) = NULL;
2040 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2041 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2042 
2043 /* Processor target table, indexed by processor number */
2044 struct ptt
2045 {
2046   const struct processor_costs *cost;		/* Processor costs */
2047   const int align_loop;				/* Default alignments.  */
2048   const int align_loop_max_skip;
2049   const int align_jump;
2050   const int align_jump_max_skip;
2051   const int align_func;
2052 };
2053 
2054 static const struct ptt processor_target_table[PROCESSOR_max] =
2055 {
2056   {&i386_cost, 4, 3, 4, 3, 4},
2057   {&i486_cost, 16, 15, 16, 15, 16},
2058   {&pentium_cost, 16, 7, 16, 7, 16},
2059   {&pentiumpro_cost, 16, 15, 16, 10, 16},
2060   {&geode_cost, 0, 0, 0, 0, 0},
2061   {&k6_cost, 32, 7, 32, 7, 32},
2062   {&athlon_cost, 16, 7, 16, 7, 16},
2063   {&pentium4_cost, 0, 0, 0, 0, 0},
2064   {&k8_cost, 16, 7, 16, 7, 16},
2065   {&nocona_cost, 0, 0, 0, 0, 0},
2066   {&core2_cost, 16, 10, 16, 10, 16},
2067   {&generic32_cost, 16, 7, 16, 7, 16},
2068   {&generic64_cost, 16, 10, 16, 10, 16},
2069   {&amdfam10_cost, 32, 24, 32, 7, 32},
2070   {&atom_cost, 16, 7, 16, 7, 16}
2071 };
2072 
2073 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2074 {
2075   "generic",
2076   "i386",
2077   "i486",
2078   "pentium",
2079   "pentium-mmx",
2080   "pentiumpro",
2081   "pentium2",
2082   "pentium3",
2083   "pentium4",
2084   "pentium-m",
2085   "prescott",
2086   "nocona",
2087   "core2",
2088   "atom",
2089   "geode",
2090   "k6",
2091   "k6-2",
2092   "k6-3",
2093   "athlon",
2094   "athlon-4",
2095   "k8",
2096   "amdfam10"
2097 };
2098 
2099 /* Implement TARGET_HANDLE_OPTION.  */
2100 
2101 static bool
2102 ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
2103 {
2104   switch (code)
2105     {
2106     case OPT_mmmx:
2107       if (value)
2108 	{
2109 	  ix86_isa_flags |= OPTION_MASK_ISA_MMX_SET;
2110 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_SET;
2111 	}
2112       else
2113 	{
2114 	  ix86_isa_flags &= ~OPTION_MASK_ISA_MMX_UNSET;
2115 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_UNSET;
2116 	}
2117       return true;
2118 
2119     case OPT_m3dnow:
2120       if (value)
2121 	{
2122 	  ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_SET;
2123 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_SET;
2124 	}
2125       else
2126 	{
2127 	  ix86_isa_flags &= ~OPTION_MASK_ISA_3DNOW_UNSET;
2128 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_UNSET;
2129 	}
2130       return true;
2131 
2132     case OPT_m3dnowa:
2133       return false;
2134 
2135     case OPT_msse:
2136       if (value)
2137 	{
2138 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE_SET;
2139 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_SET;
2140 	}
2141       else
2142 	{
2143 	  ix86_isa_flags &= ~OPTION_MASK_ISA_SSE_UNSET;
2144 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_UNSET;
2145 	}
2146       return true;
2147 
2148     case OPT_msse2:
2149       if (value)
2150 	{
2151 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE2_SET;
2152 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_SET;
2153 	}
2154       else
2155 	{
2156 	  ix86_isa_flags &= ~OPTION_MASK_ISA_SSE2_UNSET;
2157 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_UNSET;
2158 	}
2159       return true;
2160 
2161     case OPT_msse3:
2162       if (value)
2163 	{
2164 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE3_SET;
2165 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_SET;
2166 	}
2167       else
2168 	{
2169 	  ix86_isa_flags &= ~OPTION_MASK_ISA_SSE3_UNSET;
2170 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_UNSET;
2171 	}
2172       return true;
2173 
2174     case OPT_mssse3:
2175       if (value)
2176 	{
2177 	  ix86_isa_flags |= OPTION_MASK_ISA_SSSE3_SET;
2178 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_SET;
2179 	}
2180       else
2181 	{
2182 	  ix86_isa_flags &= ~OPTION_MASK_ISA_SSSE3_UNSET;
2183 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_UNSET;
2184 	}
2185       return true;
2186 
2187     case OPT_msse4_1:
2188       if (value)
2189 	{
2190 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1_SET;
2191 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_SET;
2192 	}
2193       else
2194 	{
2195 	  ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_1_UNSET;
2196 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_UNSET;
2197 	}
2198       return true;
2199 
2200     case OPT_msse4_2:
2201       if (value)
2202 	{
2203 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2_SET;
2204 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_SET;
2205 	}
2206       else
2207 	{
2208 	  ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_2_UNSET;
2209 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_UNSET;
2210 	}
2211       return true;
2212 
2213     case OPT_mavx:
2214       if (value)
2215 	{
2216 	  ix86_isa_flags |= OPTION_MASK_ISA_AVX_SET;
2217 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX_SET;
2218 	}
2219       else
2220 	{
2221 	  ix86_isa_flags &= ~OPTION_MASK_ISA_AVX_UNSET;
2222 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX_UNSET;
2223 	}
2224       return true;
2225 
2226     case OPT_mfma:
2227       if (value)
2228 	{
2229 	  ix86_isa_flags |= OPTION_MASK_ISA_FMA_SET;
2230 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA_SET;
2231 	}
2232       else
2233 	{
2234 	  ix86_isa_flags &= ~OPTION_MASK_ISA_FMA_UNSET;
2235 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA_UNSET;
2236 	}
2237       return true;
2238 
2239     case OPT_msse4:
2240       ix86_isa_flags |= OPTION_MASK_ISA_SSE4_SET;
2241       ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_SET;
2242       return true;
2243 
2244     case OPT_mno_sse4:
2245       ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_UNSET;
2246       ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_UNSET;
2247       return true;
2248 
2249     case OPT_msse4a:
2250       if (value)
2251 	{
2252 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4A_SET;
2253 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_SET;
2254 	}
2255       else
2256 	{
2257 	  ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4A_UNSET;
2258 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_UNSET;
2259 	}
2260       return true;
2261 
2262     case OPT_mfma4:
2263       if (value)
2264 	{
2265 	  ix86_isa_flags |= OPTION_MASK_ISA_FMA4_SET;
2266 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_SET;
2267 	}
2268       else
2269 	{
2270 	  ix86_isa_flags &= ~OPTION_MASK_ISA_FMA4_UNSET;
2271 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_UNSET;
2272 	}
2273       return true;
2274 
2275    case OPT_mxop:
2276       if (value)
2277 	{
2278 	  ix86_isa_flags |= OPTION_MASK_ISA_XOP_SET;
2279 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_XOP_SET;
2280 	}
2281       else
2282 	{
2283 	  ix86_isa_flags &= ~OPTION_MASK_ISA_XOP_UNSET;
2284 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_XOP_UNSET;
2285 	}
2286       return true;
2287 
2288    case OPT_mlwp:
2289       if (value)
2290 	{
2291 	  ix86_isa_flags |= OPTION_MASK_ISA_LWP_SET;
2292 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_LWP_SET;
2293 	}
2294       else
2295 	{
2296 	  ix86_isa_flags &= ~OPTION_MASK_ISA_LWP_UNSET;
2297 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_LWP_UNSET;
2298 	}
2299       return true;
2300 
2301     case OPT_mabm:
2302       if (value)
2303 	{
2304 	  ix86_isa_flags |= OPTION_MASK_ISA_ABM_SET;
2305 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_ABM_SET;
2306 	}
2307       else
2308 	{
2309 	  ix86_isa_flags &= ~OPTION_MASK_ISA_ABM_UNSET;
2310 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_ABM_UNSET;
2311 	}
2312       return true;
2313 
2314     case OPT_mpopcnt:
2315       if (value)
2316 	{
2317 	  ix86_isa_flags |= OPTION_MASK_ISA_POPCNT_SET;
2318 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_POPCNT_SET;
2319 	}
2320       else
2321 	{
2322 	  ix86_isa_flags &= ~OPTION_MASK_ISA_POPCNT_UNSET;
2323 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_POPCNT_UNSET;
2324 	}
2325       return true;
2326 
2327     case OPT_msahf:
2328       if (value)
2329 	{
2330 	  ix86_isa_flags |= OPTION_MASK_ISA_SAHF_SET;
2331 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SAHF_SET;
2332 	}
2333       else
2334 	{
2335 	  ix86_isa_flags &= ~OPTION_MASK_ISA_SAHF_UNSET;
2336 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_SAHF_UNSET;
2337 	}
2338       return true;
2339 
2340     case OPT_mcx16:
2341       if (value)
2342 	{
2343 	  ix86_isa_flags |= OPTION_MASK_ISA_CX16_SET;
2344 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_CX16_SET;
2345 	}
2346       else
2347 	{
2348 	  ix86_isa_flags &= ~OPTION_MASK_ISA_CX16_UNSET;
2349 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_CX16_UNSET;
2350 	}
2351       return true;
2352 
2353     case OPT_mmovbe:
2354       if (value)
2355 	{
2356 	  ix86_isa_flags |= OPTION_MASK_ISA_MOVBE_SET;
2357 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_MOVBE_SET;
2358 	}
2359       else
2360 	{
2361 	  ix86_isa_flags &= ~OPTION_MASK_ISA_MOVBE_UNSET;
2362 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_MOVBE_UNSET;
2363 	}
2364       return true;
2365 
2366     case OPT_mcrc32:
2367       if (value)
2368 	{
2369 	  ix86_isa_flags |= OPTION_MASK_ISA_CRC32_SET;
2370 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_CRC32_SET;
2371 	}
2372       else
2373 	{
2374 	  ix86_isa_flags &= ~OPTION_MASK_ISA_CRC32_UNSET;
2375 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_CRC32_UNSET;
2376 	}
2377       return true;
2378 
2379     case OPT_maes:
2380       if (value)
2381 	{
2382 	  ix86_isa_flags |= OPTION_MASK_ISA_AES_SET;
2383 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_AES_SET;
2384 	}
2385       else
2386 	{
2387 	  ix86_isa_flags &= ~OPTION_MASK_ISA_AES_UNSET;
2388 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_AES_UNSET;
2389 	}
2390       return true;
2391 
2392     case OPT_mpclmul:
2393       if (value)
2394 	{
2395 	  ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL_SET;
2396 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_PCLMUL_SET;
2397 	}
2398       else
2399 	{
2400 	  ix86_isa_flags &= ~OPTION_MASK_ISA_PCLMUL_UNSET;
2401 	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_PCLMUL_UNSET;
2402 	}
2403       return true;
2404 
2405     default:
2406       return true;
2407     }
2408 }
2409 
2410 /* Return a string that documents the current -m options.  The caller is
2411    responsible for freeing the string.  */
2412 
2413 static char *
2414 ix86_target_string (int isa, int flags, const char *arch, const char *tune,
2415 		    const char *fpmath, bool add_nl_p)
2416 {
2417   struct ix86_target_opts
2418   {
2419     const char *option;		/* option string */
2420     int mask;			/* isa mask options */
2421   };
2422 
2423   /* This table is ordered so that options like -msse4.2 that imply
2424      preceding options while match those first.  */
2425   static struct ix86_target_opts isa_opts[] =
2426   {
2427     { "-m64",		OPTION_MASK_ISA_64BIT },
2428     { "-mfma4",		OPTION_MASK_ISA_FMA4 },
2429     { "-mfma",		OPTION_MASK_ISA_FMA },
2430     { "-mxop",		OPTION_MASK_ISA_XOP },
2431     { "-mlwp",		OPTION_MASK_ISA_LWP },
2432     { "-msse4a",	OPTION_MASK_ISA_SSE4A },
2433     { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
2434     { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
2435     { "-mssse3",	OPTION_MASK_ISA_SSSE3 },
2436     { "-msse3",		OPTION_MASK_ISA_SSE3 },
2437     { "-msse2",		OPTION_MASK_ISA_SSE2 },
2438     { "-msse",		OPTION_MASK_ISA_SSE },
2439     { "-m3dnow",	OPTION_MASK_ISA_3DNOW },
2440     { "-m3dnowa",	OPTION_MASK_ISA_3DNOW_A },
2441     { "-mmmx",		OPTION_MASK_ISA_MMX },
2442     { "-mabm",		OPTION_MASK_ISA_ABM },
2443     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
2444     { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
2445     { "-mcrc32",	OPTION_MASK_ISA_CRC32 },
2446     { "-maes",		OPTION_MASK_ISA_AES },
2447     { "-mpclmul",	OPTION_MASK_ISA_PCLMUL },
2448   };
2449 
2450   /* Flag options.  */
2451   static struct ix86_target_opts flag_opts[] =
2452   {
2453     { "-m128bit-long-double",		MASK_128BIT_LONG_DOUBLE },
2454     { "-m80387",			MASK_80387 },
2455     { "-maccumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS },
2456     { "-malign-double",			MASK_ALIGN_DOUBLE },
2457     { "-mcld",				MASK_CLD },
2458     { "-mfp-ret-in-387",		MASK_FLOAT_RETURNS },
2459     { "-mieee-fp",			MASK_IEEE_FP },
2460     { "-minline-all-stringops",		MASK_INLINE_ALL_STRINGOPS },
2461     { "-minline-stringops-dynamically",	MASK_INLINE_STRINGOPS_DYNAMICALLY },
2462     { "-mms-bitfields",			MASK_MS_BITFIELD_LAYOUT },
2463     { "-mno-align-stringops",		MASK_NO_ALIGN_STRINGOPS },
2464     { "-mno-fancy-math-387",		MASK_NO_FANCY_MATH_387 },
2465     { "-mno-push-args",			MASK_NO_PUSH_ARGS },
2466     { "-mno-red-zone",			MASK_NO_RED_ZONE },
2467     { "-momit-leaf-frame-pointer",	MASK_OMIT_LEAF_FRAME_POINTER },
2468     { "-mrecip",			MASK_RECIP },
2469     { "-mrtd",				MASK_RTD },
2470     { "-msseregparm",			MASK_SSEREGPARM },
2471     { "-mstack-arg-probe",		MASK_STACK_PROBE },
2472     { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
2473   };
2474 
2475   const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2476 
2477   char isa_other[40];
2478   char target_other[40];
2479   unsigned num = 0;
2480   unsigned i, j;
2481   char *ret;
2482   char *ptr;
2483   size_t len;
2484   size_t line_len;
2485   size_t sep_len;
2486 
2487   memset (opts, '\0', sizeof (opts));
2488 
2489   /* Add -march= option.  */
2490   if (arch)
2491     {
2492       opts[num][0] = "-march=";
2493       opts[num++][1] = arch;
2494     }
2495 
2496   /* Add -mtune= option.  */
2497   if (tune)
2498     {
2499       opts[num][0] = "-mtune=";
2500       opts[num++][1] = tune;
2501     }
2502 
2503   /* Pick out the options in isa options.  */
2504   for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2505     {
2506       if ((isa & isa_opts[i].mask) != 0)
2507 	{
2508 	  opts[num++][0] = isa_opts[i].option;
2509 	  isa &= ~ isa_opts[i].mask;
2510 	}
2511     }
2512 
2513   if (isa && add_nl_p)
2514     {
2515       opts[num++][0] = isa_other;
2516       sprintf (isa_other, "(other isa: 0x%x)", isa);
2517     }
2518 
2519   /* Add flag options.  */
2520   for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2521     {
2522       if ((flags & flag_opts[i].mask) != 0)
2523 	{
2524 	  opts[num++][0] = flag_opts[i].option;
2525 	  flags &= ~ flag_opts[i].mask;
2526 	}
2527     }
2528 
2529   if (flags && add_nl_p)
2530     {
2531       opts[num++][0] = target_other;
2532       sprintf (target_other, "(other flags: 0x%x)", flags);
2533     }
2534 
2535   /* Add -fpmath= option.  */
2536   if (fpmath)
2537     {
2538       opts[num][0] = "-mfpmath=";
2539       opts[num++][1] = fpmath;
2540     }
2541 
2542   /* Any options?  */
2543   if (num == 0)
2544     return NULL;
2545 
2546   gcc_assert (num < ARRAY_SIZE (opts));
2547 
2548   /* Size the string.  */
2549   len = 0;
2550   sep_len = (add_nl_p) ? 3 : 1;
2551   for (i = 0; i < num; i++)
2552     {
2553       len += sep_len;
2554       for (j = 0; j < 2; j++)
2555 	if (opts[i][j])
2556 	  len += strlen (opts[i][j]);
2557     }
2558 
2559   /* Build the string.  */
2560   ret = ptr = (char *) xmalloc (len);
2561   line_len = 0;
2562 
2563   for (i = 0; i < num; i++)
2564     {
2565       size_t len2[2];
2566 
2567       for (j = 0; j < 2; j++)
2568 	len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2569 
2570       if (i != 0)
2571 	{
2572 	  *ptr++ = ' ';
2573 	  line_len++;
2574 
2575 	  if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2576 	    {
2577 	      *ptr++ = '\\';
2578 	      *ptr++ = '\n';
2579 	      line_len = 0;
2580 	    }
2581 	}
2582 
2583       for (j = 0; j < 2; j++)
2584 	if (opts[i][j])
2585 	  {
2586 	    memcpy (ptr, opts[i][j], len2[j]);
2587 	    ptr += len2[j];
2588 	    line_len += len2[j];
2589 	  }
2590     }
2591 
2592   *ptr = '\0';
2593   gcc_assert (ret + len >= ptr);
2594 
2595   return ret;
2596 }
2597 
2598 /* Function that is callable from the debugger to print the current
2599    options.  */
2600 void
2601 ix86_debug_options (void)
2602 {
2603   char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2604 				   ix86_arch_string, ix86_tune_string,
2605 				   ix86_fpmath_string, true);
2606 
2607   if (opts)
2608     {
2609       fprintf (stderr, "%s\n\n", opts);
2610       free (opts);
2611     }
2612   else
2613     fputs ("<no options>\n\n", stderr);
2614 
2615   return;
2616 }
2617 
2618 /* Sometimes certain combinations of command options do not make
2619    sense on a particular target machine.  You can define a macro
2620    `OVERRIDE_OPTIONS' to take account of this.  This macro, if
2621    defined, is executed once just after all the command options have
2622    been parsed.
2623 
2624    Don't use this macro to turn on various extra optimizations for
2625    `-O'.  That is what `OPTIMIZATION_OPTIONS' is for.  */
2626 
2627 void
2628 override_options (bool main_args_p)
2629 {
2630   int i;
2631   unsigned int ix86_arch_mask, ix86_tune_mask;
2632   const bool ix86_tune_specified = (ix86_tune_string != NULL);
2633   const char *prefix;
2634   const char *suffix;
2635   const char *sw;
2636 
2637   /* Comes from final.c -- no real reason to change it.  */
2638 #define MAX_CODE_ALIGN 16
2639 
2640   enum pta_flags
2641     {
2642       PTA_SSE = 1 << 0,
2643       PTA_SSE2 = 1 << 1,
2644       PTA_SSE3 = 1 << 2,
2645       PTA_MMX = 1 << 3,
2646       PTA_PREFETCH_SSE = 1 << 4,
2647       PTA_3DNOW = 1 << 5,
2648       PTA_3DNOW_A = 1 << 6,
2649       PTA_64BIT = 1 << 7,
2650       PTA_SSSE3 = 1 << 8,
2651       PTA_CX16 = 1 << 9,
2652       PTA_POPCNT = 1 << 10,
2653       PTA_ABM = 1 << 11,
2654       PTA_SSE4A = 1 << 12,
2655       PTA_NO_SAHF = 1 << 13,
2656       PTA_SSE4_1 = 1 << 14,
2657       PTA_SSE4_2 = 1 << 15,
2658       PTA_AES = 1 << 16,
2659       PTA_PCLMUL = 1 << 17,
2660       PTA_AVX = 1 << 18,
2661       PTA_FMA = 1 << 19,
2662       PTA_MOVBE = 1 << 20,
2663       PTA_FMA4 = 1 << 21,
2664       PTA_XOP = 1 << 22,
2665       PTA_LWP = 1 << 23
2666     };
2667 
2668   static struct pta
2669     {
2670       const char *const name;		/* processor name or nickname.  */
2671       const enum processor_type processor;
2672       const enum attr_cpu schedule;
2673       const unsigned /*enum pta_flags*/ flags;
2674     }
2675   const processor_alias_table[] =
2676     {
2677       {"i386", PROCESSOR_I386, CPU_NONE, 0},
2678       {"i486", PROCESSOR_I486, CPU_NONE, 0},
2679       {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2680       {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2681       {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2682       {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2683       {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2684       {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2685       {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2686       {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2687       {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2688       {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2689       {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2690 	PTA_MMX | PTA_SSE},
2691       {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2692 	PTA_MMX | PTA_SSE},
2693       {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2694 	PTA_MMX | PTA_SSE | PTA_SSE2},
2695       {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2696 	PTA_MMX |PTA_SSE | PTA_SSE2},
2697       {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2698 	PTA_MMX | PTA_SSE | PTA_SSE2},
2699       {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2700 	PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2701       {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2702 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2703 	| PTA_CX16 | PTA_NO_SAHF},
2704       {"core2", PROCESSOR_CORE2, CPU_CORE2,
2705 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2706 	| PTA_SSSE3 | PTA_CX16},
2707       {"atom", PROCESSOR_ATOM, CPU_ATOM,
2708 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2709 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2710       {"geode", PROCESSOR_GEODE, CPU_GEODE,
2711 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2712       {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2713       {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2714       {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2715       {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2716 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2717       {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2718 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2719       {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2720 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2721       {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2722 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2723       {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2724 	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2725       {"x86-64", PROCESSOR_K8, CPU_K8,
2726 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2727       {"k8", PROCESSOR_K8, CPU_K8,
2728 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2729 	| PTA_SSE2 | PTA_NO_SAHF},
2730       {"k8-sse3", PROCESSOR_K8, CPU_K8,
2731 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2732 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2733       {"opteron", PROCESSOR_K8, CPU_K8,
2734 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2735 	| PTA_SSE2 | PTA_NO_SAHF},
2736       {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2737         PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2738 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2739       {"athlon64", PROCESSOR_K8, CPU_K8,
2740 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2741 	| PTA_SSE2 | PTA_NO_SAHF},
2742       {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2743 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2744 	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2745       {"athlon-fx", PROCESSOR_K8, CPU_K8,
2746 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2747 	| PTA_SSE2 | PTA_NO_SAHF},
2748       {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2749 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2750 	| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2751       {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2752 	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2753 	| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2754       {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
2755 	0 /* flags are only used for -march switch.  */ },
2756       {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
2757 	PTA_64BIT /* flags are only used for -march switch.  */ },
2758     };
2759 
2760   int const pta_size = ARRAY_SIZE (processor_alias_table);
2761 
2762   /* Set up prefix/suffix so the error messages refer to either the command
2763      line argument, or the attribute(target).  */
2764   if (main_args_p)
2765     {
2766       prefix = "-m";
2767       suffix = "";
2768       sw = "switch";
2769     }
2770   else
2771     {
2772       prefix = "option(\"";
2773       suffix = "\")";
2774       sw = "attribute";
2775     }
2776 
2777 #ifdef SUBTARGET_OVERRIDE_OPTIONS
2778   SUBTARGET_OVERRIDE_OPTIONS;
2779 #endif
2780 
2781 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
2782   SUBSUBTARGET_OVERRIDE_OPTIONS;
2783 #endif
2784 
2785   /* -fPIC is the default for x86_64.  */
2786   if (TARGET_MACHO && TARGET_64BIT)
2787     flag_pic = 2;
2788 
2789   /* Set the default values for switches whose default depends on TARGET_64BIT
2790      in case they weren't overwritten by command line options.  */
2791   if (TARGET_64BIT)
2792     {
2793       /* Mach-O doesn't support omitting the frame pointer for now.  */
2794       if (flag_omit_frame_pointer == 2)
2795 	flag_omit_frame_pointer = (TARGET_MACHO ? 0 : 1);
2796       if (flag_asynchronous_unwind_tables == 2)
2797 	flag_asynchronous_unwind_tables = 1;
2798       if (flag_pcc_struct_return == 2)
2799 	flag_pcc_struct_return = 0;
2800     }
2801   else
2802     {
2803       if (flag_omit_frame_pointer == 2)
2804 	flag_omit_frame_pointer = 0;
2805       if (flag_asynchronous_unwind_tables == 2)
2806 	flag_asynchronous_unwind_tables = 0;
2807       if (flag_pcc_struct_return == 2)
2808 	flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
2809     }
2810 
2811   /* Need to check -mtune=generic first.  */
2812   if (ix86_tune_string)
2813     {
2814       if (!strcmp (ix86_tune_string, "generic")
2815 	  || !strcmp (ix86_tune_string, "i686")
2816 	  /* As special support for cross compilers we read -mtune=native
2817 	     as -mtune=generic.  With native compilers we won't see the
2818 	     -mtune=native, as it was changed by the driver.  */
2819 	  || !strcmp (ix86_tune_string, "native"))
2820 	{
2821 	  if (TARGET_64BIT)
2822 	    ix86_tune_string = "generic64";
2823 	  else
2824 	    ix86_tune_string = "generic32";
2825 	}
2826       /* If this call is for setting the option attribute, allow the
2827 	 generic32/generic64 that was previously set.  */
2828       else if (!main_args_p
2829 	       && (!strcmp (ix86_tune_string, "generic32")
2830 		   || !strcmp (ix86_tune_string, "generic64")))
2831 	;
2832       else if (!strncmp (ix86_tune_string, "generic", 7))
2833         error ("bad value (%s) for %stune=%s %s",
2834 	       ix86_tune_string, prefix, suffix, sw);
2835       else if (!strcmp (ix86_tune_string, "x86-64"))
2836         warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated.  Use "
2837                  "%stune=k8%s or %stune=generic%s instead as appropriate.",
2838                  prefix, suffix, prefix, suffix, prefix, suffix);
2839     }
2840   else
2841     {
2842       if (ix86_arch_string)
2843 	ix86_tune_string = ix86_arch_string;
2844       if (!ix86_tune_string)
2845 	{
2846 	  ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
2847 	  ix86_tune_defaulted = 1;
2848 	}
2849 
2850       /* ix86_tune_string is set to ix86_arch_string or defaulted.  We
2851 	 need to use a sensible tune option.  */
2852       if (!strcmp (ix86_tune_string, "generic")
2853 	  || !strcmp (ix86_tune_string, "x86-64")
2854 	  || !strcmp (ix86_tune_string, "i686"))
2855 	{
2856 	  if (TARGET_64BIT)
2857 	    ix86_tune_string = "generic64";
2858 	  else
2859 	    ix86_tune_string = "generic32";
2860 	}
2861     }
2862 
2863   if (ix86_stringop_string)
2864     {
2865       if (!strcmp (ix86_stringop_string, "rep_byte"))
2866 	stringop_alg = rep_prefix_1_byte;
2867       else if (!strcmp (ix86_stringop_string, "libcall"))
2868 	stringop_alg = libcall;
2869       else if (!strcmp (ix86_stringop_string, "rep_4byte"))
2870 	stringop_alg = rep_prefix_4_byte;
2871       else if (!strcmp (ix86_stringop_string, "rep_8byte")
2872 	       && TARGET_64BIT)
2873 	/* rep; movq isn't available in 32-bit code.  */
2874 	stringop_alg = rep_prefix_8_byte;
2875       else if (!strcmp (ix86_stringop_string, "byte_loop"))
2876 	stringop_alg = loop_1_byte;
2877       else if (!strcmp (ix86_stringop_string, "loop"))
2878 	stringop_alg = loop;
2879       else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
2880 	stringop_alg = unrolled_loop;
2881       else
2882 	error ("bad value (%s) for %sstringop-strategy=%s %s",
2883 	       ix86_stringop_string, prefix, suffix, sw);
2884     }
2885 
2886   if (!ix86_arch_string)
2887     ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
2888   else
2889     ix86_arch_specified = 1;
2890 
2891   /* Validate -mabi= value.  */
2892   if (ix86_abi_string)
2893     {
2894       if (strcmp (ix86_abi_string, "sysv") == 0)
2895 	ix86_abi = SYSV_ABI;
2896       else if (strcmp (ix86_abi_string, "ms") == 0)
2897 	ix86_abi = MS_ABI;
2898       else
2899 	error ("unknown ABI (%s) for %sabi=%s %s",
2900 	       ix86_abi_string, prefix, suffix, sw);
2901     }
2902   else
2903     ix86_abi = DEFAULT_ABI;
2904 
2905   if (ix86_cmodel_string != 0)
2906     {
2907       if (!strcmp (ix86_cmodel_string, "small"))
2908 	ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2909       else if (!strcmp (ix86_cmodel_string, "medium"))
2910 	ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM;
2911       else if (!strcmp (ix86_cmodel_string, "large"))
2912 	ix86_cmodel = flag_pic ? CM_LARGE_PIC : CM_LARGE;
2913       else if (flag_pic)
2914 	error ("code model %s does not support PIC mode", ix86_cmodel_string);
2915       else if (!strcmp (ix86_cmodel_string, "32"))
2916 	ix86_cmodel = CM_32;
2917       else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic)
2918 	ix86_cmodel = CM_KERNEL;
2919       else
2920 	error ("bad value (%s) for %scmodel=%s %s",
2921 	       ix86_cmodel_string, prefix, suffix, sw);
2922     }
2923   else
2924     {
2925       /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
2926 	 use of rip-relative addressing.  This eliminates fixups that
2927 	 would otherwise be needed if this object is to be placed in a
2928 	 DLL, and is essentially just as efficient as direct addressing.  */
2929       if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
2930 	ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
2931       else if (TARGET_64BIT)
2932 	ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2933       else
2934         ix86_cmodel = CM_32;
2935     }
2936   if (ix86_asm_string != 0)
2937     {
2938       if (! TARGET_MACHO
2939 	  && !strcmp (ix86_asm_string, "intel"))
2940 	ix86_asm_dialect = ASM_INTEL;
2941       else if (!strcmp (ix86_asm_string, "att"))
2942 	ix86_asm_dialect = ASM_ATT;
2943       else
2944 	error ("bad value (%s) for %sasm=%s %s",
2945 	       ix86_asm_string, prefix, suffix, sw);
2946     }
2947   if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))
2948     error ("code model %qs not supported in the %s bit mode",
2949 	   ix86_cmodel_string, TARGET_64BIT ? "64" : "32");
2950   if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
2951     sorry ("%i-bit mode not compiled in",
2952 	   (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
2953 
2954   for (i = 0; i < pta_size; i++)
2955     if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
2956       {
2957 	ix86_schedule = processor_alias_table[i].schedule;
2958 	ix86_arch = processor_alias_table[i].processor;
2959 	/* Default cpu tuning to the architecture.  */
2960 	ix86_tune = ix86_arch;
2961 
2962 	if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2963 	  error ("CPU you selected does not support x86-64 "
2964 		 "instruction set");
2965 
2966 	if (processor_alias_table[i].flags & PTA_MMX
2967 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
2968 	  ix86_isa_flags |= OPTION_MASK_ISA_MMX;
2969 	if (processor_alias_table[i].flags & PTA_3DNOW
2970 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
2971 	  ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
2972 	if (processor_alias_table[i].flags & PTA_3DNOW_A
2973 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
2974 	  ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
2975 	if (processor_alias_table[i].flags & PTA_SSE
2976 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
2977 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE;
2978 	if (processor_alias_table[i].flags & PTA_SSE2
2979 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
2980 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
2981 	if (processor_alias_table[i].flags & PTA_SSE3
2982 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
2983 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2984 	if (processor_alias_table[i].flags & PTA_SSSE3
2985 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
2986 	  ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
2987 	if (processor_alias_table[i].flags & PTA_SSE4_1
2988 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
2989 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
2990 	if (processor_alias_table[i].flags & PTA_SSE4_2
2991 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
2992 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
2993 	if (processor_alias_table[i].flags & PTA_AVX
2994 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
2995 	  ix86_isa_flags |= OPTION_MASK_ISA_AVX;
2996 	if (processor_alias_table[i].flags & PTA_FMA
2997 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
2998 	  ix86_isa_flags |= OPTION_MASK_ISA_FMA;
2999 	if (processor_alias_table[i].flags & PTA_SSE4A
3000 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3001 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3002 	if (processor_alias_table[i].flags & PTA_FMA4
3003 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3004 	  ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3005 	if (processor_alias_table[i].flags & PTA_XOP
3006 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3007 	  ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3008 	if (processor_alias_table[i].flags & PTA_LWP
3009 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3010 	  ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3011 	if (processor_alias_table[i].flags & PTA_ABM
3012 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3013 	  ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3014 	if (processor_alias_table[i].flags & PTA_CX16
3015 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3016 	  ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3017 	if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3018 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3019 	  ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3020 	if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3021 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3022 	  ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3023 	if (processor_alias_table[i].flags & PTA_MOVBE
3024 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3025 	  ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3026 	if (processor_alias_table[i].flags & PTA_AES
3027 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3028 	  ix86_isa_flags |= OPTION_MASK_ISA_AES;
3029 	if (processor_alias_table[i].flags & PTA_PCLMUL
3030 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3031 	  ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3032 	if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3033 	  x86_prefetch_sse = true;
3034 
3035 	break;
3036       }
3037 
3038   if (!strcmp (ix86_arch_string, "generic"))
3039     error ("generic CPU can be used only for %stune=%s %s",
3040 	   prefix, suffix, sw);
3041   else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3042     error ("bad value (%s) for %sarch=%s %s",
3043 	   ix86_arch_string, prefix, suffix, sw);
3044 
3045   ix86_arch_mask = 1u << ix86_arch;
3046   for (i = 0; i < X86_ARCH_LAST; ++i)
3047     ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3048 
3049   for (i = 0; i < pta_size; i++)
3050     if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3051       {
3052 	ix86_schedule = processor_alias_table[i].schedule;
3053 	ix86_tune = processor_alias_table[i].processor;
3054 	if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3055 	  {
3056 	    if (ix86_tune_defaulted)
3057 	      {
3058 		ix86_tune_string = "x86-64";
3059 		for (i = 0; i < pta_size; i++)
3060 		  if (! strcmp (ix86_tune_string,
3061 				processor_alias_table[i].name))
3062 		    break;
3063 		ix86_schedule = processor_alias_table[i].schedule;
3064 		ix86_tune = processor_alias_table[i].processor;
3065 	      }
3066 	    else
3067 	      error ("CPU you selected does not support x86-64 "
3068 		     "instruction set");
3069 	  }
3070         /* Intel CPUs have always interpreted SSE prefetch instructions as
3071 	   NOPs; so, we can enable SSE prefetch instructions even when
3072 	   -mtune (rather than -march) points us to a processor that has them.
3073 	   However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3074 	   higher processors.  */
3075 	if (TARGET_CMOV
3076 	    && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3077 	  x86_prefetch_sse = true;
3078 	break;
3079       }
3080 
3081   if (ix86_tune_specified && i == pta_size)
3082     error ("bad value (%s) for %stune=%s %s",
3083 	   ix86_tune_string, prefix, suffix, sw);
3084 
3085   ix86_tune_mask = 1u << ix86_tune;
3086   for (i = 0; i < X86_TUNE_LAST; ++i)
3087     ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3088 
3089   if (optimize_size)
3090     ix86_cost = &ix86_size_cost;
3091   else
3092     ix86_cost = processor_target_table[ix86_tune].cost;
3093 
3094   /* Arrange to set up i386_stack_locals for all functions.  */
3095   init_machine_status = ix86_init_machine_status;
3096 
3097   /* Validate -mregparm= value.  */
3098   if (ix86_regparm_string)
3099     {
3100       if (TARGET_64BIT)
3101 	warning (0, "%sregparm%s is ignored in 64-bit mode", prefix, suffix);
3102       i = atoi (ix86_regparm_string);
3103       if (i < 0 || i > REGPARM_MAX)
3104 	error ("%sregparm=%d%s is not between 0 and %d",
3105 	       prefix, i, suffix, REGPARM_MAX);
3106       else
3107 	ix86_regparm = i;
3108     }
3109   if (TARGET_64BIT)
3110     ix86_regparm = REGPARM_MAX;
3111 
3112   /* If the user has provided any of the -malign-* options,
3113      warn and use that value only if -falign-* is not set.
3114      Remove this code in GCC 3.2 or later.  */
3115   if (ix86_align_loops_string)
3116     {
3117       warning (0, "%salign-loops%s is obsolete, use -falign-loops%s",
3118 	       prefix, suffix, suffix);
3119       if (align_loops == 0)
3120 	{
3121 	  i = atoi (ix86_align_loops_string);
3122 	  if (i < 0 || i > MAX_CODE_ALIGN)
3123 	    error ("%salign-loops=%d%s is not between 0 and %d",
3124 		   prefix, i, suffix, MAX_CODE_ALIGN);
3125 	  else
3126 	    align_loops = 1 << i;
3127 	}
3128     }
3129 
3130   if (ix86_align_jumps_string)
3131     {
3132       warning (0, "%salign-jumps%s is obsolete, use -falign-jumps%s",
3133 	       prefix, suffix, suffix);
3134       if (align_jumps == 0)
3135 	{
3136 	  i = atoi (ix86_align_jumps_string);
3137 	  if (i < 0 || i > MAX_CODE_ALIGN)
3138 	    error ("%salign-loops=%d%s is not between 0 and %d",
3139 		   prefix, i, suffix, MAX_CODE_ALIGN);
3140 	  else
3141 	    align_jumps = 1 << i;
3142 	}
3143     }
3144 
3145   if (ix86_align_funcs_string)
3146     {
3147       warning (0, "%salign-functions%s is obsolete, use -falign-functions%s",
3148 	       prefix, suffix, suffix);
3149       if (align_functions == 0)
3150 	{
3151 	  i = atoi (ix86_align_funcs_string);
3152 	  if (i < 0 || i > MAX_CODE_ALIGN)
3153 	    error ("%salign-loops=%d%s is not between 0 and %d",
3154 		   prefix, i, suffix, MAX_CODE_ALIGN);
3155 	  else
3156 	    align_functions = 1 << i;
3157 	}
3158     }
3159 
3160   /* Default align_* from the processor table.  */
3161   if (align_loops == 0)
3162     {
3163       align_loops = processor_target_table[ix86_tune].align_loop;
3164       align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3165     }
3166   if (align_jumps == 0)
3167     {
3168       align_jumps = processor_target_table[ix86_tune].align_jump;
3169       align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3170     }
3171   if (align_functions == 0)
3172     {
3173       align_functions = processor_target_table[ix86_tune].align_func;
3174     }
3175 
3176   /* Validate -mbranch-cost= value, or provide default.  */
3177   ix86_branch_cost = ix86_cost->branch_cost;
3178   if (ix86_branch_cost_string)
3179     {
3180       i = atoi (ix86_branch_cost_string);
3181       if (i < 0 || i > 5)
3182 	error ("%sbranch-cost=%d%s is not between 0 and 5", prefix, i, suffix);
3183       else
3184 	ix86_branch_cost = i;
3185     }
3186   if (ix86_section_threshold_string)
3187     {
3188       i = atoi (ix86_section_threshold_string);
3189       if (i < 0)
3190 	error ("%slarge-data-threshold=%d%s is negative", prefix, i, suffix);
3191       else
3192 	ix86_section_threshold = i;
3193     }
3194 
3195   if (ix86_tls_dialect_string)
3196     {
3197       if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
3198 	ix86_tls_dialect = TLS_DIALECT_GNU;
3199       else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
3200 	ix86_tls_dialect = TLS_DIALECT_GNU2;
3201       else
3202 	error ("bad value (%s) for %stls-dialect=%s %s",
3203 	       ix86_tls_dialect_string, prefix, suffix, sw);
3204     }
3205 
3206   if (ix87_precision_string)
3207     {
3208       i = atoi (ix87_precision_string);
3209       if (i != 32 && i != 64 && i != 80)
3210 	error ("pc%d is not valid precision setting (32, 64 or 80)", i);
3211     }
3212 
3213   if (TARGET_64BIT)
3214     {
3215       target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3216 
3217       /* Enable by default the SSE and MMX builtins.  Do allow the user to
3218 	 explicitly disable any of these.  In particular, disabling SSE and
3219 	 MMX for kernel code is extremely useful.  */
3220       if (!ix86_arch_specified)
3221       ix86_isa_flags
3222 	|= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3223 	     | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3224 
3225       if (TARGET_RTD)
3226 	warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3227     }
3228   else
3229     {
3230       target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3231 
3232       if (!ix86_arch_specified)
3233       ix86_isa_flags
3234 	|= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3235 
3236       /* i386 ABI does not specify red zone.  It still makes sense to use it
3237          when programmer takes care to stack from being destroyed.  */
3238       if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3239         target_flags |= MASK_NO_RED_ZONE;
3240     }
3241 
3242   /* Keep nonleaf frame pointers.  */
3243   if (flag_omit_frame_pointer)
3244     target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3245   else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3246     flag_omit_frame_pointer = 1;
3247 
3248   /* If we're doing fast math, we don't care about comparison order
3249      wrt NaNs.  This lets us use a shorter comparison sequence.  */
3250   if (flag_finite_math_only)
3251     target_flags &= ~MASK_IEEE_FP;
3252 
3253   /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3254      since the insns won't need emulation.  */
3255   if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3256     target_flags &= ~MASK_NO_FANCY_MATH_387;
3257 
3258   /* Likewise, if the target doesn't have a 387, or we've specified
3259      software floating point, don't use 387 inline intrinsics.  */
3260   if (!TARGET_80387)
3261     target_flags |= MASK_NO_FANCY_MATH_387;
3262 
3263   /* Turn on MMX builtins for -msse.  */
3264   if (TARGET_SSE)
3265     {
3266       ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3267       x86_prefetch_sse = true;
3268     }
3269 
3270   /* Turn on popcnt instruction for -msse4.2 or -mabm.  */
3271   if (TARGET_SSE4_2 || TARGET_ABM)
3272     ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3273 
3274   /* Validate -mpreferred-stack-boundary= value or default it to
3275      PREFERRED_STACK_BOUNDARY_DEFAULT.  */
3276   ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3277   if (ix86_preferred_stack_boundary_string)
3278     {
3279       i = atoi (ix86_preferred_stack_boundary_string);
3280       if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
3281 	error ("%spreferred-stack-boundary=%d%s is not between %d and 12",
3282 	       prefix, i, suffix, TARGET_64BIT ? 4 : 2);
3283       else
3284 	ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
3285     }
3286 
3287   /* Set the default value for -mstackrealign.  */
3288   if (ix86_force_align_arg_pointer == -1)
3289     ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3290 
3291   ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3292 
3293   /* Validate -mincoming-stack-boundary= value or default it to
3294      MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
3295   ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3296   if (ix86_incoming_stack_boundary_string)
3297     {
3298       i = atoi (ix86_incoming_stack_boundary_string);
3299       if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
3300 	error ("-mincoming-stack-boundary=%d is not between %d and 12",
3301 	       i, TARGET_64BIT ? 4 : 2);
3302       else
3303 	{
3304 	  ix86_user_incoming_stack_boundary = (1 << i) * BITS_PER_UNIT;
3305 	  ix86_incoming_stack_boundary
3306 	    = ix86_user_incoming_stack_boundary;
3307 	}
3308     }
3309 
3310   /* Accept -msseregparm only if at least SSE support is enabled.  */
3311   if (TARGET_SSEREGPARM
3312       && ! TARGET_SSE)
3313     error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3314 
3315   ix86_fpmath = TARGET_FPMATH_DEFAULT;
3316   if (ix86_fpmath_string != 0)
3317     {
3318       if (! strcmp (ix86_fpmath_string, "387"))
3319 	ix86_fpmath = FPMATH_387;
3320       else if (! strcmp (ix86_fpmath_string, "sse"))
3321 	{
3322 	  if (!TARGET_SSE)
3323 	    {
3324 	      warning (0, "SSE instruction set disabled, using 387 arithmetics");
3325 	      ix86_fpmath = FPMATH_387;
3326 	    }
3327 	  else
3328 	    ix86_fpmath = FPMATH_SSE;
3329 	}
3330       else if (! strcmp (ix86_fpmath_string, "387,sse")
3331 	       || ! strcmp (ix86_fpmath_string, "387+sse")
3332 	       || ! strcmp (ix86_fpmath_string, "sse,387")
3333 	       || ! strcmp (ix86_fpmath_string, "sse+387")
3334 	       || ! strcmp (ix86_fpmath_string, "both"))
3335 	{
3336 	  if (!TARGET_SSE)
3337 	    {
3338 	      warning (0, "SSE instruction set disabled, using 387 arithmetics");
3339 	      ix86_fpmath = FPMATH_387;
3340 	    }
3341 	  else if (!TARGET_80387)
3342 	    {
3343 	      warning (0, "387 instruction set disabled, using SSE arithmetics");
3344 	      ix86_fpmath = FPMATH_SSE;
3345 	    }
3346 	  else
3347 	    ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
3348 	}
3349       else
3350 	error ("bad value (%s) for %sfpmath=%s %s",
3351 	       ix86_fpmath_string, prefix, suffix, sw);
3352     }
3353 
3354   /* If the i387 is disabled, then do not return values in it. */
3355   if (!TARGET_80387)
3356     target_flags &= ~MASK_FLOAT_RETURNS;
3357 
3358   /* Use external vectorized library in vectorizing intrinsics.  */
3359   if (ix86_veclibabi_string)
3360     {
3361       if (strcmp (ix86_veclibabi_string, "svml") == 0)
3362 	ix86_veclib_handler = ix86_veclibabi_svml;
3363       else if (strcmp (ix86_veclibabi_string, "acml") == 0)
3364 	ix86_veclib_handler = ix86_veclibabi_acml;
3365       else
3366 	error ("unknown vectorization library ABI type (%s) for "
3367 	       "%sveclibabi=%s %s", ix86_veclibabi_string,
3368 	       prefix, suffix, sw);
3369     }
3370 
3371   if ((x86_accumulate_outgoing_args & ix86_tune_mask)
3372       && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3373       && !optimize_size)
3374     target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3375 
3376   /* ??? Unwind info is not correct around the CFG unless either a frame
3377      pointer is present or M_A_O_A is set.  Fixing this requires rewriting
3378      unwind info generation to be aware of the CFG and propagating states
3379      around edges.  */
3380   if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3381        || flag_exceptions || flag_non_call_exceptions)
3382       && flag_omit_frame_pointer
3383       && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3384     {
3385       if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3386 	warning (0, "unwind tables currently require either a frame pointer "
3387 		 "or %saccumulate-outgoing-args%s for correctness",
3388 		 prefix, suffix);
3389       target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3390     }
3391 
3392   /* If stack probes are required, the space used for large function
3393      arguments on the stack must also be probed, so enable
3394      -maccumulate-outgoing-args so this happens in the prologue.  */
3395   if (TARGET_STACK_PROBE
3396       && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3397     {
3398       if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3399 	warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3400 		 "for correctness", prefix, suffix);
3401       target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3402     }
3403 
3404   /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
3405   {
3406     char *p;
3407     ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3408     p = strchr (internal_label_prefix, 'X');
3409     internal_label_prefix_len = p - internal_label_prefix;
3410     *p = '\0';
3411   }
3412 
3413   /* When scheduling description is not available, disable scheduler pass
3414      so it won't slow down the compilation and make x87 code slower.  */
3415   if (!TARGET_SCHEDULE)
3416     flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3417 
3418   if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES))
3419     set_param_value ("simultaneous-prefetches",
3420 		     ix86_cost->simultaneous_prefetches);
3421   if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE))
3422     set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block);
3423   if (!PARAM_SET_P (PARAM_L1_CACHE_SIZE))
3424     set_param_value ("l1-cache-size", ix86_cost->l1_cache_size);
3425   if (!PARAM_SET_P (PARAM_L2_CACHE_SIZE))
3426     set_param_value ("l2-cache-size", ix86_cost->l2_cache_size);
3427 
3428   /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3429      can be optimized to ap = __builtin_next_arg (0).  */
3430   if (!TARGET_64BIT)
3431     targetm.expand_builtin_va_start = NULL;
3432 
3433   if (TARGET_64BIT)
3434     {
3435       ix86_gen_leave = gen_leave_rex64;
3436       ix86_gen_pop1 = gen_popdi1;
3437       ix86_gen_add3 = gen_adddi3;
3438       ix86_gen_sub3 = gen_subdi3;
3439       ix86_gen_sub3_carry = gen_subdi3_carry;
3440       ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3441       ix86_gen_monitor = gen_sse3_monitor64;
3442       ix86_gen_andsp = gen_anddi3;
3443     }
3444   else
3445     {
3446       ix86_gen_leave = gen_leave;
3447       ix86_gen_pop1 = gen_popsi1;
3448       ix86_gen_add3 = gen_addsi3;
3449       ix86_gen_sub3 = gen_subsi3;
3450       ix86_gen_sub3_carry = gen_subsi3_carry;
3451       ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3452       ix86_gen_monitor = gen_sse3_monitor;
3453       ix86_gen_andsp = gen_andsi3;
3454     }
3455 
3456 #ifdef USE_IX86_CLD
3457   /* Use -mcld by default for 32-bit code if configured with --enable-cld.  */
3458   if (!TARGET_64BIT)
3459     target_flags |= MASK_CLD & ~target_flags_explicit;
3460 #endif
3461 
3462   /* Save the initial options in case the user does function specific options */
3463   if (main_args_p)
3464     target_option_default_node = target_option_current_node
3465       = build_target_option_node ();
3466 }
3467 
3468 /* Update register usage after having seen the compiler flags.  */
3469 
3470 void
3471 ix86_conditional_register_usage (void)
3472 {
3473   int i;
3474   unsigned int j;
3475 
3476   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3477     {
3478       if (fixed_regs[i] > 1)
3479 	fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3480       if (call_used_regs[i] > 1)
3481 	call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3482     }
3483 
3484   /* The PIC register, if it exists, is fixed.  */
3485   j = PIC_OFFSET_TABLE_REGNUM;
3486   if (j != INVALID_REGNUM)
3487     fixed_regs[j] = call_used_regs[j] = 1;
3488 
3489   /* The MS_ABI changes the set of call-used registers.  */
3490   if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
3491     {
3492       call_used_regs[SI_REG] = 0;
3493       call_used_regs[DI_REG] = 0;
3494       call_used_regs[XMM6_REG] = 0;
3495       call_used_regs[XMM7_REG] = 0;
3496       for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3497 	call_used_regs[i] = 0;
3498     }
3499 
3500   /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3501      other call-clobbered regs for 64-bit.  */
3502   if (TARGET_64BIT)
3503     {
3504       CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3505 
3506       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3507 	if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3508 	    && call_used_regs[i])
3509 	  SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3510     }
3511 
3512   /* If MMX is disabled, squash the registers.  */
3513   if (! TARGET_MMX)
3514     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3515       if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3516 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3517 
3518   /* If SSE is disabled, squash the registers.  */
3519   if (! TARGET_SSE)
3520     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3521       if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3522 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3523 
3524   /* If the FPU is disabled, squash the registers.  */
3525   if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3526     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3527       if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3528 	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3529 
3530   /* If 32-bit, squash the 64-bit registers.  */
3531   if (! TARGET_64BIT)
3532     {
3533       for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3534 	reg_names[i] = "";
3535       for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3536 	reg_names[i] = "";
3537     }
3538 }
3539 
3540 
3541 /* Save the current options */
3542 
3543 static void
3544 ix86_function_specific_save (struct cl_target_option *ptr)
3545 {
3546   ptr->arch = ix86_arch;
3547   ptr->schedule = ix86_schedule;
3548   ptr->tune = ix86_tune;
3549   ptr->fpmath = ix86_fpmath;
3550   ptr->branch_cost = ix86_branch_cost;
3551   ptr->tune_defaulted = ix86_tune_defaulted;
3552   ptr->arch_specified = ix86_arch_specified;
3553   ptr->ix86_isa_flags_explicit = ix86_isa_flags_explicit;
3554   ptr->target_flags_explicit = target_flags_explicit;
3555 
3556   /* The fields are char but the variables are not; make sure the
3557      values fit in the fields.  */
3558   gcc_assert (ptr->arch == ix86_arch);
3559   gcc_assert (ptr->schedule == ix86_schedule);
3560   gcc_assert (ptr->tune == ix86_tune);
3561   gcc_assert (ptr->fpmath == ix86_fpmath);
3562   gcc_assert (ptr->branch_cost == ix86_branch_cost);
3563 }
3564 
3565 /* Restore the current options */
3566 
3567 static void
3568 ix86_function_specific_restore (struct cl_target_option *ptr)
3569 {
3570   enum processor_type old_tune = ix86_tune;
3571   enum processor_type old_arch = ix86_arch;
3572   unsigned int ix86_arch_mask, ix86_tune_mask;
3573   int i;
3574 
3575   ix86_arch = (enum processor_type) ptr->arch;
3576   ix86_schedule = (enum attr_cpu) ptr->schedule;
3577   ix86_tune = (enum processor_type) ptr->tune;
3578   ix86_fpmath = (enum fpmath_unit) ptr->fpmath;
3579   ix86_branch_cost = ptr->branch_cost;
3580   ix86_tune_defaulted = ptr->tune_defaulted;
3581   ix86_arch_specified = ptr->arch_specified;
3582   ix86_isa_flags_explicit = ptr->ix86_isa_flags_explicit;
3583   target_flags_explicit = ptr->target_flags_explicit;
3584 
3585   /* Recreate the arch feature tests if the arch changed */
3586   if (old_arch != ix86_arch)
3587     {
3588       ix86_arch_mask = 1u << ix86_arch;
3589       for (i = 0; i < X86_ARCH_LAST; ++i)
3590 	ix86_arch_features[i]
3591 	  = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3592     }
3593 
3594   /* Recreate the tune optimization tests */
3595   if (old_tune != ix86_tune)
3596     {
3597       ix86_tune_mask = 1u << ix86_tune;
3598       for (i = 0; i < X86_TUNE_LAST; ++i)
3599 	ix86_tune_features[i]
3600 	  = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3601     }
3602 }
3603 
3604 /* Print the current options */
3605 
3606 static void
3607 ix86_function_specific_print (FILE *file, int indent,
3608 			      struct cl_target_option *ptr)
3609 {
3610   char *target_string
3611     = ix86_target_string (ptr->ix86_isa_flags, ptr->target_flags,
3612 			  NULL, NULL, NULL, false);
3613 
3614   fprintf (file, "%*sarch = %d (%s)\n",
3615 	   indent, "",
3616 	   ptr->arch,
3617 	   ((ptr->arch < TARGET_CPU_DEFAULT_max)
3618 	    ? cpu_names[ptr->arch]
3619 	    : "<unknown>"));
3620 
3621   fprintf (file, "%*stune = %d (%s)\n",
3622 	   indent, "",
3623 	   ptr->tune,
3624 	   ((ptr->tune < TARGET_CPU_DEFAULT_max)
3625 	    ? cpu_names[ptr->tune]
3626 	    : "<unknown>"));
3627 
3628   fprintf (file, "%*sfpmath = %d%s%s\n", indent, "", ptr->fpmath,
3629 	   (ptr->fpmath & FPMATH_387) ? ", 387" : "",
3630 	   (ptr->fpmath & FPMATH_SSE) ? ", sse" : "");
3631   fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
3632 
3633   if (target_string)
3634     {
3635       fprintf (file, "%*s%s\n", indent, "", target_string);
3636       free (target_string);
3637     }
3638 }
3639 
3640 
3641 /* Inner function to process the attribute((target(...))), take an argument and
3642    set the current options from the argument. If we have a list, recursively go
3643    over the list.  */
3644 
3645 static bool
3646 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[])
3647 {
3648   char *next_optstr;
3649   bool ret = true;
3650 
3651 #define IX86_ATTR_ISA(S,O)   { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
3652 #define IX86_ATTR_STR(S,O)   { S, sizeof (S)-1, ix86_opt_str, O, 0 }
3653 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
3654 #define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
3655 
3656   enum ix86_opt_type
3657   {
3658     ix86_opt_unknown,
3659     ix86_opt_yes,
3660     ix86_opt_no,
3661     ix86_opt_str,
3662     ix86_opt_isa
3663   };
3664 
3665   static const struct
3666   {
3667     const char *string;
3668     size_t len;
3669     enum ix86_opt_type type;
3670     int opt;
3671     int mask;
3672   } attrs[] = {
3673     /* isa options */
3674     IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
3675     IX86_ATTR_ISA ("abm",	OPT_mabm),
3676     IX86_ATTR_ISA ("aes",	OPT_maes),
3677     IX86_ATTR_ISA ("avx",	OPT_mavx),
3678     IX86_ATTR_ISA ("mmx",	OPT_mmmx),
3679     IX86_ATTR_ISA ("pclmul",	OPT_mpclmul),
3680     IX86_ATTR_ISA ("popcnt",	OPT_mpopcnt),
3681     IX86_ATTR_ISA ("sse",	OPT_msse),
3682     IX86_ATTR_ISA ("sse2",	OPT_msse2),
3683     IX86_ATTR_ISA ("sse3",	OPT_msse3),
3684     IX86_ATTR_ISA ("sse4",	OPT_msse4),
3685     IX86_ATTR_ISA ("sse4.1",	OPT_msse4_1),
3686     IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
3687     IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
3688     IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
3689     IX86_ATTR_ISA ("fma4",	OPT_mfma4),
3690     IX86_ATTR_ISA ("xop",	OPT_mxop),
3691     IX86_ATTR_ISA ("lwp",	OPT_mlwp),
3692 
3693     /* string options */
3694     IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
3695     IX86_ATTR_STR ("fpmath=",	IX86_FUNCTION_SPECIFIC_FPMATH),
3696     IX86_ATTR_STR ("tune=",	IX86_FUNCTION_SPECIFIC_TUNE),
3697 
3698     /* flag options */
3699     IX86_ATTR_YES ("cld",
3700 		   OPT_mcld,
3701 		   MASK_CLD),
3702 
3703     IX86_ATTR_NO ("fancy-math-387",
3704 		  OPT_mfancy_math_387,
3705 		  MASK_NO_FANCY_MATH_387),
3706 
3707     IX86_ATTR_YES ("ieee-fp",
3708 		   OPT_mieee_fp,
3709 		   MASK_IEEE_FP),
3710 
3711     IX86_ATTR_YES ("inline-all-stringops",
3712 		   OPT_minline_all_stringops,
3713 		   MASK_INLINE_ALL_STRINGOPS),
3714 
3715     IX86_ATTR_YES ("inline-stringops-dynamically",
3716 		   OPT_minline_stringops_dynamically,
3717 		   MASK_INLINE_STRINGOPS_DYNAMICALLY),
3718 
3719     IX86_ATTR_NO ("align-stringops",
3720 		  OPT_mno_align_stringops,
3721 		  MASK_NO_ALIGN_STRINGOPS),
3722 
3723     IX86_ATTR_YES ("recip",
3724 		   OPT_mrecip,
3725 		   MASK_RECIP),
3726 
3727   };
3728 
3729   /* If this is a list, recurse to get the options.  */
3730   if (TREE_CODE (args) == TREE_LIST)
3731     {
3732       bool ret = true;
3733 
3734       for (; args; args = TREE_CHAIN (args))
3735 	if (TREE_VALUE (args)
3736 	    && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args), p_strings))
3737 	  ret = false;
3738 
3739       return ret;
3740     }
3741 
3742   else if (TREE_CODE (args) != STRING_CST)
3743     gcc_unreachable ();
3744 
3745   /* Handle multiple arguments separated by commas.  */
3746   next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
3747 
3748   while (next_optstr && *next_optstr != '\0')
3749     {
3750       char *p = next_optstr;
3751       char *orig_p = p;
3752       char *comma = strchr (next_optstr, ',');
3753       const char *opt_string;
3754       size_t len, opt_len;
3755       int opt;
3756       bool opt_set_p;
3757       char ch;
3758       unsigned i;
3759       enum ix86_opt_type type = ix86_opt_unknown;
3760       int mask = 0;
3761 
3762       if (comma)
3763 	{
3764 	  *comma = '\0';
3765 	  len = comma - next_optstr;
3766 	  next_optstr = comma + 1;
3767 	}
3768       else
3769 	{
3770 	  len = strlen (p);
3771 	  next_optstr = NULL;
3772 	}
3773 
3774       /* Recognize no-xxx.  */
3775       if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
3776 	{
3777 	  opt_set_p = false;
3778 	  p += 3;
3779 	  len -= 3;
3780 	}
3781       else
3782 	opt_set_p = true;
3783 
3784       /* Find the option.  */
3785       ch = *p;
3786       opt = N_OPTS;
3787       for (i = 0; i < ARRAY_SIZE (attrs); i++)
3788 	{
3789 	  type = attrs[i].type;
3790 	  opt_len = attrs[i].len;
3791 	  if (ch == attrs[i].string[0]
3792 	      && ((type != ix86_opt_str) ? len == opt_len : len > opt_len)
3793 	      && memcmp (p, attrs[i].string, opt_len) == 0)
3794 	    {
3795 	      opt = attrs[i].opt;
3796 	      mask = attrs[i].mask;
3797 	      opt_string = attrs[i].string;
3798 	      break;
3799 	    }
3800 	}
3801 
3802       /* Process the option.  */
3803       if (opt == N_OPTS)
3804 	{
3805 	  error ("attribute(target(\"%s\")) is unknown", orig_p);
3806 	  ret = false;
3807 	}
3808 
3809       else if (type == ix86_opt_isa)
3810 	ix86_handle_option (opt, p, opt_set_p);
3811 
3812       else if (type == ix86_opt_yes || type == ix86_opt_no)
3813 	{
3814 	  if (type == ix86_opt_no)
3815 	    opt_set_p = !opt_set_p;
3816 
3817 	  if (opt_set_p)
3818 	    target_flags |= mask;
3819 	  else
3820 	    target_flags &= ~mask;
3821 	}
3822 
3823       else if (type == ix86_opt_str)
3824 	{
3825 	  if (p_strings[opt])
3826 	    {
3827 	      error ("option(\"%s\") was already specified", opt_string);
3828 	      ret = false;
3829 	    }
3830 	  else
3831 	    p_strings[opt] = xstrdup (p + opt_len);
3832 	}
3833 
3834       else
3835 	gcc_unreachable ();
3836     }
3837 
3838   return ret;
3839 }
3840 
3841 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL.  */
3842 
3843 tree
3844 ix86_valid_target_attribute_tree (tree args)
3845 {
3846   const char *orig_arch_string = ix86_arch_string;
3847   const char *orig_tune_string = ix86_tune_string;
3848   const char *orig_fpmath_string = ix86_fpmath_string;
3849   int orig_tune_defaulted = ix86_tune_defaulted;
3850   int orig_arch_specified = ix86_arch_specified;
3851   char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL, NULL };
3852   tree t = NULL_TREE;
3853   int i;
3854   struct cl_target_option *def
3855     = TREE_TARGET_OPTION (target_option_default_node);
3856 
3857   /* Process each of the options on the chain.  */
3858   if (! ix86_valid_target_attribute_inner_p (args, option_strings))
3859     return NULL_TREE;
3860 
3861   /* If the changed options are different from the default, rerun override_options,
3862      and then save the options away.  The string options are are attribute options,
3863      and will be undone when we copy the save structure.  */
3864   if (ix86_isa_flags != def->ix86_isa_flags
3865       || target_flags != def->target_flags
3866       || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
3867       || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
3868       || option_strings[IX86_FUNCTION_SPECIFIC_FPMATH])
3869     {
3870       /* If we are using the default tune= or arch=, undo the string assigned,
3871 	 and use the default.  */
3872       if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
3873 	ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
3874       else if (!orig_arch_specified)
3875 	ix86_arch_string = NULL;
3876 
3877       if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
3878 	ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
3879       else if (orig_tune_defaulted)
3880 	ix86_tune_string = NULL;
3881 
3882       /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
3883       if (option_strings[IX86_FUNCTION_SPECIFIC_FPMATH])
3884 	ix86_fpmath_string = option_strings[IX86_FUNCTION_SPECIFIC_FPMATH];
3885       else if (!TARGET_64BIT && TARGET_SSE)
3886 	ix86_fpmath_string = "sse,387";
3887 
3888       /* Do any overrides, such as arch=xxx, or tune=xxx support.  */
3889       override_options (false);
3890 
3891       /* Add any builtin functions with the new isa if any.  */
3892       ix86_add_new_builtins (ix86_isa_flags);
3893 
3894       /* Save the current options unless we are validating options for
3895 	 #pragma.  */
3896       t = build_target_option_node ();
3897 
3898       ix86_arch_string = orig_arch_string;
3899       ix86_tune_string = orig_tune_string;
3900       ix86_fpmath_string = orig_fpmath_string;
3901 
3902       /* Free up memory allocated to hold the strings */
3903       for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
3904 	if (option_strings[i])
3905 	  free (option_strings[i]);
3906     }
3907 
3908   return t;
3909 }
3910 
3911 /* Hook to validate attribute((target("string"))).  */
3912 
3913 static bool
3914 ix86_valid_target_attribute_p (tree fndecl,
3915 			       tree ARG_UNUSED (name),
3916 			       tree args,
3917 			       int ARG_UNUSED (flags))
3918 {
3919   struct cl_target_option cur_target;
3920   bool ret = true;
3921   tree old_optimize = build_optimization_node ();
3922   tree new_target, new_optimize;
3923   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
3924 
3925   /* If the function changed the optimization levels as well as setting target
3926      options, start with the optimizations specified.  */
3927   if (func_optimize && func_optimize != old_optimize)
3928     cl_optimization_restore (TREE_OPTIMIZATION (func_optimize));
3929 
3930   /* The target attributes may also change some optimization flags, so update
3931      the optimization options if necessary.  */
3932   cl_target_option_save (&cur_target);
3933   new_target = ix86_valid_target_attribute_tree (args);
3934   new_optimize = build_optimization_node ();
3935 
3936   if (!new_target)
3937     ret = false;
3938 
3939   else if (fndecl)
3940     {
3941       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
3942 
3943       if (old_optimize != new_optimize)
3944 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
3945     }
3946 
3947   cl_target_option_restore (&cur_target);
3948 
3949   if (old_optimize != new_optimize)
3950     cl_optimization_restore (TREE_OPTIMIZATION (old_optimize));
3951 
3952   return ret;
3953 }
3954 
3955 
3956 /* Hook to determine if one function can safely inline another.  */
3957 
3958 static bool
3959 ix86_can_inline_p (tree caller, tree callee)
3960 {
3961   bool ret = false;
3962   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
3963   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
3964 
3965   /* If callee has no option attributes, then it is ok to inline.  */
3966   if (!callee_tree)
3967     ret = true;
3968 
3969   /* If caller has no option attributes, but callee does then it is not ok to
3970      inline.  */
3971   else if (!caller_tree)
3972     ret = false;
3973 
3974   else
3975     {
3976       struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
3977       struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
3978 
3979       /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
3980 	 can inline a SSE2 function but a SSE2 function can't inline a SSE4
3981 	 function.  */
3982       if ((caller_opts->ix86_isa_flags & callee_opts->ix86_isa_flags)
3983 	  != callee_opts->ix86_isa_flags)
3984 	ret = false;
3985 
3986       /* See if we have the same non-isa options.  */
3987       else if (caller_opts->target_flags != callee_opts->target_flags)
3988 	ret = false;
3989 
3990       /* See if arch, tune, etc. are the same.  */
3991       else if (caller_opts->arch != callee_opts->arch)
3992 	ret = false;
3993 
3994       else if (caller_opts->tune != callee_opts->tune)
3995 	ret = false;
3996 
3997       else if (caller_opts->fpmath != callee_opts->fpmath)
3998 	ret = false;
3999 
4000       else if (caller_opts->branch_cost != callee_opts->branch_cost)
4001 	ret = false;
4002 
4003       else
4004 	ret = true;
4005     }
4006 
4007   return ret;
4008 }
4009 
4010 
4011 /* Remember the last target of ix86_set_current_function.  */
4012 static GTY(()) tree ix86_previous_fndecl;
4013 
4014 /* Establish appropriate back-end context for processing the function
4015    FNDECL.  The argument might be NULL to indicate processing at top
4016    level, outside of any function scope.  */
4017 static void
4018 ix86_set_current_function (tree fndecl)
4019 {
4020   /* Only change the context if the function changes.  This hook is called
4021      several times in the course of compiling a function, and we don't want to
4022      slow things down too much or call target_reinit when it isn't safe.  */
4023   if (fndecl && fndecl != ix86_previous_fndecl)
4024     {
4025       tree old_tree = (ix86_previous_fndecl
4026 		       ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4027 		       : NULL_TREE);
4028 
4029       tree new_tree = (fndecl
4030 		       ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4031 		       : NULL_TREE);
4032 
4033       ix86_previous_fndecl = fndecl;
4034       if (old_tree == new_tree)
4035 	;
4036 
4037       else if (new_tree)
4038 	{
4039 	  cl_target_option_restore (TREE_TARGET_OPTION (new_tree));
4040 	  target_reinit ();
4041 	}
4042 
4043       else if (old_tree)
4044 	{
4045 	  struct cl_target_option *def
4046 	    = TREE_TARGET_OPTION (target_option_current_node);
4047 
4048 	  cl_target_option_restore (def);
4049 	  target_reinit ();
4050 	}
4051     }
4052 }
4053 
4054 
4055 /* Return true if this goes in large data/bss.  */
4056 
4057 static bool
4058 ix86_in_large_data_p (tree exp)
4059 {
4060   if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4061     return false;
4062 
4063   /* Functions are never large data.  */
4064   if (TREE_CODE (exp) == FUNCTION_DECL)
4065     return false;
4066 
4067   if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4068     {
4069       const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4070       if (strcmp (section, ".ldata") == 0
4071 	  || strcmp (section, ".lbss") == 0)
4072 	return true;
4073       return false;
4074     }
4075   else
4076     {
4077       HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4078 
4079       /* If this is an incomplete type with size 0, then we can't put it
4080 	 in data because it might be too big when completed.  */
4081       if (!size || size > ix86_section_threshold)
4082 	return true;
4083     }
4084 
4085   return false;
4086 }
4087 
4088 /* Switch to the appropriate section for output of DECL.
4089    DECL is either a `VAR_DECL' node or a constant of some sort.
4090    RELOC indicates whether forming the initial value of DECL requires
4091    link-time relocations.  */
4092 
4093 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4094 	ATTRIBUTE_UNUSED;
4095 
4096 static section *
4097 x86_64_elf_select_section (tree decl, int reloc,
4098 			   unsigned HOST_WIDE_INT align)
4099 {
4100   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4101       && ix86_in_large_data_p (decl))
4102     {
4103       const char *sname = NULL;
4104       unsigned int flags = SECTION_WRITE;
4105       switch (categorize_decl_for_section (decl, reloc))
4106 	{
4107 	case SECCAT_DATA:
4108 	  sname = ".ldata";
4109 	  break;
4110 	case SECCAT_DATA_REL:
4111 	  sname = ".ldata.rel";
4112 	  break;
4113 	case SECCAT_DATA_REL_LOCAL:
4114 	  sname = ".ldata.rel.local";
4115 	  break;
4116 	case SECCAT_DATA_REL_RO:
4117 	  sname = ".ldata.rel.ro";
4118 	  break;
4119 	case SECCAT_DATA_REL_RO_LOCAL:
4120 	  sname = ".ldata.rel.ro.local";
4121 	  break;
4122 	case SECCAT_BSS:
4123 	  sname = ".lbss";
4124 	  flags |= SECTION_BSS;
4125 	  break;
4126 	case SECCAT_RODATA:
4127 	case SECCAT_RODATA_MERGE_STR:
4128 	case SECCAT_RODATA_MERGE_STR_INIT:
4129 	case SECCAT_RODATA_MERGE_CONST:
4130 	  sname = ".lrodata";
4131 	  flags = 0;
4132 	  break;
4133 	case SECCAT_SRODATA:
4134 	case SECCAT_SDATA:
4135 	case SECCAT_SBSS:
4136 	  gcc_unreachable ();
4137 	case SECCAT_TEXT:
4138 	case SECCAT_TDATA:
4139 	case SECCAT_TBSS:
4140 	  /* We don't split these for medium model.  Place them into
4141 	     default sections and hope for best.  */
4142 	  break;
4143 	case SECCAT_EMUTLS_VAR:
4144 	case SECCAT_EMUTLS_TMPL:
4145 	  gcc_unreachable ();
4146 	}
4147       if (sname)
4148 	{
4149 	  /* We might get called with string constants, but get_named_section
4150 	     doesn't like them as they are not DECLs.  Also, we need to set
4151 	     flags in that case.  */
4152 	  if (!DECL_P (decl))
4153 	    return get_section (sname, flags, NULL);
4154 	  return get_named_section (decl, sname, reloc);
4155 	}
4156     }
4157   return default_elf_select_section (decl, reloc, align);
4158 }
4159 
4160 /* Build up a unique section name, expressed as a
4161    STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4162    RELOC indicates whether the initial value of EXP requires
4163    link-time relocations.  */
4164 
4165 static void ATTRIBUTE_UNUSED
4166 x86_64_elf_unique_section (tree decl, int reloc)
4167 {
4168   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4169       && ix86_in_large_data_p (decl))
4170     {
4171       const char *prefix = NULL;
4172       /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
4173       bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4174 
4175       switch (categorize_decl_for_section (decl, reloc))
4176 	{
4177 	case SECCAT_DATA:
4178 	case SECCAT_DATA_REL:
4179 	case SECCAT_DATA_REL_LOCAL:
4180 	case SECCAT_DATA_REL_RO:
4181 	case SECCAT_DATA_REL_RO_LOCAL:
4182           prefix = one_only ? ".ld" : ".ldata";
4183 	  break;
4184 	case SECCAT_BSS:
4185           prefix = one_only ? ".lb" : ".lbss";
4186 	  break;
4187 	case SECCAT_RODATA:
4188 	case SECCAT_RODATA_MERGE_STR:
4189 	case SECCAT_RODATA_MERGE_STR_INIT:
4190 	case SECCAT_RODATA_MERGE_CONST:
4191           prefix = one_only ? ".lr" : ".lrodata";
4192 	  break;
4193 	case SECCAT_SRODATA:
4194 	case SECCAT_SDATA:
4195 	case SECCAT_SBSS:
4196 	  gcc_unreachable ();
4197 	case SECCAT_TEXT:
4198 	case SECCAT_TDATA:
4199 	case SECCAT_TBSS:
4200 	  /* We don't split these for medium model.  Place them into
4201 	     default sections and hope for best.  */
4202 	  break;
4203 	case SECCAT_EMUTLS_VAR:
4204 	  prefix = targetm.emutls.var_section;
4205 	  break;
4206 	case SECCAT_EMUTLS_TMPL:
4207 	  prefix = targetm.emutls.tmpl_section;
4208 	  break;
4209 	}
4210       if (prefix)
4211 	{
4212 	  const char *name, *linkonce;
4213 	  char *string;
4214 
4215 	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4216 	  name = targetm.strip_name_encoding (name);
4217 
4218 	  /* If we're using one_only, then there needs to be a .gnu.linkonce
4219      	     prefix to the section name.  */
4220 	  linkonce = one_only ? ".gnu.linkonce" : "";
4221 
4222 	  string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4223 
4224 	  DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4225 	  return;
4226 	}
4227     }
4228   default_unique_section (decl, reloc);
4229 }
4230 
4231 #ifdef COMMON_ASM_OP
4232 /* This says how to output assembler code to declare an
4233    uninitialized external linkage data object.
4234 
4235    For medium model x86-64 we need to use .largecomm opcode for
4236    large objects.  */
4237 void
4238 x86_elf_aligned_common (FILE *file,
4239 			const char *name, unsigned HOST_WIDE_INT size,
4240 			int align)
4241 {
4242   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4243       && size > (unsigned int)ix86_section_threshold)
4244     fputs (".largecomm\t", file);
4245   else
4246     fputs (COMMON_ASM_OP, file);
4247   assemble_name (file, name);
4248   fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4249 	   size, align / BITS_PER_UNIT);
4250 }
4251 #endif
4252 
4253 /* Utility function for targets to use in implementing
4254    ASM_OUTPUT_ALIGNED_BSS.  */
4255 
4256 void
4257 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4258 			const char *name, unsigned HOST_WIDE_INT size,
4259 			int align)
4260 {
4261   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4262       && size > (unsigned int)ix86_section_threshold)
4263     switch_to_section (get_named_section (decl, ".lbss", 0));
4264   else
4265     switch_to_section (bss_section);
4266   ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4267 #ifdef ASM_DECLARE_OBJECT_NAME
4268   last_assemble_variable_decl = decl;
4269   ASM_DECLARE_OBJECT_NAME (file, name, decl);
4270 #else
4271   /* Standard thing is just output label for the object.  */
4272   ASM_OUTPUT_LABEL (file, name);
4273 #endif /* ASM_DECLARE_OBJECT_NAME */
4274   ASM_OUTPUT_SKIP (file, size ? size : 1);
4275 }
4276 
4277 void
4278 optimization_options (int level, int size ATTRIBUTE_UNUSED)
4279 {
4280   /* For -O2 and beyond, turn off -fschedule-insns by default.  It tends to
4281      make the problem with not enough registers even worse.  */
4282 #ifdef INSN_SCHEDULING
4283   if (level > 1)
4284     flag_schedule_insns = 0;
4285 #endif
4286 
4287   if (TARGET_MACHO)
4288     /* The Darwin libraries never set errno, so we might as well
4289        avoid calling them when that's the only reason we would.  */
4290     flag_errno_math = 0;
4291 
4292   /* The default values of these switches depend on the TARGET_64BIT
4293      that is not known at this moment.  Mark these values with 2 and
4294      let user the to override these.  In case there is no command line option
4295      specifying them, we will set the defaults in override_options.  */
4296   if (optimize >= 1)
4297     flag_omit_frame_pointer = 2;
4298   flag_pcc_struct_return = 2;
4299   flag_asynchronous_unwind_tables = 2;
4300   flag_vect_cost_model = 1;
4301 #ifdef SUBTARGET_OPTIMIZATION_OPTIONS
4302   SUBTARGET_OPTIMIZATION_OPTIONS;
4303 #endif
4304 }
4305 
4306 /* Decide whether we can make a sibling call to a function.  DECL is the
4307    declaration of the function being targeted by the call and EXP is the
4308    CALL_EXPR representing the call.  */
4309 
4310 static bool
4311 ix86_function_ok_for_sibcall (tree decl, tree exp)
4312 {
4313   tree type, decl_or_type;
4314   rtx a, b;
4315 
4316   /* If we are generating position-independent code, we cannot sibcall
4317      optimize any indirect call, or a direct call to a global function,
4318      as the PLT requires %ebx be live.  */
4319   if (!TARGET_64BIT && flag_pic && (!decl || !targetm.binds_local_p (decl)))
4320     return false;
4321 
4322   /* If we need to align the outgoing stack, then sibcalling would
4323      unalign the stack, which may break the called function.  */
4324   if (ix86_minimum_incoming_stack_boundary (true)
4325       < PREFERRED_STACK_BOUNDARY)
4326     return false;
4327 
4328   if (decl)
4329     {
4330       decl_or_type = decl;
4331       type = TREE_TYPE (decl);
4332     }
4333   else
4334     {
4335       /* We're looking at the CALL_EXPR, we need the type of the function.  */
4336       type = CALL_EXPR_FN (exp);		/* pointer expression */
4337       type = TREE_TYPE (type);			/* pointer type */
4338       type = TREE_TYPE (type);			/* function type */
4339       decl_or_type = type;
4340     }
4341 
4342   /* Check that the return value locations are the same.  Like
4343      if we are returning floats on the 80387 register stack, we cannot
4344      make a sibcall from a function that doesn't return a float to a
4345      function that does or, conversely, from a function that does return
4346      a float to a function that doesn't; the necessary stack adjustment
4347      would not be executed.  This is also the place we notice
4348      differences in the return value ABI.  Note that it is ok for one
4349      of the functions to have void return type as long as the return
4350      value of the other is passed in a register.  */
4351   a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4352   b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4353 			   cfun->decl, false);
4354   if (STACK_REG_P (a) || STACK_REG_P (b))
4355     {
4356       if (!rtx_equal_p (a, b))
4357 	return false;
4358     }
4359   else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4360     ;
4361   else if (!rtx_equal_p (a, b))
4362     return false;
4363 
4364   if (TARGET_64BIT)
4365     {
4366       /* The SYSV ABI has more call-clobbered registers;
4367 	 disallow sibcalls from MS to SYSV.  */
4368       if (cfun->machine->call_abi == MS_ABI
4369 	  && ix86_function_type_abi (type) == SYSV_ABI)
4370 	return false;
4371     }
4372   else
4373     {
4374       /* If this call is indirect, we'll need to be able to use a
4375 	 call-clobbered register for the address of the target function.
4376 	 Make sure that all such registers are not used for passing
4377 	 parameters.  Note that DLLIMPORT functions are indirect.  */
4378       if (!decl
4379 	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4380 	{
4381 	  if (ix86_function_regparm (type, NULL) >= 3)
4382 	    {
4383 	      /* ??? Need to count the actual number of registers to be used,
4384 		 not the possible number of registers.  Fix later.  */
4385 	      return false;
4386 	    }
4387 	}
4388     }
4389 
4390   /* Otherwise okay.  That also includes certain types of indirect calls.  */
4391   return true;
4392 }
4393 
4394 /* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm"
4395    calling convention attributes;
4396    arguments as in struct attribute_spec.handler.  */
4397 
4398 static tree
4399 ix86_handle_cconv_attribute (tree *node, tree name,
4400 				   tree args,
4401 				   int flags ATTRIBUTE_UNUSED,
4402 				   bool *no_add_attrs)
4403 {
4404   if (TREE_CODE (*node) != FUNCTION_TYPE
4405       && TREE_CODE (*node) != METHOD_TYPE
4406       && TREE_CODE (*node) != FIELD_DECL
4407       && TREE_CODE (*node) != TYPE_DECL)
4408     {
4409       warning (OPT_Wattributes, "%qE attribute only applies to functions",
4410 	       name);
4411       *no_add_attrs = true;
4412       return NULL_TREE;
4413     }
4414 
4415   /* Can combine regparm with all attributes but fastcall.  */
4416   if (is_attribute_p ("regparm", name))
4417     {
4418       tree cst;
4419 
4420       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4421         {
4422 	  error ("fastcall and regparm attributes are not compatible");
4423 	}
4424 
4425       cst = TREE_VALUE (args);
4426       if (TREE_CODE (cst) != INTEGER_CST)
4427 	{
4428 	  warning (OPT_Wattributes,
4429 		   "%qE attribute requires an integer constant argument",
4430 		   name);
4431 	  *no_add_attrs = true;
4432 	}
4433       else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4434 	{
4435 	  warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4436 		   name, REGPARM_MAX);
4437 	  *no_add_attrs = true;
4438 	}
4439 
4440       return NULL_TREE;
4441     }
4442 
4443   if (TARGET_64BIT)
4444     {
4445       /* Do not warn when emulating the MS ABI.  */
4446       if (TREE_CODE (*node) != FUNCTION_TYPE
4447 	  || ix86_function_type_abi (*node) != MS_ABI)
4448 	warning (OPT_Wattributes, "%qE attribute ignored",
4449 	         name);
4450       *no_add_attrs = true;
4451       return NULL_TREE;
4452     }
4453 
4454   /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
4455   if (is_attribute_p ("fastcall", name))
4456     {
4457       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4458         {
4459 	  error ("fastcall and cdecl attributes are not compatible");
4460 	}
4461       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4462         {
4463 	  error ("fastcall and stdcall attributes are not compatible");
4464 	}
4465       if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4466         {
4467 	  error ("fastcall and regparm attributes are not compatible");
4468 	}
4469     }
4470 
4471   /* Can combine stdcall with fastcall (redundant), regparm and
4472      sseregparm.  */
4473   else if (is_attribute_p ("stdcall", name))
4474     {
4475       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4476         {
4477 	  error ("stdcall and cdecl attributes are not compatible");
4478 	}
4479       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4480         {
4481 	  error ("stdcall and fastcall attributes are not compatible");
4482 	}
4483     }
4484 
4485   /* Can combine cdecl with regparm and sseregparm.  */
4486   else if (is_attribute_p ("cdecl", name))
4487     {
4488       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4489         {
4490 	  error ("stdcall and cdecl attributes are not compatible");
4491 	}
4492       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4493         {
4494 	  error ("fastcall and cdecl attributes are not compatible");
4495 	}
4496     }
4497 
4498   /* Can combine sseregparm with all attributes.  */
4499 
4500   return NULL_TREE;
4501 }
4502 
4503 /* Return 0 if the attributes for two types are incompatible, 1 if they
4504    are compatible, and 2 if they are nearly compatible (which causes a
4505    warning to be generated).  */
4506 
4507 static int
4508 ix86_comp_type_attributes (const_tree type1, const_tree type2)
4509 {
4510   /* Check for mismatch of non-default calling convention.  */
4511   const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall";
4512 
4513   if (TREE_CODE (type1) != FUNCTION_TYPE
4514       && TREE_CODE (type1) != METHOD_TYPE)
4515     return 1;
4516 
4517   /* Check for mismatched fastcall/regparm types.  */
4518   if ((!lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type1))
4519        != !lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type2)))
4520       || (ix86_function_regparm (type1, NULL)
4521 	  != ix86_function_regparm (type2, NULL)))
4522     return 0;
4523 
4524   /* Check for mismatched sseregparm types.  */
4525   if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1))
4526       != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2)))
4527     return 0;
4528 
4529   /* Check for mismatched return types (cdecl vs stdcall).  */
4530   if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1))
4531       != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2)))
4532     return 0;
4533 
4534   return 1;
4535 }
4536 
4537 /* Return the regparm value for a function with the indicated TYPE and DECL.
4538    DECL may be NULL when calling function indirectly
4539    or considering a libcall.  */
4540 
4541 static int
4542 ix86_function_regparm (const_tree type, const_tree decl)
4543 {
4544   tree attr;
4545   int regparm;
4546 
4547   if (TARGET_64BIT)
4548     return (ix86_function_type_abi (type) == SYSV_ABI
4549 	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
4550 
4551   regparm = ix86_regparm;
4552   attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
4553   if (attr)
4554     {
4555       regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
4556       return regparm;
4557     }
4558 
4559   if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
4560     return 2;
4561 
4562   /* Use register calling convention for local functions when possible.  */
4563   if (decl
4564       && TREE_CODE (decl) == FUNCTION_DECL
4565       && optimize
4566       && !profile_flag)
4567     {
4568       /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified.  */
4569       struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
4570       if (i && i->local)
4571 	{
4572 	  int local_regparm, globals = 0, regno;
4573 
4574 	  /* Make sure no regparm register is taken by a
4575 	     fixed register variable.  */
4576 	  for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
4577 	    if (fixed_regs[local_regparm])
4578 	      break;
4579 
4580 	  /* We don't want to use regparm(3) for nested functions as
4581 	     these use a static chain pointer in the third argument.  */
4582 	  if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
4583 	    local_regparm = 2;
4584 
4585 	  /* Each fixed register usage increases register pressure,
4586 	     so less registers should be used for argument passing.
4587 	     This functionality can be overriden by an explicit
4588 	     regparm value.  */
4589 	  for (regno = 0; regno <= DI_REG; regno++)
4590 	    if (fixed_regs[regno])
4591 	      globals++;
4592 
4593 	  local_regparm
4594 	    = globals < local_regparm ? local_regparm - globals : 0;
4595 
4596 	  if (local_regparm > regparm)
4597 	    regparm = local_regparm;
4598 	}
4599     }
4600 
4601   return regparm;
4602 }
4603 
4604 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
4605    DFmode (2) arguments in SSE registers for a function with the
4606    indicated TYPE and DECL.  DECL may be NULL when calling function
4607    indirectly or considering a libcall.  Otherwise return 0.  */
4608 
4609 static int
4610 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
4611 {
4612   gcc_assert (!TARGET_64BIT);
4613 
4614   /* Use SSE registers to pass SFmode and DFmode arguments if requested
4615      by the sseregparm attribute.  */
4616   if (TARGET_SSEREGPARM
4617       || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
4618     {
4619       if (!TARGET_SSE)
4620 	{
4621 	  if (warn)
4622 	    {
4623 	      if (decl)
4624 		error ("Calling %qD with attribute sseregparm without "
4625 		       "SSE/SSE2 enabled", decl);
4626 	      else
4627 		error ("Calling %qT with attribute sseregparm without "
4628 		       "SSE/SSE2 enabled", type);
4629 	    }
4630 	  return 0;
4631 	}
4632 
4633       return 2;
4634     }
4635 
4636   /* For local functions, pass up to SSE_REGPARM_MAX SFmode
4637      (and DFmode for SSE2) arguments in SSE registers.  */
4638   if (decl && TARGET_SSE_MATH && optimize && !profile_flag)
4639     {
4640       /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified.  */
4641       struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
4642       if (i && i->local)
4643 	return TARGET_SSE2 ? 2 : 1;
4644     }
4645 
4646   return 0;
4647 }
4648 
4649 /* Return true if EAX is live at the start of the function.  Used by
4650    ix86_expand_prologue to determine if we need special help before
4651    calling allocate_stack_worker.  */
4652 
4653 static bool
4654 ix86_eax_live_at_start_p (void)
4655 {
4656   /* Cheat.  Don't bother working forward from ix86_function_regparm
4657      to the function type to whether an actual argument is located in
4658      eax.  Instead just look at cfg info, which is still close enough
4659      to correct at this point.  This gives false positives for broken
4660      functions that might use uninitialized data that happens to be
4661      allocated in eax, but who cares?  */
4662   return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
4663 }
4664 
4665 /* Value is the number of bytes of arguments automatically
4666    popped when returning from a subroutine call.
4667    FUNDECL is the declaration node of the function (as a tree),
4668    FUNTYPE is the data type of the function (as a tree),
4669    or for a library call it is an identifier node for the subroutine name.
4670    SIZE is the number of bytes of arguments passed on the stack.
4671 
4672    On the 80386, the RTD insn may be used to pop them if the number
4673      of args is fixed, but if the number is variable then the caller
4674      must pop them all.  RTD can't be used for library calls now
4675      because the library is compiled with the Unix compiler.
4676    Use of RTD is a selectable option, since it is incompatible with
4677    standard Unix calling sequences.  If the option is not selected,
4678    the caller must always pop the args.
4679 
4680    The attribute stdcall is equivalent to RTD on a per module basis.  */
4681 
4682 int
4683 ix86_return_pops_args (tree fundecl, tree funtype, int size)
4684 {
4685   int rtd;
4686 
4687   /* None of the 64-bit ABIs pop arguments.  */
4688   if (TARGET_64BIT)
4689     return 0;
4690 
4691   rtd = TARGET_RTD && (!fundecl || TREE_CODE (fundecl) != IDENTIFIER_NODE);
4692 
4693   /* Cdecl functions override -mrtd, and never pop the stack.  */
4694   if (! lookup_attribute ("cdecl", TYPE_ATTRIBUTES (funtype)))
4695     {
4696       /* Stdcall and fastcall functions will pop the stack if not
4697          variable args.  */
4698       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype))
4699           || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)))
4700 	rtd = 1;
4701 
4702       if (rtd && ! stdarg_p (funtype))
4703 	return size;
4704     }
4705 
4706   /* Lose any fake structure return argument if it is passed on the stack.  */
4707   if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
4708       && !KEEP_AGGREGATE_RETURN_POINTER)
4709     {
4710       int nregs = ix86_function_regparm (funtype, fundecl);
4711       if (nregs == 0)
4712 	return GET_MODE_SIZE (Pmode);
4713     }
4714 
4715   return 0;
4716 }
4717 
4718 /* Argument support functions.  */
4719 
4720 /* Return true when register may be used to pass function parameters.  */
4721 bool
4722 ix86_function_arg_regno_p (int regno)
4723 {
4724   int i;
4725   const int *parm_regs;
4726 
4727   if (!TARGET_64BIT)
4728     {
4729       if (TARGET_MACHO)
4730         return (regno < REGPARM_MAX
4731                 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
4732       else
4733         return (regno < REGPARM_MAX
4734 	        || (TARGET_MMX && MMX_REGNO_P (regno)
4735 	  	    && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
4736 	        || (TARGET_SSE && SSE_REGNO_P (regno)
4737 		    && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
4738     }
4739 
4740   if (TARGET_MACHO)
4741     {
4742       if (SSE_REGNO_P (regno) && TARGET_SSE)
4743         return true;
4744     }
4745   else
4746     {
4747       if (TARGET_SSE && SSE_REGNO_P (regno)
4748           && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
4749         return true;
4750     }
4751 
4752   /* TODO: The function should depend on current function ABI but
4753      builtins.c would need updating then. Therefore we use the
4754      default ABI.  */
4755 
4756   /* RAX is used as hidden argument to va_arg functions.  */
4757   if (ix86_abi == SYSV_ABI && regno == AX_REG)
4758     return true;
4759 
4760   if (ix86_abi == MS_ABI)
4761     parm_regs = x86_64_ms_abi_int_parameter_registers;
4762   else
4763     parm_regs = x86_64_int_parameter_registers;
4764   for (i = 0; i < (ix86_abi == MS_ABI
4765 		   ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
4766     if (regno == parm_regs[i])
4767       return true;
4768   return false;
4769 }
4770 
4771 /* Return if we do not know how to pass TYPE solely in registers.  */
4772 
4773 static bool
4774 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
4775 {
4776   if (must_pass_in_stack_var_size_or_pad (mode, type))
4777     return true;
4778 
4779   /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
4780      The layout_type routine is crafty and tries to trick us into passing
4781      currently unsupported vector types on the stack by using TImode.  */
4782   return (!TARGET_64BIT && mode == TImode
4783 	  && type && TREE_CODE (type) != VECTOR_TYPE);
4784 }
4785 
4786 /* It returns the size, in bytes, of the area reserved for arguments passed
4787    in registers for the function represented by fndecl dependent to the used
4788    abi format.  */
4789 int
4790 ix86_reg_parm_stack_space (const_tree fndecl)
4791 {
4792   enum calling_abi call_abi = SYSV_ABI;
4793   if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
4794     call_abi = ix86_function_abi (fndecl);
4795   else
4796     call_abi = ix86_function_type_abi (fndecl);
4797   if (call_abi == MS_ABI)
4798     return 32;
4799   return 0;
4800 }
4801 
4802 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
4803    call abi used.  */
4804 enum calling_abi
4805 ix86_function_type_abi (const_tree fntype)
4806 {
4807   if (TARGET_64BIT && fntype != NULL)
4808     {
4809       enum calling_abi abi = ix86_abi;
4810       if (abi == SYSV_ABI)
4811 	{
4812 	  if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
4813 	    abi = MS_ABI;
4814 	}
4815       else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
4816 	abi = SYSV_ABI;
4817       return abi;
4818     }
4819   return ix86_abi;
4820 }
4821 
4822 static bool
4823 ix86_function_ms_hook_prologue (const_tree fntype)
4824 {
4825   if (!TARGET_64BIT)
4826     {
4827       if (lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fntype)))
4828         {
4829           if (decl_function_context (fntype) != NULL_TREE)
4830           {
4831             error_at (DECL_SOURCE_LOCATION (fntype),
4832                 "ms_hook_prologue is not compatible with nested function");
4833           }
4834 
4835           return true;
4836         }
4837     }
4838   return false;
4839 }
4840 
4841 static enum calling_abi
4842 ix86_function_abi (const_tree fndecl)
4843 {
4844   if (! fndecl)
4845     return ix86_abi;
4846   return ix86_function_type_abi (TREE_TYPE (fndecl));
4847 }
4848 
4849 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
4850    call abi used.  */
4851 enum calling_abi
4852 ix86_cfun_abi (void)
4853 {
4854   if (! cfun || ! TARGET_64BIT)
4855     return ix86_abi;
4856   return cfun->machine->call_abi;
4857 }
4858 
4859 /* regclass.c  */
4860 extern void init_regs (void);
4861 
4862 /* Implementation of call abi switching target hook. Specific to FNDECL
4863    the specific call register sets are set. See also CONDITIONAL_REGISTER_USAGE
4864    for more details.  */
4865 void
4866 ix86_call_abi_override (const_tree fndecl)
4867 {
4868   if (fndecl == NULL_TREE)
4869     cfun->machine->call_abi = ix86_abi;
4870   else
4871     cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
4872 }
4873 
4874 /* MS and SYSV ABI have different set of call used registers.  Avoid expensive
4875    re-initialization of init_regs each time we switch function context since
4876    this is needed only during RTL expansion.  */
4877 static void
4878 ix86_maybe_switch_abi (void)
4879 {
4880   if (TARGET_64BIT &&
4881       call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
4882     reinit_regs ();
4883 }
4884 
4885 /* Initialize a variable CUM of type CUMULATIVE_ARGS
4886    for a call to a function whose data type is FNTYPE.
4887    For a library call, FNTYPE is 0.  */
4888 
4889 void
4890 init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
4891 		      tree fntype,	/* tree ptr for function decl */
4892 		      rtx libname,	/* SYMBOL_REF of library name or 0 */
4893 		      tree fndecl)
4894 {
4895   struct cgraph_local_info *i = fndecl ? cgraph_local_info (fndecl) : NULL;
4896   memset (cum, 0, sizeof (*cum));
4897 
4898   if (fndecl)
4899    cum->call_abi = ix86_function_abi (fndecl);
4900   else
4901    cum->call_abi = ix86_function_type_abi (fntype);
4902   /* Set up the number of registers to use for passing arguments.  */
4903 
4904   if (cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
4905     sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
4906 	   "or subtarget optimization implying it");
4907   cum->nregs = ix86_regparm;
4908   if (TARGET_64BIT)
4909     {
4910       cum->nregs = (cum->call_abi == SYSV_ABI
4911                    ? X86_64_REGPARM_MAX
4912                    : X86_64_MS_REGPARM_MAX);
4913     }
4914   if (TARGET_SSE)
4915     {
4916       cum->sse_nregs = SSE_REGPARM_MAX;
4917       if (TARGET_64BIT)
4918         {
4919           cum->sse_nregs = (cum->call_abi == SYSV_ABI
4920                            ? X86_64_SSE_REGPARM_MAX
4921                            : X86_64_MS_SSE_REGPARM_MAX);
4922         }
4923     }
4924   if (TARGET_MMX)
4925     cum->mmx_nregs = MMX_REGPARM_MAX;
4926   cum->warn_avx = true;
4927   cum->warn_sse = true;
4928   cum->warn_mmx = true;
4929 
4930   /* Because type might mismatch in between caller and callee, we need to
4931      use actual type of function for local calls.
4932      FIXME: cgraph_analyze can be told to actually record if function uses
4933      va_start so for local functions maybe_vaarg can be made aggressive
4934      helping K&R code.
4935      FIXME: once typesytem is fixed, we won't need this code anymore.  */
4936   if (i && i->local)
4937     fntype = TREE_TYPE (fndecl);
4938   cum->maybe_vaarg = (fntype
4939 		      ? (!prototype_p (fntype) || stdarg_p (fntype))
4940 		      : !libname);
4941 
4942   if (!TARGET_64BIT)
4943     {
4944       /* If there are variable arguments, then we won't pass anything
4945          in registers in 32-bit mode. */
4946       if (stdarg_p (fntype))
4947 	{
4948 	  cum->nregs = 0;
4949 	  cum->sse_nregs = 0;
4950 	  cum->mmx_nregs = 0;
4951 	  cum->warn_avx = 0;
4952 	  cum->warn_sse = 0;
4953 	  cum->warn_mmx = 0;
4954 	  return;
4955 	}
4956 
4957       /* Use ecx and edx registers if function has fastcall attribute,
4958 	 else look for regparm information.  */
4959       if (fntype)
4960 	{
4961 	  if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)))
4962 	    {
4963 	      cum->nregs = 2;
4964 	      cum->fastcall = 1;
4965 	    }
4966 	  else
4967 	    cum->nregs = ix86_function_regparm (fntype, fndecl);
4968 	}
4969 
4970       /* Set up the number of SSE registers used for passing SFmode
4971 	 and DFmode arguments.  Warn for mismatching ABI.  */
4972       cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
4973     }
4974 }
4975 
4976 /* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
4977    But in the case of vector types, it is some vector mode.
4978 
4979    When we have only some of our vector isa extensions enabled, then there
4980    are some modes for which vector_mode_supported_p is false.  For these
4981    modes, the generic vector support in gcc will choose some non-vector mode
4982    in order to implement the type.  By computing the natural mode, we'll
4983    select the proper ABI location for the operand and not depend on whatever
4984    the middle-end decides to do with these vector types.
4985 
4986    The midde-end can't deal with the vector types > 16 bytes.  In this
4987    case, we return the original mode and warn ABI change if CUM isn't
4988    NULL.  */
4989 
4990 static enum machine_mode
4991 type_natural_mode (const_tree type, CUMULATIVE_ARGS *cum)
4992 {
4993   enum machine_mode mode = TYPE_MODE (type);
4994 
4995   if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
4996     {
4997       HOST_WIDE_INT size = int_size_in_bytes (type);
4998       if ((size == 8 || size == 16 || size == 32)
4999 	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
5000 	  && TYPE_VECTOR_SUBPARTS (type) > 1)
5001 	{
5002 	  enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5003 
5004 	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5005 	    mode = MIN_MODE_VECTOR_FLOAT;
5006 	  else
5007 	    mode = MIN_MODE_VECTOR_INT;
5008 
5009 	  /* Get the mode which has this inner mode and number of units.  */
5010 	  for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5011 	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5012 		&& GET_MODE_INNER (mode) == innermode)
5013 	      {
5014 		if (size == 32 && !TARGET_AVX)
5015 		  {
5016 		    static bool warnedavx;
5017 
5018 		    if (cum
5019 			&& !warnedavx
5020 			&& cum->warn_avx)
5021 		      {
5022 			warnedavx = true;
5023 			warning (0, "AVX vector argument without AVX "
5024 				 "enabled changes the ABI");
5025 		      }
5026 		    return TYPE_MODE (type);
5027 		  }
5028 		else
5029 		  return mode;
5030 	      }
5031 
5032 	  gcc_unreachable ();
5033 	}
5034     }
5035 
5036   return mode;
5037 }
5038 
5039 /* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
5040    this may not agree with the mode that the type system has chosen for the
5041    register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
5042    go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
5043 
5044 static rtx
5045 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5046 		     unsigned int regno)
5047 {
5048   rtx tmp;
5049 
5050   if (orig_mode != BLKmode)
5051     tmp = gen_rtx_REG (orig_mode, regno);
5052   else
5053     {
5054       tmp = gen_rtx_REG (mode, regno);
5055       tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5056       tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5057     }
5058 
5059   return tmp;
5060 }
5061 
5062 /* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
5063    of this code is to classify each 8bytes of incoming argument by the register
5064    class and assign registers accordingly.  */
5065 
5066 /* Return the union class of CLASS1 and CLASS2.
5067    See the x86-64 PS ABI for details.  */
5068 
5069 static enum x86_64_reg_class
5070 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5071 {
5072   /* Rule #1: If both classes are equal, this is the resulting class.  */
5073   if (class1 == class2)
5074     return class1;
5075 
5076   /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5077      the other class.  */
5078   if (class1 == X86_64_NO_CLASS)
5079     return class2;
5080   if (class2 == X86_64_NO_CLASS)
5081     return class1;
5082 
5083   /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
5084   if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5085     return X86_64_MEMORY_CLASS;
5086 
5087   /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
5088   if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5089       || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5090     return X86_64_INTEGERSI_CLASS;
5091   if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5092       || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5093     return X86_64_INTEGER_CLASS;
5094 
5095   /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5096      MEMORY is used.  */
5097   if (class1 == X86_64_X87_CLASS
5098       || class1 == X86_64_X87UP_CLASS
5099       || class1 == X86_64_COMPLEX_X87_CLASS
5100       || class2 == X86_64_X87_CLASS
5101       || class2 == X86_64_X87UP_CLASS
5102       || class2 == X86_64_COMPLEX_X87_CLASS)
5103     return X86_64_MEMORY_CLASS;
5104 
5105   /* Rule #6: Otherwise class SSE is used.  */
5106   return X86_64_SSE_CLASS;
5107 }
5108 
5109 /* Classify the argument of type TYPE and mode MODE.
5110    CLASSES will be filled by the register class used to pass each word
5111    of the operand.  The number of words is returned.  In case the parameter
5112    should be passed in memory, 0 is returned. As a special case for zero
5113    sized containers, classes[0] will be NO_CLASS and 1 is returned.
5114 
5115    BIT_OFFSET is used internally for handling records and specifies offset
5116    of the offset in bits modulo 256 to avoid overflow cases.
5117 
5118    See the x86-64 PS ABI for details.
5119 */
5120 
5121 static int
5122 classify_argument (enum machine_mode mode, const_tree type,
5123 		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5124 {
5125   HOST_WIDE_INT bytes =
5126     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5127   int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5128 
5129   /* Variable sized entities are always passed/returned in memory.  */
5130   if (bytes < 0)
5131     return 0;
5132 
5133   if (mode != VOIDmode
5134       && targetm.calls.must_pass_in_stack (mode, type))
5135     return 0;
5136 
5137   if (type && AGGREGATE_TYPE_P (type))
5138     {
5139       int i;
5140       tree field;
5141       enum x86_64_reg_class subclasses[MAX_CLASSES];
5142 
5143       /* On x86-64 we pass structures larger than 32 bytes on the stack.  */
5144       if (bytes > 32)
5145 	return 0;
5146 
5147       for (i = 0; i < words; i++)
5148 	classes[i] = X86_64_NO_CLASS;
5149 
5150       /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
5151 	 signalize memory class, so handle it as special case.  */
5152       if (!words)
5153 	{
5154 	  classes[0] = X86_64_NO_CLASS;
5155 	  return 1;
5156 	}
5157 
5158       /* Classify each field of record and merge classes.  */
5159       switch (TREE_CODE (type))
5160 	{
5161 	case RECORD_TYPE:
5162 	  /* And now merge the fields of structure.  */
5163 	  for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
5164 	    {
5165 	      if (TREE_CODE (field) == FIELD_DECL)
5166 		{
5167 		  int num;
5168 
5169 		  if (TREE_TYPE (field) == error_mark_node)
5170 		    continue;
5171 
5172 		  /* Bitfields are always classified as integer.  Handle them
5173 		     early, since later code would consider them to be
5174 		     misaligned integers.  */
5175 		  if (DECL_BIT_FIELD (field))
5176 		    {
5177 		      for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5178 			   i < ((int_bit_position (field) + (bit_offset % 64))
5179 			        + tree_low_cst (DECL_SIZE (field), 0)
5180 				+ 63) / 8 / 8; i++)
5181 			classes[i] =
5182 			  merge_classes (X86_64_INTEGER_CLASS,
5183 					 classes[i]);
5184 		    }
5185 		  else
5186 		    {
5187 		      int pos;
5188 
5189 		      type = TREE_TYPE (field);
5190 
5191 		      /* Flexible array member is ignored.  */
5192 		      if (TYPE_MODE (type) == BLKmode
5193 			  && TREE_CODE (type) == ARRAY_TYPE
5194 			  && TYPE_SIZE (type) == NULL_TREE
5195 			  && TYPE_DOMAIN (type) != NULL_TREE
5196 			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5197 			      == NULL_TREE))
5198 			{
5199 			  static bool warned;
5200 
5201 			  if (!warned && warn_psabi)
5202 			    {
5203 			      warned = true;
5204 			      inform (input_location,
5205 				      "The ABI of passing struct with"
5206 				      " a flexible array member has"
5207 				      " changed in GCC 4.4");
5208 			    }
5209 			  continue;
5210 			}
5211 		      num = classify_argument (TYPE_MODE (type), type,
5212 					       subclasses,
5213 					       (int_bit_position (field)
5214 						+ bit_offset) % 256);
5215 		      if (!num)
5216 			return 0;
5217 		      pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5218 		      for (i = 0; i < num && (i + pos) < words; i++)
5219 			classes[i + pos] =
5220 			  merge_classes (subclasses[i], classes[i + pos]);
5221 		    }
5222 		}
5223 	    }
5224 	  break;
5225 
5226 	case ARRAY_TYPE:
5227 	  /* Arrays are handled as small records.  */
5228 	  {
5229 	    int num;
5230 	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5231 				     TREE_TYPE (type), subclasses, bit_offset);
5232 	    if (!num)
5233 	      return 0;
5234 
5235 	    /* The partial classes are now full classes.  */
5236 	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5237 	      subclasses[0] = X86_64_SSE_CLASS;
5238 	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
5239 		&& !((bit_offset % 64) == 0 && bytes == 4))
5240 	      subclasses[0] = X86_64_INTEGER_CLASS;
5241 
5242 	    for (i = 0; i < words; i++)
5243 	      classes[i] = subclasses[i % num];
5244 
5245 	    break;
5246 	  }
5247 	case UNION_TYPE:
5248 	case QUAL_UNION_TYPE:
5249 	  /* Unions are similar to RECORD_TYPE but offset is always 0.
5250 	     */
5251 	  for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
5252 	    {
5253 	      if (TREE_CODE (field) == FIELD_DECL)
5254 		{
5255 		  int num;
5256 
5257 		  if (TREE_TYPE (field) == error_mark_node)
5258 		    continue;
5259 
5260 		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5261 					   TREE_TYPE (field), subclasses,
5262 					   bit_offset);
5263 		  if (!num)
5264 		    return 0;
5265 		  for (i = 0; i < num; i++)
5266 		    classes[i] = merge_classes (subclasses[i], classes[i]);
5267 		}
5268 	    }
5269 	  break;
5270 
5271 	default:
5272 	  gcc_unreachable ();
5273 	}
5274 
5275       if (words > 2)
5276 	{
5277 	  /* When size > 16 bytes, if the first one isn't
5278 	     X86_64_SSE_CLASS or any other ones aren't
5279 	     X86_64_SSEUP_CLASS, everything should be passed in
5280 	     memory.  */
5281 	  if (classes[0] != X86_64_SSE_CLASS)
5282 	      return 0;
5283 
5284 	  for (i = 1; i < words; i++)
5285 	    if (classes[i] != X86_64_SSEUP_CLASS)
5286 	      return 0;
5287 	}
5288 
5289       /* Final merger cleanup.  */
5290       for (i = 0; i < words; i++)
5291 	{
5292 	  /* If one class is MEMORY, everything should be passed in
5293 	     memory.  */
5294 	  if (classes[i] == X86_64_MEMORY_CLASS)
5295 	    return 0;
5296 
5297 	  /* The X86_64_SSEUP_CLASS should be always preceded by
5298 	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
5299 	  if (classes[i] == X86_64_SSEUP_CLASS
5300 	      && classes[i - 1] != X86_64_SSE_CLASS
5301 	      && classes[i - 1] != X86_64_SSEUP_CLASS)
5302 	    {
5303 	      /* The first one should never be X86_64_SSEUP_CLASS.  */
5304 	      gcc_assert (i != 0);
5305 	      classes[i] = X86_64_SSE_CLASS;
5306 	    }
5307 
5308 	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
5309 	       everything should be passed in memory.  */
5310 	  if (classes[i] == X86_64_X87UP_CLASS
5311 	      && (classes[i - 1] != X86_64_X87_CLASS))
5312 	    {
5313 	      static bool warned;
5314 
5315 	      /* The first one should never be X86_64_X87UP_CLASS.  */
5316 	      gcc_assert (i != 0);
5317 	      if (!warned && warn_psabi)
5318 		{
5319 		  warned = true;
5320 		  inform (input_location,
5321 			  "The ABI of passing union with long double"
5322 			  " has changed in GCC 4.4");
5323 		}
5324 	      return 0;
5325 	    }
5326 	}
5327       return words;
5328     }
5329 
5330   /* Compute alignment needed.  We align all types to natural boundaries with
5331      exception of XFmode that is aligned to 64bits.  */
5332   if (mode != VOIDmode && mode != BLKmode)
5333     {
5334       int mode_alignment = GET_MODE_BITSIZE (mode);
5335 
5336       if (mode == XFmode)
5337 	mode_alignment = 128;
5338       else if (mode == XCmode)
5339 	mode_alignment = 256;
5340       if (COMPLEX_MODE_P (mode))
5341 	mode_alignment /= 2;
5342       /* Misaligned fields are always returned in memory.  */
5343       if (bit_offset % mode_alignment)
5344 	return 0;
5345     }
5346 
5347   /* for V1xx modes, just use the base mode */
5348   if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
5349       && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
5350     mode = GET_MODE_INNER (mode);
5351 
5352   /* Classification of atomic types.  */
5353   switch (mode)
5354     {
5355     case SDmode:
5356     case DDmode:
5357       classes[0] = X86_64_SSE_CLASS;
5358       return 1;
5359     case TDmode:
5360       classes[0] = X86_64_SSE_CLASS;
5361       classes[1] = X86_64_SSEUP_CLASS;
5362       return 2;
5363     case DImode:
5364     case SImode:
5365     case HImode:
5366     case QImode:
5367     case CSImode:
5368     case CHImode:
5369     case CQImode:
5370       {
5371 	int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
5372 
5373 	if (size <= 32)
5374 	  {
5375 	    classes[0] = X86_64_INTEGERSI_CLASS;
5376 	    return 1;
5377 	  }
5378 	else if (size <= 64)
5379 	  {
5380 	    classes[0] = X86_64_INTEGER_CLASS;
5381 	    return 1;
5382 	  }
5383 	else if (size <= 64+32)
5384 	  {
5385 	    classes[0] = X86_64_INTEGER_CLASS;
5386 	    classes[1] = X86_64_INTEGERSI_CLASS;
5387 	    return 2;
5388 	  }
5389 	else if (size <= 64+64)
5390 	  {
5391 	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
5392 	    return 2;
5393 	  }
5394 	else
5395 	  gcc_unreachable ();
5396       }
5397     case CDImode:
5398     case TImode:
5399       classes[0] = classes[1] = X86_64_INTEGER_CLASS;
5400       return 2;
5401     case COImode:
5402     case OImode:
5403       /* OImode shouldn't be used directly.  */
5404       gcc_unreachable ();
5405     case CTImode:
5406       return 0;
5407     case SFmode:
5408       if (!(bit_offset % 64))
5409 	classes[0] = X86_64_SSESF_CLASS;
5410       else
5411 	classes[0] = X86_64_SSE_CLASS;
5412       return 1;
5413     case DFmode:
5414       classes[0] = X86_64_SSEDF_CLASS;
5415       return 1;
5416     case XFmode:
5417       classes[0] = X86_64_X87_CLASS;
5418       classes[1] = X86_64_X87UP_CLASS;
5419       return 2;
5420     case TFmode:
5421       classes[0] = X86_64_SSE_CLASS;
5422       classes[1] = X86_64_SSEUP_CLASS;
5423       return 2;
5424     case SCmode:
5425       classes[0] = X86_64_SSE_CLASS;
5426       if (!(bit_offset % 64))
5427 	return 1;
5428       else
5429 	{
5430 	  static bool warned;
5431 
5432 	  if (!warned && warn_psabi)
5433 	    {
5434 	      warned = true;
5435 	      inform (input_location,
5436 		      "The ABI of passing structure with complex float"
5437 		      " member has changed in GCC 4.4");
5438 	    }
5439 	  classes[1] = X86_64_SSESF_CLASS;
5440 	  return 2;
5441 	}
5442     case DCmode:
5443       classes[0] = X86_64_SSEDF_CLASS;
5444       classes[1] = X86_64_SSEDF_CLASS;
5445       return 2;
5446     case XCmode:
5447       classes[0] = X86_64_COMPLEX_X87_CLASS;
5448       return 1;
5449     case TCmode:
5450       /* This modes is larger than 16 bytes.  */
5451       return 0;
5452     case V8SFmode:
5453     case V8SImode:
5454     case V32QImode:
5455     case V16HImode:
5456     case V4DFmode:
5457     case V4DImode:
5458       classes[0] = X86_64_SSE_CLASS;
5459       classes[1] = X86_64_SSEUP_CLASS;
5460       classes[2] = X86_64_SSEUP_CLASS;
5461       classes[3] = X86_64_SSEUP_CLASS;
5462       return 4;
5463     case V4SFmode:
5464     case V4SImode:
5465     case V16QImode:
5466     case V8HImode:
5467     case V2DFmode:
5468     case V2DImode:
5469       classes[0] = X86_64_SSE_CLASS;
5470       classes[1] = X86_64_SSEUP_CLASS;
5471       return 2;
5472     case V1TImode:
5473     case V1DImode:
5474     case V2SFmode:
5475     case V2SImode:
5476     case V4HImode:
5477     case V8QImode:
5478       classes[0] = X86_64_SSE_CLASS;
5479       return 1;
5480     case BLKmode:
5481     case VOIDmode:
5482       return 0;
5483     default:
5484       gcc_assert (VECTOR_MODE_P (mode));
5485 
5486       if (bytes > 16)
5487 	return 0;
5488 
5489       gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
5490 
5491       if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
5492 	classes[0] = X86_64_INTEGERSI_CLASS;
5493       else
5494 	classes[0] = X86_64_INTEGER_CLASS;
5495       classes[1] = X86_64_INTEGER_CLASS;
5496       return 1 + (bytes > 8);
5497     }
5498 }
5499 
5500 /* Examine the argument and return set number of register required in each
5501    class.  Return 0 iff parameter should be passed in memory.  */
5502 static int
5503 examine_argument (enum machine_mode mode, const_tree type, int in_return,
5504 		  int *int_nregs, int *sse_nregs)
5505 {
5506   enum x86_64_reg_class regclass[MAX_CLASSES];
5507   int n = classify_argument (mode, type, regclass, 0);
5508 
5509   *int_nregs = 0;
5510   *sse_nregs = 0;
5511   if (!n)
5512     return 0;
5513   for (n--; n >= 0; n--)
5514     switch (regclass[n])
5515       {
5516       case X86_64_INTEGER_CLASS:
5517       case X86_64_INTEGERSI_CLASS:
5518 	(*int_nregs)++;
5519 	break;
5520       case X86_64_SSE_CLASS:
5521       case X86_64_SSESF_CLASS:
5522       case X86_64_SSEDF_CLASS:
5523 	(*sse_nregs)++;
5524 	break;
5525       case X86_64_NO_CLASS:
5526       case X86_64_SSEUP_CLASS:
5527 	break;
5528       case X86_64_X87_CLASS:
5529       case X86_64_X87UP_CLASS:
5530 	if (!in_return)
5531 	  return 0;
5532 	break;
5533       case X86_64_COMPLEX_X87_CLASS:
5534 	return in_return ? 2 : 0;
5535       case X86_64_MEMORY_CLASS:
5536 	gcc_unreachable ();
5537       }
5538   return 1;
5539 }
5540 
5541 /* Construct container for the argument used by GCC interface.  See
5542    FUNCTION_ARG for the detailed description.  */
5543 
5544 static rtx
5545 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
5546 		     const_tree type, int in_return, int nintregs, int nsseregs,
5547 		     const int *intreg, int sse_regno)
5548 {
5549   /* The following variables hold the static issued_error state.  */
5550   static bool issued_sse_arg_error;
5551   static bool issued_sse_ret_error;
5552   static bool issued_x87_ret_error;
5553 
5554   enum machine_mode tmpmode;
5555   int bytes =
5556     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5557   enum x86_64_reg_class regclass[MAX_CLASSES];
5558   int n;
5559   int i;
5560   int nexps = 0;
5561   int needed_sseregs, needed_intregs;
5562   rtx exp[MAX_CLASSES];
5563   rtx ret;
5564 
5565   n = classify_argument (mode, type, regclass, 0);
5566   if (!n)
5567     return NULL;
5568   if (!examine_argument (mode, type, in_return, &needed_intregs,
5569 			 &needed_sseregs))
5570     return NULL;
5571   if (needed_intregs > nintregs || needed_sseregs > nsseregs)
5572     return NULL;
5573 
5574   /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
5575      some less clueful developer tries to use floating-point anyway.  */
5576   if (needed_sseregs && !TARGET_SSE)
5577     {
5578       if (in_return)
5579 	{
5580 	  if (!issued_sse_ret_error)
5581 	    {
5582 	      error ("SSE register return with SSE disabled");
5583 	      issued_sse_ret_error = true;
5584 	    }
5585 	}
5586       else if (!issued_sse_arg_error)
5587 	{
5588 	  error ("SSE register argument with SSE disabled");
5589 	  issued_sse_arg_error = true;
5590 	}
5591       return NULL;
5592     }
5593 
5594   /* Likewise, error if the ABI requires us to return values in the
5595      x87 registers and the user specified -mno-80387.  */
5596   if (!TARGET_80387 && in_return)
5597     for (i = 0; i < n; i++)
5598       if (regclass[i] == X86_64_X87_CLASS
5599 	  || regclass[i] == X86_64_X87UP_CLASS
5600 	  || regclass[i] == X86_64_COMPLEX_X87_CLASS)
5601 	{
5602 	  if (!issued_x87_ret_error)
5603 	    {
5604 	      error ("x87 register return with x87 disabled");
5605 	      issued_x87_ret_error = true;
5606 	    }
5607 	  return NULL;
5608 	}
5609 
5610   /* First construct simple cases.  Avoid SCmode, since we want to use
5611      single register to pass this type.  */
5612   if (n == 1 && mode != SCmode)
5613     switch (regclass[0])
5614       {
5615       case X86_64_INTEGER_CLASS:
5616       case X86_64_INTEGERSI_CLASS:
5617 	return gen_rtx_REG (mode, intreg[0]);
5618       case X86_64_SSE_CLASS:
5619       case X86_64_SSESF_CLASS:
5620       case X86_64_SSEDF_CLASS:
5621 	if (mode != BLKmode)
5622 	  return gen_reg_or_parallel (mode, orig_mode,
5623 				      SSE_REGNO (sse_regno));
5624 	break;
5625       case X86_64_X87_CLASS:
5626       case X86_64_COMPLEX_X87_CLASS:
5627 	return gen_rtx_REG (mode, FIRST_STACK_REG);
5628       case X86_64_NO_CLASS:
5629 	/* Zero sized array, struct or class.  */
5630 	return NULL;
5631       default:
5632 	gcc_unreachable ();
5633       }
5634   if (n == 2 && regclass[0] == X86_64_SSE_CLASS
5635       && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
5636     return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
5637   if (n == 4
5638       && regclass[0] == X86_64_SSE_CLASS
5639       && regclass[1] == X86_64_SSEUP_CLASS
5640       && regclass[2] == X86_64_SSEUP_CLASS
5641       && regclass[3] == X86_64_SSEUP_CLASS
5642       && mode != BLKmode)
5643     return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
5644 
5645   if (n == 2
5646       && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
5647     return gen_rtx_REG (XFmode, FIRST_STACK_REG);
5648   if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
5649       && regclass[1] == X86_64_INTEGER_CLASS
5650       && (mode == CDImode || mode == TImode || mode == TFmode)
5651       && intreg[0] + 1 == intreg[1])
5652     return gen_rtx_REG (mode, intreg[0]);
5653 
5654   /* Otherwise figure out the entries of the PARALLEL.  */
5655   for (i = 0; i < n; i++)
5656     {
5657       int pos;
5658 
5659       switch (regclass[i])
5660         {
5661 	  case X86_64_NO_CLASS:
5662 	    break;
5663 	  case X86_64_INTEGER_CLASS:
5664 	  case X86_64_INTEGERSI_CLASS:
5665 	    /* Merge TImodes on aligned occasions here too.  */
5666 	    if (i * 8 + 8 > bytes)
5667 	      tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
5668 	    else if (regclass[i] == X86_64_INTEGERSI_CLASS)
5669 	      tmpmode = SImode;
5670 	    else
5671 	      tmpmode = DImode;
5672 	    /* We've requested 24 bytes we don't have mode for.  Use DImode.  */
5673 	    if (tmpmode == BLKmode)
5674 	      tmpmode = DImode;
5675 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
5676 					       gen_rtx_REG (tmpmode, *intreg),
5677 					       GEN_INT (i*8));
5678 	    intreg++;
5679 	    break;
5680 	  case X86_64_SSESF_CLASS:
5681 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
5682 					       gen_rtx_REG (SFmode,
5683 							    SSE_REGNO (sse_regno)),
5684 					       GEN_INT (i*8));
5685 	    sse_regno++;
5686 	    break;
5687 	  case X86_64_SSEDF_CLASS:
5688 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
5689 					       gen_rtx_REG (DFmode,
5690 							    SSE_REGNO (sse_regno)),
5691 					       GEN_INT (i*8));
5692 	    sse_regno++;
5693 	    break;
5694 	  case X86_64_SSE_CLASS:
5695 	    pos = i;
5696 	    switch (n)
5697 	      {
5698 	      case 1:
5699 		tmpmode = DImode;
5700 		break;
5701 	      case 2:
5702 		if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
5703 		  {
5704 		    tmpmode = TImode;
5705 		    i++;
5706 		  }
5707 		else
5708 		  tmpmode = DImode;
5709 		break;
5710 	      case 4:
5711 		gcc_assert (i == 0
5712 			    && regclass[1] == X86_64_SSEUP_CLASS
5713 			    && regclass[2] == X86_64_SSEUP_CLASS
5714 			    && regclass[3] == X86_64_SSEUP_CLASS);
5715 		tmpmode = OImode;
5716 		i += 3;
5717 		break;
5718 	      default:
5719 		gcc_unreachable ();
5720 	      }
5721 	    exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
5722 					       gen_rtx_REG (tmpmode,
5723 							    SSE_REGNO (sse_regno)),
5724 					       GEN_INT (pos*8));
5725 	    sse_regno++;
5726 	    break;
5727 	  default:
5728 	    gcc_unreachable ();
5729 	}
5730     }
5731 
5732   /* Empty aligned struct, union or class.  */
5733   if (nexps == 0)
5734     return NULL;
5735 
5736   ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
5737   for (i = 0; i < nexps; i++)
5738     XVECEXP (ret, 0, i) = exp [i];
5739   return ret;
5740 }
5741 
5742 /* Update the data in CUM to advance over an argument of mode MODE
5743    and data type TYPE.  (TYPE is null for libcalls where that information
5744    may not be available.)  */
5745 
5746 static void
5747 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
5748 			 tree type, HOST_WIDE_INT bytes, HOST_WIDE_INT words)
5749 {
5750   switch (mode)
5751     {
5752     default:
5753       break;
5754 
5755     case BLKmode:
5756       if (bytes < 0)
5757 	break;
5758       /* FALLTHRU */
5759 
5760     case DImode:
5761     case SImode:
5762     case HImode:
5763     case QImode:
5764       cum->words += words;
5765       cum->nregs -= words;
5766       cum->regno += words;
5767 
5768       if (cum->nregs <= 0)
5769 	{
5770 	  cum->nregs = 0;
5771 	  cum->regno = 0;
5772 	}
5773       break;
5774 
5775     case OImode:
5776       /* OImode shouldn't be used directly.  */
5777       gcc_unreachable ();
5778 
5779     case DFmode:
5780       if (cum->float_in_sse < 2)
5781 	break;
5782     case SFmode:
5783       if (cum->float_in_sse < 1)
5784 	break;
5785       /* FALLTHRU */
5786 
5787     case V8SFmode:
5788     case V8SImode:
5789     case V32QImode:
5790     case V16HImode:
5791     case V4DFmode:
5792     case V4DImode:
5793     case TImode:
5794     case V16QImode:
5795     case V8HImode:
5796     case V4SImode:
5797     case V2DImode:
5798     case V4SFmode:
5799     case V2DFmode:
5800       if (!type || !AGGREGATE_TYPE_P (type))
5801 	{
5802 	  cum->sse_words += words;
5803 	  cum->sse_nregs -= 1;
5804 	  cum->sse_regno += 1;
5805 	  if (cum->sse_nregs <= 0)
5806 	    {
5807 	      cum->sse_nregs = 0;
5808 	      cum->sse_regno = 0;
5809 	    }
5810 	}
5811       break;
5812 
5813     case V8QImode:
5814     case V4HImode:
5815     case V2SImode:
5816     case V2SFmode:
5817     case V1TImode:
5818     case V1DImode:
5819       if (!type || !AGGREGATE_TYPE_P (type))
5820 	{
5821 	  cum->mmx_words += words;
5822 	  cum->mmx_nregs -= 1;
5823 	  cum->mmx_regno += 1;
5824 	  if (cum->mmx_nregs <= 0)
5825 	    {
5826 	      cum->mmx_nregs = 0;
5827 	      cum->mmx_regno = 0;
5828 	    }
5829 	}
5830       break;
5831     }
5832 }
5833 
5834 static void
5835 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
5836 			 tree type, HOST_WIDE_INT words, int named)
5837 {
5838   int int_nregs, sse_nregs;
5839 
5840   /* Unnamed 256bit vector mode parameters are passed on stack.  */
5841   if (!named && VALID_AVX256_REG_MODE (mode))
5842     return;
5843 
5844   if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
5845       && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
5846     {
5847       cum->nregs -= int_nregs;
5848       cum->sse_nregs -= sse_nregs;
5849       cum->regno += int_nregs;
5850       cum->sse_regno += sse_nregs;
5851     }
5852   else
5853     {
5854       int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
5855       cum->words = (cum->words + align - 1) & ~(align - 1);
5856       cum->words += words;
5857     }
5858 }
5859 
5860 static void
5861 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
5862 			    HOST_WIDE_INT words)
5863 {
5864   /* Otherwise, this should be passed indirect.  */
5865   gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
5866 
5867   cum->words += words;
5868   if (cum->nregs > 0)
5869     {
5870       cum->nregs -= 1;
5871       cum->regno += 1;
5872     }
5873 }
5874 
5875 void
5876 function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode,
5877 		      tree type, int named)
5878 {
5879   HOST_WIDE_INT bytes, words;
5880 
5881   if (mode == BLKmode)
5882     bytes = int_size_in_bytes (type);
5883   else
5884     bytes = GET_MODE_SIZE (mode);
5885   words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5886 
5887   if (type)
5888     mode = type_natural_mode (type, NULL);
5889 
5890   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
5891     function_arg_advance_ms_64 (cum, bytes, words);
5892   else if (TARGET_64BIT)
5893     function_arg_advance_64 (cum, mode, type, words, named);
5894   else
5895     function_arg_advance_32 (cum, mode, type, bytes, words);
5896 }
5897 
5898 /* Define where to put the arguments to a function.
5899    Value is zero to push the argument on the stack,
5900    or a hard register in which to store the argument.
5901 
5902    MODE is the argument's machine mode.
5903    TYPE is the data type of the argument (as a tree).
5904     This is null for libcalls where that information may
5905     not be available.
5906    CUM is a variable of type CUMULATIVE_ARGS which gives info about
5907     the preceding args and about the function being called.
5908    NAMED is nonzero if this argument is a named parameter
5909     (otherwise it is an extra parameter matching an ellipsis).  */
5910 
5911 static rtx
5912 function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
5913 		 enum machine_mode orig_mode, tree type,
5914 		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
5915 {
5916   static bool warnedsse, warnedmmx;
5917 
5918   /* Avoid the AL settings for the Unix64 ABI.  */
5919   if (mode == VOIDmode)
5920     return constm1_rtx;
5921 
5922   switch (mode)
5923     {
5924     default:
5925       break;
5926 
5927     case BLKmode:
5928       if (bytes < 0)
5929 	break;
5930       /* FALLTHRU */
5931     case DImode:
5932     case SImode:
5933     case HImode:
5934     case QImode:
5935       if (words <= cum->nregs)
5936 	{
5937 	  int regno = cum->regno;
5938 
5939 	  /* Fastcall allocates the first two DWORD (SImode) or
5940             smaller arguments to ECX and EDX if it isn't an
5941             aggregate type .  */
5942 	  if (cum->fastcall)
5943 	    {
5944 	      if (mode == BLKmode
5945 		  || mode == DImode
5946 		  || (type && AGGREGATE_TYPE_P (type)))
5947 	        break;
5948 
5949 	      /* ECX not EAX is the first allocated register.  */
5950 	      if (regno == AX_REG)
5951 		regno = CX_REG;
5952 	    }
5953 	  return gen_rtx_REG (mode, regno);
5954 	}
5955       break;
5956 
5957     case DFmode:
5958       if (cum->float_in_sse < 2)
5959 	break;
5960     case SFmode:
5961       if (cum->float_in_sse < 1)
5962 	break;
5963       /* FALLTHRU */
5964     case TImode:
5965       /* In 32bit, we pass TImode in xmm registers.  */
5966     case V16QImode:
5967     case V8HImode:
5968     case V4SImode:
5969     case V2DImode:
5970     case V4SFmode:
5971     case V2DFmode:
5972       if (!type || !AGGREGATE_TYPE_P (type))
5973 	{
5974 	  if (!TARGET_SSE && !warnedsse && cum->warn_sse)
5975 	    {
5976 	      warnedsse = true;
5977 	      warning (0, "SSE vector argument without SSE enabled "
5978 		       "changes the ABI");
5979 	    }
5980 	  if (cum->sse_nregs)
5981 	    return gen_reg_or_parallel (mode, orig_mode,
5982 				        cum->sse_regno + FIRST_SSE_REG);
5983 	}
5984       break;
5985 
5986     case OImode:
5987       /* OImode shouldn't be used directly.  */
5988       gcc_unreachable ();
5989 
5990     case V8SFmode:
5991     case V8SImode:
5992     case V32QImode:
5993     case V16HImode:
5994     case V4DFmode:
5995     case V4DImode:
5996       if (!type || !AGGREGATE_TYPE_P (type))
5997 	{
5998 	  if (cum->sse_nregs)
5999 	    return gen_reg_or_parallel (mode, orig_mode,
6000 				        cum->sse_regno + FIRST_SSE_REG);
6001 	}
6002       break;
6003 
6004     case V8QImode:
6005     case V4HImode:
6006     case V2SImode:
6007     case V2SFmode:
6008     case V1TImode:
6009     case V1DImode:
6010       if (!type || !AGGREGATE_TYPE_P (type))
6011 	{
6012 	  if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6013 	    {
6014 	      warnedmmx = true;
6015 	      warning (0, "MMX vector argument without MMX enabled "
6016 		       "changes the ABI");
6017 	    }
6018 	  if (cum->mmx_nregs)
6019 	    return gen_reg_or_parallel (mode, orig_mode,
6020 				        cum->mmx_regno + FIRST_MMX_REG);
6021 	}
6022       break;
6023     }
6024 
6025   return NULL_RTX;
6026 }
6027 
6028 static rtx
6029 function_arg_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6030 		 enum machine_mode orig_mode, tree type, int named)
6031 {
6032   /* Handle a hidden AL argument containing number of registers
6033      for varargs x86-64 functions.  */
6034   if (mode == VOIDmode)
6035     return GEN_INT (cum->maybe_vaarg
6036 		    ? (cum->sse_nregs < 0
6037 		       ? X86_64_SSE_REGPARM_MAX
6038 		       : cum->sse_regno)
6039 		    : -1);
6040 
6041   switch (mode)
6042     {
6043     default:
6044       break;
6045 
6046     case V8SFmode:
6047     case V8SImode:
6048     case V32QImode:
6049     case V16HImode:
6050     case V4DFmode:
6051     case V4DImode:
6052       /* Unnamed 256bit vector mode parameters are passed on stack.  */
6053       if (!named)
6054 	return NULL;
6055       break;
6056     }
6057 
6058   return construct_container (mode, orig_mode, type, 0, cum->nregs,
6059 			      cum->sse_nregs,
6060 			      &x86_64_int_parameter_registers [cum->regno],
6061 			      cum->sse_regno);
6062 }
6063 
6064 static rtx
6065 function_arg_ms_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6066 		    enum machine_mode orig_mode, int named,
6067 		    HOST_WIDE_INT bytes)
6068 {
6069   unsigned int regno;
6070 
6071   /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6072      We use value of -2 to specify that current function call is MSABI.  */
6073   if (mode == VOIDmode)
6074     return GEN_INT (-2);
6075 
6076   /* If we've run out of registers, it goes on the stack.  */
6077   if (cum->nregs == 0)
6078     return NULL_RTX;
6079 
6080   regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6081 
6082   /* Only floating point modes are passed in anything but integer regs.  */
6083   if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6084     {
6085       if (named)
6086 	regno = cum->regno + FIRST_SSE_REG;
6087       else
6088 	{
6089 	  rtx t1, t2;
6090 
6091 	  /* Unnamed floating parameters are passed in both the
6092 	     SSE and integer registers.  */
6093 	  t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6094 	  t2 = gen_rtx_REG (mode, regno);
6095 	  t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6096 	  t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6097 	  return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6098 	}
6099     }
6100   /* Handle aggregated types passed in register.  */
6101   if (orig_mode == BLKmode)
6102     {
6103       if (bytes > 0 && bytes <= 8)
6104         mode = (bytes > 4 ? DImode : SImode);
6105       if (mode == BLKmode)
6106         mode = DImode;
6107     }
6108 
6109   return gen_reg_or_parallel (mode, orig_mode, regno);
6110 }
6111 
6112 rtx
6113 function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode,
6114 	      tree type, int named)
6115 {
6116   enum machine_mode mode = omode;
6117   HOST_WIDE_INT bytes, words;
6118 
6119   if (mode == BLKmode)
6120     bytes = int_size_in_bytes (type);
6121   else
6122     bytes = GET_MODE_SIZE (mode);
6123   words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6124 
6125   /* To simplify the code below, represent vector types with a vector mode
6126      even if MMX/SSE are not active.  */
6127   if (type && TREE_CODE (type) == VECTOR_TYPE)
6128     mode = type_natural_mode (type, cum);
6129 
6130   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6131     return function_arg_ms_64 (cum, mode, omode, named, bytes);
6132   else if (TARGET_64BIT)
6133     return function_arg_64 (cum, mode, omode, type, named);
6134   else
6135     return function_arg_32 (cum, mode, omode, type, bytes, words);
6136 }
6137 
6138 /* A C expression that indicates when an argument must be passed by
6139    reference.  If nonzero for an argument, a copy of that argument is
6140    made in memory and a pointer to the argument is passed instead of
6141    the argument itself.  The pointer is passed in whatever way is
6142    appropriate for passing a pointer to that type.  */
6143 
6144 static bool
6145 ix86_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED,
6146 			enum machine_mode mode ATTRIBUTE_UNUSED,
6147 			const_tree type, bool named ATTRIBUTE_UNUSED)
6148 {
6149   /* See Windows x64 Software Convention.  */
6150   if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6151     {
6152       int msize = (int) GET_MODE_SIZE (mode);
6153       if (type)
6154 	{
6155 	  /* Arrays are passed by reference.  */
6156 	  if (TREE_CODE (type) == ARRAY_TYPE)
6157 	    return true;
6158 
6159 	  if (AGGREGATE_TYPE_P (type))
6160 	    {
6161 	      /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6162 	         are passed by reference.  */
6163 	      msize = int_size_in_bytes (type);
6164 	    }
6165 	}
6166 
6167       /* __m128 is passed by reference.  */
6168       switch (msize) {
6169       case 1: case 2: case 4: case 8:
6170         break;
6171       default:
6172         return true;
6173       }
6174     }
6175   else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6176     return 1;
6177 
6178   return 0;
6179 }
6180 
6181 /* Return true when TYPE should be 128bit aligned for 32bit argument passing
6182    ABI.  */
6183 static bool
6184 contains_aligned_value_p (const_tree type)
6185 {
6186   enum machine_mode mode = TYPE_MODE (type);
6187   if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6188        || mode == TDmode
6189        || mode == TFmode
6190        || mode == TCmode)
6191       && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6192     return true;
6193   if (TYPE_ALIGN (type) < 128)
6194     return false;
6195 
6196   if (AGGREGATE_TYPE_P (type))
6197     {
6198       /* Walk the aggregates recursively.  */
6199       switch (TREE_CODE (type))
6200 	{
6201 	case RECORD_TYPE:
6202 	case UNION_TYPE:
6203 	case QUAL_UNION_TYPE:
6204 	  {
6205 	    tree field;
6206 
6207 	    /* Walk all the structure fields.  */
6208 	    for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
6209 	      {
6210 		if (TREE_CODE (field) == FIELD_DECL
6211 		    && contains_aligned_value_p (TREE_TYPE (field)))
6212 		  return true;
6213 	      }
6214 	    break;
6215 	  }
6216 
6217 	case ARRAY_TYPE:
6218 	  /* Just for use if some languages passes arrays by value.  */
6219 	  if (contains_aligned_value_p (TREE_TYPE (type)))
6220 	    return true;
6221 	  break;
6222 
6223 	default:
6224 	  gcc_unreachable ();
6225 	}
6226     }
6227   return false;
6228 }
6229 
6230 /* Gives the alignment boundary, in bits, of an argument with the
6231    specified mode and type.  */
6232 
6233 int
6234 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
6235 {
6236   int align;
6237   if (type)
6238     {
6239       /* Since canonical type is used for call, we convert it to
6240 	 canonical type if needed.  */
6241       if (!TYPE_STRUCTURAL_EQUALITY_P (type))
6242 	type = TYPE_CANONICAL (type);
6243       align = TYPE_ALIGN (type);
6244     }
6245   else
6246     align = GET_MODE_ALIGNMENT (mode);
6247   if (align < PARM_BOUNDARY)
6248     align = PARM_BOUNDARY;
6249   /* In 32bit, only _Decimal128 and __float128 are aligned to their
6250      natural boundaries.  */
6251   if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6252     {
6253       /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
6254 	 make an exception for SSE modes since these require 128bit
6255 	 alignment.
6256 
6257 	 The handling here differs from field_alignment.  ICC aligns MMX
6258 	 arguments to 4 byte boundaries, while structure fields are aligned
6259 	 to 8 byte boundaries.  */
6260       if (!type)
6261 	{
6262 	  if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6263 	    align = PARM_BOUNDARY;
6264 	}
6265       else
6266 	{
6267 	  if (!contains_aligned_value_p (type))
6268 	    align = PARM_BOUNDARY;
6269 	}
6270     }
6271   if (align > BIGGEST_ALIGNMENT)
6272     align = BIGGEST_ALIGNMENT;
6273   return align;
6274 }
6275 
6276 /* Return true if N is a possible register number of function value.  */
6277 
6278 bool
6279 ix86_function_value_regno_p (int regno)
6280 {
6281   switch (regno)
6282     {
6283     case 0:
6284       return true;
6285 
6286     case FIRST_FLOAT_REG:
6287       /* TODO: The function should depend on current function ABI but
6288        builtins.c would need updating then. Therefore we use the
6289        default ABI.  */
6290       if (TARGET_64BIT && ix86_abi == MS_ABI)
6291 	return false;
6292       return TARGET_FLOAT_RETURNS_IN_80387;
6293 
6294     case FIRST_SSE_REG:
6295       return TARGET_SSE;
6296 
6297     case FIRST_MMX_REG:
6298       if (TARGET_MACHO || TARGET_64BIT)
6299 	return false;
6300       return TARGET_MMX;
6301     }
6302 
6303   return false;
6304 }
6305 
6306 /* Define how to find the value returned by a function.
6307    VALTYPE is the data type of the value (as a tree).
6308    If the precise function being called is known, FUNC is its FUNCTION_DECL;
6309    otherwise, FUNC is 0.  */
6310 
6311 static rtx
6312 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
6313 		   const_tree fntype, const_tree fn)
6314 {
6315   unsigned int regno;
6316 
6317   /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
6318      we normally prevent this case when mmx is not available.  However
6319      some ABIs may require the result to be returned like DImode.  */
6320   if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
6321     regno = TARGET_MMX ? FIRST_MMX_REG : 0;
6322 
6323   /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
6324      we prevent this case when sse is not available.  However some ABIs
6325      may require the result to be returned like integer TImode.  */
6326   else if (mode == TImode
6327 	   || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
6328     regno = TARGET_SSE ? FIRST_SSE_REG : 0;
6329 
6330   /* 32-byte vector modes in %ymm0.   */
6331   else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
6332     regno = TARGET_AVX ? FIRST_SSE_REG : 0;
6333 
6334   /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
6335   else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
6336     regno = FIRST_FLOAT_REG;
6337   else
6338     /* Most things go in %eax.  */
6339     regno = AX_REG;
6340 
6341   /* Override FP return register with %xmm0 for local functions when
6342      SSE math is enabled or for functions with sseregparm attribute.  */
6343   if ((fn || fntype) && (mode == SFmode || mode == DFmode))
6344     {
6345       int sse_level = ix86_function_sseregparm (fntype, fn, false);
6346       if ((sse_level >= 1 && mode == SFmode)
6347 	  || (sse_level == 2 && mode == DFmode))
6348 	regno = FIRST_SSE_REG;
6349     }
6350 
6351   /* OImode shouldn't be used directly.  */
6352   gcc_assert (mode != OImode);
6353 
6354   return gen_rtx_REG (orig_mode, regno);
6355 }
6356 
6357 static rtx
6358 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
6359 		   const_tree valtype)
6360 {
6361   rtx ret;
6362 
6363   /* Handle libcalls, which don't provide a type node.  */
6364   if (valtype == NULL)
6365     {
6366       switch (mode)
6367 	{
6368 	case SFmode:
6369 	case SCmode:
6370 	case DFmode:
6371 	case DCmode:
6372 	case TFmode:
6373 	case SDmode:
6374 	case DDmode:
6375 	case TDmode:
6376 	  return gen_rtx_REG (mode, FIRST_SSE_REG);
6377 	case XFmode:
6378 	case XCmode:
6379 	  return gen_rtx_REG (mode, FIRST_FLOAT_REG);
6380 	case TCmode:
6381 	  return NULL;
6382 	default:
6383 	  return gen_rtx_REG (mode, AX_REG);
6384 	}
6385     }
6386 
6387   ret = construct_container (mode, orig_mode, valtype, 1,
6388 			     X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
6389 			     x86_64_int_return_registers, 0);
6390 
6391   /* For zero sized structures, construct_container returns NULL, but we
6392      need to keep rest of compiler happy by returning meaningful value.  */
6393   if (!ret)
6394     ret = gen_rtx_REG (orig_mode, AX_REG);
6395 
6396   return ret;
6397 }
6398 
6399 static rtx
6400 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
6401 {
6402   unsigned int regno = AX_REG;
6403 
6404   if (TARGET_SSE)
6405     {
6406       switch (GET_MODE_SIZE (mode))
6407         {
6408         case 16:
6409           if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
6410 	     && !COMPLEX_MODE_P (mode))
6411 	    regno = FIRST_SSE_REG;
6412 	  break;
6413 	case 8:
6414 	case 4:
6415 	  if (mode == SFmode || mode == DFmode)
6416 	    regno = FIRST_SSE_REG;
6417 	  break;
6418 	default:
6419 	  break;
6420         }
6421     }
6422   return gen_rtx_REG (orig_mode, regno);
6423 }
6424 
6425 static rtx
6426 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
6427 		       enum machine_mode orig_mode, enum machine_mode mode)
6428 {
6429   const_tree fn, fntype;
6430 
6431   fn = NULL_TREE;
6432   if (fntype_or_decl && DECL_P (fntype_or_decl))
6433     fn = fntype_or_decl;
6434   fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
6435 
6436   if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
6437     return function_value_ms_64 (orig_mode, mode);
6438   else if (TARGET_64BIT)
6439     return function_value_64 (orig_mode, mode, valtype);
6440   else
6441     return function_value_32 (orig_mode, mode, fntype, fn);
6442 }
6443 
6444 static rtx
6445 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
6446 		     bool outgoing ATTRIBUTE_UNUSED)
6447 {
6448   enum machine_mode mode, orig_mode;
6449 
6450   orig_mode = TYPE_MODE (valtype);
6451   mode = type_natural_mode (valtype, NULL);
6452   return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
6453 }
6454 
6455 rtx
6456 ix86_libcall_value (enum machine_mode mode)
6457 {
6458   return ix86_function_value_1 (NULL, NULL, mode, mode);
6459 }
6460 
6461 /* Return true iff type is returned in memory.  */
6462 
6463 static int ATTRIBUTE_UNUSED
6464 return_in_memory_32 (const_tree type, enum machine_mode mode)
6465 {
6466   HOST_WIDE_INT size;
6467 
6468   if (mode == BLKmode)
6469     return 1;
6470 
6471   size = int_size_in_bytes (type);
6472 
6473   if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
6474     return 0;
6475 
6476   if (VECTOR_MODE_P (mode) || mode == TImode)
6477     {
6478       /* User-created vectors small enough to fit in EAX.  */
6479       if (size < 8)
6480 	return 0;
6481 
6482       /* MMX/3dNow values are returned in MM0,
6483 	 except when it doesn't exits.  */
6484       if (size == 8)
6485 	return (TARGET_MMX ? 0 : 1);
6486 
6487       /* SSE values are returned in XMM0, except when it doesn't exist.  */
6488       if (size == 16)
6489 	return (TARGET_SSE ? 0 : 1);
6490 
6491       /* AVX values are returned in YMM0, except when it doesn't exist.  */
6492       if (size == 32)
6493 	return TARGET_AVX ? 0 : 1;
6494     }
6495 
6496   if (mode == XFmode)
6497     return 0;
6498 
6499   if (size > 12)
6500     return 1;
6501 
6502   /* OImode shouldn't be used directly.  */
6503   gcc_assert (mode != OImode);
6504 
6505   return 0;
6506 }
6507 
6508 static int ATTRIBUTE_UNUSED
6509 return_in_memory_64 (const_tree type, enum machine_mode mode)
6510 {
6511   int needed_intregs, needed_sseregs;
6512   return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
6513 }
6514 
6515 static int ATTRIBUTE_UNUSED
6516 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
6517 {
6518   HOST_WIDE_INT size = int_size_in_bytes (type);
6519 
6520   /* __m128 is returned in xmm0.  */
6521   if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
6522       && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
6523     return 0;
6524 
6525   /* Otherwise, the size must be exactly in [1248]. */
6526   return (size != 1 && size != 2 && size != 4 && size != 8);
6527 }
6528 
6529 static bool
6530 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
6531 {
6532 #ifdef SUBTARGET_RETURN_IN_MEMORY
6533   return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
6534 #else
6535   const enum machine_mode mode = type_natural_mode (type, NULL);
6536 
6537   if (TARGET_64BIT)
6538     {
6539       if (ix86_function_type_abi (fntype) == MS_ABI)
6540 	return return_in_memory_ms_64 (type, mode);
6541       else
6542 	return return_in_memory_64 (type, mode);
6543     }
6544   else
6545     return return_in_memory_32 (type, mode);
6546 #endif
6547 }
6548 
6549 /* Return false iff TYPE is returned in memory.  This version is used
6550    on Solaris 2.  It is similar to the generic ix86_return_in_memory,
6551    but differs notably in that when MMX is available, 8-byte vectors
6552    are returned in memory, rather than in MMX registers.  */
6553 
6554 bool
6555 ix86_solaris_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
6556 {
6557   int size;
6558   enum machine_mode mode = type_natural_mode (type, NULL);
6559 
6560   if (TARGET_64BIT)
6561     return return_in_memory_64 (type, mode);
6562 
6563   if (mode == BLKmode)
6564     return 1;
6565 
6566   size = int_size_in_bytes (type);
6567 
6568   if (VECTOR_MODE_P (mode))
6569     {
6570       /* Return in memory only if MMX registers *are* available.  This
6571 	 seems backwards, but it is consistent with the existing
6572 	 Solaris x86 ABI.  */
6573       if (size == 8)
6574 	return TARGET_MMX;
6575       if (size == 16)
6576 	return !TARGET_SSE;
6577     }
6578   else if (mode == TImode)
6579     return !TARGET_SSE;
6580   else if (mode == XFmode)
6581     return 0;
6582 
6583   return size > 12;
6584 }
6585 
6586 /* When returning SSE vector types, we have a choice of either
6587      (1) being abi incompatible with a -march switch, or
6588      (2) generating an error.
6589    Given no good solution, I think the safest thing is one warning.
6590    The user won't be able to use -Werror, but....
6591 
6592    Choose the STRUCT_VALUE_RTX hook because that's (at present) only
6593    called in response to actually generating a caller or callee that
6594    uses such a type.  As opposed to TARGET_RETURN_IN_MEMORY, which is called
6595    via aggregate_value_p for general type probing from tree-ssa.  */
6596 
6597 static rtx
6598 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
6599 {
6600   static bool warnedsse, warnedmmx;
6601 
6602   if (!TARGET_64BIT && type)
6603     {
6604       /* Look at the return type of the function, not the function type.  */
6605       enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
6606 
6607       if (!TARGET_SSE && !warnedsse)
6608 	{
6609 	  if (mode == TImode
6610 	      || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
6611 	    {
6612 	      warnedsse = true;
6613 	      warning (0, "SSE vector return without SSE enabled "
6614 		       "changes the ABI");
6615 	    }
6616 	}
6617 
6618       if (!TARGET_MMX && !warnedmmx)
6619 	{
6620 	  if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
6621 	    {
6622 	      warnedmmx = true;
6623 	      warning (0, "MMX vector return without MMX enabled "
6624 		       "changes the ABI");
6625 	    }
6626 	}
6627     }
6628 
6629   return NULL;
6630 }
6631 
6632 
6633 /* Create the va_list data type.  */
6634 
6635 /* Returns the calling convention specific va_list date type.
6636    The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI.  */
6637 
6638 static tree
6639 ix86_build_builtin_va_list_abi (enum calling_abi abi)
6640 {
6641   tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
6642 
6643   /* For i386 we use plain pointer to argument area.  */
6644   if (!TARGET_64BIT || abi == MS_ABI)
6645     return build_pointer_type (char_type_node);
6646 
6647   record = (*lang_hooks.types.make_type) (RECORD_TYPE);
6648   type_decl = build_decl (BUILTINS_LOCATION,
6649 			  TYPE_DECL, get_identifier ("__va_list_tag"), record);
6650 
6651   f_gpr = build_decl (BUILTINS_LOCATION,
6652 		      FIELD_DECL, get_identifier ("gp_offset"),
6653 		      unsigned_type_node);
6654   f_fpr = build_decl (BUILTINS_LOCATION,
6655 		      FIELD_DECL, get_identifier ("fp_offset"),
6656 		      unsigned_type_node);
6657   f_ovf = build_decl (BUILTINS_LOCATION,
6658 		      FIELD_DECL, get_identifier ("overflow_arg_area"),
6659 		      ptr_type_node);
6660   f_sav = build_decl (BUILTINS_LOCATION,
6661 		      FIELD_DECL, get_identifier ("reg_save_area"),
6662 		      ptr_type_node);
6663 
6664   va_list_gpr_counter_field = f_gpr;
6665   va_list_fpr_counter_field = f_fpr;
6666 
6667   DECL_FIELD_CONTEXT (f_gpr) = record;
6668   DECL_FIELD_CONTEXT (f_fpr) = record;
6669   DECL_FIELD_CONTEXT (f_ovf) = record;
6670   DECL_FIELD_CONTEXT (f_sav) = record;
6671 
6672   TREE_CHAIN (record) = type_decl;
6673   TYPE_NAME (record) = type_decl;
6674   TYPE_FIELDS (record) = f_gpr;
6675   TREE_CHAIN (f_gpr) = f_fpr;
6676   TREE_CHAIN (f_fpr) = f_ovf;
6677   TREE_CHAIN (f_ovf) = f_sav;
6678 
6679   layout_type (record);
6680 
6681   /* The correct type is an array type of one element.  */
6682   return build_array_type (record, build_index_type (size_zero_node));
6683 }
6684 
6685 /* Setup the builtin va_list data type and for 64-bit the additional
6686    calling convention specific va_list data types.  */
6687 
6688 static tree
6689 ix86_build_builtin_va_list (void)
6690 {
6691   tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
6692 
6693   /* Initialize abi specific va_list builtin types.  */
6694   if (TARGET_64BIT)
6695     {
6696       tree t;
6697       if (ix86_abi == MS_ABI)
6698         {
6699           t = ix86_build_builtin_va_list_abi (SYSV_ABI);
6700           if (TREE_CODE (t) != RECORD_TYPE)
6701             t = build_variant_type_copy (t);
6702           sysv_va_list_type_node = t;
6703         }
6704       else
6705         {
6706           t = ret;
6707           if (TREE_CODE (t) != RECORD_TYPE)
6708             t = build_variant_type_copy (t);
6709           sysv_va_list_type_node = t;
6710         }
6711       if (ix86_abi != MS_ABI)
6712         {
6713           t = ix86_build_builtin_va_list_abi (MS_ABI);
6714           if (TREE_CODE (t) != RECORD_TYPE)
6715             t = build_variant_type_copy (t);
6716           ms_va_list_type_node = t;
6717         }
6718       else
6719         {
6720           t = ret;
6721           if (TREE_CODE (t) != RECORD_TYPE)
6722             t = build_variant_type_copy (t);
6723           ms_va_list_type_node = t;
6724         }
6725     }
6726 
6727   return ret;
6728 }
6729 
6730 /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
6731 
6732 static void
6733 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
6734 {
6735   rtx save_area, mem;
6736   rtx label;
6737   rtx label_ref;
6738   rtx tmp_reg;
6739   rtx nsse_reg;
6740   alias_set_type set;
6741   int i;
6742 
6743   /* GPR size of varargs save area.  */
6744   if (cfun->va_list_gpr_size)
6745     ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
6746   else
6747     ix86_varargs_gpr_size = 0;
6748 
6749   /* FPR size of varargs save area.  We don't need it if we don't pass
6750      anything in SSE registers.  */
6751   if (cum->sse_nregs && cfun->va_list_fpr_size)
6752     ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
6753   else
6754     ix86_varargs_fpr_size = 0;
6755 
6756   if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
6757     return;
6758 
6759   save_area = frame_pointer_rtx;
6760   set = get_varargs_alias_set ();
6761 
6762   for (i = cum->regno;
6763        i < X86_64_REGPARM_MAX
6764        && i < cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
6765        i++)
6766     {
6767       mem = gen_rtx_MEM (Pmode,
6768 			 plus_constant (save_area, i * UNITS_PER_WORD));
6769       MEM_NOTRAP_P (mem) = 1;
6770       set_mem_alias_set (mem, set);
6771       emit_move_insn (mem, gen_rtx_REG (Pmode,
6772 					x86_64_int_parameter_registers[i]));
6773     }
6774 
6775   if (ix86_varargs_fpr_size)
6776     {
6777       /* Stack must be aligned to 16byte for FP register save area.  */
6778       if (crtl->stack_alignment_needed < 128)
6779 	crtl->stack_alignment_needed = 128;
6780 
6781       /* Now emit code to save SSE registers.  The AX parameter contains number
6782 	 of SSE parameter registers used to call this function.  We use
6783 	 sse_prologue_save insn template that produces computed jump across
6784 	 SSE saves.  We need some preparation work to get this working.  */
6785 
6786       label = gen_label_rtx ();
6787       label_ref = gen_rtx_LABEL_REF (Pmode, label);
6788 
6789       /* Compute address to jump to :
6790          label - eax*4 + nnamed_sse_arguments*4 Or
6791          label - eax*5 + nnamed_sse_arguments*5 for AVX.  */
6792       tmp_reg = gen_reg_rtx (Pmode);
6793       nsse_reg = gen_reg_rtx (Pmode);
6794       emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, AX_REG)));
6795       emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
6796 			      gen_rtx_MULT (Pmode, nsse_reg,
6797 					    GEN_INT (4))));
6798 
6799       /* vmovaps is one byte longer than movaps.  */
6800       if (TARGET_AVX)
6801 	emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
6802 				gen_rtx_PLUS (Pmode, tmp_reg,
6803 					      nsse_reg)));
6804 
6805       if (cum->sse_regno)
6806 	emit_move_insn
6807 	  (nsse_reg,
6808 	   gen_rtx_CONST (DImode,
6809 			  gen_rtx_PLUS (DImode,
6810 					label_ref,
6811 					GEN_INT (cum->sse_regno
6812 						 * (TARGET_AVX ? 5 : 4)))));
6813       else
6814 	emit_move_insn (nsse_reg, label_ref);
6815       emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg));
6816 
6817       /* Compute address of memory block we save into.  We always use pointer
6818 	 pointing 127 bytes after first byte to store - this is needed to keep
6819 	 instruction size limited by 4 bytes (5 bytes for AVX) with one
6820 	 byte displacement.  */
6821       tmp_reg = gen_reg_rtx (Pmode);
6822       emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
6823 			      plus_constant (save_area,
6824 					     ix86_varargs_gpr_size + 127)));
6825       mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127));
6826       MEM_NOTRAP_P (mem) = 1;
6827       set_mem_alias_set (mem, set);
6828       set_mem_align (mem, BITS_PER_WORD);
6829 
6830       /* And finally do the dirty job!  */
6831       emit_insn (gen_sse_prologue_save (mem, nsse_reg,
6832 					GEN_INT (cum->sse_regno), label));
6833     }
6834 }
6835 
6836 static void
6837 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
6838 {
6839   alias_set_type set = get_varargs_alias_set ();
6840   int i;
6841 
6842   for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
6843     {
6844       rtx reg, mem;
6845 
6846       mem = gen_rtx_MEM (Pmode,
6847 			 plus_constant (virtual_incoming_args_rtx,
6848 					i * UNITS_PER_WORD));
6849       MEM_NOTRAP_P (mem) = 1;
6850       set_mem_alias_set (mem, set);
6851 
6852       reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
6853       emit_move_insn (mem, reg);
6854     }
6855 }
6856 
6857 static void
6858 ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6859 			     tree type, int *pretend_size ATTRIBUTE_UNUSED,
6860 			     int no_rtl)
6861 {
6862   CUMULATIVE_ARGS next_cum;
6863   tree fntype;
6864 
6865   /* This argument doesn't appear to be used anymore.  Which is good,
6866      because the old code here didn't suppress rtl generation.  */
6867   gcc_assert (!no_rtl);
6868 
6869   if (!TARGET_64BIT)
6870     return;
6871 
6872   fntype = TREE_TYPE (current_function_decl);
6873 
6874   /* For varargs, we do not want to skip the dummy va_dcl argument.
6875      For stdargs, we do want to skip the last named argument.  */
6876   next_cum = *cum;
6877   if (stdarg_p (fntype))
6878     function_arg_advance (&next_cum, mode, type, 1);
6879 
6880   if (cum->call_abi == MS_ABI)
6881     setup_incoming_varargs_ms_64 (&next_cum);
6882   else
6883     setup_incoming_varargs_64 (&next_cum);
6884 }
6885 
6886 /* Checks if TYPE is of kind va_list char *.  */
6887 
6888 static bool
6889 is_va_list_char_pointer (tree type)
6890 {
6891   tree canonic;
6892 
6893   /* For 32-bit it is always true.  */
6894   if (!TARGET_64BIT)
6895     return true;
6896   canonic = ix86_canonical_va_list_type (type);
6897   return (canonic == ms_va_list_type_node
6898           || (ix86_abi == MS_ABI && canonic == va_list_type_node));
6899 }
6900 
6901 /* Implement va_start.  */
6902 
6903 static void
6904 ix86_va_start (tree valist, rtx nextarg)
6905 {
6906   HOST_WIDE_INT words, n_gpr, n_fpr;
6907   tree f_gpr, f_fpr, f_ovf, f_sav;
6908   tree gpr, fpr, ovf, sav, t;
6909   tree type;
6910 
6911   /* Only 64bit target needs something special.  */
6912   if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
6913     {
6914       std_expand_builtin_va_start (valist, nextarg);
6915       return;
6916     }
6917 
6918   f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
6919   f_fpr = TREE_CHAIN (f_gpr);
6920   f_ovf = TREE_CHAIN (f_fpr);
6921   f_sav = TREE_CHAIN (f_ovf);
6922 
6923   valist = build1 (INDIRECT_REF, TREE_TYPE (TREE_TYPE (valist)), valist);
6924   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), valist, f_gpr, NULL_TREE);
6925   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
6926   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
6927   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
6928 
6929   /* Count number of gp and fp argument registers used.  */
6930   words = crtl->args.info.words;
6931   n_gpr = crtl->args.info.regno;
6932   n_fpr = crtl->args.info.sse_regno;
6933 
6934   if (cfun->va_list_gpr_size)
6935     {
6936       type = TREE_TYPE (gpr);
6937       t = build2 (MODIFY_EXPR, type,
6938 		  gpr, build_int_cst (type, n_gpr * 8));
6939       TREE_SIDE_EFFECTS (t) = 1;
6940       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6941     }
6942 
6943   if (TARGET_SSE && cfun->va_list_fpr_size)
6944     {
6945       type = TREE_TYPE (fpr);
6946       t = build2 (MODIFY_EXPR, type, fpr,
6947 		  build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
6948       TREE_SIDE_EFFECTS (t) = 1;
6949       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6950     }
6951 
6952   /* Find the overflow area.  */
6953   type = TREE_TYPE (ovf);
6954   t = make_tree (type, crtl->args.internal_arg_pointer);
6955   if (words != 0)
6956     t = build2 (POINTER_PLUS_EXPR, type, t,
6957 	        size_int (words * UNITS_PER_WORD));
6958   t = build2 (MODIFY_EXPR, type, ovf, t);
6959   TREE_SIDE_EFFECTS (t) = 1;
6960   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6961 
6962   if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
6963     {
6964       /* Find the register save area.
6965 	 Prologue of the function save it right above stack frame.  */
6966       type = TREE_TYPE (sav);
6967       t = make_tree (type, frame_pointer_rtx);
6968       if (!ix86_varargs_gpr_size)
6969 	t = build2 (POINTER_PLUS_EXPR, type, t,
6970 		    size_int (-8 * X86_64_REGPARM_MAX));
6971       t = build2 (MODIFY_EXPR, type, sav, t);
6972       TREE_SIDE_EFFECTS (t) = 1;
6973       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6974     }
6975 }
6976 
6977 /* Implement va_arg.  */
6978 
6979 static tree
6980 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
6981 		      gimple_seq *post_p)
6982 {
6983   static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
6984   tree f_gpr, f_fpr, f_ovf, f_sav;
6985   tree gpr, fpr, ovf, sav, t;
6986   int size, rsize;
6987   tree lab_false, lab_over = NULL_TREE;
6988   tree addr, t2;
6989   rtx container;
6990   int indirect_p = 0;
6991   tree ptrtype;
6992   enum machine_mode nat_mode;
6993   int arg_boundary;
6994 
6995   /* Only 64bit target needs something special.  */
6996   if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
6997     return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
6998 
6999   f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7000   f_fpr = TREE_CHAIN (f_gpr);
7001   f_ovf = TREE_CHAIN (f_fpr);
7002   f_sav = TREE_CHAIN (f_ovf);
7003 
7004   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7005 		build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7006   valist = build_va_arg_indirect_ref (valist);
7007   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7008   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7009   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7010 
7011   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7012   if (indirect_p)
7013     type = build_pointer_type (type);
7014   size = int_size_in_bytes (type);
7015   rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7016 
7017   nat_mode = type_natural_mode (type, NULL);
7018   switch (nat_mode)
7019     {
7020     case V8SFmode:
7021     case V8SImode:
7022     case V32QImode:
7023     case V16HImode:
7024     case V4DFmode:
7025     case V4DImode:
7026       /* Unnamed 256bit vector mode parameters are passed on stack.  */
7027       if (ix86_cfun_abi () == SYSV_ABI)
7028 	{
7029 	  container = NULL;
7030 	  break;
7031 	}
7032 
7033     default:
7034       container = construct_container (nat_mode, TYPE_MODE (type),
7035 				       type, 0, X86_64_REGPARM_MAX,
7036 				       X86_64_SSE_REGPARM_MAX, intreg,
7037 				       0);
7038       break;
7039     }
7040 
7041   /* Pull the value out of the saved registers.  */
7042 
7043   addr = create_tmp_var (ptr_type_node, "addr");
7044 
7045   if (container)
7046     {
7047       int needed_intregs, needed_sseregs;
7048       bool need_temp;
7049       tree int_addr, sse_addr;
7050 
7051       lab_false = create_artificial_label (UNKNOWN_LOCATION);
7052       lab_over = create_artificial_label (UNKNOWN_LOCATION);
7053 
7054       examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7055 
7056       need_temp = (!REG_P (container)
7057 		   && ((needed_intregs && TYPE_ALIGN (type) > 64)
7058 		       || TYPE_ALIGN (type) > 128));
7059 
7060       /* In case we are passing structure, verify that it is consecutive block
7061          on the register save area.  If not we need to do moves.  */
7062       if (!need_temp && !REG_P (container))
7063 	{
7064 	  /* Verify that all registers are strictly consecutive  */
7065 	  if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7066 	    {
7067 	      int i;
7068 
7069 	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7070 		{
7071 		  rtx slot = XVECEXP (container, 0, i);
7072 		  if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7073 		      || INTVAL (XEXP (slot, 1)) != i * 16)
7074 		    need_temp = 1;
7075 		}
7076 	    }
7077 	  else
7078 	    {
7079 	      int i;
7080 
7081 	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7082 		{
7083 		  rtx slot = XVECEXP (container, 0, i);
7084 		  if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7085 		      || INTVAL (XEXP (slot, 1)) != i * 8)
7086 		    need_temp = 1;
7087 		}
7088 	    }
7089 	}
7090       if (!need_temp)
7091 	{
7092 	  int_addr = addr;
7093 	  sse_addr = addr;
7094 	}
7095       else
7096 	{
7097 	  int_addr = create_tmp_var (ptr_type_node, "int_addr");
7098 	  sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7099 	}
7100 
7101       /* First ensure that we fit completely in registers.  */
7102       if (needed_intregs)
7103 	{
7104 	  t = build_int_cst (TREE_TYPE (gpr),
7105 			     (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7106 	  t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7107 	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7108 	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7109 	  gimplify_and_add (t, pre_p);
7110 	}
7111       if (needed_sseregs)
7112 	{
7113 	  t = build_int_cst (TREE_TYPE (fpr),
7114 			     (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7115 			     + X86_64_REGPARM_MAX * 8);
7116 	  t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7117 	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7118 	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7119 	  gimplify_and_add (t, pre_p);
7120 	}
7121 
7122       /* Compute index to start of area used for integer regs.  */
7123       if (needed_intregs)
7124 	{
7125 	  /* int_addr = gpr + sav; */
7126 	  t = fold_convert (sizetype, gpr);
7127 	  t = build2 (POINTER_PLUS_EXPR, ptr_type_node, sav, t);
7128 	  gimplify_assign (int_addr, t, pre_p);
7129 	}
7130       if (needed_sseregs)
7131 	{
7132 	  /* sse_addr = fpr + sav; */
7133 	  t = fold_convert (sizetype, fpr);
7134 	  t = build2 (POINTER_PLUS_EXPR, ptr_type_node, sav, t);
7135 	  gimplify_assign (sse_addr, t, pre_p);
7136 	}
7137       if (need_temp)
7138 	{
7139 	  int i, prev_size = 0;
7140 	  tree temp = create_tmp_var (type, "va_arg_tmp");
7141 
7142 	  /* addr = &temp; */
7143 	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
7144 	  gimplify_assign (addr, t, pre_p);
7145 
7146 	  for (i = 0; i < XVECLEN (container, 0); i++)
7147 	    {
7148 	      rtx slot = XVECEXP (container, 0, i);
7149 	      rtx reg = XEXP (slot, 0);
7150 	      enum machine_mode mode = GET_MODE (reg);
7151 	      tree piece_type;
7152 	      tree addr_type;
7153 	      tree daddr_type;
7154 	      tree src_addr, src;
7155 	      int src_offset;
7156 	      tree dest_addr, dest;
7157 	      int cur_size = GET_MODE_SIZE (mode);
7158 
7159 	      gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
7160 	      prev_size = INTVAL (XEXP (slot, 1));
7161 	      if (prev_size + cur_size > size)
7162 		{
7163 		  cur_size = size - prev_size;
7164 		  mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
7165 		  if (mode == BLKmode)
7166 		    mode = QImode;
7167 		}
7168 	      piece_type = lang_hooks.types.type_for_mode (mode, 1);
7169 	      if (mode == GET_MODE (reg))
7170 		addr_type = build_pointer_type (piece_type);
7171 	      else
7172 		addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
7173 							 true);
7174 	      daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
7175 							true);
7176 
7177 	      if (SSE_REGNO_P (REGNO (reg)))
7178 		{
7179 		  src_addr = sse_addr;
7180 		  src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
7181 		}
7182 	      else
7183 		{
7184 		  src_addr = int_addr;
7185 		  src_offset = REGNO (reg) * 8;
7186 		}
7187 	      src_addr = fold_convert (addr_type, src_addr);
7188 	      src_addr = fold_build2 (POINTER_PLUS_EXPR, addr_type, src_addr,
7189 				      size_int (src_offset));
7190 
7191 	      dest_addr = fold_convert (daddr_type, addr);
7192 	      dest_addr = fold_build2 (POINTER_PLUS_EXPR, daddr_type, dest_addr,
7193 				       size_int (prev_size));
7194 	      if (cur_size == GET_MODE_SIZE (mode))
7195 		{
7196 		  src = build_va_arg_indirect_ref (src_addr);
7197 		  dest = build_va_arg_indirect_ref (dest_addr);
7198 
7199 		  gimplify_assign (dest, src, pre_p);
7200 		}
7201 	      else
7202 		{
7203 		  tree copy
7204 		    = build_call_expr (implicit_built_in_decls[BUILT_IN_MEMCPY],
7205 				       3, dest_addr, src_addr,
7206 				       size_int (cur_size));
7207 		  gimplify_and_add (copy, pre_p);
7208 		}
7209 	      prev_size += cur_size;
7210 	    }
7211 	}
7212 
7213       if (needed_intregs)
7214 	{
7215 	  t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
7216 		      build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
7217 	  gimplify_assign (gpr, t, pre_p);
7218 	}
7219 
7220       if (needed_sseregs)
7221 	{
7222 	  t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
7223 		      build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
7224 	  gimplify_assign (fpr, t, pre_p);
7225 	}
7226 
7227       gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
7228 
7229       gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
7230     }
7231 
7232   /* ... otherwise out of the overflow area.  */
7233 
7234   /* When we align parameter on stack for caller, if the parameter
7235      alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
7236      aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
7237      here with caller.  */
7238   arg_boundary = FUNCTION_ARG_BOUNDARY (VOIDmode, type);
7239   if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
7240     arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
7241 
7242   /* Care for on-stack alignment if needed.  */
7243   if (arg_boundary <= 64
7244       || integer_zerop (TYPE_SIZE (type)))
7245     t = ovf;
7246  else
7247     {
7248       HOST_WIDE_INT align = arg_boundary / 8;
7249       t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (ovf), ovf,
7250 		  size_int (align - 1));
7251       t = fold_convert (sizetype, t);
7252       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7253 		  size_int (-align));
7254       t = fold_convert (TREE_TYPE (ovf), t);
7255     }
7256   gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
7257   gimplify_assign (addr, t, pre_p);
7258 
7259   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (t), t,
7260 	      size_int (rsize * UNITS_PER_WORD));
7261   gimplify_assign (unshare_expr (ovf), t, pre_p);
7262 
7263   if (container)
7264     gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
7265 
7266   ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
7267   addr = fold_convert (ptrtype, addr);
7268 
7269   if (indirect_p)
7270     addr = build_va_arg_indirect_ref (addr);
7271   return build_va_arg_indirect_ref (addr);
7272 }
7273 
7274 /* Return nonzero if OPNUM's MEM should be matched
7275    in movabs* patterns.  */
7276 
7277 int
7278 ix86_check_movabs (rtx insn, int opnum)
7279 {
7280   rtx set, mem;
7281 
7282   set = PATTERN (insn);
7283   if (GET_CODE (set) == PARALLEL)
7284     set = XVECEXP (set, 0, 0);
7285   gcc_assert (GET_CODE (set) == SET);
7286   mem = XEXP (set, opnum);
7287   while (GET_CODE (mem) == SUBREG)
7288     mem = SUBREG_REG (mem);
7289   gcc_assert (MEM_P (mem));
7290   return (volatile_ok || !MEM_VOLATILE_P (mem));
7291 }
7292 
7293 /* Initialize the table of extra 80387 mathematical constants.  */
7294 
7295 static void
7296 init_ext_80387_constants (void)
7297 {
7298   static const char * cst[5] =
7299   {
7300     "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
7301     "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
7302     "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
7303     "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
7304     "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
7305   };
7306   int i;
7307 
7308   for (i = 0; i < 5; i++)
7309     {
7310       real_from_string (&ext_80387_constants_table[i], cst[i]);
7311       /* Ensure each constant is rounded to XFmode precision.  */
7312       real_convert (&ext_80387_constants_table[i],
7313 		    XFmode, &ext_80387_constants_table[i]);
7314     }
7315 
7316   ext_80387_constants_init = 1;
7317 }
7318 
7319 /* Return true if the constant is something that can be loaded with
7320    a special instruction.  */
7321 
7322 int
7323 standard_80387_constant_p (rtx x)
7324 {
7325   enum machine_mode mode = GET_MODE (x);
7326 
7327   REAL_VALUE_TYPE r;
7328 
7329   if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
7330     return -1;
7331 
7332   if (x == CONST0_RTX (mode))
7333     return 1;
7334   if (x == CONST1_RTX (mode))
7335     return 2;
7336 
7337   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
7338 
7339   /* For XFmode constants, try to find a special 80387 instruction when
7340      optimizing for size or on those CPUs that benefit from them.  */
7341   if (mode == XFmode
7342       && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
7343     {
7344       int i;
7345 
7346       if (! ext_80387_constants_init)
7347 	init_ext_80387_constants ();
7348 
7349       for (i = 0; i < 5; i++)
7350         if (real_identical (&r, &ext_80387_constants_table[i]))
7351 	  return i + 3;
7352     }
7353 
7354   /* Load of the constant -0.0 or -1.0 will be split as
7355      fldz;fchs or fld1;fchs sequence.  */
7356   if (real_isnegzero (&r))
7357     return 8;
7358   if (real_identical (&r, &dconstm1))
7359     return 9;
7360 
7361   return 0;
7362 }
7363 
7364 /* Return the opcode of the special instruction to be used to load
7365    the constant X.  */
7366 
7367 const char *
7368 standard_80387_constant_opcode (rtx x)
7369 {
7370   switch (standard_80387_constant_p (x))
7371     {
7372     case 1:
7373       return "fldz";
7374     case 2:
7375       return "fld1";
7376     case 3:
7377       return "fldlg2";
7378     case 4:
7379       return "fldln2";
7380     case 5:
7381       return "fldl2e";
7382     case 6:
7383       return "fldl2t";
7384     case 7:
7385       return "fldpi";
7386     case 8:
7387     case 9:
7388       return "#";
7389     default:
7390       gcc_unreachable ();
7391     }
7392 }
7393 
7394 /* Return the CONST_DOUBLE representing the 80387 constant that is
7395    loaded by the specified special instruction.  The argument IDX
7396    matches the return value from standard_80387_constant_p.  */
7397 
7398 rtx
7399 standard_80387_constant_rtx (int idx)
7400 {
7401   int i;
7402 
7403   if (! ext_80387_constants_init)
7404     init_ext_80387_constants ();
7405 
7406   switch (idx)
7407     {
7408     case 3:
7409     case 4:
7410     case 5:
7411     case 6:
7412     case 7:
7413       i = idx - 3;
7414       break;
7415 
7416     default:
7417       gcc_unreachable ();
7418     }
7419 
7420   return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
7421 				       XFmode);
7422 }
7423 
7424 /* Return 1 if X is all 0s and 2 if x is all 1s
7425    in supported SSE vector mode.  */
7426 
7427 int
7428 standard_sse_constant_p (rtx x)
7429 {
7430   enum machine_mode mode = GET_MODE (x);
7431 
7432   if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
7433     return 1;
7434   if (vector_all_ones_operand (x, mode))
7435     switch (mode)
7436       {
7437       case V16QImode:
7438       case V8HImode:
7439       case V4SImode:
7440       case V2DImode:
7441 	if (TARGET_SSE2)
7442 	  return 2;
7443       default:
7444 	break;
7445       }
7446 
7447   return 0;
7448 }
7449 
7450 /* Return the opcode of the special instruction to be used to load
7451    the constant X.  */
7452 
7453 const char *
7454 standard_sse_constant_opcode (rtx insn, rtx x)
7455 {
7456   switch (standard_sse_constant_p (x))
7457     {
7458     case 1:
7459       switch (get_attr_mode (insn))
7460 	{
7461 	case MODE_V4SF:
7462 	  return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0";
7463 	case MODE_V2DF:
7464 	  return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0";
7465 	case MODE_TI:
7466 	  return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0";
7467 	case MODE_V8SF:
7468 	  return "vxorps\t%x0, %x0, %x0";
7469 	case MODE_V4DF:
7470 	  return "vxorpd\t%x0, %x0, %x0";
7471 	case MODE_OI:
7472 	  return "vpxor\t%x0, %x0, %x0";
7473 	default:
7474 	  break;
7475 	}
7476     case 2:
7477       return TARGET_AVX ? "vpcmpeqd\t%0, %0, %0" : "pcmpeqd\t%0, %0";
7478     default:
7479       break;
7480     }
7481   gcc_unreachable ();
7482 }
7483 
7484 /* Returns 1 if OP contains a symbol reference */
7485 
7486 int
7487 symbolic_reference_mentioned_p (rtx op)
7488 {
7489   const char *fmt;
7490   int i;
7491 
7492   if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
7493     return 1;
7494 
7495   fmt = GET_RTX_FORMAT (GET_CODE (op));
7496   for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
7497     {
7498       if (fmt[i] == 'E')
7499 	{
7500 	  int j;
7501 
7502 	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
7503 	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
7504 	      return 1;
7505 	}
7506 
7507       else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
7508 	return 1;
7509     }
7510 
7511   return 0;
7512 }
7513 
7514 /* Return 1 if it is appropriate to emit `ret' instructions in the
7515    body of a function.  Do this only if the epilogue is simple, needing a
7516    couple of insns.  Prior to reloading, we can't tell how many registers
7517    must be saved, so return 0 then.  Return 0 if there is no frame
7518    marker to de-allocate.  */
7519 
7520 int
7521 ix86_can_use_return_insn_p (void)
7522 {
7523   struct ix86_frame frame;
7524 
7525   if (! reload_completed || frame_pointer_needed)
7526     return 0;
7527 
7528   /* Don't allow more than 32 pop, since that's all we can do
7529      with one instruction.  */
7530   if (crtl->args.pops_args
7531       && crtl->args.size >= 32768)
7532     return 0;
7533 
7534   ix86_compute_frame_layout (&frame);
7535   return frame.to_allocate == 0 && frame.padding0 == 0
7536          && (frame.nregs + frame.nsseregs) == 0;
7537 }
7538 
7539 /* Value should be nonzero if functions must have frame pointers.
7540    Zero means the frame pointer need not be set up (and parms may
7541    be accessed via the stack pointer) in functions that seem suitable.  */
7542 
7543 static bool
7544 ix86_frame_pointer_required (void)
7545 {
7546   /* If we accessed previous frames, then the generated code expects
7547      to be able to access the saved ebp value in our frame.  */
7548   if (cfun->machine->accesses_prev_frame)
7549     return true;
7550 
7551   /* Several x86 os'es need a frame pointer for other reasons,
7552      usually pertaining to setjmp.  */
7553   if (SUBTARGET_FRAME_POINTER_REQUIRED)
7554     return true;
7555 
7556   /* In override_options, TARGET_OMIT_LEAF_FRAME_POINTER turns off
7557      the frame pointer by default.  Turn it back on now if we've not
7558      got a leaf function.  */
7559   if (TARGET_OMIT_LEAF_FRAME_POINTER
7560       && (!current_function_is_leaf
7561 	  || ix86_current_function_calls_tls_descriptor))
7562     return true;
7563 
7564   if (crtl->profile)
7565     return true;
7566 
7567   return false;
7568 }
7569 
7570 /* Record that the current function accesses previous call frames.  */
7571 
7572 void
7573 ix86_setup_frame_addresses (void)
7574 {
7575   cfun->machine->accesses_prev_frame = 1;
7576 }
7577 
7578 #ifndef USE_HIDDEN_LINKONCE
7579 # if (defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)) || TARGET_MACHO
7580 #  define USE_HIDDEN_LINKONCE 1
7581 # else
7582 #  define USE_HIDDEN_LINKONCE 0
7583 # endif
7584 #endif
7585 
7586 static int pic_labels_used;
7587 
7588 /* Fills in the label name that should be used for a pc thunk for
7589    the given register.  */
7590 
7591 static void
7592 get_pc_thunk_name (char name[32], unsigned int regno)
7593 {
7594   gcc_assert (!TARGET_64BIT);
7595 
7596   if (USE_HIDDEN_LINKONCE)
7597     sprintf (name, "__i686.get_pc_thunk.%s", reg_names[regno]);
7598   else
7599     ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
7600 }
7601 
7602 
7603 /* This function generates code for -fpic that loads %ebx with
7604    the return address of the caller and then returns.  */
7605 
7606 static void
7607 ix86_code_end (void)
7608 {
7609   rtx xops[2];
7610   int regno;
7611 
7612   for (regno = 0; regno < 8; ++regno)
7613     {
7614       char name[32];
7615       tree decl;
7616 
7617       if (! ((pic_labels_used >> regno) & 1))
7618 	continue;
7619 
7620       get_pc_thunk_name (name, regno);
7621 
7622       decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
7623 			 get_identifier (name),
7624 			 build_function_type (void_type_node, void_list_node));
7625       DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
7626 				       NULL_TREE, void_type_node);
7627       TREE_PUBLIC (decl) = 1;
7628       TREE_STATIC (decl) = 1;
7629 
7630 #if TARGET_MACHO
7631       if (TARGET_MACHO)
7632 	{
7633 	  switch_to_section (darwin_sections[text_coal_section]);
7634 	  fputs ("\t.weak_definition\t", asm_out_file);
7635 	  assemble_name (asm_out_file, name);
7636 	  fputs ("\n\t.private_extern\t", asm_out_file);
7637 	  assemble_name (asm_out_file, name);
7638 	  fputs ("\n", asm_out_file);
7639 	  ASM_OUTPUT_LABEL (asm_out_file, name);
7640 	  DECL_WEAK (decl) = 1;
7641 	}
7642       else
7643 #endif
7644       if (USE_HIDDEN_LINKONCE)
7645 	{
7646 	  DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
7647 
7648 	  (*targetm.asm_out.unique_section) (decl, 0);
7649 	  switch_to_section (get_named_section (decl, NULL, 0));
7650 
7651 	  (*targetm.asm_out.globalize_label) (asm_out_file, name);
7652 	  fputs ("\t.hidden\t", asm_out_file);
7653 	  assemble_name (asm_out_file, name);
7654 	  putc ('\n', asm_out_file);
7655 	  ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
7656 	}
7657       else
7658 	{
7659 	  switch_to_section (text_section);
7660 	  ASM_OUTPUT_LABEL (asm_out_file, name);
7661 	}
7662 
7663       DECL_INITIAL (decl) = make_node (BLOCK);
7664       current_function_decl = decl;
7665       init_function_start (decl);
7666       first_function_block_is_cold = false;
7667       /* Make sure unwind info is emitted for the thunk if needed.  */
7668       final_start_function (emit_barrier (), asm_out_file, 1);
7669 
7670       xops[0] = gen_rtx_REG (Pmode, regno);
7671       xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
7672       output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
7673       output_asm_insn ("ret", xops);
7674       final_end_function ();
7675       init_insn_lengths ();
7676       free_after_compilation (cfun);
7677       set_cfun (NULL);
7678       current_function_decl = NULL;
7679     }
7680 }
7681 
7682 /* Emit code for the SET_GOT patterns.  */
7683 
7684 const char *
7685 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
7686 {
7687   rtx xops[3];
7688 
7689   xops[0] = dest;
7690 
7691   if (TARGET_VXWORKS_RTP && flag_pic)
7692     {
7693       /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
7694       xops[2] = gen_rtx_MEM (Pmode,
7695 			     gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
7696       output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
7697 
7698       /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
7699 	 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
7700 	 an unadorned address.  */
7701       xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
7702       SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
7703       output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
7704       return "";
7705     }
7706 
7707   xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
7708 
7709   if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic)
7710     {
7711       xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
7712 
7713       if (!flag_pic)
7714 	output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
7715       else
7716 	{
7717 	  output_asm_insn ("call\t%a2", xops);
7718 #ifdef DWARF2_UNWIND_INFO
7719 	  /* The call to next label acts as a push.  */
7720 	  if (dwarf2out_do_frame ())
7721 	    {
7722 	      rtx insn;
7723 	      start_sequence ();
7724 	      insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
7725 					     gen_rtx_PLUS (Pmode,
7726 							   stack_pointer_rtx,
7727 							   GEN_INT (-4))));
7728 	      RTX_FRAME_RELATED_P (insn) = 1;
7729 	      dwarf2out_frame_debug (insn, true);
7730 	      end_sequence ();
7731 	    }
7732 #endif
7733 	}
7734 
7735 #if TARGET_MACHO
7736       /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
7737          is what will be referenced by the Mach-O PIC subsystem.  */
7738       if (!label)
7739 	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
7740 #endif
7741 
7742       (*targetm.asm_out.internal_label) (asm_out_file, "L",
7743 				 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
7744 
7745       if (flag_pic)
7746 	{
7747 	  output_asm_insn ("pop%z0\t%0", xops);
7748 #ifdef DWARF2_UNWIND_INFO
7749 	  /* The pop is a pop and clobbers dest, but doesn't restore it
7750 	     for unwind info purposes.  */
7751 	  if (dwarf2out_do_frame ())
7752 	    {
7753 	      rtx insn;
7754 	      start_sequence ();
7755 	      insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx));
7756 	      dwarf2out_frame_debug (insn, true);
7757 	      insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
7758 					     gen_rtx_PLUS (Pmode,
7759 							   stack_pointer_rtx,
7760 							   GEN_INT (4))));
7761 	      RTX_FRAME_RELATED_P (insn) = 1;
7762 	      dwarf2out_frame_debug (insn, true);
7763 	      end_sequence ();
7764 	    }
7765 #endif
7766 	}
7767     }
7768   else
7769     {
7770       char name[32];
7771       get_pc_thunk_name (name, REGNO (dest));
7772       pic_labels_used |= 1 << REGNO (dest);
7773 
7774 #ifdef DWARF2_UNWIND_INFO
7775       /* Ensure all queued register saves are flushed before the
7776 	 call.  */
7777       if (dwarf2out_do_frame ())
7778 	dwarf2out_flush_queued_reg_saves ();
7779 #endif
7780       xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
7781       xops[2] = gen_rtx_MEM (QImode, xops[2]);
7782       output_asm_insn ("call\t%X2", xops);
7783       /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
7784          is what will be referenced by the Mach-O PIC subsystem.  */
7785 #if TARGET_MACHO
7786       if (!label)
7787 	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
7788       else
7789         targetm.asm_out.internal_label (asm_out_file, "L",
7790 					   CODE_LABEL_NUMBER (label));
7791 #endif
7792     }
7793 
7794   if (TARGET_MACHO)
7795     return "";
7796 
7797   if (!flag_pic || TARGET_DEEP_BRANCH_PREDICTION)
7798     output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
7799   else
7800     output_asm_insn ("add%z0\t{%1+[.-%a2], %0|%0, %1+(.-%a2)}", xops);
7801 
7802   return "";
7803 }
7804 
7805 /* Generate an "push" pattern for input ARG.  */
7806 
7807 static rtx
7808 gen_push (rtx arg)
7809 {
7810   if (ix86_cfa_state->reg == stack_pointer_rtx)
7811     ix86_cfa_state->offset += UNITS_PER_WORD;
7812 
7813   return gen_rtx_SET (VOIDmode,
7814 		      gen_rtx_MEM (Pmode,
7815 				   gen_rtx_PRE_DEC (Pmode,
7816 						    stack_pointer_rtx)),
7817 		      arg);
7818 }
7819 
7820 /* Return >= 0 if there is an unused call-clobbered register available
7821    for the entire function.  */
7822 
7823 static unsigned int
7824 ix86_select_alt_pic_regnum (void)
7825 {
7826   if (current_function_is_leaf && !crtl->profile
7827       && !ix86_current_function_calls_tls_descriptor)
7828     {
7829       int i, drap;
7830       /* Can't use the same register for both PIC and DRAP.  */
7831       if (crtl->drap_reg)
7832 	drap = REGNO (crtl->drap_reg);
7833       else
7834 	drap = -1;
7835       for (i = 2; i >= 0; --i)
7836         if (i != drap && !df_regs_ever_live_p (i))
7837 	  return i;
7838     }
7839 
7840   return INVALID_REGNUM;
7841 }
7842 
7843 /* Return 1 if we need to save REGNO.  */
7844 static int
7845 ix86_save_reg (unsigned int regno, int maybe_eh_return)
7846 {
7847   if (pic_offset_table_rtx
7848       && regno == REAL_PIC_OFFSET_TABLE_REGNUM
7849       && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
7850 	  || crtl->profile
7851 	  || crtl->calls_eh_return
7852 	  || crtl->uses_const_pool))
7853     {
7854       if (ix86_select_alt_pic_regnum () != INVALID_REGNUM)
7855 	return 0;
7856       return 1;
7857     }
7858 
7859   if (crtl->calls_eh_return && maybe_eh_return)
7860     {
7861       unsigned i;
7862       for (i = 0; ; i++)
7863 	{
7864 	  unsigned test = EH_RETURN_DATA_REGNO (i);
7865 	  if (test == INVALID_REGNUM)
7866 	    break;
7867 	  if (test == regno)
7868 	    return 1;
7869 	}
7870     }
7871 
7872   if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
7873     return 1;
7874 
7875   return (df_regs_ever_live_p (regno)
7876 	  && !call_used_regs[regno]
7877 	  && !fixed_regs[regno]
7878 	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
7879 }
7880 
7881 /* Return number of saved general prupose registers.  */
7882 
7883 static int
7884 ix86_nsaved_regs (void)
7885 {
7886   int nregs = 0;
7887   int regno;
7888 
7889   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
7890     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
7891       nregs ++;
7892   return nregs;
7893 }
7894 
7895 /* Return number of saved SSE registrers.  */
7896 
7897 static int
7898 ix86_nsaved_sseregs (void)
7899 {
7900   int nregs = 0;
7901   int regno;
7902 
7903   if (ix86_cfun_abi () != MS_ABI)
7904     return 0;
7905   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
7906     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
7907       nregs ++;
7908   return nregs;
7909 }
7910 
7911 /* Given FROM and TO register numbers, say whether this elimination is
7912    allowed.  If stack alignment is needed, we can only replace argument
7913    pointer with hard frame pointer, or replace frame pointer with stack
7914    pointer.  Otherwise, frame pointer elimination is automatically
7915    handled and all other eliminations are valid.  */
7916 
7917 static bool
7918 ix86_can_eliminate (const int from, const int to)
7919 {
7920   if (stack_realign_fp)
7921     return ((from == ARG_POINTER_REGNUM
7922 	     && to == HARD_FRAME_POINTER_REGNUM)
7923 	    || (from == FRAME_POINTER_REGNUM
7924 		&& to == STACK_POINTER_REGNUM));
7925   else
7926     return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
7927 }
7928 
7929 /* Return the offset between two registers, one to be eliminated, and the other
7930    its replacement, at the start of a routine.  */
7931 
7932 HOST_WIDE_INT
7933 ix86_initial_elimination_offset (int from, int to)
7934 {
7935   struct ix86_frame frame;
7936   ix86_compute_frame_layout (&frame);
7937 
7938   if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
7939     return frame.hard_frame_pointer_offset;
7940   else if (from == FRAME_POINTER_REGNUM
7941 	   && to == HARD_FRAME_POINTER_REGNUM)
7942     return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
7943   else
7944     {
7945       gcc_assert (to == STACK_POINTER_REGNUM);
7946 
7947       if (from == ARG_POINTER_REGNUM)
7948 	return frame.stack_pointer_offset;
7949 
7950       gcc_assert (from == FRAME_POINTER_REGNUM);
7951       return frame.stack_pointer_offset - frame.frame_pointer_offset;
7952     }
7953 }
7954 
7955 /* In a dynamically-aligned function, we can't know the offset from
7956    stack pointer to frame pointer, so we must ensure that setjmp
7957    eliminates fp against the hard fp (%ebp) rather than trying to
7958    index from %esp up to the top of the frame across a gap that is
7959    of unknown (at compile-time) size.  */
7960 static rtx
7961 ix86_builtin_setjmp_frame_value (void)
7962 {
7963   return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
7964 }
7965 
7966 /* Fill structure ix86_frame about frame of currently computed function.  */
7967 
7968 static void
7969 ix86_compute_frame_layout (struct ix86_frame *frame)
7970 {
7971   unsigned int stack_alignment_needed;
7972   HOST_WIDE_INT offset;
7973   unsigned int preferred_alignment;
7974   HOST_WIDE_INT size = get_frame_size ();
7975 
7976   frame->nregs = ix86_nsaved_regs ();
7977   frame->nsseregs = ix86_nsaved_sseregs ();
7978 
7979   stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
7980   preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
7981 
7982   /* MS ABI seem to require stack alignment to be always 16 except for function
7983      prologues.  */
7984   if (ix86_cfun_abi () == MS_ABI && preferred_alignment < 16)
7985     {
7986       preferred_alignment = 16;
7987       stack_alignment_needed = 16;
7988       crtl->preferred_stack_boundary = 128;
7989       crtl->stack_alignment_needed = 128;
7990     }
7991 
7992   gcc_assert (!size || stack_alignment_needed);
7993   gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
7994   gcc_assert (preferred_alignment <= stack_alignment_needed);
7995 
7996   /* During reload iteration the amount of registers saved can change.
7997      Recompute the value as needed.  Do not recompute when amount of registers
7998      didn't change as reload does multiple calls to the function and does not
7999      expect the decision to change within single iteration.  */
8000   if (!optimize_function_for_size_p (cfun)
8001       && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8002     {
8003       int count = frame->nregs;
8004 
8005       cfun->machine->use_fast_prologue_epilogue_nregs = count;
8006       /* The fast prologue uses move instead of push to save registers.  This
8007          is significantly longer, but also executes faster as modern hardware
8008          can execute the moves in parallel, but can't do that for push/pop.
8009 
8010 	 Be careful about choosing what prologue to emit:  When function takes
8011 	 many instructions to execute we may use slow version as well as in
8012 	 case function is known to be outside hot spot (this is known with
8013 	 feedback only).  Weight the size of function by number of registers
8014 	 to save as it is cheap to use one or two push instructions but very
8015 	 slow to use many of them.  */
8016       if (count)
8017 	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8018       if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL
8019 	  || (flag_branch_probabilities
8020 	      && cfun->function_frequency < FUNCTION_FREQUENCY_HOT))
8021         cfun->machine->use_fast_prologue_epilogue = false;
8022       else
8023         cfun->machine->use_fast_prologue_epilogue
8024 	   = !expensive_function_p (count);
8025     }
8026   if (TARGET_PROLOGUE_USING_MOVE
8027       && cfun->machine->use_fast_prologue_epilogue)
8028     frame->save_regs_using_mov = true;
8029   else
8030     frame->save_regs_using_mov = false;
8031 
8032   /* Skip return address.  */
8033   offset = UNITS_PER_WORD;
8034 
8035   /* Skip pushed static chain.  */
8036   if (ix86_static_chain_on_stack)
8037     offset += UNITS_PER_WORD;
8038 
8039   /* Skip saved base pointer.  */
8040   if (frame_pointer_needed)
8041     offset += UNITS_PER_WORD;
8042 
8043   frame->hard_frame_pointer_offset = offset;
8044 
8045   /* Set offset to aligned because the realigned frame starts from
8046      here.  */
8047   if (stack_realign_fp)
8048     offset = (offset + stack_alignment_needed -1) & -stack_alignment_needed;
8049 
8050   /* Register save area */
8051   offset += frame->nregs * UNITS_PER_WORD;
8052 
8053   /* Align SSE reg save area.  */
8054   if (frame->nsseregs)
8055     frame->padding0 = ((offset + 16 - 1) & -16) - offset;
8056   else
8057     frame->padding0 = 0;
8058 
8059   /* SSE register save area.  */
8060   offset += frame->padding0 + frame->nsseregs * 16;
8061 
8062   /* Va-arg area */
8063   frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8064   offset += frame->va_arg_size;
8065 
8066   /* Align start of frame for local function.  */
8067   frame->padding1 = ((offset + stack_alignment_needed - 1)
8068 		     & -stack_alignment_needed) - offset;
8069 
8070   offset += frame->padding1;
8071 
8072   /* Frame pointer points here.  */
8073   frame->frame_pointer_offset = offset;
8074 
8075   offset += size;
8076 
8077   /* Add outgoing arguments area.  Can be skipped if we eliminated
8078      all the function calls as dead code.
8079      Skipping is however impossible when function calls alloca.  Alloca
8080      expander assumes that last crtl->outgoing_args_size
8081      of stack frame are unused.  */
8082   if (ACCUMULATE_OUTGOING_ARGS
8083       && (!current_function_is_leaf || cfun->calls_alloca
8084 	  || ix86_current_function_calls_tls_descriptor))
8085     {
8086       offset += crtl->outgoing_args_size;
8087       frame->outgoing_arguments_size = crtl->outgoing_args_size;
8088     }
8089   else
8090     frame->outgoing_arguments_size = 0;
8091 
8092   /* Align stack boundary.  Only needed if we're calling another function
8093      or using alloca.  */
8094   if (!current_function_is_leaf || cfun->calls_alloca
8095       || ix86_current_function_calls_tls_descriptor)
8096     frame->padding2 = ((offset + preferred_alignment - 1)
8097 		       & -preferred_alignment) - offset;
8098   else
8099     frame->padding2 = 0;
8100 
8101   offset += frame->padding2;
8102 
8103   /* We've reached end of stack frame.  */
8104   frame->stack_pointer_offset = offset;
8105 
8106   /* Size prologue needs to allocate.  */
8107   frame->to_allocate =
8108     (size + frame->padding1 + frame->padding2
8109      + frame->outgoing_arguments_size + frame->va_arg_size);
8110 
8111   if ((!frame->to_allocate && frame->nregs <= 1)
8112       || (TARGET_64BIT && frame->to_allocate >= (HOST_WIDE_INT) 0x80000000))
8113     frame->save_regs_using_mov = false;
8114 
8115   if (!TARGET_64BIT_MS_ABI && TARGET_RED_ZONE
8116       && current_function_sp_is_unchanging
8117       && current_function_is_leaf
8118       && !ix86_current_function_calls_tls_descriptor)
8119     {
8120       frame->red_zone_size = frame->to_allocate;
8121       if (frame->save_regs_using_mov)
8122 	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
8123       if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
8124 	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
8125     }
8126   else
8127     frame->red_zone_size = 0;
8128   frame->to_allocate -= frame->red_zone_size;
8129   frame->stack_pointer_offset -= frame->red_zone_size;
8130 }
8131 
8132 /* Emit code to save registers in the prologue.  */
8133 
8134 static void
8135 ix86_emit_save_regs (void)
8136 {
8137   unsigned int regno;
8138   rtx insn;
8139 
8140   for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
8141     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8142       {
8143 	insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
8144 	RTX_FRAME_RELATED_P (insn) = 1;
8145       }
8146 }
8147 
8148 /* Emit code to save registers using MOV insns.  First register
8149    is restored from POINTER + OFFSET.  */
8150 static void
8151 ix86_emit_save_regs_using_mov (rtx pointer, HOST_WIDE_INT offset)
8152 {
8153   unsigned int regno;
8154   rtx insn;
8155 
8156   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8157     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8158       {
8159 	insn = emit_move_insn (adjust_address (gen_rtx_MEM (Pmode, pointer),
8160 					       Pmode, offset),
8161 			       gen_rtx_REG (Pmode, regno));
8162 	RTX_FRAME_RELATED_P (insn) = 1;
8163 	offset += UNITS_PER_WORD;
8164       }
8165 }
8166 
8167 /* Emit code to save registers using MOV insns.  First register
8168    is restored from POINTER + OFFSET.  */
8169 static void
8170 ix86_emit_save_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset)
8171 {
8172   unsigned int regno;
8173   rtx insn;
8174   rtx mem;
8175 
8176   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8177     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8178       {
8179 	mem = adjust_address (gen_rtx_MEM (TImode, pointer), TImode, offset);
8180 	set_mem_align (mem, 128);
8181 	insn = emit_move_insn (mem, gen_rtx_REG (TImode, regno));
8182 	RTX_FRAME_RELATED_P (insn) = 1;
8183 	offset += 16;
8184       }
8185 }
8186 
8187 static GTY(()) rtx queued_cfa_restores;
8188 
8189 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
8190    manipulation insn.  Don't add it if the previously
8191    saved value will be left untouched within stack red-zone till return,
8192    as unwinders can find the same value in the register and
8193    on the stack.  */
8194 
8195 static void
8196 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT red_offset)
8197 {
8198   if (TARGET_RED_ZONE
8199       && !TARGET_64BIT_MS_ABI
8200       && red_offset + RED_ZONE_SIZE >= 0
8201       && crtl->args.pops_args < 65536)
8202     return;
8203 
8204   if (insn)
8205     {
8206       add_reg_note (insn, REG_CFA_RESTORE, reg);
8207       RTX_FRAME_RELATED_P (insn) = 1;
8208     }
8209   else
8210     queued_cfa_restores
8211       = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
8212 }
8213 
8214 /* Add queued REG_CFA_RESTORE notes if any to INSN.  */
8215 
8216 static void
8217 ix86_add_queued_cfa_restore_notes (rtx insn)
8218 {
8219   rtx last;
8220   if (!queued_cfa_restores)
8221     return;
8222   for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
8223     ;
8224   XEXP (last, 1) = REG_NOTES (insn);
8225   REG_NOTES (insn) = queued_cfa_restores;
8226   queued_cfa_restores = NULL_RTX;
8227   RTX_FRAME_RELATED_P (insn) = 1;
8228 }
8229 
8230 /* Expand prologue or epilogue stack adjustment.
8231    The pattern exist to put a dependency on all ebp-based memory accesses.
8232    STYLE should be negative if instructions should be marked as frame related,
8233    zero if %r11 register is live and cannot be freely used and positive
8234    otherwise.  */
8235 
8236 static void
8237 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
8238 			   int style, bool set_cfa)
8239 {
8240   rtx insn;
8241 
8242   if (! TARGET_64BIT)
8243     insn = emit_insn (gen_pro_epilogue_adjust_stack_1 (dest, src, offset));
8244   else if (x86_64_immediate_operand (offset, DImode))
8245     insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64 (dest, src, offset));
8246   else
8247     {
8248       rtx r11;
8249       /* r11 is used by indirect sibcall return as well, set before the
8250 	 epilogue and used after the epilogue.  ATM indirect sibcall
8251 	 shouldn't be used together with huge frame sizes in one
8252 	 function because of the frame_size check in sibcall.c.  */
8253       gcc_assert (style);
8254       r11 = gen_rtx_REG (DImode, R11_REG);
8255       insn = emit_insn (gen_rtx_SET (DImode, r11, offset));
8256       if (style < 0)
8257 	RTX_FRAME_RELATED_P (insn) = 1;
8258       insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64_2 (dest, src, r11,
8259 							       offset));
8260     }
8261 
8262   if (style >= 0)
8263     ix86_add_queued_cfa_restore_notes (insn);
8264 
8265   if (set_cfa)
8266     {
8267       rtx r;
8268 
8269       gcc_assert (ix86_cfa_state->reg == src);
8270       ix86_cfa_state->offset += INTVAL (offset);
8271       ix86_cfa_state->reg = dest;
8272 
8273       r = gen_rtx_PLUS (Pmode, src, offset);
8274       r = gen_rtx_SET (VOIDmode, dest, r);
8275       add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
8276       RTX_FRAME_RELATED_P (insn) = 1;
8277     }
8278   else if (style < 0)
8279     RTX_FRAME_RELATED_P (insn) = 1;
8280 }
8281 
8282 /* Find an available register to be used as dynamic realign argument
8283    pointer regsiter.  Such a register will be written in prologue and
8284    used in begin of body, so it must not be
8285 	1. parameter passing register.
8286 	2. GOT pointer.
8287    We reuse static-chain register if it is available.  Otherwise, we
8288    use DI for i386 and R13 for x86-64.  We chose R13 since it has
8289    shorter encoding.
8290 
8291    Return: the regno of chosen register.  */
8292 
8293 static unsigned int
8294 find_drap_reg (void)
8295 {
8296   tree decl = cfun->decl;
8297 
8298   if (TARGET_64BIT)
8299     {
8300       /* Use R13 for nested function or function need static chain.
8301 	 Since function with tail call may use any caller-saved
8302 	 registers in epilogue, DRAP must not use caller-saved
8303 	 register in such case.  */
8304       if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
8305 	return R13_REG;
8306 
8307       return R10_REG;
8308     }
8309   else
8310     {
8311       /* Use DI for nested function or function need static chain.
8312 	 Since function with tail call may use any caller-saved
8313 	 registers in epilogue, DRAP must not use caller-saved
8314 	 register in such case.  */
8315       if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
8316 	return DI_REG;
8317 
8318       /* Reuse static chain register if it isn't used for parameter
8319          passing.  */
8320       if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2
8321 	  && !lookup_attribute ("fastcall",
8322     				TYPE_ATTRIBUTES (TREE_TYPE (decl))))
8323 	return CX_REG;
8324       else
8325 	return DI_REG;
8326     }
8327 }
8328 
8329 /* Return minimum incoming stack alignment.  */
8330 
8331 static unsigned int
8332 ix86_minimum_incoming_stack_boundary (bool sibcall)
8333 {
8334   unsigned int incoming_stack_boundary;
8335 
8336   /* Prefer the one specified at command line. */
8337   if (ix86_user_incoming_stack_boundary)
8338     incoming_stack_boundary = ix86_user_incoming_stack_boundary;
8339   /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
8340      if -mstackrealign is used, it isn't used for sibcall check and
8341      estimated stack alignment is 128bit.  */
8342   else if (!sibcall
8343 	   && !TARGET_64BIT
8344 	   && ix86_force_align_arg_pointer
8345 	   && crtl->stack_alignment_estimated == 128)
8346     incoming_stack_boundary = MIN_STACK_BOUNDARY;
8347   else
8348     incoming_stack_boundary = ix86_default_incoming_stack_boundary;
8349 
8350   /* Incoming stack alignment can be changed on individual functions
8351      via force_align_arg_pointer attribute.  We use the smallest
8352      incoming stack boundary.  */
8353   if (incoming_stack_boundary > MIN_STACK_BOUNDARY
8354       && lookup_attribute (ix86_force_align_arg_pointer_string,
8355 			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
8356     incoming_stack_boundary = MIN_STACK_BOUNDARY;
8357 
8358   /* The incoming stack frame has to be aligned at least at
8359      parm_stack_boundary.  */
8360   if (incoming_stack_boundary < crtl->parm_stack_boundary)
8361     incoming_stack_boundary = crtl->parm_stack_boundary;
8362 
8363   /* Stack at entrance of main is aligned by runtime.  We use the
8364      smallest incoming stack boundary. */
8365   if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
8366       && DECL_NAME (current_function_decl)
8367       && MAIN_NAME_P (DECL_NAME (current_function_decl))
8368       && DECL_FILE_SCOPE_P (current_function_decl))
8369     incoming_stack_boundary = MAIN_STACK_BOUNDARY;
8370 
8371   return incoming_stack_boundary;
8372 }
8373 
8374 /* Update incoming stack boundary and estimated stack alignment.  */
8375 
8376 static void
8377 ix86_update_stack_boundary (void)
8378 {
8379   ix86_incoming_stack_boundary
8380     = ix86_minimum_incoming_stack_boundary (false);
8381 
8382   /* x86_64 vararg needs 16byte stack alignment for register save
8383      area.  */
8384   if (TARGET_64BIT
8385       && cfun->stdarg
8386       && crtl->stack_alignment_estimated < 128)
8387     crtl->stack_alignment_estimated = 128;
8388 }
8389 
8390 /* Handle the TARGET_GET_DRAP_RTX hook.  Return NULL if no DRAP is
8391    needed or an rtx for DRAP otherwise.  */
8392 
8393 static rtx
8394 ix86_get_drap_rtx (void)
8395 {
8396   if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
8397     crtl->need_drap = true;
8398 
8399   if (stack_realign_drap)
8400     {
8401       /* Assign DRAP to vDRAP and returns vDRAP */
8402       unsigned int regno = find_drap_reg ();
8403       rtx drap_vreg;
8404       rtx arg_ptr;
8405       rtx seq, insn;
8406 
8407       arg_ptr = gen_rtx_REG (Pmode, regno);
8408       crtl->drap_reg = arg_ptr;
8409 
8410       start_sequence ();
8411       drap_vreg = copy_to_reg (arg_ptr);
8412       seq = get_insns ();
8413       end_sequence ();
8414 
8415       insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
8416       if (!optimize)
8417 	{
8418 	  add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
8419 	  RTX_FRAME_RELATED_P (insn) = 1;
8420 	}
8421       return drap_vreg;
8422     }
8423   else
8424     return NULL;
8425 }
8426 
8427 /* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
8428 
8429 static rtx
8430 ix86_internal_arg_pointer (void)
8431 {
8432   return virtual_incoming_args_rtx;
8433 }
8434 
8435 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
8436    to be generated in correct form.  */
8437 static void
8438 ix86_finalize_stack_realign_flags (void)
8439 {
8440   /* Check if stack realign is really needed after reload, and
8441      stores result in cfun */
8442   unsigned int incoming_stack_boundary
8443     = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
8444        ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
8445   unsigned int stack_realign = (incoming_stack_boundary
8446 				< (current_function_is_leaf
8447 				   ? crtl->max_used_stack_slot_alignment
8448 				   : crtl->stack_alignment_needed));
8449 
8450   if (crtl->stack_realign_finalized)
8451     {
8452       /* After stack_realign_needed is finalized, we can't no longer
8453 	 change it.  */
8454       gcc_assert (crtl->stack_realign_needed == stack_realign);
8455     }
8456   else
8457     {
8458       crtl->stack_realign_needed = stack_realign;
8459       crtl->stack_realign_finalized = true;
8460     }
8461 }
8462 
8463 /* Expand the prologue into a bunch of separate insns.  */
8464 
8465 void
8466 ix86_expand_prologue (void)
8467 {
8468   rtx insn;
8469   bool pic_reg_used;
8470   struct ix86_frame frame;
8471   HOST_WIDE_INT allocate;
8472   int gen_frame_pointer = frame_pointer_needed;
8473 
8474   ix86_finalize_stack_realign_flags ();
8475 
8476   /* DRAP should not coexist with stack_realign_fp */
8477   gcc_assert (!(crtl->drap_reg && stack_realign_fp));
8478 
8479   /* Initialize CFA state for before the prologue.  */
8480   ix86_cfa_state->reg = stack_pointer_rtx;
8481   ix86_cfa_state->offset = INCOMING_FRAME_SP_OFFSET;
8482 
8483   ix86_compute_frame_layout (&frame);
8484 
8485   if (ix86_function_ms_hook_prologue (current_function_decl))
8486     {
8487       rtx push, mov;
8488 
8489       /* Make sure the function starts with
8490 	 8b ff     movl.s %edi,%edi
8491 	 55        push   %ebp
8492 	 8b ec     movl.s %esp,%ebp
8493 
8494 	 This matches the hookable function prologue in Win32 API
8495 	 functions in Microsoft Windows XP Service Pack 2 and newer.
8496 	 Wine uses this to enable Windows apps to hook the Win32 API
8497 	 functions provided by Wine.  */
8498       insn = emit_insn (gen_vswapmov (gen_rtx_REG (SImode, DI_REG),
8499 				      gen_rtx_REG (SImode, DI_REG)));
8500       push = emit_insn (gen_push (hard_frame_pointer_rtx));
8501       mov = emit_insn (gen_vswapmov (hard_frame_pointer_rtx,
8502 				     stack_pointer_rtx));
8503 
8504       if (frame_pointer_needed && !(crtl->drap_reg
8505 				    && crtl->stack_realign_needed))
8506 	{
8507 	  /* The push %ebp and movl.s %esp, %ebp already set up
8508 	     the frame pointer.  No need to do this again. */
8509 	  gen_frame_pointer = 0;
8510 	  RTX_FRAME_RELATED_P (push) = 1;
8511 	  RTX_FRAME_RELATED_P (mov) = 1;
8512 	  if (ix86_cfa_state->reg == stack_pointer_rtx)
8513 	    ix86_cfa_state->reg = hard_frame_pointer_rtx;
8514 	}
8515       else
8516 	/* If the frame pointer is not needed, pop %ebp again. This
8517 	   could be optimized for cases where ebp needs to be backed up
8518 	   for some other reason.  If stack realignment is needed, pop
8519 	   the base pointer again, align the stack, and later regenerate
8520 	   the frame pointer setup.  The frame pointer generated by the
8521 	   hook prologue is not aligned, so it can't be used.  */
8522 	insn = emit_insn ((*ix86_gen_pop1) (hard_frame_pointer_rtx));
8523     }
8524 
8525   /* The first insn of a function that accepts its static chain on the
8526      stack is to push the register that would be filled in by a direct
8527      call.  This insn will be skipped by the trampoline.  */
8528   if (ix86_static_chain_on_stack)
8529     {
8530       rtx t;
8531 
8532       insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
8533       emit_insn (gen_blockage ());
8534 
8535       /* We don't want to interpret this push insn as a register save,
8536 	 only as a stack adjustment.  The real copy of the register as
8537 	 a save will be done later, if needed.  */
8538       t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
8539       t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
8540       add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
8541       RTX_FRAME_RELATED_P (insn) = 1;
8542     }
8543 
8544   /* Emit prologue code to adjust stack alignment and setup DRAP, in case
8545      of DRAP is needed and stack realignment is really needed after reload */
8546   if (crtl->drap_reg && crtl->stack_realign_needed)
8547     {
8548       rtx x, y;
8549       int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
8550       int param_ptr_offset = UNITS_PER_WORD;
8551 
8552       if (ix86_static_chain_on_stack)
8553 	param_ptr_offset += UNITS_PER_WORD;
8554       if (!call_used_regs[REGNO (crtl->drap_reg)])
8555 	param_ptr_offset += UNITS_PER_WORD;
8556 
8557       gcc_assert (stack_realign_drap);
8558 
8559       /* Grab the argument pointer.  */
8560       x = plus_constant (stack_pointer_rtx, param_ptr_offset);
8561       y = crtl->drap_reg;
8562 
8563       /* Only need to push parameter pointer reg if it is caller
8564 	 saved reg */
8565       if (!call_used_regs[REGNO (crtl->drap_reg)])
8566 	{
8567 	  /* Push arg pointer reg */
8568 	  insn = emit_insn (gen_push (y));
8569 	  RTX_FRAME_RELATED_P (insn) = 1;
8570 	}
8571 
8572       insn = emit_insn (gen_rtx_SET (VOIDmode, y, x));
8573       RTX_FRAME_RELATED_P (insn) = 1;
8574       ix86_cfa_state->reg = crtl->drap_reg;
8575 
8576       /* Align the stack.  */
8577       insn = emit_insn ((*ix86_gen_andsp) (stack_pointer_rtx,
8578 					   stack_pointer_rtx,
8579 					   GEN_INT (-align_bytes)));
8580       RTX_FRAME_RELATED_P (insn) = 1;
8581 
8582       /* Replicate the return address on the stack so that return
8583 	 address can be reached via (argp - 1) slot.  This is needed
8584 	 to implement macro RETURN_ADDR_RTX and intrinsic function
8585 	 expand_builtin_return_addr etc.  */
8586       x = crtl->drap_reg;
8587       x = gen_frame_mem (Pmode,
8588                          plus_constant (x, -UNITS_PER_WORD));
8589       insn = emit_insn (gen_push (x));
8590       RTX_FRAME_RELATED_P (insn) = 1;
8591     }
8592 
8593   /* Note: AT&T enter does NOT have reversed args.  Enter is probably
8594      slower on all targets.  Also sdb doesn't like it.  */
8595 
8596   if (gen_frame_pointer)
8597     {
8598       insn = emit_insn (gen_push (hard_frame_pointer_rtx));
8599       RTX_FRAME_RELATED_P (insn) = 1;
8600 
8601       insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
8602       RTX_FRAME_RELATED_P (insn) = 1;
8603 
8604       if (ix86_cfa_state->reg == stack_pointer_rtx)
8605         ix86_cfa_state->reg = hard_frame_pointer_rtx;
8606     }
8607 
8608   if (stack_realign_fp)
8609     {
8610       int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
8611       gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
8612 
8613       /* Align the stack.  */
8614       insn = emit_insn ((*ix86_gen_andsp) (stack_pointer_rtx,
8615 					   stack_pointer_rtx,
8616 					   GEN_INT (-align_bytes)));
8617       RTX_FRAME_RELATED_P (insn) = 1;
8618     }
8619 
8620   allocate = frame.to_allocate + frame.nsseregs * 16 + frame.padding0;
8621 
8622   if (!frame.save_regs_using_mov)
8623     ix86_emit_save_regs ();
8624   else
8625     allocate += frame.nregs * UNITS_PER_WORD;
8626 
8627   /* When using red zone we may start register saving before allocating
8628      the stack frame saving one cycle of the prologue. However I will
8629      avoid doing this if I am going to have to probe the stack since
8630      at least on x86_64 the stack probe can turn into a call that clobbers
8631      a red zone location */
8632   if (!TARGET_64BIT_MS_ABI && TARGET_RED_ZONE && frame.save_regs_using_mov
8633       && (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT))
8634     ix86_emit_save_regs_using_mov ((frame_pointer_needed
8635 				     && !crtl->stack_realign_needed)
8636                                    ? hard_frame_pointer_rtx
8637 				   : stack_pointer_rtx,
8638 				   -frame.nregs * UNITS_PER_WORD);
8639 
8640   if (allocate == 0)
8641     ;
8642   else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
8643     pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
8644 			       GEN_INT (-allocate), -1,
8645 			       ix86_cfa_state->reg == stack_pointer_rtx);
8646   else
8647     {
8648       rtx eax = gen_rtx_REG (Pmode, AX_REG);
8649       rtx r10 = NULL;
8650       bool eax_live = false;
8651       bool r10_live = false;
8652       rtx t;
8653 
8654       if (TARGET_64BIT)
8655         r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
8656       if (!TARGET_64BIT_MS_ABI)
8657         eax_live = ix86_eax_live_at_start_p ();
8658 
8659       if (eax_live)
8660 	{
8661 	  emit_insn (gen_push (eax));
8662 	  allocate -= UNITS_PER_WORD;
8663 	}
8664       if (r10_live)
8665        {
8666          r10 = gen_rtx_REG (Pmode, R10_REG);
8667          emit_insn (gen_push (r10));
8668          allocate -= UNITS_PER_WORD;
8669        }
8670 
8671       emit_move_insn (eax, GEN_INT (allocate));
8672 
8673       if (TARGET_64BIT)
8674 	insn = gen_allocate_stack_worker_64 (eax, eax);
8675       else
8676 	insn = gen_allocate_stack_worker_32 (eax, eax);
8677       insn = emit_insn (insn);
8678 
8679       if (ix86_cfa_state->reg == stack_pointer_rtx)
8680 	{
8681 	  ix86_cfa_state->offset += allocate;
8682 	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (-allocate));
8683 	  t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
8684 	  add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
8685 	  RTX_FRAME_RELATED_P (insn) = 1;
8686 	}
8687 
8688       if (eax_live && r10_live)
8689 	{
8690 	  if (frame_pointer_needed)
8691 	    {
8692 	      t = plus_constant (hard_frame_pointer_rtx,
8693 				 allocate
8694 				 - frame.to_allocate
8695 				 - frame.nregs * UNITS_PER_WORD);
8696 	      emit_move_insn (r10, gen_rtx_MEM (Pmode, t));
8697 	      t = plus_constant (hard_frame_pointer_rtx,
8698 				 allocate + UNITS_PER_WORD
8699 				 - frame.to_allocate
8700 				 - frame.nregs * UNITS_PER_WORD);
8701 	      emit_move_insn (eax, gen_rtx_MEM (Pmode, t));
8702 	    }
8703           else
8704 	    {
8705 	      t = plus_constant (stack_pointer_rtx, allocate);
8706 	      emit_move_insn (r10, gen_rtx_MEM (Pmode, t));
8707 	      t = plus_constant (stack_pointer_rtx, allocate + UNITS_PER_WORD);
8708 	      emit_move_insn (eax, gen_rtx_MEM (Pmode, t));
8709 	    }
8710 	}
8711       else if (eax_live || r10_live)
8712 	{
8713 	  if (frame_pointer_needed)
8714 	    t = plus_constant (hard_frame_pointer_rtx,
8715 			       allocate
8716 			       - frame.to_allocate
8717 			       - frame.nregs * UNITS_PER_WORD);
8718 	  else
8719 	    t = plus_constant (stack_pointer_rtx, allocate);
8720 	  emit_move_insn ((eax_live ? eax : r10), gen_rtx_MEM (Pmode, t));
8721 	}
8722     }
8723 
8724   if (frame.save_regs_using_mov
8725       && !(!TARGET_64BIT_MS_ABI && TARGET_RED_ZONE
8726          && (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)))
8727     {
8728       if (!frame_pointer_needed
8729 	  || !(frame.to_allocate + frame.padding0)
8730 	  || crtl->stack_realign_needed)
8731         ix86_emit_save_regs_using_mov (stack_pointer_rtx,
8732 				       frame.to_allocate
8733 				       + frame.nsseregs * 16 + frame.padding0);
8734       else
8735         ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
8736 				       -frame.nregs * UNITS_PER_WORD);
8737     }
8738   if (!frame_pointer_needed
8739       || !(frame.to_allocate + frame.padding0)
8740       || crtl->stack_realign_needed)
8741     ix86_emit_save_sse_regs_using_mov (stack_pointer_rtx,
8742 				       frame.to_allocate);
8743   else
8744     ix86_emit_save_sse_regs_using_mov (hard_frame_pointer_rtx,
8745 				       - frame.nregs * UNITS_PER_WORD
8746 				       - frame.nsseregs * 16
8747 				       - frame.padding0);
8748 
8749   pic_reg_used = false;
8750   if (pic_offset_table_rtx
8751       && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8752 	  || crtl->profile))
8753     {
8754       unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
8755 
8756       if (alt_pic_reg_used != INVALID_REGNUM)
8757 	SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
8758 
8759       pic_reg_used = true;
8760     }
8761 
8762   if (pic_reg_used)
8763     {
8764       if (TARGET_64BIT)
8765 	{
8766 	  if (ix86_cmodel == CM_LARGE_PIC)
8767 	    {
8768               rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
8769 	      rtx label = gen_label_rtx ();
8770 	      emit_label (label);
8771 	      LABEL_PRESERVE_P (label) = 1;
8772 	      gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
8773 	      insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
8774 	      insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8775 	      insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
8776 					    pic_offset_table_rtx, tmp_reg));
8777 	    }
8778 	  else
8779             insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8780 	}
8781       else
8782         insn = emit_insn (gen_set_got (pic_offset_table_rtx));
8783     }
8784 
8785   /* In the pic_reg_used case, make sure that the got load isn't deleted
8786      when mcount needs it.  Blockage to avoid call movement across mcount
8787      call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
8788      note.  */
8789   if (crtl->profile && pic_reg_used)
8790     emit_insn (gen_prologue_use (pic_offset_table_rtx));
8791 
8792   if (crtl->drap_reg && !crtl->stack_realign_needed)
8793     {
8794       /* vDRAP is setup but after reload it turns out stack realign
8795          isn't necessary, here we will emit prologue to setup DRAP
8796          without stack realign adjustment */
8797       rtx x;
8798       int drap_bp_offset = UNITS_PER_WORD * 2;
8799 
8800       if (ix86_static_chain_on_stack)
8801 	drap_bp_offset += UNITS_PER_WORD;
8802       x = plus_constant (hard_frame_pointer_rtx, drap_bp_offset);
8803       insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, x));
8804     }
8805 
8806   /* Prevent instructions from being scheduled into register save push
8807      sequence when access to the redzone area is done through frame pointer.
8808      The offset between the frame pointer and the stack pointer is calculated
8809      relative to the value of the stack pointer at the end of the function
8810      prologue, and moving instructions that access redzone area via frame
8811      pointer inside push sequence violates this assumption.  */
8812   if (frame_pointer_needed && frame.red_zone_size)
8813     emit_insn (gen_memory_blockage ());
8814 
8815   /* Emit cld instruction if stringops are used in the function.  */
8816   if (TARGET_CLD && ix86_current_function_needs_cld)
8817     emit_insn (gen_cld ());
8818 }
8819 
8820 /* Emit code to restore REG using a POP insn.  */
8821 
8822 static void
8823 ix86_emit_restore_reg_using_pop (rtx reg, HOST_WIDE_INT red_offset)
8824 {
8825   rtx insn = emit_insn (ix86_gen_pop1 (reg));
8826 
8827   if (ix86_cfa_state->reg == crtl->drap_reg
8828       && REGNO (reg) == REGNO (crtl->drap_reg))
8829     {
8830       /* Previously we'd represented the CFA as an expression
8831 	 like *(%ebp - 8).  We've just popped that value from
8832 	 the stack, which means we need to reset the CFA to
8833 	 the drap register.  This will remain until we restore
8834 	 the stack pointer.  */
8835       add_reg_note (insn, REG_CFA_DEF_CFA, reg);
8836       RTX_FRAME_RELATED_P (insn) = 1;
8837       return;
8838     }
8839 
8840   if (ix86_cfa_state->reg == stack_pointer_rtx)
8841     {
8842       ix86_cfa_state->offset -= UNITS_PER_WORD;
8843       add_reg_note (insn, REG_CFA_ADJUST_CFA,
8844 		    copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
8845       RTX_FRAME_RELATED_P (insn) = 1;
8846     }
8847 
8848   /* When the frame pointer is the CFA, and we pop it, we are
8849      swapping back to the stack pointer as the CFA.  This happens
8850      for stack frames that don't allocate other data, so we assume
8851      the stack pointer is now pointing at the return address, i.e.
8852      the function entry state, which makes the offset be 1 word.  */
8853   else if (ix86_cfa_state->reg == hard_frame_pointer_rtx
8854 	   && reg == hard_frame_pointer_rtx)
8855     {
8856       ix86_cfa_state->reg = stack_pointer_rtx;
8857       ix86_cfa_state->offset -= UNITS_PER_WORD;
8858 
8859       add_reg_note (insn, REG_CFA_DEF_CFA,
8860 		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
8861 				  GEN_INT (ix86_cfa_state->offset)));
8862       RTX_FRAME_RELATED_P (insn) = 1;
8863     }
8864 
8865   ix86_add_cfa_restore_note (insn, reg, red_offset);
8866 }
8867 
8868 /* Emit code to restore saved registers using POP insns.  */
8869 
8870 static void
8871 ix86_emit_restore_regs_using_pop (HOST_WIDE_INT red_offset)
8872 {
8873   int regno;
8874 
8875   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8876     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
8877       {
8878 	ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno),
8879 					 red_offset);
8880 	red_offset += UNITS_PER_WORD;
8881       }
8882 }
8883 
8884 /* Emit code and notes for the LEAVE instruction.  */
8885 
8886 static void
8887 ix86_emit_leave (HOST_WIDE_INT red_offset)
8888 {
8889   rtx insn = emit_insn (ix86_gen_leave ());
8890 
8891   ix86_add_queued_cfa_restore_notes (insn);
8892 
8893   if (ix86_cfa_state->reg == hard_frame_pointer_rtx)
8894     {
8895       ix86_cfa_state->reg = stack_pointer_rtx;
8896       ix86_cfa_state->offset -= UNITS_PER_WORD;
8897 
8898       add_reg_note (insn, REG_CFA_ADJUST_CFA,
8899 		    copy_rtx (XVECEXP (PATTERN (insn), 0, 0)));
8900       RTX_FRAME_RELATED_P (insn) = 1;
8901       ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx, red_offset);
8902     }
8903 }
8904 
8905 /* Emit code to restore saved registers using MOV insns.  First register
8906    is restored from POINTER + OFFSET.  */
8907 static void
8908 ix86_emit_restore_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
8909 				  HOST_WIDE_INT red_offset,
8910 				  int maybe_eh_return)
8911 {
8912   unsigned int regno;
8913   rtx base_address = gen_rtx_MEM (Pmode, pointer);
8914   rtx insn;
8915 
8916   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8917     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
8918       {
8919 	rtx reg = gen_rtx_REG (Pmode, regno);
8920 
8921 	/* Ensure that adjust_address won't be forced to produce pointer
8922 	   out of range allowed by x86-64 instruction set.  */
8923 	if (TARGET_64BIT && offset != trunc_int_for_mode (offset, SImode))
8924 	  {
8925 	    rtx r11;
8926 
8927 	    r11 = gen_rtx_REG (DImode, R11_REG);
8928 	    emit_move_insn (r11, GEN_INT (offset));
8929 	    emit_insn (gen_adddi3 (r11, r11, pointer));
8930 	    base_address = gen_rtx_MEM (Pmode, r11);
8931 	    offset = 0;
8932 	  }
8933 	insn = emit_move_insn (reg,
8934 			       adjust_address (base_address, Pmode, offset));
8935 	offset += UNITS_PER_WORD;
8936 
8937         if (ix86_cfa_state->reg == crtl->drap_reg
8938 	    && regno == REGNO (crtl->drap_reg))
8939 	  {
8940 	    /* Previously we'd represented the CFA as an expression
8941 	       like *(%ebp - 8).  We've just popped that value from
8942 	       the stack, which means we need to reset the CFA to
8943 	       the drap register.  This will remain until we restore
8944 	       the stack pointer.  */
8945 	    add_reg_note (insn, REG_CFA_DEF_CFA, reg);
8946 	    RTX_FRAME_RELATED_P (insn) = 1;
8947 	  }
8948 	else
8949 	  ix86_add_cfa_restore_note (NULL_RTX, reg, red_offset);
8950 
8951 	red_offset += UNITS_PER_WORD;
8952       }
8953 }
8954 
8955 /* Emit code to restore saved registers using MOV insns.  First register
8956    is restored from POINTER + OFFSET.  */
8957 static void
8958 ix86_emit_restore_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
8959 				      HOST_WIDE_INT red_offset,
8960 				      int maybe_eh_return)
8961 {
8962   int regno;
8963   rtx base_address = gen_rtx_MEM (TImode, pointer);
8964   rtx mem;
8965 
8966   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8967     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
8968       {
8969 	rtx reg = gen_rtx_REG (TImode, regno);
8970 
8971 	/* Ensure that adjust_address won't be forced to produce pointer
8972 	   out of range allowed by x86-64 instruction set.  */
8973 	if (TARGET_64BIT && offset != trunc_int_for_mode (offset, SImode))
8974 	  {
8975 	    rtx r11;
8976 
8977 	    r11 = gen_rtx_REG (DImode, R11_REG);
8978 	    emit_move_insn (r11, GEN_INT (offset));
8979 	    emit_insn (gen_adddi3 (r11, r11, pointer));
8980 	    base_address = gen_rtx_MEM (TImode, r11);
8981 	    offset = 0;
8982 	  }
8983 	mem = adjust_address (base_address, TImode, offset);
8984 	set_mem_align (mem, 128);
8985 	emit_move_insn (reg, mem);
8986 	offset += 16;
8987 
8988 	ix86_add_cfa_restore_note (NULL_RTX, reg, red_offset);
8989 
8990 	red_offset += 16;
8991       }
8992 }
8993 
8994 /* Restore function stack, frame, and registers.  */
8995 
8996 void
8997 ix86_expand_epilogue (int style)
8998 {
8999   int sp_valid;
9000   struct ix86_frame frame;
9001   HOST_WIDE_INT offset, red_offset;
9002   struct machine_cfa_state cfa_state_save = *ix86_cfa_state;
9003   bool using_drap;
9004 
9005   ix86_finalize_stack_realign_flags ();
9006 
9007  /* When stack is realigned, SP must be valid.  */
9008   sp_valid = (!frame_pointer_needed
9009 	      || current_function_sp_is_unchanging
9010 	      || stack_realign_fp);
9011 
9012   ix86_compute_frame_layout (&frame);
9013 
9014   /* See the comment about red zone and frame
9015      pointer usage in ix86_expand_prologue.  */
9016   if (frame_pointer_needed && frame.red_zone_size)
9017     emit_insn (gen_memory_blockage ());
9018 
9019   using_drap = crtl->drap_reg && crtl->stack_realign_needed;
9020   gcc_assert (!using_drap || ix86_cfa_state->reg == crtl->drap_reg);
9021 
9022   /* Calculate start of saved registers relative to ebp.  Special care
9023      must be taken for the normal return case of a function using
9024      eh_return: the eax and edx registers are marked as saved, but not
9025      restored along this path.  */
9026   offset = frame.nregs;
9027   if (crtl->calls_eh_return && style != 2)
9028     offset -= 2;
9029   offset *= -UNITS_PER_WORD;
9030   offset -= frame.nsseregs * 16 + frame.padding0;
9031 
9032   /* Calculate start of saved registers relative to esp on entry of the
9033      function.  When realigning stack, this needs to be the most negative
9034      value possible at runtime.  */
9035   red_offset = offset;
9036   if (using_drap)
9037     red_offset -= crtl->stack_alignment_needed / BITS_PER_UNIT
9038 		  + UNITS_PER_WORD;
9039   else if (stack_realign_fp)
9040     red_offset -= crtl->stack_alignment_needed / BITS_PER_UNIT
9041 		  - UNITS_PER_WORD;
9042   if (ix86_static_chain_on_stack)
9043     red_offset -= UNITS_PER_WORD;
9044   if (frame_pointer_needed)
9045     red_offset -= UNITS_PER_WORD;
9046 
9047   /* If we're only restoring one register and sp is not valid then
9048      using a move instruction to restore the register since it's
9049      less work than reloading sp and popping the register.
9050 
9051      The default code result in stack adjustment using add/lea instruction,
9052      while this code results in LEAVE instruction (or discrete equivalent),
9053      so it is profitable in some other cases as well.  Especially when there
9054      are no registers to restore.  We also use this code when TARGET_USE_LEAVE
9055      and there is exactly one register to pop. This heuristic may need some
9056      tuning in future.  */
9057   if ((!sp_valid && (frame.nregs + frame.nsseregs) <= 1)
9058       || (TARGET_EPILOGUE_USING_MOVE
9059 	  && cfun->machine->use_fast_prologue_epilogue
9060 	  && ((frame.nregs + frame.nsseregs) > 1
9061 	      || (frame.to_allocate + frame.padding0) != 0))
9062       || (frame_pointer_needed && !(frame.nregs + frame.nsseregs)
9063 	  && (frame.to_allocate + frame.padding0) != 0)
9064       || (frame_pointer_needed && TARGET_USE_LEAVE
9065 	  && cfun->machine->use_fast_prologue_epilogue
9066 	  && (frame.nregs + frame.nsseregs) == 1)
9067       || crtl->calls_eh_return)
9068     {
9069       /* Restore registers.  We can use ebp or esp to address the memory
9070 	 locations.  If both are available, default to ebp, since offsets
9071 	 are known to be small.  Only exception is esp pointing directly
9072 	 to the end of block of saved registers, where we may simplify
9073 	 addressing mode.
9074 
9075 	 If we are realigning stack with bp and sp, regs restore can't
9076 	 be addressed by bp. sp must be used instead.  */
9077 
9078       if (!frame_pointer_needed
9079 	  || (sp_valid && !(frame.to_allocate + frame.padding0))
9080 	  || stack_realign_fp)
9081 	{
9082 	  ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
9083 						frame.to_allocate, red_offset,
9084 						style == 2);
9085 	  ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
9086 					    frame.to_allocate
9087 					    + frame.nsseregs * 16
9088 					    + frame.padding0,
9089 					    red_offset
9090 					    + frame.nsseregs * 16
9091 					    + frame.padding0, style == 2);
9092 	}
9093       else
9094         {
9095 	  ix86_emit_restore_sse_regs_using_mov (hard_frame_pointer_rtx,
9096 						offset, red_offset,
9097 						style == 2);
9098 	  ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
9099 					    offset
9100 					    + frame.nsseregs * 16
9101 					    + frame.padding0,
9102 					    red_offset
9103 					    + frame.nsseregs * 16
9104 					    + frame.padding0, style == 2);
9105         }
9106 
9107       red_offset -= offset;
9108 
9109       /* eh_return epilogues need %ecx added to the stack pointer.  */
9110       if (style == 2)
9111 	{
9112 	  rtx tmp, sa = EH_RETURN_STACKADJ_RTX;
9113 
9114 	  /* Stack align doesn't work with eh_return.  */
9115 	  gcc_assert (!crtl->stack_realign_needed);
9116 	  /* Neither does regparm nested functions.  */
9117 	  gcc_assert (!ix86_static_chain_on_stack);
9118 
9119 	  if (frame_pointer_needed)
9120 	    {
9121 	      tmp = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
9122 	      tmp = plus_constant (tmp, UNITS_PER_WORD);
9123 	      tmp = emit_insn (gen_rtx_SET (VOIDmode, sa, tmp));
9124 
9125 	      tmp = gen_rtx_MEM (Pmode, hard_frame_pointer_rtx);
9126 	      tmp = emit_move_insn (hard_frame_pointer_rtx, tmp);
9127 
9128 	      /* Note that we use SA as a temporary CFA, as the return
9129 		 address is at the proper place relative to it.  We
9130 		 pretend this happens at the FP restore insn because
9131 		 prior to this insn the FP would be stored at the wrong
9132 		 offset relative to SA, and after this insn we have no
9133 		 other reasonable register to use for the CFA.  We don't
9134 		 bother resetting the CFA to the SP for the duration of
9135 		 the return insn.  */
9136 	      add_reg_note (tmp, REG_CFA_DEF_CFA,
9137 			    plus_constant (sa, UNITS_PER_WORD));
9138 	      ix86_add_queued_cfa_restore_notes (tmp);
9139 	      add_reg_note (tmp, REG_CFA_RESTORE, hard_frame_pointer_rtx);
9140 	      RTX_FRAME_RELATED_P (tmp) = 1;
9141 	      ix86_cfa_state->reg = sa;
9142 	      ix86_cfa_state->offset = UNITS_PER_WORD;
9143 
9144 	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
9145 					 const0_rtx, style, false);
9146 	    }
9147 	  else
9148 	    {
9149 	      tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
9150 	      tmp = plus_constant (tmp, (frame.to_allocate
9151                                          + frame.nregs * UNITS_PER_WORD
9152 					 + frame.nsseregs * 16
9153 					 + frame.padding0));
9154 	      tmp = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp));
9155 	      ix86_add_queued_cfa_restore_notes (tmp);
9156 
9157 	      gcc_assert (ix86_cfa_state->reg == stack_pointer_rtx);
9158 	      if (ix86_cfa_state->offset != UNITS_PER_WORD)
9159 		{
9160 		  ix86_cfa_state->offset = UNITS_PER_WORD;
9161 		  add_reg_note (tmp, REG_CFA_DEF_CFA,
9162 				plus_constant (stack_pointer_rtx,
9163 					       UNITS_PER_WORD));
9164 		  RTX_FRAME_RELATED_P (tmp) = 1;
9165 		}
9166 	    }
9167 	}
9168       else if (!frame_pointer_needed)
9169 	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
9170 				   GEN_INT (frame.to_allocate
9171 					    + frame.nregs * UNITS_PER_WORD
9172 					    + frame.nsseregs * 16
9173 					    + frame.padding0),
9174 				   style, !using_drap);
9175       /* If not an i386, mov & pop is faster than "leave".  */
9176       else if (TARGET_USE_LEAVE || optimize_function_for_size_p (cfun)
9177 	       || !cfun->machine->use_fast_prologue_epilogue)
9178 	ix86_emit_leave (red_offset);
9179       else
9180 	{
9181 	  pro_epilogue_adjust_stack (stack_pointer_rtx,
9182 				     hard_frame_pointer_rtx,
9183 				     const0_rtx, style, !using_drap);
9184 
9185 	  ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx, red_offset);
9186 	}
9187     }
9188   else
9189     {
9190       /* First step is to deallocate the stack frame so that we can
9191 	 pop the registers.
9192 
9193 	 If we realign stack with frame pointer, then stack pointer
9194          won't be able to recover via lea $offset(%bp), %sp, because
9195          there is a padding area between bp and sp for realign.
9196          "add $to_allocate, %sp" must be used instead.  */
9197       if (!sp_valid)
9198 	{
9199 	  gcc_assert (frame_pointer_needed);
9200           gcc_assert (!stack_realign_fp);
9201 	  pro_epilogue_adjust_stack (stack_pointer_rtx,
9202 				     hard_frame_pointer_rtx,
9203 				     GEN_INT (offset), style, false);
9204           ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
9205 						0, red_offset,
9206 						style == 2);
9207 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
9208 				     GEN_INT (frame.nsseregs * 16
9209 					      + frame.padding0),
9210 				     style, false);
9211 	}
9212       else if (frame.to_allocate || frame.padding0 || frame.nsseregs)
9213 	{
9214           ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
9215 						frame.to_allocate, red_offset,
9216 						style == 2);
9217 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
9218 				     GEN_INT (frame.to_allocate
9219 				     	      + frame.nsseregs * 16
9220 					      + frame.padding0), style,
9221 				     !using_drap && !frame_pointer_needed);
9222 	}
9223 
9224       ix86_emit_restore_regs_using_pop (red_offset + frame.nsseregs * 16
9225 					+ frame.padding0);
9226       red_offset -= offset;
9227 
9228       if (frame_pointer_needed)
9229 	{
9230 	  /* Leave results in shorter dependency chains on CPUs that are
9231 	     able to grok it fast.  */
9232 	  if (TARGET_USE_LEAVE)
9233 	    ix86_emit_leave (red_offset);
9234 	  else
9235             {
9236               /* For stack realigned really happens, recover stack
9237                  pointer to hard frame pointer is a must, if not using
9238                  leave.  */
9239               if (stack_realign_fp)
9240 		pro_epilogue_adjust_stack (stack_pointer_rtx,
9241 					   hard_frame_pointer_rtx,
9242 					   const0_rtx, style, !using_drap);
9243 	      ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx,
9244 					       red_offset);
9245             }
9246 	}
9247     }
9248 
9249   if (using_drap)
9250     {
9251       int param_ptr_offset = UNITS_PER_WORD;
9252       rtx insn;
9253 
9254       gcc_assert (stack_realign_drap);
9255 
9256       if (ix86_static_chain_on_stack)
9257 	param_ptr_offset += UNITS_PER_WORD;
9258       if (!call_used_regs[REGNO (crtl->drap_reg)])
9259 	param_ptr_offset += UNITS_PER_WORD;
9260 
9261       insn = emit_insn (gen_rtx_SET
9262 			(VOIDmode, stack_pointer_rtx,
9263 			 gen_rtx_PLUS (Pmode,
9264 				       crtl->drap_reg,
9265 				       GEN_INT (-param_ptr_offset))));
9266 
9267       ix86_cfa_state->reg = stack_pointer_rtx;
9268       ix86_cfa_state->offset = param_ptr_offset;
9269 
9270       add_reg_note (insn, REG_CFA_DEF_CFA,
9271 		    gen_rtx_PLUS (Pmode, ix86_cfa_state->reg,
9272 				  GEN_INT (ix86_cfa_state->offset)));
9273       RTX_FRAME_RELATED_P (insn) = 1;
9274 
9275       if (!call_used_regs[REGNO (crtl->drap_reg)])
9276 	ix86_emit_restore_reg_using_pop (crtl->drap_reg, -UNITS_PER_WORD);
9277     }
9278 
9279   /* Remove the saved static chain from the stack.  The use of ECX is
9280      merely as a scratch register, not as the actual static chain.  */
9281   if (ix86_static_chain_on_stack)
9282     {
9283       rtx r, insn;
9284 
9285       gcc_assert (ix86_cfa_state->reg == stack_pointer_rtx);
9286       ix86_cfa_state->offset += UNITS_PER_WORD;
9287 
9288       r = gen_rtx_REG (Pmode, CX_REG);
9289       insn = emit_insn (ix86_gen_pop1 (r));
9290 
9291       r = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
9292       r = gen_rtx_SET (VOIDmode, stack_pointer_rtx, r);
9293       add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9294       RTX_FRAME_RELATED_P (insn) = 1;
9295     }
9296 
9297   /* Sibcall epilogues don't want a return instruction.  */
9298   if (style == 0)
9299     {
9300       *ix86_cfa_state = cfa_state_save;
9301       return;
9302     }
9303 
9304   if (crtl->args.pops_args && crtl->args.size)
9305     {
9306       rtx popc = GEN_INT (crtl->args.pops_args);
9307 
9308       /* i386 can only pop 64K bytes.  If asked to pop more, pop return
9309 	 address, do explicit add, and jump indirectly to the caller.  */
9310 
9311       if (crtl->args.pops_args >= 65536)
9312 	{
9313 	  rtx ecx = gen_rtx_REG (SImode, CX_REG);
9314 	  rtx insn;
9315 
9316 	  /* There is no "pascal" calling convention in any 64bit ABI.  */
9317 	  gcc_assert (!TARGET_64BIT);
9318 
9319 	  insn = emit_insn (gen_popsi1 (ecx));
9320 	  ix86_cfa_state->offset -= UNITS_PER_WORD;
9321 
9322 	  add_reg_note (insn, REG_CFA_ADJUST_CFA,
9323 			copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
9324 	  add_reg_note (insn, REG_CFA_REGISTER,
9325 			gen_rtx_SET (VOIDmode, ecx, pc_rtx));
9326 	  RTX_FRAME_RELATED_P (insn) = 1;
9327 
9328 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
9329 				     popc, -1, true);
9330 	  emit_jump_insn (gen_return_indirect_internal (ecx));
9331 	}
9332       else
9333 	emit_jump_insn (gen_return_pop_internal (popc));
9334     }
9335   else
9336     emit_jump_insn (gen_return_internal ());
9337 
9338   /* Restore the state back to the state from the prologue,
9339      so that it's correct for the next epilogue.  */
9340   *ix86_cfa_state = cfa_state_save;
9341 }
9342 
9343 /* Reset from the function's potential modifications.  */
9344 
9345 static void
9346 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
9347 			       HOST_WIDE_INT size ATTRIBUTE_UNUSED)
9348 {
9349   if (pic_offset_table_rtx)
9350     SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
9351 #if TARGET_MACHO
9352   /* Mach-O doesn't support labels at the end of objects, so if
9353      it looks like we might want one, insert a NOP.  */
9354   {
9355     rtx insn = get_last_insn ();
9356     while (insn
9357 	   && NOTE_P (insn)
9358 	   && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
9359       insn = PREV_INSN (insn);
9360     if (insn
9361 	&& (LABEL_P (insn)
9362 	    || (NOTE_P (insn)
9363 		&& NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
9364       fputs ("\tnop\n", file);
9365   }
9366 #endif
9367 
9368 }
9369 
9370 /* Extract the parts of an RTL expression that is a valid memory address
9371    for an instruction.  Return 0 if the structure of the address is
9372    grossly off.  Return -1 if the address contains ASHIFT, so it is not
9373    strictly valid, but still used for computing length of lea instruction.  */
9374 
9375 int
9376 ix86_decompose_address (rtx addr, struct ix86_address *out)
9377 {
9378   rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
9379   rtx base_reg, index_reg;
9380   HOST_WIDE_INT scale = 1;
9381   rtx scale_rtx = NULL_RTX;
9382   int retval = 1;
9383   enum ix86_address_seg seg = SEG_DEFAULT;
9384 
9385   if (REG_P (addr) || GET_CODE (addr) == SUBREG)
9386     base = addr;
9387   else if (GET_CODE (addr) == PLUS)
9388     {
9389       rtx addends[4], op;
9390       int n = 0, i;
9391 
9392       op = addr;
9393       do
9394 	{
9395 	  if (n >= 4)
9396 	    return 0;
9397 	  addends[n++] = XEXP (op, 1);
9398 	  op = XEXP (op, 0);
9399 	}
9400       while (GET_CODE (op) == PLUS);
9401       if (n >= 4)
9402 	return 0;
9403       addends[n] = op;
9404 
9405       for (i = n; i >= 0; --i)
9406 	{
9407 	  op = addends[i];
9408 	  switch (GET_CODE (op))
9409 	    {
9410 	    case MULT:
9411 	      if (index)
9412 		return 0;
9413 	      index = XEXP (op, 0);
9414 	      scale_rtx = XEXP (op, 1);
9415 	      break;
9416 
9417 	    case UNSPEC:
9418 	      if (XINT (op, 1) == UNSPEC_TP
9419 	          && TARGET_TLS_DIRECT_SEG_REFS
9420 	          && seg == SEG_DEFAULT)
9421 		seg = TARGET_64BIT ? SEG_FS : SEG_GS;
9422 	      else
9423 		return 0;
9424 	      break;
9425 
9426 	    case REG:
9427 	    case SUBREG:
9428 	      if (!base)
9429 		base = op;
9430 	      else if (!index)
9431 		index = op;
9432 	      else
9433 		return 0;
9434 	      break;
9435 
9436 	    case CONST:
9437 	    case CONST_INT:
9438 	    case SYMBOL_REF:
9439 	    case LABEL_REF:
9440 	      if (disp)
9441 		return 0;
9442 	      disp = op;
9443 	      break;
9444 
9445 	    default:
9446 	      return 0;
9447 	    }
9448 	}
9449     }
9450   else if (GET_CODE (addr) == MULT)
9451     {
9452       index = XEXP (addr, 0);		/* index*scale */
9453       scale_rtx = XEXP (addr, 1);
9454     }
9455   else if (GET_CODE (addr) == ASHIFT)
9456     {
9457       rtx tmp;
9458 
9459       /* We're called for lea too, which implements ashift on occasion.  */
9460       index = XEXP (addr, 0);
9461       tmp = XEXP (addr, 1);
9462       if (!CONST_INT_P (tmp))
9463 	return 0;
9464       scale = INTVAL (tmp);
9465       if ((unsigned HOST_WIDE_INT) scale > 3)
9466 	return 0;
9467       scale = 1 << scale;
9468       retval = -1;
9469     }
9470   else
9471     disp = addr;			/* displacement */
9472 
9473   /* Extract the integral value of scale.  */
9474   if (scale_rtx)
9475     {
9476       if (!CONST_INT_P (scale_rtx))
9477 	return 0;
9478       scale = INTVAL (scale_rtx);
9479     }
9480 
9481   base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
9482   index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
9483 
9484   /* Avoid useless 0 displacement.  */
9485   if (disp == const0_rtx && (base || index))
9486     disp = NULL_RTX;
9487 
9488   /* Allow arg pointer and stack pointer as index if there is not scaling.  */
9489   if (base_reg && index_reg && scale == 1
9490       && (index_reg == arg_pointer_rtx
9491 	  || index_reg == frame_pointer_rtx
9492 	  || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
9493     {
9494       rtx tmp;
9495       tmp = base, base = index, index = tmp;
9496       tmp = base_reg, base_reg = index_reg, index_reg = tmp;
9497     }
9498 
9499   /* Special case: %ebp cannot be encoded as a base without a displacement.
9500      Similarly %r13.  */
9501   if (!disp
9502       && base_reg
9503       && (base_reg == hard_frame_pointer_rtx
9504 	  || base_reg == frame_pointer_rtx
9505 	  || base_reg == arg_pointer_rtx
9506 	  || (REG_P (base_reg)
9507 	      && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
9508 		  || REGNO (base_reg) == R13_REG))))
9509     disp = const0_rtx;
9510 
9511   /* Special case: on K6, [%esi] makes the instruction vector decoded.
9512      Avoid this by transforming to [%esi+0].
9513      Reload calls address legitimization without cfun defined, so we need
9514      to test cfun for being non-NULL. */
9515   if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
9516       && base_reg && !index_reg && !disp
9517       && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
9518     disp = const0_rtx;
9519 
9520   /* Special case: encode reg+reg instead of reg*2.  */
9521   if (!base && index && scale == 2)
9522     base = index, base_reg = index_reg, scale = 1;
9523 
9524   /* Special case: scaling cannot be encoded without base or displacement.  */
9525   if (!base && !disp && index && scale != 1)
9526     disp = const0_rtx;
9527 
9528   out->base = base;
9529   out->index = index;
9530   out->disp = disp;
9531   out->scale = scale;
9532   out->seg = seg;
9533 
9534   return retval;
9535 }
9536 
9537 /* Return cost of the memory address x.
9538    For i386, it is better to use a complex address than let gcc copy
9539    the address into a reg and make a new pseudo.  But not if the address
9540    requires to two regs - that would mean more pseudos with longer
9541    lifetimes.  */
9542 static int
9543 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
9544 {
9545   struct ix86_address parts;
9546   int cost = 1;
9547   int ok = ix86_decompose_address (x, &parts);
9548 
9549   gcc_assert (ok);
9550 
9551   if (parts.base && GET_CODE (parts.base) == SUBREG)
9552     parts.base = SUBREG_REG (parts.base);
9553   if (parts.index && GET_CODE (parts.index) == SUBREG)
9554     parts.index = SUBREG_REG (parts.index);
9555 
9556   /* Attempt to minimize number of registers in the address.  */
9557   if ((parts.base
9558        && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
9559       || (parts.index
9560 	  && (!REG_P (parts.index)
9561 	      || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
9562     cost++;
9563 
9564   if (parts.base
9565       && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
9566       && parts.index
9567       && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
9568       && parts.base != parts.index)
9569     cost++;
9570 
9571   /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
9572      since it's predecode logic can't detect the length of instructions
9573      and it degenerates to vector decoded.  Increase cost of such
9574      addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
9575      to split such addresses or even refuse such addresses at all.
9576 
9577      Following addressing modes are affected:
9578       [base+scale*index]
9579       [scale*index+disp]
9580       [base+index]
9581 
9582      The first and last case  may be avoidable by explicitly coding the zero in
9583      memory address, but I don't have AMD-K6 machine handy to check this
9584      theory.  */
9585 
9586   if (TARGET_K6
9587       && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
9588 	  || (parts.disp && !parts.base && parts.index && parts.scale != 1)
9589 	  || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
9590     cost += 10;
9591 
9592   return cost;
9593 }
9594 
9595 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
9596    this is used for to form addresses to local data when -fPIC is in
9597    use.  */
9598 
9599 static bool
9600 darwin_local_data_pic (rtx disp)
9601 {
9602   return (GET_CODE (disp) == UNSPEC
9603 	  && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
9604 }
9605 
9606 /* Determine if a given RTX is a valid constant.  We already know this
9607    satisfies CONSTANT_P.  */
9608 
9609 bool
9610 legitimate_constant_p (rtx x)
9611 {
9612   switch (GET_CODE (x))
9613     {
9614     case CONST:
9615       x = XEXP (x, 0);
9616 
9617       if (GET_CODE (x) == PLUS)
9618 	{
9619 	  if (!CONST_INT_P (XEXP (x, 1)))
9620 	    return false;
9621 	  x = XEXP (x, 0);
9622 	}
9623 
9624       if (TARGET_MACHO && darwin_local_data_pic (x))
9625 	return true;
9626 
9627       /* Only some unspecs are valid as "constants".  */
9628       if (GET_CODE (x) == UNSPEC)
9629 	switch (XINT (x, 1))
9630 	  {
9631 	  case UNSPEC_GOT:
9632 	  case UNSPEC_GOTOFF:
9633 	  case UNSPEC_PLTOFF:
9634 	    return TARGET_64BIT;
9635 	  case UNSPEC_TPOFF:
9636 	  case UNSPEC_NTPOFF:
9637 	    x = XVECEXP (x, 0, 0);
9638 	    return (GET_CODE (x) == SYMBOL_REF
9639 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
9640 	  case UNSPEC_DTPOFF:
9641 	    x = XVECEXP (x, 0, 0);
9642 	    return (GET_CODE (x) == SYMBOL_REF
9643 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
9644 	  default:
9645 	    return false;
9646 	  }
9647 
9648       /* We must have drilled down to a symbol.  */
9649       if (GET_CODE (x) == LABEL_REF)
9650 	return true;
9651       if (GET_CODE (x) != SYMBOL_REF)
9652 	return false;
9653       /* FALLTHRU */
9654 
9655     case SYMBOL_REF:
9656       /* TLS symbols are never valid.  */
9657       if (SYMBOL_REF_TLS_MODEL (x))
9658 	return false;
9659 
9660       /* DLLIMPORT symbols are never valid.  */
9661       if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
9662 	  && SYMBOL_REF_DLLIMPORT_P (x))
9663 	return false;
9664       break;
9665 
9666     case CONST_DOUBLE:
9667       if (GET_MODE (x) == TImode
9668 	  && x != CONST0_RTX (TImode)
9669           && !TARGET_64BIT)
9670 	return false;
9671       break;
9672 
9673     case CONST_VECTOR:
9674       if (!standard_sse_constant_p (x))
9675 	return false;
9676 
9677     default:
9678       break;
9679     }
9680 
9681   /* Otherwise we handle everything else in the move patterns.  */
9682   return true;
9683 }
9684 
9685 /* Determine if it's legal to put X into the constant pool.  This
9686    is not possible for the address of thread-local symbols, which
9687    is checked above.  */
9688 
9689 static bool
9690 ix86_cannot_force_const_mem (rtx x)
9691 {
9692   /* We can always put integral constants and vectors in memory.  */
9693   switch (GET_CODE (x))
9694     {
9695     case CONST_INT:
9696     case CONST_DOUBLE:
9697     case CONST_VECTOR:
9698       return false;
9699 
9700     default:
9701       break;
9702     }
9703   return !legitimate_constant_p (x);
9704 }
9705 
9706 
9707 /* Nonzero if the constant value X is a legitimate general operand
9708    when generating PIC code.  It is given that flag_pic is on and
9709    that X satisfies CONSTANT_P or is a CONST_DOUBLE.  */
9710 
9711 bool
9712 legitimate_pic_operand_p (rtx x)
9713 {
9714   rtx inner;
9715 
9716   switch (GET_CODE (x))
9717     {
9718     case CONST:
9719       inner = XEXP (x, 0);
9720       if (GET_CODE (inner) == PLUS
9721 	  && CONST_INT_P (XEXP (inner, 1)))
9722 	inner = XEXP (inner, 0);
9723 
9724       /* Only some unspecs are valid as "constants".  */
9725       if (GET_CODE (inner) == UNSPEC)
9726 	switch (XINT (inner, 1))
9727 	  {
9728 	  case UNSPEC_GOT:
9729 	  case UNSPEC_GOTOFF:
9730 	  case UNSPEC_PLTOFF:
9731 	    return TARGET_64BIT;
9732 	  case UNSPEC_TPOFF:
9733 	    x = XVECEXP (inner, 0, 0);
9734 	    return (GET_CODE (x) == SYMBOL_REF
9735 		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
9736 	  case UNSPEC_MACHOPIC_OFFSET:
9737 	    return legitimate_pic_address_disp_p (x);
9738 	  default:
9739 	    return false;
9740 	  }
9741       /* FALLTHRU */
9742 
9743     case SYMBOL_REF:
9744     case LABEL_REF:
9745       return legitimate_pic_address_disp_p (x);
9746 
9747     default:
9748       return true;
9749     }
9750 }
9751 
9752 /* Determine if a given CONST RTX is a valid memory displacement
9753    in PIC mode.  */
9754 
9755 int
9756 legitimate_pic_address_disp_p (rtx disp)
9757 {
9758   bool saw_plus;
9759 
9760   /* In 64bit mode we can allow direct addresses of symbols and labels
9761      when they are not dynamic symbols.  */
9762   if (TARGET_64BIT)
9763     {
9764       rtx op0 = disp, op1;
9765 
9766       switch (GET_CODE (disp))
9767 	{
9768 	case LABEL_REF:
9769 	  return true;
9770 
9771 	case CONST:
9772 	  if (GET_CODE (XEXP (disp, 0)) != PLUS)
9773 	    break;
9774 	  op0 = XEXP (XEXP (disp, 0), 0);
9775 	  op1 = XEXP (XEXP (disp, 0), 1);
9776 	  if (!CONST_INT_P (op1)
9777 	      || INTVAL (op1) >= 16*1024*1024
9778 	      || INTVAL (op1) < -16*1024*1024)
9779             break;
9780 	  if (GET_CODE (op0) == LABEL_REF)
9781 	    return true;
9782 	  if (GET_CODE (op0) != SYMBOL_REF)
9783 	    break;
9784 	  /* FALLTHRU */
9785 
9786 	case SYMBOL_REF:
9787 	  /* TLS references should always be enclosed in UNSPEC.  */
9788 	  if (SYMBOL_REF_TLS_MODEL (op0))
9789 	    return false;
9790 	  if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
9791 	      && ix86_cmodel != CM_LARGE_PIC)
9792 	    return true;
9793 	  break;
9794 
9795 	default:
9796 	  break;
9797 	}
9798     }
9799   if (GET_CODE (disp) != CONST)
9800     return 0;
9801   disp = XEXP (disp, 0);
9802 
9803   if (TARGET_64BIT)
9804     {
9805       /* We are unsafe to allow PLUS expressions.  This limit allowed distance
9806          of GOT tables.  We should not need these anyway.  */
9807       if (GET_CODE (disp) != UNSPEC
9808 	  || (XINT (disp, 1) != UNSPEC_GOTPCREL
9809 	      && XINT (disp, 1) != UNSPEC_GOTOFF
9810 	      && XINT (disp, 1) != UNSPEC_PLTOFF))
9811 	return 0;
9812 
9813       if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
9814 	  && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
9815 	return 0;
9816       return 1;
9817     }
9818 
9819   saw_plus = false;
9820   if (GET_CODE (disp) == PLUS)
9821     {
9822       if (!CONST_INT_P (XEXP (disp, 1)))
9823 	return 0;
9824       disp = XEXP (disp, 0);
9825       saw_plus = true;
9826     }
9827 
9828   if (TARGET_MACHO && darwin_local_data_pic (disp))
9829     return 1;
9830 
9831   if (GET_CODE (disp) != UNSPEC)
9832     return 0;
9833 
9834   switch (XINT (disp, 1))
9835     {
9836     case UNSPEC_GOT:
9837       if (saw_plus)
9838 	return false;
9839       /* We need to check for both symbols and labels because VxWorks loads
9840 	 text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
9841 	 details.  */
9842       return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
9843 	      || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
9844     case UNSPEC_GOTOFF:
9845       /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
9846 	 While ABI specify also 32bit relocation but we don't produce it in
9847 	 small PIC model at all.  */
9848       if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
9849 	   || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
9850 	  && !TARGET_64BIT)
9851         return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
9852       return false;
9853     case UNSPEC_GOTTPOFF:
9854     case UNSPEC_GOTNTPOFF:
9855     case UNSPEC_INDNTPOFF:
9856       if (saw_plus)
9857 	return false;
9858       disp = XVECEXP (disp, 0, 0);
9859       return (GET_CODE (disp) == SYMBOL_REF
9860 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
9861     case UNSPEC_NTPOFF:
9862       disp = XVECEXP (disp, 0, 0);
9863       return (GET_CODE (disp) == SYMBOL_REF
9864 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
9865     case UNSPEC_DTPOFF:
9866       disp = XVECEXP (disp, 0, 0);
9867       return (GET_CODE (disp) == SYMBOL_REF
9868 	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
9869     }
9870 
9871   return 0;
9872 }
9873 
9874 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS.  Returns a value to
9875    replace the input X, or the original X if no replacement is called for.
9876    The output parameter *WIN is 1 if the calling macro should goto WIN,
9877    0 if it should not.  */
9878 
9879 bool
9880 ix86_legitimize_reload_address (rtx x,
9881 				enum machine_mode mode ATTRIBUTE_UNUSED,
9882 				int opnum, int type,
9883 				int ind_levels ATTRIBUTE_UNUSED)
9884 {
9885   /* Reload can generate:
9886 
9887      (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
9888 		       (reg:DI 97))
9889 	      (reg:DI 2 cx))
9890 
9891      This RTX is rejected from ix86_legitimate_address_p due to
9892      non-strictness of base register 97.  Following this rejection,
9893      reload pushes all three components into separate registers,
9894      creating invalid memory address RTX.
9895 
9896      Following code reloads only the invalid part of the
9897      memory address RTX.  */
9898 
9899   if (GET_CODE (x) == PLUS
9900       && REG_P (XEXP (x, 1))
9901       && GET_CODE (XEXP (x, 0)) == PLUS
9902       && REG_P (XEXP (XEXP (x, 0), 1)))
9903     {
9904       rtx base, index;
9905       bool something_reloaded = false;
9906 
9907       base = XEXP (XEXP (x, 0), 1);
9908       if (!REG_OK_FOR_BASE_STRICT_P (base))
9909 	{
9910 	  push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
9911 		       BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
9912 		       opnum, (enum reload_type)type);
9913 	  something_reloaded = true;
9914 	}
9915 
9916       index = XEXP (x, 1);
9917       if (!REG_OK_FOR_INDEX_STRICT_P (index))
9918 	{
9919 	  push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
9920 		       INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
9921 		       opnum, (enum reload_type)type);
9922 	  something_reloaded = true;
9923 	}
9924 
9925       gcc_assert (something_reloaded);
9926       return true;
9927     }
9928 
9929   return false;
9930 }
9931 
9932 /* Recognizes RTL expressions that are valid memory addresses for an
9933    instruction.  The MODE argument is the machine mode for the MEM
9934    expression that wants to use this address.
9935 
9936    It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
9937    convert common non-canonical forms to canonical form so that they will
9938    be recognized.  */
9939 
9940 static bool
9941 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
9942 		           rtx addr, bool strict)
9943 {
9944   struct ix86_address parts;
9945   rtx base, index, disp;
9946   HOST_WIDE_INT scale;
9947 
9948   if (ix86_decompose_address (addr, &parts) <= 0)
9949     /* Decomposition failed.  */
9950     return false;
9951 
9952   base = parts.base;
9953   index = parts.index;
9954   disp = parts.disp;
9955   scale = parts.scale;
9956 
9957   /* Validate base register.
9958 
9959      Don't allow SUBREG's that span more than a word here.  It can lead to spill
9960      failures when the base is one word out of a two word structure, which is
9961      represented internally as a DImode int.  */
9962 
9963   if (base)
9964     {
9965       rtx reg;
9966 
9967       if (REG_P (base))
9968   	reg = base;
9969       else if (GET_CODE (base) == SUBREG
9970 	       && REG_P (SUBREG_REG (base))
9971 	       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (base)))
9972 		  <= UNITS_PER_WORD)
9973   	reg = SUBREG_REG (base);
9974       else
9975 	/* Base is not a register.  */
9976 	return false;
9977 
9978       if (GET_MODE (base) != Pmode)
9979 	/* Base is not in Pmode.  */
9980 	return false;
9981 
9982       if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
9983 	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
9984 	/* Base is not valid.  */
9985 	return false;
9986     }
9987 
9988   /* Validate index register.
9989 
9990      Don't allow SUBREG's that span more than a word here -- same as above.  */
9991 
9992   if (index)
9993     {
9994       rtx reg;
9995 
9996       if (REG_P (index))
9997   	reg = index;
9998       else if (GET_CODE (index) == SUBREG
9999 	       && REG_P (SUBREG_REG (index))
10000 	       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (index)))
10001 		  <= UNITS_PER_WORD)
10002   	reg = SUBREG_REG (index);
10003       else
10004 	/* Index is not a register.  */
10005 	return false;
10006 
10007       if (GET_MODE (index) != Pmode)
10008 	/* Index is not in Pmode.  */
10009 	return false;
10010 
10011       if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
10012 	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
10013 	/* Index is not valid.  */
10014 	return false;
10015     }
10016 
10017   /* Validate scale factor.  */
10018   if (scale != 1)
10019     {
10020       if (!index)
10021 	/* Scale without index.  */
10022 	return false;
10023 
10024       if (scale != 2 && scale != 4 && scale != 8)
10025 	/* Scale is not a valid multiplier.  */
10026 	return false;
10027     }
10028 
10029   /* Validate displacement.  */
10030   if (disp)
10031     {
10032       if (GET_CODE (disp) == CONST
10033 	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
10034 	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
10035 	switch (XINT (XEXP (disp, 0), 1))
10036 	  {
10037 	  /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
10038 	     used.  While ABI specify also 32bit relocations, we don't produce
10039 	     them at all and use IP relative instead.  */
10040 	  case UNSPEC_GOT:
10041 	  case UNSPEC_GOTOFF:
10042 	    gcc_assert (flag_pic);
10043 	    if (!TARGET_64BIT)
10044 	      goto is_legitimate_pic;
10045 
10046 	    /* 64bit address unspec.  */
10047 	    return false;
10048 
10049 	  case UNSPEC_GOTPCREL:
10050 	    gcc_assert (flag_pic);
10051 	    goto is_legitimate_pic;
10052 
10053 	  case UNSPEC_GOTTPOFF:
10054 	  case UNSPEC_GOTNTPOFF:
10055 	  case UNSPEC_INDNTPOFF:
10056 	  case UNSPEC_NTPOFF:
10057 	  case UNSPEC_DTPOFF:
10058 	    break;
10059 
10060 	  default:
10061 	    /* Invalid address unspec.  */
10062 	    return false;
10063 	  }
10064 
10065       else if (SYMBOLIC_CONST (disp)
10066 	       && (flag_pic
10067 		   || (TARGET_MACHO
10068 #if TARGET_MACHO
10069 		       && MACHOPIC_INDIRECT
10070 		       && !machopic_operand_p (disp)
10071 #endif
10072 	       )))
10073 	{
10074 
10075 	is_legitimate_pic:
10076 	  if (TARGET_64BIT && (index || base))
10077 	    {
10078 	      /* foo@dtpoff(%rX) is ok.  */
10079 	      if (GET_CODE (disp) != CONST
10080 		  || GET_CODE (XEXP (disp, 0)) != PLUS
10081 		  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
10082 		  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
10083 		  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
10084 		      && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
10085 		/* Non-constant pic memory reference.  */
10086 		return false;
10087 	    }
10088 	  else if (! legitimate_pic_address_disp_p (disp))
10089 	    /* Displacement is an invalid pic construct.  */
10090 	    return false;
10091 
10092           /* This code used to verify that a symbolic pic displacement
10093 	     includes the pic_offset_table_rtx register.
10094 
10095 	     While this is good idea, unfortunately these constructs may
10096 	     be created by "adds using lea" optimization for incorrect
10097 	     code like:
10098 
10099 	     int a;
10100 	     int foo(int i)
10101 	       {
10102 	         return *(&a+i);
10103 	       }
10104 
10105 	     This code is nonsensical, but results in addressing
10106 	     GOT table with pic_offset_table_rtx base.  We can't
10107 	     just refuse it easily, since it gets matched by
10108 	     "addsi3" pattern, that later gets split to lea in the
10109 	     case output register differs from input.  While this
10110 	     can be handled by separate addsi pattern for this case
10111 	     that never results in lea, this seems to be easier and
10112 	     correct fix for crash to disable this test.  */
10113 	}
10114       else if (GET_CODE (disp) != LABEL_REF
10115 	       && !CONST_INT_P (disp)
10116 	       && (GET_CODE (disp) != CONST
10117 		   || !legitimate_constant_p (disp))
10118 	       && (GET_CODE (disp) != SYMBOL_REF
10119 		   || !legitimate_constant_p (disp)))
10120 	/* Displacement is not constant.  */
10121 	return false;
10122       else if (TARGET_64BIT
10123 	       && !x86_64_immediate_operand (disp, VOIDmode))
10124 	/* Displacement is out of range.  */
10125 	return false;
10126     }
10127 
10128   /* Everything looks valid.  */
10129   return true;
10130 }
10131 
10132 /* Determine if a given RTX is a valid constant address.  */
10133 
10134 bool
10135 constant_address_p (rtx x)
10136 {
10137   return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
10138 }
10139 
10140 /* Return a unique alias set for the GOT.  */
10141 
10142 static alias_set_type
10143 ix86_GOT_alias_set (void)
10144 {
10145   static alias_set_type set = -1;
10146   if (set == -1)
10147     set = new_alias_set ();
10148   return set;
10149 }
10150 
10151 /* Return a legitimate reference for ORIG (an address) using the
10152    register REG.  If REG is 0, a new pseudo is generated.
10153 
10154    There are two types of references that must be handled:
10155 
10156    1. Global data references must load the address from the GOT, via
10157       the PIC reg.  An insn is emitted to do this load, and the reg is
10158       returned.
10159 
10160    2. Static data references, constant pool addresses, and code labels
10161       compute the address as an offset from the GOT, whose base is in
10162       the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
10163       differentiate them from global data objects.  The returned
10164       address is the PIC reg + an unspec constant.
10165 
10166    TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
10167    reg also appears in the address.  */
10168 
10169 static rtx
10170 legitimize_pic_address (rtx orig, rtx reg)
10171 {
10172   rtx addr = orig;
10173   rtx new_rtx = orig;
10174   rtx base;
10175 
10176 #if TARGET_MACHO
10177   if (TARGET_MACHO && !TARGET_64BIT)
10178     {
10179       if (reg == 0)
10180 	reg = gen_reg_rtx (Pmode);
10181       /* Use the generic Mach-O PIC machinery.  */
10182       return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
10183     }
10184 #endif
10185 
10186   if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
10187     new_rtx = addr;
10188   else if (TARGET_64BIT
10189 	   && ix86_cmodel != CM_SMALL_PIC
10190 	   && gotoff_operand (addr, Pmode))
10191     {
10192       rtx tmpreg;
10193       /* This symbol may be referenced via a displacement from the PIC
10194 	 base address (@GOTOFF).  */
10195 
10196       if (reload_in_progress)
10197 	df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
10198       if (GET_CODE (addr) == CONST)
10199 	addr = XEXP (addr, 0);
10200       if (GET_CODE (addr) == PLUS)
10201 	  {
10202             new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
10203 				      UNSPEC_GOTOFF);
10204 	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
10205 	  }
10206 	else
10207           new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
10208       new_rtx = gen_rtx_CONST (Pmode, new_rtx);
10209       if (!reg)
10210         tmpreg = gen_reg_rtx (Pmode);
10211       else
10212 	tmpreg = reg;
10213       emit_move_insn (tmpreg, new_rtx);
10214 
10215       if (reg != 0)
10216 	{
10217 	  new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
10218 					 tmpreg, 1, OPTAB_DIRECT);
10219 	  new_rtx = reg;
10220 	}
10221       else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
10222     }
10223   else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
10224     {
10225       /* This symbol may be referenced via a displacement from the PIC
10226 	 base address (@GOTOFF).  */
10227 
10228       if (reload_in_progress)
10229 	df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
10230       if (GET_CODE (addr) == CONST)
10231 	addr = XEXP (addr, 0);
10232       if (GET_CODE (addr) == PLUS)
10233 	  {
10234             new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
10235 				      UNSPEC_GOTOFF);
10236 	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
10237 	  }
10238 	else
10239           new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
10240       new_rtx = gen_rtx_CONST (Pmode, new_rtx);
10241       new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
10242 
10243       if (reg != 0)
10244 	{
10245 	  emit_move_insn (reg, new_rtx);
10246 	  new_rtx = reg;
10247 	}
10248     }
10249   else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
10250 	   /* We can't use @GOTOFF for text labels on VxWorks;
10251 	      see gotoff_operand.  */
10252 	   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
10253     {
10254       if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
10255         {
10256           if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
10257             return legitimize_dllimport_symbol (addr, true);
10258           if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
10259               && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
10260               && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
10261             {
10262               rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
10263               return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
10264             }
10265         }
10266 
10267       if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
10268 	{
10269 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
10270 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
10271 	  new_rtx = gen_const_mem (Pmode, new_rtx);
10272 	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
10273 
10274 	  if (reg == 0)
10275 	    reg = gen_reg_rtx (Pmode);
10276 	  /* Use directly gen_movsi, otherwise the address is loaded
10277 	     into register for CSE.  We don't want to CSE this addresses,
10278 	     instead we CSE addresses from the GOT table, so skip this.  */
10279 	  emit_insn (gen_movsi (reg, new_rtx));
10280 	  new_rtx = reg;
10281 	}
10282       else
10283 	{
10284 	  /* This symbol must be referenced via a load from the
10285 	     Global Offset Table (@GOT).  */
10286 
10287 	  if (reload_in_progress)
10288 	    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
10289 	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
10290 	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
10291 	  if (TARGET_64BIT)
10292 	    new_rtx = force_reg (Pmode, new_rtx);
10293 	  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
10294 	  new_rtx = gen_const_mem (Pmode, new_rtx);
10295 	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
10296 
10297 	  if (reg == 0)
10298 	    reg = gen_reg_rtx (Pmode);
10299 	  emit_move_insn (reg, new_rtx);
10300 	  new_rtx = reg;
10301 	}
10302     }
10303   else
10304     {
10305       if (CONST_INT_P (addr)
10306 	  && !x86_64_immediate_operand (addr, VOIDmode))
10307 	{
10308 	  if (reg)
10309 	    {
10310 	      emit_move_insn (reg, addr);
10311 	      new_rtx = reg;
10312 	    }
10313 	  else
10314 	    new_rtx = force_reg (Pmode, addr);
10315 	}
10316       else if (GET_CODE (addr) == CONST)
10317 	{
10318 	  addr = XEXP (addr, 0);
10319 
10320 	  /* We must match stuff we generate before.  Assume the only
10321 	     unspecs that can get here are ours.  Not that we could do
10322 	     anything with them anyway....  */
10323 	  if (GET_CODE (addr) == UNSPEC
10324 	      || (GET_CODE (addr) == PLUS
10325 		  && GET_CODE (XEXP (addr, 0)) == UNSPEC))
10326 	    return orig;
10327 	  gcc_assert (GET_CODE (addr) == PLUS);
10328 	}
10329       if (GET_CODE (addr) == PLUS)
10330 	{
10331 	  rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
10332 
10333 	  /* Check first to see if this is a constant offset from a @GOTOFF
10334 	     symbol reference.  */
10335 	  if (gotoff_operand (op0, Pmode)
10336 	      && CONST_INT_P (op1))
10337 	    {
10338 	      if (!TARGET_64BIT)
10339 		{
10340 		  if (reload_in_progress)
10341 		    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
10342 		  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
10343 					    UNSPEC_GOTOFF);
10344 		  new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
10345 		  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
10346 		  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
10347 
10348 		  if (reg != 0)
10349 		    {
10350 		      emit_move_insn (reg, new_rtx);
10351 		      new_rtx = reg;
10352 		    }
10353 		}
10354 	      else
10355 		{
10356 		  if (INTVAL (op1) < -16*1024*1024
10357 		      || INTVAL (op1) >= 16*1024*1024)
10358 		    {
10359 		      if (!x86_64_immediate_operand (op1, Pmode))
10360 			op1 = force_reg (Pmode, op1);
10361 		      new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
10362 		    }
10363 		}
10364 	    }
10365 	  else
10366 	    {
10367 	      base = legitimize_pic_address (XEXP (addr, 0), reg);
10368 	      new_rtx  = legitimize_pic_address (XEXP (addr, 1),
10369 						 base == reg ? NULL_RTX : reg);
10370 
10371 	      if (CONST_INT_P (new_rtx))
10372 		new_rtx = plus_constant (base, INTVAL (new_rtx));
10373 	      else
10374 		{
10375 		  if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
10376 		    {
10377 		      base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
10378 		      new_rtx = XEXP (new_rtx, 1);
10379 		    }
10380 		  new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
10381 		}
10382 	    }
10383 	}
10384     }
10385   return new_rtx;
10386 }
10387 
10388 /* Load the thread pointer.  If TO_REG is true, force it into a register.  */
10389 
10390 static rtx
10391 get_thread_pointer (int to_reg)
10392 {
10393   rtx tp, reg, insn;
10394 
10395   tp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
10396   if (!to_reg)
10397     return tp;
10398 
10399   reg = gen_reg_rtx (Pmode);
10400   insn = gen_rtx_SET (VOIDmode, reg, tp);
10401   insn = emit_insn (insn);
10402 
10403   return reg;
10404 }
10405 
10406 /* A subroutine of ix86_legitimize_address and ix86_expand_move.  FOR_MOV is
10407    false if we expect this to be used for a memory address and true if
10408    we expect to load the address into a register.  */
10409 
10410 static rtx
10411 legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
10412 {
10413   rtx dest, base, off, pic, tp;
10414   int type;
10415 
10416   switch (model)
10417     {
10418     case TLS_MODEL_GLOBAL_DYNAMIC:
10419       dest = gen_reg_rtx (Pmode);
10420       tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
10421 
10422       if (TARGET_64BIT && ! TARGET_GNU2_TLS)
10423 	{
10424 	  rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
10425 
10426 	  start_sequence ();
10427 	  emit_call_insn (gen_tls_global_dynamic_64 (rax, x));
10428 	  insns = get_insns ();
10429 	  end_sequence ();
10430 
10431 	  RTL_CONST_CALL_P (insns) = 1;
10432 	  emit_libcall_block (insns, dest, rax, x);
10433 	}
10434       else if (TARGET_64BIT && TARGET_GNU2_TLS)
10435 	emit_insn (gen_tls_global_dynamic_64 (dest, x));
10436       else
10437 	emit_insn (gen_tls_global_dynamic_32 (dest, x));
10438 
10439       if (TARGET_GNU2_TLS)
10440 	{
10441 	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
10442 
10443 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
10444 	}
10445       break;
10446 
10447     case TLS_MODEL_LOCAL_DYNAMIC:
10448       base = gen_reg_rtx (Pmode);
10449       tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
10450 
10451       if (TARGET_64BIT && ! TARGET_GNU2_TLS)
10452 	{
10453 	  rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, note;
10454 
10455 	  start_sequence ();
10456 	  emit_call_insn (gen_tls_local_dynamic_base_64 (rax));
10457 	  insns = get_insns ();
10458 	  end_sequence ();
10459 
10460 	  note = gen_rtx_EXPR_LIST (VOIDmode, const0_rtx, NULL);
10461 	  note = gen_rtx_EXPR_LIST (VOIDmode, ix86_tls_get_addr (), note);
10462 	  RTL_CONST_CALL_P (insns) = 1;
10463 	  emit_libcall_block (insns, base, rax, note);
10464 	}
10465       else if (TARGET_64BIT && TARGET_GNU2_TLS)
10466 	emit_insn (gen_tls_local_dynamic_base_64 (base));
10467       else
10468 	emit_insn (gen_tls_local_dynamic_base_32 (base));
10469 
10470       if (TARGET_GNU2_TLS)
10471 	{
10472 	  rtx x = ix86_tls_module_base ();
10473 
10474 	  set_unique_reg_note (get_last_insn (), REG_EQUIV,
10475 			       gen_rtx_MINUS (Pmode, x, tp));
10476 	}
10477 
10478       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
10479       off = gen_rtx_CONST (Pmode, off);
10480 
10481       dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
10482 
10483       if (TARGET_GNU2_TLS)
10484 	{
10485 	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
10486 
10487 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
10488 	}
10489 
10490       break;
10491 
10492     case TLS_MODEL_INITIAL_EXEC:
10493       if (TARGET_64BIT)
10494 	{
10495 	  if (TARGET_SUN_TLS)
10496 	    {
10497 	      /* The Sun linker took the AMD64 TLS spec literally
10498 		 and can only handle %rax as destination of the
10499 		 initial executable code sequence.  */
10500 
10501 	      dest = gen_reg_rtx (Pmode);
10502 	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
10503 	      return dest;
10504 	    }
10505 
10506 	  pic = NULL;
10507 	  type = UNSPEC_GOTNTPOFF;
10508 	}
10509       else if (flag_pic)
10510 	{
10511 	  if (reload_in_progress)
10512 	    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
10513 	  pic = pic_offset_table_rtx;
10514 	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
10515 	}
10516       else if (!TARGET_ANY_GNU_TLS)
10517 	{
10518 	  pic = gen_reg_rtx (Pmode);
10519 	  emit_insn (gen_set_got (pic));
10520 	  type = UNSPEC_GOTTPOFF;
10521 	}
10522       else
10523 	{
10524 	  pic = NULL;
10525 	  type = UNSPEC_INDNTPOFF;
10526 	}
10527 
10528       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
10529       off = gen_rtx_CONST (Pmode, off);
10530       if (pic)
10531 	off = gen_rtx_PLUS (Pmode, pic, off);
10532       off = gen_const_mem (Pmode, off);
10533       set_mem_alias_set (off, ix86_GOT_alias_set ());
10534 
10535       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
10536 	{
10537           base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
10538 	  off = force_reg (Pmode, off);
10539 	  return gen_rtx_PLUS (Pmode, base, off);
10540 	}
10541       else
10542 	{
10543 	  base = get_thread_pointer (true);
10544 	  dest = gen_reg_rtx (Pmode);
10545 	  emit_insn (gen_subsi3 (dest, base, off));
10546 	}
10547       break;
10548 
10549     case TLS_MODEL_LOCAL_EXEC:
10550       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
10551 			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
10552 			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
10553       off = gen_rtx_CONST (Pmode, off);
10554 
10555       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
10556 	{
10557 	  base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
10558 	  return gen_rtx_PLUS (Pmode, base, off);
10559 	}
10560       else
10561 	{
10562 	  base = get_thread_pointer (true);
10563 	  dest = gen_reg_rtx (Pmode);
10564 	  emit_insn (gen_subsi3 (dest, base, off));
10565 	}
10566       break;
10567 
10568     default:
10569       gcc_unreachable ();
10570     }
10571 
10572   return dest;
10573 }
10574 
10575 /* Create or return the unique __imp_DECL dllimport symbol corresponding
10576    to symbol DECL.  */
10577 
10578 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
10579   htab_t dllimport_map;
10580 
10581 static tree
10582 get_dllimport_decl (tree decl)
10583 {
10584   struct tree_map *h, in;
10585   void **loc;
10586   const char *name;
10587   const char *prefix;
10588   size_t namelen, prefixlen;
10589   char *imp_name;
10590   tree to;
10591   rtx rtl;
10592 
10593   if (!dllimport_map)
10594     dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
10595 
10596   in.hash = htab_hash_pointer (decl);
10597   in.base.from = decl;
10598   loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
10599   h = (struct tree_map *) *loc;
10600   if (h)
10601     return h->to;
10602 
10603   *loc = h = GGC_NEW (struct tree_map);
10604   h->hash = in.hash;
10605   h->base.from = decl;
10606   h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
10607 			   VAR_DECL, NULL, ptr_type_node);
10608   DECL_ARTIFICIAL (to) = 1;
10609   DECL_IGNORED_P (to) = 1;
10610   DECL_EXTERNAL (to) = 1;
10611   TREE_READONLY (to) = 1;
10612 
10613   name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
10614   name = targetm.strip_name_encoding (name);
10615   prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
10616     ? "*__imp_" : "*__imp__";
10617   namelen = strlen (name);
10618   prefixlen = strlen (prefix);
10619   imp_name = (char *) alloca (namelen + prefixlen + 1);
10620   memcpy (imp_name, prefix, prefixlen);
10621   memcpy (imp_name + prefixlen, name, namelen + 1);
10622 
10623   name = ggc_alloc_string (imp_name, namelen + prefixlen);
10624   rtl = gen_rtx_SYMBOL_REF (Pmode, name);
10625   SET_SYMBOL_REF_DECL (rtl, to);
10626   SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
10627 
10628   rtl = gen_const_mem (Pmode, rtl);
10629   set_mem_alias_set (rtl, ix86_GOT_alias_set ());
10630 
10631   SET_DECL_RTL (to, rtl);
10632   SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
10633 
10634   return to;
10635 }
10636 
10637 /* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
10638    true if we require the result be a register.  */
10639 
10640 static rtx
10641 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
10642 {
10643   tree imp_decl;
10644   rtx x;
10645 
10646   gcc_assert (SYMBOL_REF_DECL (symbol));
10647   imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
10648 
10649   x = DECL_RTL (imp_decl);
10650   if (want_reg)
10651     x = force_reg (Pmode, x);
10652   return x;
10653 }
10654 
10655 /* Try machine-dependent ways of modifying an illegitimate address
10656    to be legitimate.  If we find one, return the new, valid address.
10657    This macro is used in only one place: `memory_address' in explow.c.
10658 
10659    OLDX is the address as it was before break_out_memory_refs was called.
10660    In some cases it is useful to look at this to decide what needs to be done.
10661 
10662    It is always safe for this macro to do nothing.  It exists to recognize
10663    opportunities to optimize the output.
10664 
10665    For the 80386, we handle X+REG by loading X into a register R and
10666    using R+REG.  R will go in a general reg and indexing will be used.
10667    However, if REG is a broken-out memory address or multiplication,
10668    nothing needs to be done because REG can certainly go in a general reg.
10669 
10670    When -fpic is used, special handling is needed for symbolic references.
10671    See comments by legitimize_pic_address in i386.c for details.  */
10672 
10673 static rtx
10674 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
10675 			 enum machine_mode mode)
10676 {
10677   int changed = 0;
10678   unsigned log;
10679 
10680   log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
10681   if (log)
10682     return legitimize_tls_address (x, (enum tls_model) log, false);
10683   if (GET_CODE (x) == CONST
10684       && GET_CODE (XEXP (x, 0)) == PLUS
10685       && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
10686       && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
10687     {
10688       rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
10689 				      (enum tls_model) log, false);
10690       return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
10691     }
10692 
10693   if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
10694     {
10695       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
10696 	return legitimize_dllimport_symbol (x, true);
10697       if (GET_CODE (x) == CONST
10698 	  && GET_CODE (XEXP (x, 0)) == PLUS
10699 	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
10700 	  && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
10701 	{
10702 	  rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
10703 	  return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
10704 	}
10705     }
10706 
10707   if (flag_pic && SYMBOLIC_CONST (x))
10708     return legitimize_pic_address (x, 0);
10709 
10710   /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
10711   if (GET_CODE (x) == ASHIFT
10712       && CONST_INT_P (XEXP (x, 1))
10713       && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
10714     {
10715       changed = 1;
10716       log = INTVAL (XEXP (x, 1));
10717       x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
10718 			GEN_INT (1 << log));
10719     }
10720 
10721   if (GET_CODE (x) == PLUS)
10722     {
10723       /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
10724 
10725       if (GET_CODE (XEXP (x, 0)) == ASHIFT
10726 	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10727 	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
10728 	{
10729 	  changed = 1;
10730 	  log = INTVAL (XEXP (XEXP (x, 0), 1));
10731 	  XEXP (x, 0) = gen_rtx_MULT (Pmode,
10732 				      force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
10733 				      GEN_INT (1 << log));
10734 	}
10735 
10736       if (GET_CODE (XEXP (x, 1)) == ASHIFT
10737 	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
10738 	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
10739 	{
10740 	  changed = 1;
10741 	  log = INTVAL (XEXP (XEXP (x, 1), 1));
10742 	  XEXP (x, 1) = gen_rtx_MULT (Pmode,
10743 				      force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
10744 				      GEN_INT (1 << log));
10745 	}
10746 
10747       /* Put multiply first if it isn't already.  */
10748       if (GET_CODE (XEXP (x, 1)) == MULT)
10749 	{
10750 	  rtx tmp = XEXP (x, 0);
10751 	  XEXP (x, 0) = XEXP (x, 1);
10752 	  XEXP (x, 1) = tmp;
10753 	  changed = 1;
10754 	}
10755 
10756       /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
10757 	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
10758 	 created by virtual register instantiation, register elimination, and
10759 	 similar optimizations.  */
10760       if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
10761 	{
10762 	  changed = 1;
10763 	  x = gen_rtx_PLUS (Pmode,
10764 			    gen_rtx_PLUS (Pmode, XEXP (x, 0),
10765 					  XEXP (XEXP (x, 1), 0)),
10766 			    XEXP (XEXP (x, 1), 1));
10767 	}
10768 
10769       /* Canonicalize
10770 	 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
10771 	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
10772       else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
10773 	       && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10774 	       && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
10775 	       && CONSTANT_P (XEXP (x, 1)))
10776 	{
10777 	  rtx constant;
10778 	  rtx other = NULL_RTX;
10779 
10780 	  if (CONST_INT_P (XEXP (x, 1)))
10781 	    {
10782 	      constant = XEXP (x, 1);
10783 	      other = XEXP (XEXP (XEXP (x, 0), 1), 1);
10784 	    }
10785 	  else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
10786 	    {
10787 	      constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
10788 	      other = XEXP (x, 1);
10789 	    }
10790 	  else
10791 	    constant = 0;
10792 
10793 	  if (constant)
10794 	    {
10795 	      changed = 1;
10796 	      x = gen_rtx_PLUS (Pmode,
10797 				gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
10798 					      XEXP (XEXP (XEXP (x, 0), 1), 0)),
10799 				plus_constant (other, INTVAL (constant)));
10800 	    }
10801 	}
10802 
10803       if (changed && ix86_legitimate_address_p (mode, x, FALSE))
10804 	return x;
10805 
10806       if (GET_CODE (XEXP (x, 0)) == MULT)
10807 	{
10808 	  changed = 1;
10809 	  XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
10810 	}
10811 
10812       if (GET_CODE (XEXP (x, 1)) == MULT)
10813 	{
10814 	  changed = 1;
10815 	  XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
10816 	}
10817 
10818       if (changed
10819 	  && REG_P (XEXP (x, 1))
10820 	  && REG_P (XEXP (x, 0)))
10821 	return x;
10822 
10823       if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
10824 	{
10825 	  changed = 1;
10826 	  x = legitimize_pic_address (x, 0);
10827 	}
10828 
10829       if (changed && ix86_legitimate_address_p (mode, x, FALSE))
10830 	return x;
10831 
10832       if (REG_P (XEXP (x, 0)))
10833 	{
10834 	  rtx temp = gen_reg_rtx (Pmode);
10835 	  rtx val  = force_operand (XEXP (x, 1), temp);
10836 	  if (val != temp)
10837 	    emit_move_insn (temp, val);
10838 
10839 	  XEXP (x, 1) = temp;
10840 	  return x;
10841 	}
10842 
10843       else if (REG_P (XEXP (x, 1)))
10844 	{
10845 	  rtx temp = gen_reg_rtx (Pmode);
10846 	  rtx val  = force_operand (XEXP (x, 0), temp);
10847 	  if (val != temp)
10848 	    emit_move_insn (temp, val);
10849 
10850 	  XEXP (x, 0) = temp;
10851 	  return x;
10852 	}
10853     }
10854 
10855   return x;
10856 }
10857 
10858 /* Print an integer constant expression in assembler syntax.  Addition
10859    and subtraction are the only arithmetic that may appear in these
10860    expressions.  FILE is the stdio stream to write to, X is the rtx, and
10861    CODE is the operand print code from the output string.  */
10862 
10863 static void
10864 output_pic_addr_const (FILE *file, rtx x, int code)
10865 {
10866   char buf[256];
10867 
10868   switch (GET_CODE (x))
10869     {
10870     case PC:
10871       gcc_assert (flag_pic);
10872       putc ('.', file);
10873       break;
10874 
10875     case SYMBOL_REF:
10876       if (! TARGET_MACHO || TARGET_64BIT)
10877 	output_addr_const (file, x);
10878       else
10879 	{
10880 	  const char *name = XSTR (x, 0);
10881 
10882 	  /* Mark the decl as referenced so that cgraph will
10883 	     output the function.  */
10884 	  if (SYMBOL_REF_DECL (x))
10885 	    mark_decl_referenced (SYMBOL_REF_DECL (x));
10886 
10887 #if TARGET_MACHO
10888 	  if (MACHOPIC_INDIRECT
10889 	      && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
10890 	    name = machopic_indirection_name (x, /*stub_p=*/true);
10891 #endif
10892 	  assemble_name (file, name);
10893 	}
10894       if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
10895 	  && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
10896 	fputs ("@PLT", file);
10897       break;
10898 
10899     case LABEL_REF:
10900       x = XEXP (x, 0);
10901       /* FALLTHRU */
10902     case CODE_LABEL:
10903       ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
10904       assemble_name (asm_out_file, buf);
10905       break;
10906 
10907     case CONST_INT:
10908       fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
10909       break;
10910 
10911     case CONST:
10912       /* This used to output parentheses around the expression,
10913 	 but that does not work on the 386 (either ATT or BSD assembler).  */
10914       output_pic_addr_const (file, XEXP (x, 0), code);
10915       break;
10916 
10917     case CONST_DOUBLE:
10918       if (GET_MODE (x) == VOIDmode)
10919 	{
10920 	  /* We can use %d if the number is <32 bits and positive.  */
10921 	  if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
10922 	    fprintf (file, "0x%lx%08lx",
10923 		     (unsigned long) CONST_DOUBLE_HIGH (x),
10924 		     (unsigned long) CONST_DOUBLE_LOW (x));
10925 	  else
10926 	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
10927 	}
10928       else
10929 	/* We can't handle floating point constants;
10930 	   PRINT_OPERAND must handle them.  */
10931 	output_operand_lossage ("floating constant misused");
10932       break;
10933 
10934     case PLUS:
10935       /* Some assemblers need integer constants to appear first.  */
10936       if (CONST_INT_P (XEXP (x, 0)))
10937 	{
10938 	  output_pic_addr_const (file, XEXP (x, 0), code);
10939 	  putc ('+', file);
10940 	  output_pic_addr_const (file, XEXP (x, 1), code);
10941 	}
10942       else
10943 	{
10944 	  gcc_assert (CONST_INT_P (XEXP (x, 1)));
10945 	  output_pic_addr_const (file, XEXP (x, 1), code);
10946 	  putc ('+', file);
10947 	  output_pic_addr_const (file, XEXP (x, 0), code);
10948 	}
10949       break;
10950 
10951     case MINUS:
10952       if (!TARGET_MACHO)
10953 	putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
10954       output_pic_addr_const (file, XEXP (x, 0), code);
10955       putc ('-', file);
10956       output_pic_addr_const (file, XEXP (x, 1), code);
10957       if (!TARGET_MACHO)
10958 	putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
10959       break;
10960 
10961      case UNSPEC:
10962        gcc_assert (XVECLEN (x, 0) == 1);
10963        output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
10964        switch (XINT (x, 1))
10965 	{
10966 	case UNSPEC_GOT:
10967 	  fputs ("@GOT", file);
10968 	  break;
10969 	case UNSPEC_GOTOFF:
10970 	  fputs ("@GOTOFF", file);
10971 	  break;
10972 	case UNSPEC_PLTOFF:
10973 	  fputs ("@PLTOFF", file);
10974 	  break;
10975 	case UNSPEC_GOTPCREL:
10976 	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
10977 		 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
10978 	  break;
10979 	case UNSPEC_GOTTPOFF:
10980 	  /* FIXME: This might be @TPOFF in Sun ld too.  */
10981 	  fputs ("@gottpoff", file);
10982 	  break;
10983 	case UNSPEC_TPOFF:
10984 	  fputs ("@tpoff", file);
10985 	  break;
10986 	case UNSPEC_NTPOFF:
10987 	  if (TARGET_64BIT)
10988 	    fputs ("@tpoff", file);
10989 	  else
10990 	    fputs ("@ntpoff", file);
10991 	  break;
10992 	case UNSPEC_DTPOFF:
10993 	  fputs ("@dtpoff", file);
10994 	  break;
10995 	case UNSPEC_GOTNTPOFF:
10996 	  if (TARGET_64BIT)
10997 	    fputs (ASSEMBLER_DIALECT == ASM_ATT ?
10998 		   "@gottpoff(%rip)": "@gottpoff[rip]", file);
10999 	  else
11000 	    fputs ("@gotntpoff", file);
11001 	  break;
11002 	case UNSPEC_INDNTPOFF:
11003 	  fputs ("@indntpoff", file);
11004 	  break;
11005 #if TARGET_MACHO
11006 	case UNSPEC_MACHOPIC_OFFSET:
11007 	  putc ('-', file);
11008 	  machopic_output_function_base_name (file);
11009 	  break;
11010 #endif
11011 	default:
11012 	  output_operand_lossage ("invalid UNSPEC as operand");
11013 	  break;
11014 	}
11015        break;
11016 
11017     default:
11018       output_operand_lossage ("invalid expression as operand");
11019     }
11020 }
11021 
11022 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
11023    We need to emit DTP-relative relocations.  */
11024 
11025 static void ATTRIBUTE_UNUSED
11026 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
11027 {
11028   fputs (ASM_LONG, file);
11029   output_addr_const (file, x);
11030   fputs ("@dtpoff", file);
11031   switch (size)
11032     {
11033     case 4:
11034       break;
11035     case 8:
11036       fputs (", 0", file);
11037       break;
11038     default:
11039       gcc_unreachable ();
11040    }
11041 }
11042 
11043 /* Return true if X is a representation of the PIC register.  This copes
11044    with calls from ix86_find_base_term, where the register might have
11045    been replaced by a cselib value.  */
11046 
11047 static bool
11048 ix86_pic_register_p (rtx x)
11049 {
11050   if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
11051     return (pic_offset_table_rtx
11052 	    && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
11053   else
11054     return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
11055 }
11056 
11057 /* In the name of slightly smaller debug output, and to cater to
11058    general assembler lossage, recognize PIC+GOTOFF and turn it back
11059    into a direct symbol reference.
11060 
11061    On Darwin, this is necessary to avoid a crash, because Darwin
11062    has a different PIC label for each routine but the DWARF debugging
11063    information is not associated with any particular routine, so it's
11064    necessary to remove references to the PIC label from RTL stored by
11065    the DWARF output code.  */
11066 
11067 static rtx
11068 ix86_delegitimize_address (rtx x)
11069 {
11070   rtx orig_x = delegitimize_mem_from_attrs (x);
11071   /* addend is NULL or some rtx if x is something+GOTOFF where
11072      something doesn't include the PIC register.  */
11073   rtx addend = NULL_RTX;
11074   /* reg_addend is NULL or a multiple of some register.  */
11075   rtx reg_addend = NULL_RTX;
11076   /* const_addend is NULL or a const_int.  */
11077   rtx const_addend = NULL_RTX;
11078   /* This is the result, or NULL.  */
11079   rtx result = NULL_RTX;
11080 
11081   x = orig_x;
11082 
11083   if (MEM_P (x))
11084     x = XEXP (x, 0);
11085 
11086   if (TARGET_64BIT)
11087     {
11088       if (GET_CODE (x) != CONST
11089 	  || GET_CODE (XEXP (x, 0)) != UNSPEC
11090 	  || XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
11091 	  || !MEM_P (orig_x))
11092 	return orig_x;
11093       x = XVECEXP (XEXP (x, 0), 0, 0);
11094       if (GET_MODE (orig_x) != Pmode)
11095 	{
11096 	  x = simplify_gen_subreg (GET_MODE (orig_x), x, Pmode, 0);
11097 	  if (x == NULL_RTX)
11098 	    return orig_x;
11099 	}
11100       return x;
11101     }
11102 
11103   if (GET_CODE (x) != PLUS
11104       || GET_CODE (XEXP (x, 1)) != CONST)
11105     return orig_x;
11106 
11107   if (ix86_pic_register_p (XEXP (x, 0)))
11108     /* %ebx + GOT/GOTOFF */
11109     ;
11110   else if (GET_CODE (XEXP (x, 0)) == PLUS)
11111     {
11112       /* %ebx + %reg * scale + GOT/GOTOFF */
11113       reg_addend = XEXP (x, 0);
11114       if (ix86_pic_register_p (XEXP (reg_addend, 0)))
11115 	reg_addend = XEXP (reg_addend, 1);
11116       else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
11117 	reg_addend = XEXP (reg_addend, 0);
11118       else
11119 	{
11120 	  reg_addend = NULL_RTX;
11121 	  addend = XEXP (x, 0);
11122 	}
11123     }
11124   else
11125     addend = XEXP (x, 0);
11126 
11127   x = XEXP (XEXP (x, 1), 0);
11128   if (GET_CODE (x) == PLUS
11129       && CONST_INT_P (XEXP (x, 1)))
11130     {
11131       const_addend = XEXP (x, 1);
11132       x = XEXP (x, 0);
11133     }
11134 
11135   if (GET_CODE (x) == UNSPEC
11136       && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
11137 	  || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
11138     result = XVECEXP (x, 0, 0);
11139 
11140   if (TARGET_MACHO && darwin_local_data_pic (x)
11141       && !MEM_P (orig_x))
11142     result = XVECEXP (x, 0, 0);
11143 
11144   if (! result)
11145     return orig_x;
11146 
11147   if (const_addend)
11148     result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
11149   if (reg_addend)
11150     result = gen_rtx_PLUS (Pmode, reg_addend, result);
11151   if (addend)
11152     {
11153       /* If the rest of original X doesn't involve the PIC register, add
11154 	 addend and subtract pic_offset_table_rtx.  This can happen e.g.
11155 	 for code like:
11156 	 leal (%ebx, %ecx, 4), %ecx
11157 	 ...
11158 	 movl foo@GOTOFF(%ecx), %edx
11159 	 in which case we return (%ecx - %ebx) + foo.  */
11160       if (pic_offset_table_rtx)
11161         result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
11162 						     pic_offset_table_rtx),
11163 			       result);
11164       else
11165 	return orig_x;
11166     }
11167   if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
11168     {
11169       result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
11170       if (result == NULL_RTX)
11171 	return orig_x;
11172     }
11173   return result;
11174 }
11175 
11176 /* If X is a machine specific address (i.e. a symbol or label being
11177    referenced as a displacement from the GOT implemented using an
11178    UNSPEC), then return the base term.  Otherwise return X.  */
11179 
11180 rtx
11181 ix86_find_base_term (rtx x)
11182 {
11183   rtx term;
11184 
11185   if (TARGET_64BIT)
11186     {
11187       if (GET_CODE (x) != CONST)
11188 	return x;
11189       term = XEXP (x, 0);
11190       if (GET_CODE (term) == PLUS
11191 	  && (CONST_INT_P (XEXP (term, 1))
11192 	      || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
11193 	term = XEXP (term, 0);
11194       if (GET_CODE (term) != UNSPEC
11195 	  || XINT (term, 1) != UNSPEC_GOTPCREL)
11196 	return x;
11197 
11198       return XVECEXP (term, 0, 0);
11199     }
11200 
11201   return ix86_delegitimize_address (x);
11202 }
11203 
11204 static void
11205 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
11206 		    int fp, FILE *file)
11207 {
11208   const char *suffix;
11209 
11210   if (mode == CCFPmode || mode == CCFPUmode)
11211     {
11212       code = ix86_fp_compare_code_to_integer (code);
11213       mode = CCmode;
11214     }
11215   if (reverse)
11216     code = reverse_condition (code);
11217 
11218   switch (code)
11219     {
11220     case EQ:
11221       switch (mode)
11222 	{
11223 	case CCAmode:
11224 	  suffix = "a";
11225 	  break;
11226 
11227 	case CCCmode:
11228 	  suffix = "c";
11229 	  break;
11230 
11231 	case CCOmode:
11232 	  suffix = "o";
11233 	  break;
11234 
11235 	case CCSmode:
11236 	  suffix = "s";
11237 	  break;
11238 
11239 	default:
11240 	  suffix = "e";
11241 	}
11242       break;
11243     case NE:
11244       switch (mode)
11245 	{
11246 	case CCAmode:
11247 	  suffix = "na";
11248 	  break;
11249 
11250 	case CCCmode:
11251 	  suffix = "nc";
11252 	  break;
11253 
11254 	case CCOmode:
11255 	  suffix = "no";
11256 	  break;
11257 
11258 	case CCSmode:
11259 	  suffix = "ns";
11260 	  break;
11261 
11262 	default:
11263 	  suffix = "ne";
11264 	}
11265       break;
11266     case GT:
11267       gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
11268       suffix = "g";
11269       break;
11270     case GTU:
11271       /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
11272 	 Those same assemblers have the same but opposite lossage on cmov.  */
11273       if (mode == CCmode)
11274 	suffix = fp ? "nbe" : "a";
11275       else if (mode == CCCmode)
11276 	suffix = "b";
11277       else
11278 	gcc_unreachable ();
11279       break;
11280     case LT:
11281       switch (mode)
11282 	{
11283 	case CCNOmode:
11284 	case CCGOCmode:
11285 	  suffix = "s";
11286 	  break;
11287 
11288 	case CCmode:
11289 	case CCGCmode:
11290 	  suffix = "l";
11291 	  break;
11292 
11293 	default:
11294 	  gcc_unreachable ();
11295 	}
11296       break;
11297     case LTU:
11298       gcc_assert (mode == CCmode || mode == CCCmode);
11299       suffix = "b";
11300       break;
11301     case GE:
11302       switch (mode)
11303 	{
11304 	case CCNOmode:
11305 	case CCGOCmode:
11306 	  suffix = "ns";
11307 	  break;
11308 
11309 	case CCmode:
11310 	case CCGCmode:
11311 	  suffix = "ge";
11312 	  break;
11313 
11314 	default:
11315 	  gcc_unreachable ();
11316 	}
11317       break;
11318     case GEU:
11319       /* ??? As above.  */
11320       gcc_assert (mode == CCmode || mode == CCCmode);
11321       suffix = fp ? "nb" : "ae";
11322       break;
11323     case LE:
11324       gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
11325       suffix = "le";
11326       break;
11327     case LEU:
11328       /* ??? As above.  */
11329       if (mode == CCmode)
11330 	suffix = "be";
11331       else if (mode == CCCmode)
11332 	suffix = fp ? "nb" : "ae";
11333       else
11334 	gcc_unreachable ();
11335       break;
11336     case UNORDERED:
11337       suffix = fp ? "u" : "p";
11338       break;
11339     case ORDERED:
11340       suffix = fp ? "nu" : "np";
11341       break;
11342     default:
11343       gcc_unreachable ();
11344     }
11345   fputs (suffix, file);
11346 }
11347 
11348 /* Print the name of register X to FILE based on its machine mode and number.
11349    If CODE is 'w', pretend the mode is HImode.
11350    If CODE is 'b', pretend the mode is QImode.
11351    If CODE is 'k', pretend the mode is SImode.
11352    If CODE is 'q', pretend the mode is DImode.
11353    If CODE is 'x', pretend the mode is V4SFmode.
11354    If CODE is 't', pretend the mode is V8SFmode.
11355    If CODE is 'h', pretend the reg is the 'high' byte register.
11356    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
11357    If CODE is 'd', duplicate the operand for AVX instruction.
11358  */
11359 
11360 void
11361 print_reg (rtx x, int code, FILE *file)
11362 {
11363   const char *reg;
11364   bool duplicated = code == 'd' && TARGET_AVX;
11365 
11366   gcc_assert (x == pc_rtx
11367 	      || (REGNO (x) != ARG_POINTER_REGNUM
11368 		  && REGNO (x) != FRAME_POINTER_REGNUM
11369 		  && REGNO (x) != FLAGS_REG
11370 		  && REGNO (x) != FPSR_REG
11371 		  && REGNO (x) != FPCR_REG));
11372 
11373   if (ASSEMBLER_DIALECT == ASM_ATT)
11374     putc ('%', file);
11375 
11376   if (x == pc_rtx)
11377     {
11378       gcc_assert (TARGET_64BIT);
11379       fputs ("rip", file);
11380       return;
11381     }
11382 
11383   if (code == 'w' || MMX_REG_P (x))
11384     code = 2;
11385   else if (code == 'b')
11386     code = 1;
11387   else if (code == 'k')
11388     code = 4;
11389   else if (code == 'q')
11390     code = 8;
11391   else if (code == 'y')
11392     code = 3;
11393   else if (code == 'h')
11394     code = 0;
11395   else if (code == 'x')
11396     code = 16;
11397   else if (code == 't')
11398     code = 32;
11399   else
11400     code = GET_MODE_SIZE (GET_MODE (x));
11401 
11402   /* Irritatingly, AMD extended registers use different naming convention
11403      from the normal registers.  */
11404   if (REX_INT_REG_P (x))
11405     {
11406       gcc_assert (TARGET_64BIT);
11407       switch (code)
11408 	{
11409 	  case 0:
11410 	    error ("extended registers have no high halves");
11411 	    break;
11412 	  case 1:
11413 	    fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8);
11414 	    break;
11415 	  case 2:
11416 	    fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8);
11417 	    break;
11418 	  case 4:
11419 	    fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8);
11420 	    break;
11421 	  case 8:
11422 	    fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8);
11423 	    break;
11424 	  default:
11425 	    error ("unsupported operand size for extended register");
11426 	    break;
11427 	}
11428       return;
11429     }
11430 
11431   reg = NULL;
11432   switch (code)
11433     {
11434     case 3:
11435       if (STACK_TOP_P (x))
11436 	{
11437 	  reg = "st(0)";
11438 	  break;
11439 	}
11440       /* FALLTHRU */
11441     case 8:
11442     case 4:
11443     case 12:
11444       if (! ANY_FP_REG_P (x))
11445 	putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
11446       /* FALLTHRU */
11447     case 16:
11448     case 2:
11449     normal:
11450       reg = hi_reg_name[REGNO (x)];
11451       break;
11452     case 1:
11453       if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
11454 	goto normal;
11455       reg = qi_reg_name[REGNO (x)];
11456       break;
11457     case 0:
11458       if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
11459 	goto normal;
11460       reg = qi_high_reg_name[REGNO (x)];
11461       break;
11462     case 32:
11463       if (SSE_REG_P (x))
11464 	{
11465 	  gcc_assert (!duplicated);
11466 	  putc ('y', file);
11467 	  fputs (hi_reg_name[REGNO (x)] + 1, file);
11468 	  return;
11469 	}
11470       break;
11471     default:
11472       gcc_unreachable ();
11473     }
11474 
11475   fputs (reg, file);
11476   if (duplicated)
11477     {
11478       if (ASSEMBLER_DIALECT == ASM_ATT)
11479 	fprintf (file, ", %%%s", reg);
11480       else
11481 	fprintf (file, ", %s", reg);
11482     }
11483 }
11484 
11485 /* Locate some local-dynamic symbol still in use by this function
11486    so that we can print its name in some tls_local_dynamic_base
11487    pattern.  */
11488 
11489 static int
11490 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
11491 {
11492   rtx x = *px;
11493 
11494   if (GET_CODE (x) == SYMBOL_REF
11495       && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
11496     {
11497       cfun->machine->some_ld_name = XSTR (x, 0);
11498       return 1;
11499     }
11500 
11501   return 0;
11502 }
11503 
11504 static const char *
11505 get_some_local_dynamic_name (void)
11506 {
11507   rtx insn;
11508 
11509   if (cfun->machine->some_ld_name)
11510     return cfun->machine->some_ld_name;
11511 
11512   for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
11513     if (NONDEBUG_INSN_P (insn)
11514 	&& for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
11515       return cfun->machine->some_ld_name;
11516 
11517   return NULL;
11518 }
11519 
11520 /* Meaning of CODE:
11521    L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
11522    C -- print opcode suffix for set/cmov insn.
11523    c -- like C, but print reversed condition
11524    F,f -- likewise, but for floating-point.
11525    O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
11526         otherwise nothing
11527    R -- print the prefix for register names.
11528    z -- print the opcode suffix for the size of the current operand.
11529    Z -- likewise, with special suffixes for x87 instructions.
11530    * -- print a star (in certain assembler syntax)
11531    A -- print an absolute memory reference.
11532    w -- print the operand as if it's a "word" (HImode) even if it isn't.
11533    s -- print a shift double count, followed by the assemblers argument
11534 	delimiter.
11535    b -- print the QImode name of the register for the indicated operand.
11536 	%b0 would print %al if operands[0] is reg 0.
11537    w --  likewise, print the HImode name of the register.
11538    k --  likewise, print the SImode name of the register.
11539    q --  likewise, print the DImode name of the register.
11540    x --  likewise, print the V4SFmode name of the register.
11541    t --  likewise, print the V8SFmode name of the register.
11542    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
11543    y -- print "st(0)" instead of "st" as a register.
11544    d -- print duplicated register operand for AVX instruction.
11545    D -- print condition for SSE cmp instruction.
11546    P -- if PIC, print an @PLT suffix.
11547    X -- don't print any sort of PIC '@' suffix for a symbol.
11548    & -- print some in-use local-dynamic symbol name.
11549    H -- print a memory address offset by 8; used for sse high-parts
11550    Y -- print condition for XOP pcom* instruction.
11551    + -- print a branch hint as 'cs' or 'ds' prefix
11552    ; -- print a semicolon (after prefixes due to bug in older gas).
11553  */
11554 
11555 void
11556 print_operand (FILE *file, rtx x, int code)
11557 {
11558   if (code)
11559     {
11560       switch (code)
11561 	{
11562 	case '*':
11563 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11564 	    putc ('*', file);
11565 	  return;
11566 
11567 	case '&':
11568 	  {
11569 	    const char *name = get_some_local_dynamic_name ();
11570 	    if (name == NULL)
11571 	      output_operand_lossage ("'%%&' used without any "
11572 				      "local dynamic TLS references");
11573 	    else
11574 	      assemble_name (file, name);
11575 	    return;
11576 	  }
11577 
11578 	case 'A':
11579 	  switch (ASSEMBLER_DIALECT)
11580 	    {
11581 	    case ASM_ATT:
11582 	      putc ('*', file);
11583 	      break;
11584 
11585 	    case ASM_INTEL:
11586 	      /* Intel syntax. For absolute addresses, registers should not
11587 		 be surrounded by braces.  */
11588 	      if (!REG_P (x))
11589 		{
11590 		  putc ('[', file);
11591 		  PRINT_OPERAND (file, x, 0);
11592 		  putc (']', file);
11593 		  return;
11594 		}
11595 	      break;
11596 
11597 	    default:
11598 	      gcc_unreachable ();
11599 	    }
11600 
11601 	  PRINT_OPERAND (file, x, 0);
11602 	  return;
11603 
11604 
11605 	case 'L':
11606 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11607 	    putc ('l', file);
11608 	  return;
11609 
11610 	case 'W':
11611 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11612 	    putc ('w', file);
11613 	  return;
11614 
11615 	case 'B':
11616 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11617 	    putc ('b', file);
11618 	  return;
11619 
11620 	case 'Q':
11621 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11622 	    putc ('l', file);
11623 	  return;
11624 
11625 	case 'S':
11626 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11627 	    putc ('s', file);
11628 	  return;
11629 
11630 	case 'T':
11631 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11632 	    putc ('t', file);
11633 	  return;
11634 
11635 	case 'z':
11636 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
11637 	    {
11638 	      /* Opcodes don't get size suffixes if using Intel opcodes.  */
11639 	      if (ASSEMBLER_DIALECT == ASM_INTEL)
11640 		return;
11641 
11642 	      switch (GET_MODE_SIZE (GET_MODE (x)))
11643 		{
11644 		case 1:
11645 		  putc ('b', file);
11646 		  return;
11647 
11648 		case 2:
11649 		  putc ('w', file);
11650 		  return;
11651 
11652 		case 4:
11653 		  putc ('l', file);
11654 		  return;
11655 
11656 		case 8:
11657 		  putc ('q', file);
11658 		  return;
11659 
11660 		default:
11661 		  output_operand_lossage
11662 		    ("invalid operand size for operand code '%c'", code);
11663 		  return;
11664 		}
11665 	    }
11666 
11667 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
11668 	    warning
11669 	      (0, "non-integer operand used with operand code '%c'", code);
11670 	  /* FALLTHRU */
11671 
11672 	case 'Z':
11673 	  /* 387 opcodes don't get size suffixes if using Intel opcodes.  */
11674 	  if (ASSEMBLER_DIALECT == ASM_INTEL)
11675 	    return;
11676 
11677 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
11678 	    {
11679 	      switch (GET_MODE_SIZE (GET_MODE (x)))
11680 		{
11681 		case 2:
11682 #ifdef HAVE_AS_IX86_FILDS
11683 		  putc ('s', file);
11684 #endif
11685 		  return;
11686 
11687 		case 4:
11688 		  putc ('l', file);
11689 		  return;
11690 
11691 		case 8:
11692 #ifdef HAVE_AS_IX86_FILDQ
11693 		  putc ('q', file);
11694 #else
11695 		  fputs ("ll", file);
11696 #endif
11697 		  return;
11698 
11699 		default:
11700 		  break;
11701 		}
11702 	    }
11703 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
11704 	    {
11705 	      /* 387 opcodes don't get size suffixes
11706 		 if the operands are registers.  */
11707 	      if (STACK_REG_P (x))
11708 		return;
11709 
11710 	      switch (GET_MODE_SIZE (GET_MODE (x)))
11711 		{
11712 		case 4:
11713 		  putc ('s', file);
11714 		  return;
11715 
11716 		case 8:
11717 		  putc ('l', file);
11718 		  return;
11719 
11720 		case 12:
11721 		case 16:
11722 		  putc ('t', file);
11723 		  return;
11724 
11725 		default:
11726 		  break;
11727 		}
11728 	    }
11729 	  else
11730 	    {
11731 	      output_operand_lossage
11732 		("invalid operand type used with operand code '%c'", code);
11733 	      return;
11734 	    }
11735 
11736 	  output_operand_lossage
11737 	    ("invalid operand size for operand code '%c'", code);
11738 	  return;
11739 
11740 	case 'd':
11741 	case 'b':
11742 	case 'w':
11743 	case 'k':
11744 	case 'q':
11745 	case 'h':
11746 	case 't':
11747 	case 'y':
11748 	case 'x':
11749 	case 'X':
11750 	case 'P':
11751 	  break;
11752 
11753 	case 's':
11754 	  if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
11755 	    {
11756 	      PRINT_OPERAND (file, x, 0);
11757 	      fputs (", ", file);
11758 	    }
11759 	  return;
11760 
11761 	case 'D':
11762 	  /* Little bit of braindamage here.  The SSE compare instructions
11763 	     does use completely different names for the comparisons that the
11764 	     fp conditional moves.  */
11765 	  if (TARGET_AVX)
11766 	    {
11767 	      switch (GET_CODE (x))
11768 		{
11769 		case EQ:
11770 		  fputs ("eq", file);
11771 		  break;
11772 		case UNEQ:
11773 		  fputs ("eq_us", file);
11774 		  break;
11775 		case LT:
11776 		  fputs ("lt", file);
11777 		  break;
11778 		case UNLT:
11779 		  fputs ("nge", file);
11780 		  break;
11781 		case LE:
11782 		  fputs ("le", file);
11783 		  break;
11784 		case UNLE:
11785 		  fputs ("ngt", file);
11786 		  break;
11787 		case UNORDERED:
11788 		  fputs ("unord", file);
11789 		  break;
11790 		case NE:
11791 		  fputs ("neq", file);
11792 		  break;
11793 		case LTGT:
11794 		  fputs ("neq_oq", file);
11795 		  break;
11796 		case GE:
11797 		  fputs ("ge", file);
11798 		  break;
11799 		case UNGE:
11800 		  fputs ("nlt", file);
11801 		  break;
11802 		case GT:
11803 		  fputs ("gt", file);
11804 		  break;
11805 		case UNGT:
11806 		  fputs ("nle", file);
11807 		  break;
11808 		case ORDERED:
11809 		  fputs ("ord", file);
11810 		  break;
11811 		default:
11812 		  output_operand_lossage ("operand is not a condition code, "
11813 					  "invalid operand code 'D'");
11814 		  return;
11815 		}
11816 	    }
11817 	  else
11818 	    {
11819 	      switch (GET_CODE (x))
11820 		{
11821 		case EQ:
11822 		case UNEQ:
11823 		  fputs ("eq", file);
11824 		  break;
11825 		case LT:
11826 		case UNLT:
11827 		  fputs ("lt", file);
11828 		  break;
11829 		case LE:
11830 		case UNLE:
11831 		  fputs ("le", file);
11832 		  break;
11833 		case UNORDERED:
11834 		  fputs ("unord", file);
11835 		  break;
11836 		case NE:
11837 		case LTGT:
11838 		  fputs ("neq", file);
11839 		  break;
11840 		case UNGE:
11841 		case GE:
11842 		  fputs ("nlt", file);
11843 		  break;
11844 		case UNGT:
11845 		case GT:
11846 		  fputs ("nle", file);
11847 		  break;
11848 		case ORDERED:
11849 		  fputs ("ord", file);
11850 		  break;
11851 		default:
11852 		  output_operand_lossage ("operand is not a condition code, "
11853 					  "invalid operand code 'D'");
11854 		  return;
11855 		}
11856 	    }
11857 	  return;
11858 	case 'O':
11859 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
11860 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11861 	    {
11862 	      switch (GET_MODE (x))
11863 		{
11864 		case HImode: putc ('w', file); break;
11865 		case SImode:
11866 		case SFmode: putc ('l', file); break;
11867 		case DImode:
11868 		case DFmode: putc ('q', file); break;
11869 		default: gcc_unreachable ();
11870 		}
11871 	      putc ('.', file);
11872 	    }
11873 #endif
11874 	  return;
11875 	case 'C':
11876 	  if (!COMPARISON_P (x))
11877 	    {
11878 	      output_operand_lossage ("operand is neither a constant nor a "
11879 				      "condition code, invalid operand code "
11880 				      "'C'");
11881 	      return;
11882 	    }
11883 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
11884 	  return;
11885 	case 'F':
11886 	  if (!COMPARISON_P (x))
11887 	    {
11888 	      output_operand_lossage ("operand is neither a constant nor a "
11889 				      "condition code, invalid operand code "
11890 				      "'F'");
11891 	      return;
11892 	    }
11893 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
11894 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11895 	    putc ('.', file);
11896 #endif
11897 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
11898 	  return;
11899 
11900 	  /* Like above, but reverse condition */
11901 	case 'c':
11902 	  /* Check to see if argument to %c is really a constant
11903 	     and not a condition code which needs to be reversed.  */
11904 	  if (!COMPARISON_P (x))
11905 	    {
11906 	      output_operand_lossage ("operand is neither a constant nor a "
11907 				      "condition code, invalid operand "
11908 				      "code 'c'");
11909 	      return;
11910 	    }
11911 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
11912 	  return;
11913 	case 'f':
11914 	  if (!COMPARISON_P (x))
11915 	    {
11916 	      output_operand_lossage ("operand is neither a constant nor a "
11917 				      "condition code, invalid operand "
11918 				      "code 'f'");
11919 	      return;
11920 	    }
11921 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
11922 	  if (ASSEMBLER_DIALECT == ASM_ATT)
11923 	    putc ('.', file);
11924 #endif
11925 	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
11926 	  return;
11927 
11928 	case 'H':
11929 	  if (!offsettable_memref_p (x))
11930 	    {
11931 	      output_operand_lossage ("operand is not an offsettable memory "
11932 				      "reference, invalid operand "
11933 				      "code 'H'");
11934 	      return;
11935 	    }
11936 	  /* It doesn't actually matter what mode we use here, as we're
11937 	     only going to use this for printing.  */
11938 	  x = adjust_address_nv (x, DImode, 8);
11939 	  break;
11940 
11941 	case '+':
11942 	  {
11943 	    rtx x;
11944 
11945 	    if (!optimize
11946 	        || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
11947 	      return;
11948 
11949 	    x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
11950 	    if (x)
11951 	      {
11952 		int pred_val = INTVAL (XEXP (x, 0));
11953 
11954 		if (pred_val < REG_BR_PROB_BASE * 45 / 100
11955 		    || pred_val > REG_BR_PROB_BASE * 55 / 100)
11956 		  {
11957 		    int taken = pred_val > REG_BR_PROB_BASE / 2;
11958 		    int cputaken = final_forward_branch_p (current_output_insn) == 0;
11959 
11960 		    /* Emit hints only in the case default branch prediction
11961 		       heuristics would fail.  */
11962 		    if (taken != cputaken)
11963 		      {
11964 			/* We use 3e (DS) prefix for taken branches and
11965 			   2e (CS) prefix for not taken branches.  */
11966 			if (taken)
11967 			  fputs ("ds ; ", file);
11968 			else
11969 			  fputs ("cs ; ", file);
11970 		      }
11971 		  }
11972 	      }
11973 	    return;
11974 	  }
11975 
11976 	case 'Y':
11977 	  switch (GET_CODE (x))
11978 	    {
11979 	    case NE:
11980 	      fputs ("neq", file);
11981 	      break;
11982 	    case EQ:
11983 	      fputs ("eq", file);
11984 	      break;
11985 	    case GE:
11986 	    case GEU:
11987 	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
11988 	      break;
11989 	    case GT:
11990 	    case GTU:
11991 	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
11992 	      break;
11993 	    case LE:
11994 	    case LEU:
11995 	      fputs ("le", file);
11996 	      break;
11997 	    case LT:
11998 	    case LTU:
11999 	      fputs ("lt", file);
12000 	      break;
12001 	    case UNORDERED:
12002 	      fputs ("unord", file);
12003 	      break;
12004 	    case ORDERED:
12005 	      fputs ("ord", file);
12006 	      break;
12007 	    case UNEQ:
12008 	      fputs ("ueq", file);
12009 	      break;
12010 	    case UNGE:
12011 	      fputs ("nlt", file);
12012 	      break;
12013 	    case UNGT:
12014 	      fputs ("nle", file);
12015 	      break;
12016 	    case UNLE:
12017 	      fputs ("ule", file);
12018 	      break;
12019 	    case UNLT:
12020 	      fputs ("ult", file);
12021 	      break;
12022 	    case LTGT:
12023 	      fputs ("une", file);
12024 	      break;
12025 	    default:
12026 	      output_operand_lossage ("operand is not a condition code, "
12027 				      "invalid operand code 'Y'");
12028 	      return;
12029 	    }
12030 	  return;
12031 
12032 	case ';':
12033 #if TARGET_MACHO || !HAVE_AS_IX86_REP_LOCK_PREFIX
12034 	  fputs (";", file);
12035 #endif
12036 	  return;
12037 
12038 	default:
12039 	    output_operand_lossage ("invalid operand code '%c'", code);
12040 	}
12041     }
12042 
12043   if (REG_P (x))
12044     print_reg (x, code, file);
12045 
12046   else if (MEM_P (x))
12047     {
12048       /* No `byte ptr' prefix for call instructions or BLKmode operands.  */
12049       if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
12050 	  && GET_MODE (x) != BLKmode)
12051 	{
12052 	  const char * size;
12053 	  switch (GET_MODE_SIZE (GET_MODE (x)))
12054 	    {
12055 	    case 1: size = "BYTE"; break;
12056 	    case 2: size = "WORD"; break;
12057 	    case 4: size = "DWORD"; break;
12058 	    case 8: size = "QWORD"; break;
12059 	    case 12: size = "TBYTE"; break;
12060 	    case 16:
12061 	      if (GET_MODE (x) == XFmode)
12062 		size = "TBYTE";
12063               else
12064 		size = "XMMWORD";
12065               break;
12066 	    case 32: size = "YMMWORD"; break;
12067 	    default:
12068 	      gcc_unreachable ();
12069 	    }
12070 
12071 	  /* Check for explicit size override (codes 'b', 'w' and 'k')  */
12072 	  if (code == 'b')
12073 	    size = "BYTE";
12074 	  else if (code == 'w')
12075 	    size = "WORD";
12076 	  else if (code == 'k')
12077 	    size = "DWORD";
12078 
12079 	  fputs (size, file);
12080 	  fputs (" PTR ", file);
12081 	}
12082 
12083       x = XEXP (x, 0);
12084       /* Avoid (%rip) for call operands.  */
12085       if (CONSTANT_ADDRESS_P (x) && code == 'P'
12086 	  && !CONST_INT_P (x))
12087 	output_addr_const (file, x);
12088       else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
12089 	output_operand_lossage ("invalid constraints for operand");
12090       else
12091 	output_address (x);
12092     }
12093 
12094   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
12095     {
12096       REAL_VALUE_TYPE r;
12097       long l;
12098 
12099       REAL_VALUE_FROM_CONST_DOUBLE (r, x);
12100       REAL_VALUE_TO_TARGET_SINGLE (r, l);
12101 
12102       if (ASSEMBLER_DIALECT == ASM_ATT)
12103 	putc ('$', file);
12104       fprintf (file, "0x%08lx", (long unsigned int) l);
12105     }
12106 
12107   /* These float cases don't actually occur as immediate operands.  */
12108   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
12109     {
12110       char dstr[30];
12111 
12112       real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
12113       fputs (dstr, file);
12114     }
12115 
12116   else if (GET_CODE (x) == CONST_DOUBLE
12117 	   && GET_MODE (x) == XFmode)
12118     {
12119       char dstr[30];
12120 
12121       real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
12122       fputs (dstr, file);
12123     }
12124 
12125   else
12126     {
12127       /* We have patterns that allow zero sets of memory, for instance.
12128 	 In 64-bit mode, we should probably support all 8-byte vectors,
12129 	 since we can in fact encode that into an immediate.  */
12130       if (GET_CODE (x) == CONST_VECTOR)
12131 	{
12132 	  gcc_assert (x == CONST0_RTX (GET_MODE (x)));
12133 	  x = const0_rtx;
12134 	}
12135 
12136       if (code != 'P')
12137 	{
12138 	  if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
12139 	    {
12140 	      if (ASSEMBLER_DIALECT == ASM_ATT)
12141 		putc ('$', file);
12142 	    }
12143 	  else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
12144 		   || GET_CODE (x) == LABEL_REF)
12145 	    {
12146 	      if (ASSEMBLER_DIALECT == ASM_ATT)
12147 		putc ('$', file);
12148 	      else
12149 		fputs ("OFFSET FLAT:", file);
12150 	    }
12151 	}
12152       if (CONST_INT_P (x))
12153 	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12154       else if (flag_pic)
12155 	output_pic_addr_const (file, x, code);
12156       else
12157 	output_addr_const (file, x);
12158     }
12159 }
12160 
12161 /* Print a memory operand whose address is ADDR.  */
12162 
12163 void
12164 print_operand_address (FILE *file, rtx addr)
12165 {
12166   struct ix86_address parts;
12167   rtx base, index, disp;
12168   int scale;
12169   int ok = ix86_decompose_address (addr, &parts);
12170 
12171   gcc_assert (ok);
12172 
12173   base = parts.base;
12174   index = parts.index;
12175   disp = parts.disp;
12176   scale = parts.scale;
12177 
12178   switch (parts.seg)
12179     {
12180     case SEG_DEFAULT:
12181       break;
12182     case SEG_FS:
12183     case SEG_GS:
12184       if (ASSEMBLER_DIALECT == ASM_ATT)
12185 	putc ('%', file);
12186       fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
12187       break;
12188     default:
12189       gcc_unreachable ();
12190     }
12191 
12192   /* Use one byte shorter RIP relative addressing for 64bit mode.  */
12193   if (TARGET_64BIT && !base && !index)
12194     {
12195       rtx symbol = disp;
12196 
12197       if (GET_CODE (disp) == CONST
12198 	  && GET_CODE (XEXP (disp, 0)) == PLUS
12199 	  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
12200 	symbol = XEXP (XEXP (disp, 0), 0);
12201 
12202       if (GET_CODE (symbol) == LABEL_REF
12203 	  || (GET_CODE (symbol) == SYMBOL_REF
12204 	      && SYMBOL_REF_TLS_MODEL (symbol) == 0))
12205 	base = pc_rtx;
12206     }
12207   if (!base && !index)
12208     {
12209       /* Displacement only requires special attention.  */
12210 
12211       if (CONST_INT_P (disp))
12212 	{
12213 	  if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
12214 	    fputs ("ds:", file);
12215 	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
12216 	}
12217       else if (flag_pic)
12218 	output_pic_addr_const (file, disp, 0);
12219       else
12220 	output_addr_const (file, disp);
12221     }
12222   else
12223     {
12224       if (ASSEMBLER_DIALECT == ASM_ATT)
12225 	{
12226 	  if (disp)
12227 	    {
12228 	      if (flag_pic)
12229 		output_pic_addr_const (file, disp, 0);
12230 	      else if (GET_CODE (disp) == LABEL_REF)
12231 		output_asm_label (disp);
12232 	      else
12233 		output_addr_const (file, disp);
12234 	    }
12235 
12236 	  putc ('(', file);
12237 	  if (base)
12238 	    print_reg (base, 0, file);
12239 	  if (index)
12240 	    {
12241 	      putc (',', file);
12242 	      print_reg (index, 0, file);
12243 	      if (scale != 1)
12244 		fprintf (file, ",%d", scale);
12245 	    }
12246 	  putc (')', file);
12247 	}
12248       else
12249 	{
12250 	  rtx offset = NULL_RTX;
12251 
12252 	  if (disp)
12253 	    {
12254 	      /* Pull out the offset of a symbol; print any symbol itself.  */
12255 	      if (GET_CODE (disp) == CONST
12256 		  && GET_CODE (XEXP (disp, 0)) == PLUS
12257 		  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
12258 		{
12259 		  offset = XEXP (XEXP (disp, 0), 1);
12260 		  disp = gen_rtx_CONST (VOIDmode,
12261 					XEXP (XEXP (disp, 0), 0));
12262 		}
12263 
12264 	      if (flag_pic)
12265 		output_pic_addr_const (file, disp, 0);
12266 	      else if (GET_CODE (disp) == LABEL_REF)
12267 		output_asm_label (disp);
12268 	      else if (CONST_INT_P (disp))
12269 		offset = disp;
12270 	      else
12271 		output_addr_const (file, disp);
12272 	    }
12273 
12274 	  putc ('[', file);
12275 	  if (base)
12276 	    {
12277 	      print_reg (base, 0, file);
12278 	      if (offset)
12279 		{
12280 		  if (INTVAL (offset) >= 0)
12281 		    putc ('+', file);
12282 		  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
12283 		}
12284 	    }
12285 	  else if (offset)
12286 	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
12287 	  else
12288 	    putc ('0', file);
12289 
12290 	  if (index)
12291 	    {
12292 	      putc ('+', file);
12293 	      print_reg (index, 0, file);
12294 	      if (scale != 1)
12295 		fprintf (file, "*%d", scale);
12296 	    }
12297 	  putc (']', file);
12298 	}
12299     }
12300 }
12301 
12302 bool
12303 output_addr_const_extra (FILE *file, rtx x)
12304 {
12305   rtx op;
12306 
12307   if (GET_CODE (x) != UNSPEC)
12308     return false;
12309 
12310   op = XVECEXP (x, 0, 0);
12311   switch (XINT (x, 1))
12312     {
12313     case UNSPEC_GOTTPOFF:
12314       output_addr_const (file, op);
12315       /* FIXME: This might be @TPOFF in Sun ld.  */
12316       fputs ("@gottpoff", file);
12317       break;
12318     case UNSPEC_TPOFF:
12319       output_addr_const (file, op);
12320       fputs ("@tpoff", file);
12321       break;
12322     case UNSPEC_NTPOFF:
12323       output_addr_const (file, op);
12324       if (TARGET_64BIT)
12325 	fputs ("@tpoff", file);
12326       else
12327 	fputs ("@ntpoff", file);
12328       break;
12329     case UNSPEC_DTPOFF:
12330       output_addr_const (file, op);
12331       fputs ("@dtpoff", file);
12332       break;
12333     case UNSPEC_GOTNTPOFF:
12334       output_addr_const (file, op);
12335       if (TARGET_64BIT)
12336 	fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12337 	       "@gottpoff(%rip)" : "@gottpoff[rip]", file);
12338       else
12339 	fputs ("@gotntpoff", file);
12340       break;
12341     case UNSPEC_INDNTPOFF:
12342       output_addr_const (file, op);
12343       fputs ("@indntpoff", file);
12344       break;
12345 #if TARGET_MACHO
12346     case UNSPEC_MACHOPIC_OFFSET:
12347       output_addr_const (file, op);
12348       putc ('-', file);
12349       machopic_output_function_base_name (file);
12350       break;
12351 #endif
12352 
12353     default:
12354       return false;
12355     }
12356 
12357   return true;
12358 }
12359 
12360 /* Split one or more DImode RTL references into pairs of SImode
12361    references.  The RTL can be REG, offsettable MEM, integer constant, or
12362    CONST_DOUBLE.  "operands" is a pointer to an array of DImode RTL to
12363    split and "num" is its length.  lo_half and hi_half are output arrays
12364    that parallel "operands".  */
12365 
12366 void
12367 split_di (rtx operands[], int num, rtx lo_half[], rtx hi_half[])
12368 {
12369   while (num--)
12370     {
12371       rtx op = operands[num];
12372 
12373       /* simplify_subreg refuse to split volatile memory addresses,
12374          but we still have to handle it.  */
12375       if (MEM_P (op))
12376 	{
12377 	  lo_half[num] = adjust_address (op, SImode, 0);
12378 	  hi_half[num] = adjust_address (op, SImode, 4);
12379 	}
12380       else
12381 	{
12382 	  lo_half[num] = simplify_gen_subreg (SImode, op,
12383 					      GET_MODE (op) == VOIDmode
12384 					      ? DImode : GET_MODE (op), 0);
12385 	  hi_half[num] = simplify_gen_subreg (SImode, op,
12386 					      GET_MODE (op) == VOIDmode
12387 					      ? DImode : GET_MODE (op), 4);
12388 	}
12389     }
12390 }
12391 /* Split one or more TImode RTL references into pairs of DImode
12392    references.  The RTL can be REG, offsettable MEM, integer constant, or
12393    CONST_DOUBLE.  "operands" is a pointer to an array of DImode RTL to
12394    split and "num" is its length.  lo_half and hi_half are output arrays
12395    that parallel "operands".  */
12396 
12397 void
12398 split_ti (rtx operands[], int num, rtx lo_half[], rtx hi_half[])
12399 {
12400   while (num--)
12401     {
12402       rtx op = operands[num];
12403 
12404       /* simplify_subreg refuse to split volatile memory addresses, but we
12405          still have to handle it.  */
12406       if (MEM_P (op))
12407 	{
12408 	  lo_half[num] = adjust_address (op, DImode, 0);
12409 	  hi_half[num] = adjust_address (op, DImode, 8);
12410 	}
12411       else
12412 	{
12413 	  lo_half[num] = simplify_gen_subreg (DImode, op, TImode, 0);
12414 	  hi_half[num] = simplify_gen_subreg (DImode, op, TImode, 8);
12415 	}
12416     }
12417 }
12418 
12419 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
12420    MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
12421    is the expression of the binary operation.  The output may either be
12422    emitted here, or returned to the caller, like all output_* functions.
12423 
12424    There is no guarantee that the operands are the same mode, as they
12425    might be within FLOAT or FLOAT_EXTEND expressions.  */
12426 
12427 #ifndef SYSV386_COMPAT
12428 /* Set to 1 for compatibility with brain-damaged assemblers.  No-one
12429    wants to fix the assemblers because that causes incompatibility
12430    with gcc.  No-one wants to fix gcc because that causes
12431    incompatibility with assemblers...  You can use the option of
12432    -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
12433 #define SYSV386_COMPAT 1
12434 #endif
12435 
12436 const char *
12437 output_387_binary_op (rtx insn, rtx *operands)
12438 {
12439   static char buf[40];
12440   const char *p;
12441   const char *ssep;
12442   int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
12443 
12444 #ifdef ENABLE_CHECKING
12445   /* Even if we do not want to check the inputs, this documents input
12446      constraints.  Which helps in understanding the following code.  */
12447   if (STACK_REG_P (operands[0])
12448       && ((REG_P (operands[1])
12449 	   && REGNO (operands[0]) == REGNO (operands[1])
12450 	   && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
12451 	  || (REG_P (operands[2])
12452 	      && REGNO (operands[0]) == REGNO (operands[2])
12453 	      && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
12454       && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
12455     ; /* ok */
12456   else
12457     gcc_assert (is_sse);
12458 #endif
12459 
12460   switch (GET_CODE (operands[3]))
12461     {
12462     case PLUS:
12463       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
12464 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
12465 	p = "fiadd";
12466       else
12467 	p = "fadd";
12468       ssep = "vadd";
12469       break;
12470 
12471     case MINUS:
12472       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
12473 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
12474 	p = "fisub";
12475       else
12476 	p = "fsub";
12477       ssep = "vsub";
12478       break;
12479 
12480     case MULT:
12481       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
12482 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
12483 	p = "fimul";
12484       else
12485 	p = "fmul";
12486       ssep = "vmul";
12487       break;
12488 
12489     case DIV:
12490       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
12491 	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
12492 	p = "fidiv";
12493       else
12494 	p = "fdiv";
12495       ssep = "vdiv";
12496       break;
12497 
12498     default:
12499       gcc_unreachable ();
12500     }
12501 
12502   if (is_sse)
12503    {
12504      if (TARGET_AVX)
12505        {
12506 	 strcpy (buf, ssep);
12507 	 if (GET_MODE (operands[0]) == SFmode)
12508 	   strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
12509 	 else
12510 	   strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
12511        }
12512      else
12513        {
12514 	 strcpy (buf, ssep + 1);
12515 	 if (GET_MODE (operands[0]) == SFmode)
12516 	   strcat (buf, "ss\t{%2, %0|%0, %2}");
12517 	 else
12518 	   strcat (buf, "sd\t{%2, %0|%0, %2}");
12519        }
12520       return buf;
12521    }
12522   strcpy (buf, p);
12523 
12524   switch (GET_CODE (operands[3]))
12525     {
12526     case MULT:
12527     case PLUS:
12528       if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
12529 	{
12530 	  rtx temp = operands[2];
12531 	  operands[2] = operands[1];
12532 	  operands[1] = temp;
12533 	}
12534 
12535       /* know operands[0] == operands[1].  */
12536 
12537       if (MEM_P (operands[2]))
12538 	{
12539 	  p = "%Z2\t%2";
12540 	  break;
12541 	}
12542 
12543       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
12544 	{
12545 	  if (STACK_TOP_P (operands[0]))
12546 	    /* How is it that we are storing to a dead operand[2]?
12547 	       Well, presumably operands[1] is dead too.  We can't
12548 	       store the result to st(0) as st(0) gets popped on this
12549 	       instruction.  Instead store to operands[2] (which I
12550 	       think has to be st(1)).  st(1) will be popped later.
12551 	       gcc <= 2.8.1 didn't have this check and generated
12552 	       assembly code that the Unixware assembler rejected.  */
12553 	    p = "p\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
12554 	  else
12555 	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
12556 	  break;
12557 	}
12558 
12559       if (STACK_TOP_P (operands[0]))
12560 	p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
12561       else
12562 	p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
12563       break;
12564 
12565     case MINUS:
12566     case DIV:
12567       if (MEM_P (operands[1]))
12568 	{
12569 	  p = "r%Z1\t%1";
12570 	  break;
12571 	}
12572 
12573       if (MEM_P (operands[2]))
12574 	{
12575 	  p = "%Z2\t%2";
12576 	  break;
12577 	}
12578 
12579       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
12580 	{
12581 #if SYSV386_COMPAT
12582 	  /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
12583 	     derived assemblers, confusingly reverse the direction of
12584 	     the operation for fsub{r} and fdiv{r} when the
12585 	     destination register is not st(0).  The Intel assembler
12586 	     doesn't have this brain damage.  Read !SYSV386_COMPAT to
12587 	     figure out what the hardware really does.  */
12588 	  if (STACK_TOP_P (operands[0]))
12589 	    p = "{p\t%0, %2|rp\t%2, %0}";
12590 	  else
12591 	    p = "{rp\t%2, %0|p\t%0, %2}";
12592 #else
12593 	  if (STACK_TOP_P (operands[0]))
12594 	    /* As above for fmul/fadd, we can't store to st(0).  */
12595 	    p = "rp\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
12596 	  else
12597 	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
12598 #endif
12599 	  break;
12600 	}
12601 
12602       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
12603 	{
12604 #if SYSV386_COMPAT
12605 	  if (STACK_TOP_P (operands[0]))
12606 	    p = "{rp\t%0, %1|p\t%1, %0}";
12607 	  else
12608 	    p = "{p\t%1, %0|rp\t%0, %1}";
12609 #else
12610 	  if (STACK_TOP_P (operands[0]))
12611 	    p = "p\t{%0, %1|%1, %0}";	/* st(1) = st(1) op st(0); pop */
12612 	  else
12613 	    p = "rp\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2); pop */
12614 #endif
12615 	  break;
12616 	}
12617 
12618       if (STACK_TOP_P (operands[0]))
12619 	{
12620 	  if (STACK_TOP_P (operands[1]))
12621 	    p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
12622 	  else
12623 	    p = "r\t{%y1, %0|%0, %y1}";	/* st(0) = st(r1) op st(0) */
12624 	  break;
12625 	}
12626       else if (STACK_TOP_P (operands[1]))
12627 	{
12628 #if SYSV386_COMPAT
12629 	  p = "{\t%1, %0|r\t%0, %1}";
12630 #else
12631 	  p = "r\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2) */
12632 #endif
12633 	}
12634       else
12635 	{
12636 #if SYSV386_COMPAT
12637 	  p = "{r\t%2, %0|\t%0, %2}";
12638 #else
12639 	  p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
12640 #endif
12641 	}
12642       break;
12643 
12644     default:
12645       gcc_unreachable ();
12646     }
12647 
12648   strcat (buf, p);
12649   return buf;
12650 }
12651 
12652 /* Return needed mode for entity in optimize_mode_switching pass.  */
12653 
12654 int
12655 ix86_mode_needed (int entity, rtx insn)
12656 {
12657   enum attr_i387_cw mode;
12658 
12659   /* The mode UNINITIALIZED is used to store control word after a
12660      function call or ASM pattern.  The mode ANY specify that function
12661      has no requirements on the control word and make no changes in the
12662      bits we are interested in.  */
12663 
12664   if (CALL_P (insn)
12665       || (NONJUMP_INSN_P (insn)
12666 	  && (asm_noperands (PATTERN (insn)) >= 0
12667 	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
12668     return I387_CW_UNINITIALIZED;
12669 
12670   if (recog_memoized (insn) < 0)
12671     return I387_CW_ANY;
12672 
12673   mode = get_attr_i387_cw (insn);
12674 
12675   switch (entity)
12676     {
12677     case I387_TRUNC:
12678       if (mode == I387_CW_TRUNC)
12679 	return mode;
12680       break;
12681 
12682     case I387_FLOOR:
12683       if (mode == I387_CW_FLOOR)
12684 	return mode;
12685       break;
12686 
12687     case I387_CEIL:
12688       if (mode == I387_CW_CEIL)
12689 	return mode;
12690       break;
12691 
12692     case I387_MASK_PM:
12693       if (mode == I387_CW_MASK_PM)
12694 	return mode;
12695       break;
12696 
12697     default:
12698       gcc_unreachable ();
12699     }
12700 
12701   return I387_CW_ANY;
12702 }
12703 
12704 /* Output code to initialize control word copies used by trunc?f?i and
12705    rounding patterns.  CURRENT_MODE is set to current control word,
12706    while NEW_MODE is set to new control word.  */
12707 
12708 void
12709 emit_i387_cw_initialization (int mode)
12710 {
12711   rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
12712   rtx new_mode;
12713 
12714   enum ix86_stack_slot slot;
12715 
12716   rtx reg = gen_reg_rtx (HImode);
12717 
12718   emit_insn (gen_x86_fnstcw_1 (stored_mode));
12719   emit_move_insn (reg, copy_rtx (stored_mode));
12720 
12721   if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
12722       || optimize_function_for_size_p (cfun))
12723     {
12724       switch (mode)
12725 	{
12726 	case I387_CW_TRUNC:
12727 	  /* round toward zero (truncate) */
12728 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
12729 	  slot = SLOT_CW_TRUNC;
12730 	  break;
12731 
12732 	case I387_CW_FLOOR:
12733 	  /* round down toward -oo */
12734 	  emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
12735 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
12736 	  slot = SLOT_CW_FLOOR;
12737 	  break;
12738 
12739 	case I387_CW_CEIL:
12740 	  /* round up toward +oo */
12741 	  emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
12742 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
12743 	  slot = SLOT_CW_CEIL;
12744 	  break;
12745 
12746 	case I387_CW_MASK_PM:
12747 	  /* mask precision exception for nearbyint() */
12748 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
12749 	  slot = SLOT_CW_MASK_PM;
12750 	  break;
12751 
12752 	default:
12753 	  gcc_unreachable ();
12754 	}
12755     }
12756   else
12757     {
12758       switch (mode)
12759 	{
12760 	case I387_CW_TRUNC:
12761 	  /* round toward zero (truncate) */
12762 	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
12763 	  slot = SLOT_CW_TRUNC;
12764 	  break;
12765 
12766 	case I387_CW_FLOOR:
12767 	  /* round down toward -oo */
12768 	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
12769 	  slot = SLOT_CW_FLOOR;
12770 	  break;
12771 
12772 	case I387_CW_CEIL:
12773 	  /* round up toward +oo */
12774 	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
12775 	  slot = SLOT_CW_CEIL;
12776 	  break;
12777 
12778 	case I387_CW_MASK_PM:
12779 	  /* mask precision exception for nearbyint() */
12780 	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
12781 	  slot = SLOT_CW_MASK_PM;
12782 	  break;
12783 
12784 	default:
12785 	  gcc_unreachable ();
12786 	}
12787     }
12788 
12789   gcc_assert (slot < MAX_386_STACK_LOCALS);
12790 
12791   new_mode = assign_386_stack_local (HImode, slot);
12792   emit_move_insn (new_mode, reg);
12793 }
12794 
12795 /* Output code for INSN to convert a float to a signed int.  OPERANDS
12796    are the insn operands.  The output may be [HSD]Imode and the input
12797    operand may be [SDX]Fmode.  */
12798 
12799 const char *
12800 output_fix_trunc (rtx insn, rtx *operands, int fisttp)
12801 {
12802   int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
12803   int dimode_p = GET_MODE (operands[0]) == DImode;
12804   int round_mode = get_attr_i387_cw (insn);
12805 
12806   /* Jump through a hoop or two for DImode, since the hardware has no
12807      non-popping instruction.  We used to do this a different way, but
12808      that was somewhat fragile and broke with post-reload splitters.  */
12809   if ((dimode_p || fisttp) && !stack_top_dies)
12810     output_asm_insn ("fld\t%y1", operands);
12811 
12812   gcc_assert (STACK_TOP_P (operands[1]));
12813   gcc_assert (MEM_P (operands[0]));
12814   gcc_assert (GET_MODE (operands[1]) != TFmode);
12815 
12816   if (fisttp)
12817       output_asm_insn ("fisttp%Z0\t%0", operands);
12818   else
12819     {
12820       if (round_mode != I387_CW_ANY)
12821 	output_asm_insn ("fldcw\t%3", operands);
12822       if (stack_top_dies || dimode_p)
12823 	output_asm_insn ("fistp%Z0\t%0", operands);
12824       else
12825 	output_asm_insn ("fist%Z0\t%0", operands);
12826       if (round_mode != I387_CW_ANY)
12827 	output_asm_insn ("fldcw\t%2", operands);
12828     }
12829 
12830   return "";
12831 }
12832 
12833 /* Output code for x87 ffreep insn.  The OPNO argument, which may only
12834    have the values zero or one, indicates the ffreep insn's operand
12835    from the OPERANDS array.  */
12836 
12837 static const char *
12838 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
12839 {
12840   if (TARGET_USE_FFREEP)
12841 #ifdef HAVE_AS_IX86_FFREEP
12842     return opno ? "ffreep\t%y1" : "ffreep\t%y0";
12843 #else
12844     {
12845       static char retval[32];
12846       int regno = REGNO (operands[opno]);
12847 
12848       gcc_assert (FP_REGNO_P (regno));
12849 
12850       regno -= FIRST_STACK_REG;
12851 
12852       snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
12853       return retval;
12854     }
12855 #endif
12856 
12857   return opno ? "fstp\t%y1" : "fstp\t%y0";
12858 }
12859 
12860 
12861 /* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
12862    should be used.  UNORDERED_P is true when fucom should be used.  */
12863 
12864 const char *
12865 output_fp_compare (rtx insn, rtx *operands, int eflags_p, int unordered_p)
12866 {
12867   int stack_top_dies;
12868   rtx cmp_op0, cmp_op1;
12869   int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
12870 
12871   if (eflags_p)
12872     {
12873       cmp_op0 = operands[0];
12874       cmp_op1 = operands[1];
12875     }
12876   else
12877     {
12878       cmp_op0 = operands[1];
12879       cmp_op1 = operands[2];
12880     }
12881 
12882   if (is_sse)
12883     {
12884       static const char ucomiss[] = "vucomiss\t{%1, %0|%0, %1}";
12885       static const char ucomisd[] = "vucomisd\t{%1, %0|%0, %1}";
12886       static const char comiss[] = "vcomiss\t{%1, %0|%0, %1}";
12887       static const char comisd[] = "vcomisd\t{%1, %0|%0, %1}";
12888 
12889       if (GET_MODE (operands[0]) == SFmode)
12890 	if (unordered_p)
12891 	  return &ucomiss[TARGET_AVX ? 0 : 1];
12892 	else
12893 	  return &comiss[TARGET_AVX ? 0 : 1];
12894       else
12895 	if (unordered_p)
12896 	  return &ucomisd[TARGET_AVX ? 0 : 1];
12897 	else
12898 	  return &comisd[TARGET_AVX ? 0 : 1];
12899     }
12900 
12901   gcc_assert (STACK_TOP_P (cmp_op0));
12902 
12903   stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
12904 
12905   if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
12906     {
12907       if (stack_top_dies)
12908 	{
12909 	  output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
12910 	  return output_387_ffreep (operands, 1);
12911 	}
12912       else
12913 	return "ftst\n\tfnstsw\t%0";
12914     }
12915 
12916   if (STACK_REG_P (cmp_op1)
12917       && stack_top_dies
12918       && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
12919       && REGNO (cmp_op1) != FIRST_STACK_REG)
12920     {
12921       /* If both the top of the 387 stack dies, and the other operand
12922 	 is also a stack register that dies, then this must be a
12923 	 `fcompp' float compare */
12924 
12925       if (eflags_p)
12926 	{
12927 	  /* There is no double popping fcomi variant.  Fortunately,
12928 	     eflags is immune from the fstp's cc clobbering.  */
12929 	  if (unordered_p)
12930 	    output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
12931 	  else
12932 	    output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
12933 	  return output_387_ffreep (operands, 0);
12934 	}
12935       else
12936 	{
12937 	  if (unordered_p)
12938 	    return "fucompp\n\tfnstsw\t%0";
12939 	  else
12940 	    return "fcompp\n\tfnstsw\t%0";
12941 	}
12942     }
12943   else
12944     {
12945       /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies.  */
12946 
12947       static const char * const alt[16] =
12948       {
12949 	"fcom%Z2\t%y2\n\tfnstsw\t%0",
12950 	"fcomp%Z2\t%y2\n\tfnstsw\t%0",
12951 	"fucom%Z2\t%y2\n\tfnstsw\t%0",
12952 	"fucomp%Z2\t%y2\n\tfnstsw\t%0",
12953 
12954 	"ficom%Z2\t%y2\n\tfnstsw\t%0",
12955 	"ficomp%Z2\t%y2\n\tfnstsw\t%0",
12956 	NULL,
12957 	NULL,
12958 
12959 	"fcomi\t{%y1, %0|%0, %y1}",
12960 	"fcomip\t{%y1, %0|%0, %y1}",
12961 	"fucomi\t{%y1, %0|%0, %y1}",
12962 	"fucomip\t{%y1, %0|%0, %y1}",
12963 
12964 	NULL,
12965 	NULL,
12966 	NULL,
12967 	NULL
12968       };
12969 
12970       int mask;
12971       const char *ret;
12972 
12973       mask  = eflags_p << 3;
12974       mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
12975       mask |= unordered_p << 1;
12976       mask |= stack_top_dies;
12977 
12978       gcc_assert (mask < 16);
12979       ret = alt[mask];
12980       gcc_assert (ret);
12981 
12982       return ret;
12983     }
12984 }
12985 
12986 void
12987 ix86_output_addr_vec_elt (FILE *file, int value)
12988 {
12989   const char *directive = ASM_LONG;
12990 
12991 #ifdef ASM_QUAD
12992   if (TARGET_64BIT)
12993     directive = ASM_QUAD;
12994 #else
12995   gcc_assert (!TARGET_64BIT);
12996 #endif
12997 
12998   fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
12999 }
13000 
13001 void
13002 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
13003 {
13004   const char *directive = ASM_LONG;
13005 
13006 #ifdef ASM_QUAD
13007   if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
13008     directive = ASM_QUAD;
13009 #else
13010   gcc_assert (!TARGET_64BIT);
13011 #endif
13012   /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
13013   if (TARGET_64BIT || TARGET_VXWORKS_RTP)
13014     fprintf (file, "%s%s%d-%s%d\n",
13015 	     directive, LPREFIX, value, LPREFIX, rel);
13016   else if (HAVE_AS_GOTOFF_IN_DATA)
13017     fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
13018 #if TARGET_MACHO
13019   else if (TARGET_MACHO)
13020     {
13021       fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
13022       machopic_output_function_base_name (file);
13023       putc ('\n', file);
13024     }
13025 #endif
13026   else
13027     asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
13028 		 GOT_SYMBOL_NAME, LPREFIX, value);
13029 }
13030 
13031 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
13032    for the target.  */
13033 
13034 void
13035 ix86_expand_clear (rtx dest)
13036 {
13037   rtx tmp;
13038 
13039   /* We play register width games, which are only valid after reload.  */
13040   gcc_assert (reload_completed);
13041 
13042   /* Avoid HImode and its attendant prefix byte.  */
13043   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
13044     dest = gen_rtx_REG (SImode, REGNO (dest));
13045   tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
13046 
13047   /* This predicate should match that for movsi_xor and movdi_xor_rex64.  */
13048   if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
13049     {
13050       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
13051       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
13052     }
13053 
13054   emit_insn (tmp);
13055 }
13056 
13057 /* X is an unchanging MEM.  If it is a constant pool reference, return
13058    the constant pool rtx, else NULL.  */
13059 
13060 rtx
13061 maybe_get_pool_constant (rtx x)
13062 {
13063   x = ix86_delegitimize_address (XEXP (x, 0));
13064 
13065   if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
13066     return get_pool_constant (x);
13067 
13068   return NULL_RTX;
13069 }
13070 
13071 void
13072 ix86_expand_move (enum machine_mode mode, rtx operands[])
13073 {
13074   rtx op0, op1;
13075   enum tls_model model;
13076 
13077   op0 = operands[0];
13078   op1 = operands[1];
13079 
13080   if (GET_CODE (op1) == SYMBOL_REF)
13081     {
13082       model = SYMBOL_REF_TLS_MODEL (op1);
13083       if (model)
13084 	{
13085 	  op1 = legitimize_tls_address (op1, model, true);
13086 	  op1 = force_operand (op1, op0);
13087 	  if (op1 == op0)
13088 	    return;
13089 	}
13090       else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
13091 	       && SYMBOL_REF_DLLIMPORT_P (op1))
13092 	op1 = legitimize_dllimport_symbol (op1, false);
13093     }
13094   else if (GET_CODE (op1) == CONST
13095 	   && GET_CODE (XEXP (op1, 0)) == PLUS
13096 	   && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
13097     {
13098       rtx addend = XEXP (XEXP (op1, 0), 1);
13099       rtx symbol = XEXP (XEXP (op1, 0), 0);
13100       rtx tmp = NULL;
13101 
13102       model = SYMBOL_REF_TLS_MODEL (symbol);
13103       if (model)
13104 	tmp = legitimize_tls_address (symbol, model, true);
13105       else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
13106 	       && SYMBOL_REF_DLLIMPORT_P (symbol))
13107 	tmp = legitimize_dllimport_symbol (symbol, true);
13108 
13109       if (tmp)
13110 	{
13111 	  tmp = force_operand (tmp, NULL);
13112 	  tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
13113 				     op0, 1, OPTAB_DIRECT);
13114 	  if (tmp == op0)
13115 	    return;
13116 	}
13117     }
13118 
13119   if (flag_pic && mode == Pmode && symbolic_operand (op1, Pmode))
13120     {
13121       if (TARGET_MACHO && !TARGET_64BIT)
13122 	{
13123 #if TARGET_MACHO
13124 	  if (MACHOPIC_PURE)
13125 	    {
13126 	      rtx temp = ((reload_in_progress
13127 			   || ((op0 && REG_P (op0))
13128 			       && mode == Pmode))
13129 			  ? op0 : gen_reg_rtx (Pmode));
13130 	      op1 = machopic_indirect_data_reference (op1, temp);
13131 	      op1 = machopic_legitimize_pic_address (op1, mode,
13132 						     temp == op1 ? 0 : temp);
13133 	    }
13134 	  else if (MACHOPIC_INDIRECT)
13135 	    op1 = machopic_indirect_data_reference (op1, 0);
13136 	  if (op0 == op1)
13137 	    return;
13138 #endif
13139 	}
13140       else
13141 	{
13142 	  if (MEM_P (op0))
13143 	    op1 = force_reg (Pmode, op1);
13144 	  else if (!TARGET_64BIT || !x86_64_movabs_operand (op1, Pmode))
13145 	    {
13146 	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
13147 	      op1 = legitimize_pic_address (op1, reg);
13148 	      if (op0 == op1)
13149 		return;
13150 	    }
13151 	}
13152     }
13153   else
13154     {
13155       if (MEM_P (op0)
13156 	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
13157 	      || !push_operand (op0, mode))
13158 	  && MEM_P (op1))
13159 	op1 = force_reg (mode, op1);
13160 
13161       if (push_operand (op0, mode)
13162 	  && ! general_no_elim_operand (op1, mode))
13163 	op1 = copy_to_mode_reg (mode, op1);
13164 
13165       /* Force large constants in 64bit compilation into register
13166 	 to get them CSEed.  */
13167       if (can_create_pseudo_p ()
13168 	  && (mode == DImode) && TARGET_64BIT
13169 	  && immediate_operand (op1, mode)
13170 	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
13171 	  && !register_operand (op0, mode)
13172 	  && optimize)
13173 	op1 = copy_to_mode_reg (mode, op1);
13174 
13175       if (can_create_pseudo_p ()
13176 	  && FLOAT_MODE_P (mode)
13177 	  && GET_CODE (op1) == CONST_DOUBLE)
13178 	{
13179 	  /* If we are loading a floating point constant to a register,
13180 	     force the value to memory now, since we'll get better code
13181 	     out the back end.  */
13182 
13183 	  op1 = validize_mem (force_const_mem (mode, op1));
13184 	  if (!register_operand (op0, mode))
13185 	    {
13186 	      rtx temp = gen_reg_rtx (mode);
13187 	      emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
13188 	      emit_move_insn (op0, temp);
13189 	      return;
13190 	    }
13191 	}
13192     }
13193 
13194   emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
13195 }
13196 
13197 void
13198 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
13199 {
13200   rtx op0 = operands[0], op1 = operands[1];
13201   unsigned int align = GET_MODE_ALIGNMENT (mode);
13202 
13203   /* Force constants other than zero into memory.  We do not know how
13204      the instructions used to build constants modify the upper 64 bits
13205      of the register, once we have that information we may be able
13206      to handle some of them more efficiently.  */
13207   if (can_create_pseudo_p ()
13208       && register_operand (op0, mode)
13209       && (CONSTANT_P (op1)
13210 	  || (GET_CODE (op1) == SUBREG
13211 	      && CONSTANT_P (SUBREG_REG (op1))))
13212       && !standard_sse_constant_p (op1))
13213     op1 = validize_mem (force_const_mem (mode, op1));
13214 
13215   /* We need to check memory alignment for SSE mode since attribute
13216      can make operands unaligned.  */
13217   if (can_create_pseudo_p ()
13218       && SSE_REG_MODE_P (mode)
13219       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
13220 	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
13221     {
13222       rtx tmp[2];
13223 
13224       /* ix86_expand_vector_move_misalign() does not like constants ... */
13225       if (CONSTANT_P (op1)
13226 	  || (GET_CODE (op1) == SUBREG
13227 	      && CONSTANT_P (SUBREG_REG (op1))))
13228 	op1 = validize_mem (force_const_mem (mode, op1));
13229 
13230       /* ... nor both arguments in memory.  */
13231       if (!register_operand (op0, mode)
13232 	  && !register_operand (op1, mode))
13233 	op1 = force_reg (mode, op1);
13234 
13235       tmp[0] = op0; tmp[1] = op1;
13236       ix86_expand_vector_move_misalign (mode, tmp);
13237       return;
13238     }
13239 
13240   /* Make operand1 a register if it isn't already.  */
13241   if (can_create_pseudo_p ()
13242       && !register_operand (op0, mode)
13243       && !register_operand (op1, mode))
13244     {
13245       emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
13246       return;
13247     }
13248 
13249   emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
13250 }
13251 
13252 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
13253    straight to ix86_expand_vector_move.  */
13254 /* Code generation for scalar reg-reg moves of single and double precision data:
13255      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
13256        movaps reg, reg
13257      else
13258        movss reg, reg
13259      if (x86_sse_partial_reg_dependency == true)
13260        movapd reg, reg
13261      else
13262        movsd reg, reg
13263 
13264    Code generation for scalar loads of double precision data:
13265      if (x86_sse_split_regs == true)
13266        movlpd mem, reg      (gas syntax)
13267      else
13268        movsd mem, reg
13269 
13270    Code generation for unaligned packed loads of single precision data
13271    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
13272      if (x86_sse_unaligned_move_optimal)
13273        movups mem, reg
13274 
13275      if (x86_sse_partial_reg_dependency == true)
13276        {
13277          xorps  reg, reg
13278          movlps mem, reg
13279          movhps mem+8, reg
13280        }
13281      else
13282        {
13283          movlps mem, reg
13284          movhps mem+8, reg
13285        }
13286 
13287    Code generation for unaligned packed loads of double precision data
13288    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
13289      if (x86_sse_unaligned_move_optimal)
13290        movupd mem, reg
13291 
13292      if (x86_sse_split_regs == true)
13293        {
13294          movlpd mem, reg
13295          movhpd mem+8, reg
13296        }
13297      else
13298        {
13299          movsd  mem, reg
13300          movhpd mem+8, reg
13301        }
13302  */
13303 
13304 void
13305 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
13306 {
13307   rtx op0, op1, m;
13308 
13309   op0 = operands[0];
13310   op1 = operands[1];
13311 
13312   if (TARGET_AVX)
13313     {
13314       switch (GET_MODE_CLASS (mode))
13315 	{
13316 	case MODE_VECTOR_INT:
13317 	case MODE_INT:
13318 	  switch (GET_MODE_SIZE (mode))
13319 	    {
13320 	    case 16:
13321 	      op0 = gen_lowpart (V16QImode, op0);
13322 	      op1 = gen_lowpart (V16QImode, op1);
13323 	      emit_insn (gen_avx_movdqu (op0, op1));
13324 	      break;
13325 	    case 32:
13326 	      op0 = gen_lowpart (V32QImode, op0);
13327 	      op1 = gen_lowpart (V32QImode, op1);
13328 	      emit_insn (gen_avx_movdqu256 (op0, op1));
13329 	      break;
13330 	    default:
13331 	      gcc_unreachable ();
13332 	    }
13333 	  break;
13334 	case MODE_VECTOR_FLOAT:
13335 	  op0 = gen_lowpart (mode, op0);
13336 	  op1 = gen_lowpart (mode, op1);
13337 
13338 	  switch (mode)
13339 	    {
13340 	    case V4SFmode:
13341 	      emit_insn (gen_avx_movups (op0, op1));
13342 	      break;
13343 	    case V8SFmode:
13344 	      emit_insn (gen_avx_movups256 (op0, op1));
13345 	      break;
13346 	    case V2DFmode:
13347 	      emit_insn (gen_avx_movupd (op0, op1));
13348 	      break;
13349 	    case V4DFmode:
13350 	      emit_insn (gen_avx_movupd256 (op0, op1));
13351 	      break;
13352 	    default:
13353 	      gcc_unreachable ();
13354 	    }
13355 	  break;
13356 
13357 	default:
13358 	  gcc_unreachable ();
13359 	}
13360 
13361       return;
13362     }
13363 
13364   if (MEM_P (op1))
13365     {
13366       /* If we're optimizing for size, movups is the smallest.  */
13367       if (optimize_insn_for_size_p ())
13368 	{
13369 	  op0 = gen_lowpart (V4SFmode, op0);
13370 	  op1 = gen_lowpart (V4SFmode, op1);
13371 	  emit_insn (gen_sse_movups (op0, op1));
13372 	  return;
13373 	}
13374 
13375       /* ??? If we have typed data, then it would appear that using
13376 	 movdqu is the only way to get unaligned data loaded with
13377 	 integer type.  */
13378       if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
13379 	{
13380 	  op0 = gen_lowpart (V16QImode, op0);
13381 	  op1 = gen_lowpart (V16QImode, op1);
13382 	  emit_insn (gen_sse2_movdqu (op0, op1));
13383 	  return;
13384 	}
13385 
13386       if (TARGET_SSE2 && mode == V2DFmode)
13387         {
13388           rtx zero;
13389 
13390           if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
13391             {
13392               op0 = gen_lowpart (V2DFmode, op0);
13393               op1 = gen_lowpart (V2DFmode, op1);
13394               emit_insn (gen_sse2_movupd (op0, op1));
13395               return;
13396             }
13397 
13398 	  /* When SSE registers are split into halves, we can avoid
13399 	     writing to the top half twice.  */
13400 	  if (TARGET_SSE_SPLIT_REGS)
13401 	    {
13402 	      emit_clobber (op0);
13403 	      zero = op0;
13404 	    }
13405 	  else
13406 	    {
13407 	      /* ??? Not sure about the best option for the Intel chips.
13408 		 The following would seem to satisfy; the register is
13409 		 entirely cleared, breaking the dependency chain.  We
13410 		 then store to the upper half, with a dependency depth
13411 		 of one.  A rumor has it that Intel recommends two movsd
13412 		 followed by an unpacklpd, but this is unconfirmed.  And
13413 		 given that the dependency depth of the unpacklpd would
13414 		 still be one, I'm not sure why this would be better.  */
13415 	      zero = CONST0_RTX (V2DFmode);
13416 	    }
13417 
13418 	  m = adjust_address (op1, DFmode, 0);
13419 	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
13420 	  m = adjust_address (op1, DFmode, 8);
13421 	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
13422 	}
13423       else
13424         {
13425           if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
13426             {
13427               op0 = gen_lowpart (V4SFmode, op0);
13428               op1 = gen_lowpart (V4SFmode, op1);
13429               emit_insn (gen_sse_movups (op0, op1));
13430               return;
13431             }
13432 
13433 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
13434 	    emit_move_insn (op0, CONST0_RTX (mode));
13435 	  else
13436 	    emit_clobber (op0);
13437 
13438 	  if (mode != V4SFmode)
13439 	    op0 = gen_lowpart (V4SFmode, op0);
13440 	  m = adjust_address (op1, V2SFmode, 0);
13441 	  emit_insn (gen_sse_loadlps (op0, op0, m));
13442 	  m = adjust_address (op1, V2SFmode, 8);
13443 	  emit_insn (gen_sse_loadhps (op0, op0, m));
13444 	}
13445     }
13446   else if (MEM_P (op0))
13447     {
13448       /* If we're optimizing for size, movups is the smallest.  */
13449       if (optimize_insn_for_size_p ())
13450 	{
13451 	  op0 = gen_lowpart (V4SFmode, op0);
13452 	  op1 = gen_lowpart (V4SFmode, op1);
13453 	  emit_insn (gen_sse_movups (op0, op1));
13454 	  return;
13455 	}
13456 
13457       /* ??? Similar to above, only less clear because of quote
13458 	 typeless stores unquote.  */
13459       if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
13460 	  && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
13461         {
13462 	  op0 = gen_lowpart (V16QImode, op0);
13463 	  op1 = gen_lowpart (V16QImode, op1);
13464 	  emit_insn (gen_sse2_movdqu (op0, op1));
13465 	  return;
13466 	}
13467 
13468       if (TARGET_SSE2 && mode == V2DFmode)
13469 	{
13470 	  m = adjust_address (op0, DFmode, 0);
13471 	  emit_insn (gen_sse2_storelpd (m, op1));
13472 	  m = adjust_address (op0, DFmode, 8);
13473 	  emit_insn (gen_sse2_storehpd (m, op1));
13474 	}
13475       else
13476 	{
13477 	  if (mode != V4SFmode)
13478 	    op1 = gen_lowpart (V4SFmode, op1);
13479 	  m = adjust_address (op0, V2SFmode, 0);
13480 	  emit_insn (gen_sse_storelps (m, op1));
13481 	  m = adjust_address (op0, V2SFmode, 8);
13482 	  emit_insn (gen_sse_storehps (m, op1));
13483 	}
13484     }
13485   else
13486     gcc_unreachable ();
13487 }
13488 
13489 /* Expand a push in MODE.  This is some mode for which we do not support
13490    proper push instructions, at least from the registers that we expect
13491    the value to live in.  */
13492 
13493 void
13494 ix86_expand_push (enum machine_mode mode, rtx x)
13495 {
13496   rtx tmp;
13497 
13498   tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
13499 			     GEN_INT (-GET_MODE_SIZE (mode)),
13500 			     stack_pointer_rtx, 1, OPTAB_DIRECT);
13501   if (tmp != stack_pointer_rtx)
13502     emit_move_insn (stack_pointer_rtx, tmp);
13503 
13504   tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
13505 
13506   /* When we push an operand onto stack, it has to be aligned at least
13507      at the function argument boundary.  However since we don't have
13508      the argument type, we can't determine the actual argument
13509      boundary.  */
13510   emit_move_insn (tmp, x);
13511 }
13512 
13513 /* Helper function of ix86_fixup_binary_operands to canonicalize
13514    operand order.  Returns true if the operands should be swapped.  */
13515 
13516 static bool
13517 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
13518 			     rtx operands[])
13519 {
13520   rtx dst = operands[0];
13521   rtx src1 = operands[1];
13522   rtx src2 = operands[2];
13523 
13524   /* If the operation is not commutative, we can't do anything.  */
13525   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
13526     return false;
13527 
13528   /* Highest priority is that src1 should match dst.  */
13529   if (rtx_equal_p (dst, src1))
13530     return false;
13531   if (rtx_equal_p (dst, src2))
13532     return true;
13533 
13534   /* Next highest priority is that immediate constants come second.  */
13535   if (immediate_operand (src2, mode))
13536     return false;
13537   if (immediate_operand (src1, mode))
13538     return true;
13539 
13540   /* Lowest priority is that memory references should come second.  */
13541   if (MEM_P (src2))
13542     return false;
13543   if (MEM_P (src1))
13544     return true;
13545 
13546   return false;
13547 }
13548 
13549 
13550 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
13551    destination to use for the operation.  If different from the true
13552    destination in operands[0], a copy operation will be required.  */
13553 
13554 rtx
13555 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
13556 			    rtx operands[])
13557 {
13558   rtx dst = operands[0];
13559   rtx src1 = operands[1];
13560   rtx src2 = operands[2];
13561 
13562   /* Canonicalize operand order.  */
13563   if (ix86_swap_binary_operands_p (code, mode, operands))
13564     {
13565       rtx temp;
13566 
13567       /* It is invalid to swap operands of different modes.  */
13568       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
13569 
13570       temp = src1;
13571       src1 = src2;
13572       src2 = temp;
13573     }
13574 
13575   /* Both source operands cannot be in memory.  */
13576   if (MEM_P (src1) && MEM_P (src2))
13577     {
13578       /* Optimization: Only read from memory once.  */
13579       if (rtx_equal_p (src1, src2))
13580 	{
13581 	  src2 = force_reg (mode, src2);
13582 	  src1 = src2;
13583 	}
13584       else
13585 	src2 = force_reg (mode, src2);
13586     }
13587 
13588   /* If the destination is memory, and we do not have matching source
13589      operands, do things in registers.  */
13590   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
13591     dst = gen_reg_rtx (mode);
13592 
13593   /* Source 1 cannot be a constant.  */
13594   if (CONSTANT_P (src1))
13595     src1 = force_reg (mode, src1);
13596 
13597   /* Source 1 cannot be a non-matching memory.  */
13598   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
13599     src1 = force_reg (mode, src1);
13600 
13601   operands[1] = src1;
13602   operands[2] = src2;
13603   return dst;
13604 }
13605 
13606 /* Similarly, but assume that the destination has already been
13607    set up properly.  */
13608 
13609 void
13610 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
13611 				    enum machine_mode mode, rtx operands[])
13612 {
13613   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
13614   gcc_assert (dst == operands[0]);
13615 }
13616 
13617 /* Attempt to expand a binary operator.  Make the expansion closer to the
13618    actual machine, then just general_operand, which will allow 3 separate
13619    memory references (one output, two input) in a single insn.  */
13620 
13621 void
13622 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
13623 			     rtx operands[])
13624 {
13625   rtx src1, src2, dst, op, clob;
13626 
13627   dst = ix86_fixup_binary_operands (code, mode, operands);
13628   src1 = operands[1];
13629   src2 = operands[2];
13630 
13631  /* Emit the instruction.  */
13632 
13633   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
13634   if (reload_in_progress)
13635     {
13636       /* Reload doesn't know about the flags register, and doesn't know that
13637          it doesn't want to clobber it.  We can only do this with PLUS.  */
13638       gcc_assert (code == PLUS);
13639       emit_insn (op);
13640     }
13641   else
13642     {
13643       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
13644       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
13645     }
13646 
13647   /* Fix up the destination if needed.  */
13648   if (dst != operands[0])
13649     emit_move_insn (operands[0], dst);
13650 }
13651 
13652 /* Return TRUE or FALSE depending on whether the binary operator meets the
13653    appropriate constraints.  */
13654 
13655 int
13656 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
13657 			 rtx operands[3])
13658 {
13659   rtx dst = operands[0];
13660   rtx src1 = operands[1];
13661   rtx src2 = operands[2];
13662 
13663   /* Both source operands cannot be in memory.  */
13664   if (MEM_P (src1) && MEM_P (src2))
13665     return 0;
13666 
13667   /* Canonicalize operand order for commutative operators.  */
13668   if (ix86_swap_binary_operands_p (code, mode, operands))
13669     {
13670       rtx temp = src1;
13671       src1 = src2;
13672       src2 = temp;
13673     }
13674 
13675   /* If the destination is memory, we must have a matching source operand.  */
13676   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
13677       return 0;
13678 
13679   /* Source 1 cannot be a constant.  */
13680   if (CONSTANT_P (src1))
13681     return 0;
13682 
13683   /* Source 1 cannot be a non-matching memory.  */
13684   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
13685     return 0;
13686 
13687   return 1;
13688 }
13689 
13690 /* Attempt to expand a unary operator.  Make the expansion closer to the
13691    actual machine, then just general_operand, which will allow 2 separate
13692    memory references (one output, one input) in a single insn.  */
13693 
13694 void
13695 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
13696 			    rtx operands[])
13697 {
13698   int matching_memory;
13699   rtx src, dst, op, clob;
13700 
13701   dst = operands[0];
13702   src = operands[1];
13703 
13704   /* If the destination is memory, and we do not have matching source
13705      operands, do things in registers.  */
13706   matching_memory = 0;
13707   if (MEM_P (dst))
13708     {
13709       if (rtx_equal_p (dst, src))
13710 	matching_memory = 1;
13711       else
13712 	dst = gen_reg_rtx (mode);
13713     }
13714 
13715   /* When source operand is memory, destination must match.  */
13716   if (MEM_P (src) && !matching_memory)
13717     src = force_reg (mode, src);
13718 
13719   /* Emit the instruction.  */
13720 
13721   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
13722   if (reload_in_progress || code == NOT)
13723     {
13724       /* Reload doesn't know about the flags register, and doesn't know that
13725          it doesn't want to clobber it.  */
13726       gcc_assert (code == NOT);
13727       emit_insn (op);
13728     }
13729   else
13730     {
13731       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
13732       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
13733     }
13734 
13735   /* Fix up the destination if needed.  */
13736   if (dst != operands[0])
13737     emit_move_insn (operands[0], dst);
13738 }
13739 
13740 #define LEA_SEARCH_THRESHOLD 12
13741 
13742 /* Search backward for non-agu definition of register number REGNO1
13743    or register number REGNO2 in INSN's basic block until
13744    1. Pass LEA_SEARCH_THRESHOLD instructions, or
13745    2. Reach BB boundary, or
13746    3. Reach agu definition.
13747    Returns the distance between the non-agu definition point and INSN.
13748    If no definition point, returns -1.  */
13749 
13750 static int
13751 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
13752 			 rtx insn)
13753 {
13754   basic_block bb = BLOCK_FOR_INSN (insn);
13755   int distance = 0;
13756   df_ref *def_rec;
13757   enum attr_type insn_type;
13758 
13759   if (insn != BB_HEAD (bb))
13760     {
13761       rtx prev = PREV_INSN (insn);
13762       while (prev && distance < LEA_SEARCH_THRESHOLD)
13763 	{
13764 	  if (NONDEBUG_INSN_P (prev))
13765 	    {
13766 	      distance++;
13767               for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
13768                 if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF
13769                     && !DF_REF_IS_ARTIFICIAL (*def_rec)
13770                     && (regno1 == DF_REF_REGNO (*def_rec)
13771 			|| regno2 == DF_REF_REGNO (*def_rec)))
13772 		  {
13773 		    insn_type = get_attr_type (prev);
13774 		    if (insn_type != TYPE_LEA)
13775 		      goto done;
13776 		  }
13777 	    }
13778 	  if (prev == BB_HEAD (bb))
13779 	    break;
13780 	  prev = PREV_INSN (prev);
13781 	}
13782     }
13783 
13784   if (distance < LEA_SEARCH_THRESHOLD)
13785     {
13786       edge e;
13787       edge_iterator ei;
13788       bool simple_loop = false;
13789 
13790       FOR_EACH_EDGE (e, ei, bb->preds)
13791 	if (e->src == bb)
13792 	  {
13793 	    simple_loop = true;
13794 	    break;
13795 	  }
13796 
13797       if (simple_loop)
13798 	{
13799 	  rtx prev = BB_END (bb);
13800 	  while (prev
13801 		 && prev != insn
13802 		 && distance < LEA_SEARCH_THRESHOLD)
13803 	    {
13804 	      if (NONDEBUG_INSN_P (prev))
13805 		{
13806 		  distance++;
13807 		  for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
13808 		    if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF
13809 			&& !DF_REF_IS_ARTIFICIAL (*def_rec)
13810 			&& (regno1 == DF_REF_REGNO (*def_rec)
13811 			    || regno2 == DF_REF_REGNO (*def_rec)))
13812 		      {
13813 			insn_type = get_attr_type (prev);
13814 			if (insn_type != TYPE_LEA)
13815 			  goto done;
13816 		      }
13817 		}
13818 	      prev = PREV_INSN (prev);
13819 	    }
13820 	}
13821     }
13822 
13823   distance = -1;
13824 
13825 done:
13826   /* get_attr_type may modify recog data.  We want to make sure
13827      that recog data is valid for instruction INSN, on which
13828      distance_non_agu_define is called.  INSN is unchanged here.  */
13829   extract_insn_cached (insn);
13830   return distance;
13831 }
13832 
13833 /* Return the distance between INSN and the next insn that uses
13834    register number REGNO0 in memory address.  Return -1 if no such
13835    a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set.  */
13836 
13837 static int
13838 distance_agu_use (unsigned int regno0, rtx insn)
13839 {
13840   basic_block bb = BLOCK_FOR_INSN (insn);
13841   int distance = 0;
13842   df_ref *def_rec;
13843   df_ref *use_rec;
13844 
13845   if (insn != BB_END (bb))
13846     {
13847       rtx next = NEXT_INSN (insn);
13848       while (next && distance < LEA_SEARCH_THRESHOLD)
13849 	{
13850 	  if (NONDEBUG_INSN_P (next))
13851 	    {
13852 	      distance++;
13853 
13854 	      for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
13855 		if ((DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_LOAD
13856 		     || DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_STORE)
13857 		    && regno0 == DF_REF_REGNO (*use_rec))
13858 		  {
13859 		    /* Return DISTANCE if OP0 is used in memory
13860 		       address in NEXT.  */
13861 		    return distance;
13862 		  }
13863 
13864 	      for (def_rec = DF_INSN_DEFS (next); *def_rec; def_rec++)
13865 		if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF
13866 		    && !DF_REF_IS_ARTIFICIAL (*def_rec)
13867 		    && regno0 == DF_REF_REGNO (*def_rec))
13868 		  {
13869 		    /* Return -1 if OP0 is set in NEXT.  */
13870 		    return -1;
13871 		  }
13872 	    }
13873 	  if (next == BB_END (bb))
13874 	    break;
13875 	  next = NEXT_INSN (next);
13876 	}
13877     }
13878 
13879   if (distance < LEA_SEARCH_THRESHOLD)
13880     {
13881       edge e;
13882       edge_iterator ei;
13883       bool simple_loop = false;
13884 
13885       FOR_EACH_EDGE (e, ei, bb->succs)
13886         if (e->dest == bb)
13887 	  {
13888 	    simple_loop = true;
13889 	    break;
13890 	  }
13891 
13892       if (simple_loop)
13893 	{
13894 	  rtx next = BB_HEAD (bb);
13895 	  while (next
13896 		 && next != insn
13897 		 && distance < LEA_SEARCH_THRESHOLD)
13898 	    {
13899 	      if (NONDEBUG_INSN_P (next))
13900 		{
13901 		  distance++;
13902 
13903 		  for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
13904 		    if ((DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_LOAD
13905 			 || DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_STORE)
13906 			&& regno0 == DF_REF_REGNO (*use_rec))
13907 		      {
13908 			/* Return DISTANCE if OP0 is used in memory
13909 			   address in NEXT.  */
13910 			return distance;
13911 		      }
13912 
13913 		  for (def_rec = DF_INSN_DEFS (next); *def_rec; def_rec++)
13914 		    if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF
13915 			&& !DF_REF_IS_ARTIFICIAL (*def_rec)
13916 			&& regno0 == DF_REF_REGNO (*def_rec))
13917 		      {
13918 			/* Return -1 if OP0 is set in NEXT.  */
13919 			return -1;
13920 		      }
13921 
13922 		}
13923 	      next = NEXT_INSN (next);
13924 	    }
13925 	}
13926     }
13927 
13928   return -1;
13929 }
13930 
13931 /* Define this macro to tune LEA priority vs ADD, it take effect when
13932    there is a dilemma of choicing LEA or ADD
13933    Negative value: ADD is more preferred than LEA
13934    Zero: Netrual
13935    Positive value: LEA is more preferred than ADD*/
13936 #define IX86_LEA_PRIORITY 2
13937 
13938 /* Return true if it is ok to optimize an ADD operation to LEA
13939    operation to avoid flag register consumation.  For the processors
13940    like ATOM, if the destination register of LEA holds an actual
13941    address which will be used soon, LEA is better and otherwise ADD
13942    is better.  */
13943 
13944 bool
13945 ix86_lea_for_add_ok (enum rtx_code code ATTRIBUTE_UNUSED,
13946                      rtx insn, rtx operands[])
13947 {
13948   unsigned int regno0 = true_regnum (operands[0]);
13949   unsigned int regno1 = true_regnum (operands[1]);
13950   unsigned int regno2;
13951 
13952   if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
13953     return regno0 != regno1;
13954 
13955   regno2 = true_regnum (operands[2]);
13956 
13957   /* If a = b + c, (a!=b && a!=c), must use lea form. */
13958   if (regno0 != regno1 && regno0 != regno2)
13959     return true;
13960   else
13961     {
13962       int dist_define, dist_use;
13963       dist_define = distance_non_agu_define (regno1, regno2, insn);
13964       if (dist_define <= 0)
13965         return true;
13966 
13967       /* If this insn has both backward non-agu dependence and forward
13968          agu dependence, the one with short distance take effect. */
13969       dist_use = distance_agu_use (regno0, insn);
13970       if (dist_use <= 0
13971 	  || (dist_define + IX86_LEA_PRIORITY) < dist_use)
13972         return false;
13973 
13974       return true;
13975     }
13976 }
13977 
13978 /* Return true if destination reg of SET_BODY is shift count of
13979    USE_BODY.  */
13980 
13981 static bool
13982 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
13983 {
13984   rtx set_dest;
13985   rtx shift_rtx;
13986   int i;
13987 
13988   /* Retrieve destination of SET_BODY.  */
13989   switch (GET_CODE (set_body))
13990     {
13991     case SET:
13992       set_dest = SET_DEST (set_body);
13993       if (!set_dest || !REG_P (set_dest))
13994 	return false;
13995       break;
13996     case PARALLEL:
13997       for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
13998 	if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
13999 					  use_body))
14000 	  return true;
14001     default:
14002       return false;
14003       break;
14004     }
14005 
14006   /* Retrieve shift count of USE_BODY.  */
14007   switch (GET_CODE (use_body))
14008     {
14009     case SET:
14010       shift_rtx = XEXP (use_body, 1);
14011       break;
14012     case PARALLEL:
14013       for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
14014 	if (ix86_dep_by_shift_count_body (set_body,
14015 					  XVECEXP (use_body, 0, i)))
14016 	  return true;
14017     default:
14018       return false;
14019       break;
14020     }
14021 
14022   if (shift_rtx
14023       && (GET_CODE (shift_rtx) == ASHIFT
14024 	  || GET_CODE (shift_rtx) == LSHIFTRT
14025 	  || GET_CODE (shift_rtx) == ASHIFTRT
14026 	  || GET_CODE (shift_rtx) == ROTATE
14027 	  || GET_CODE (shift_rtx) == ROTATERT))
14028     {
14029       rtx shift_count = XEXP (shift_rtx, 1);
14030 
14031       /* Return true if shift count is dest of SET_BODY.  */
14032       if (REG_P (shift_count)
14033 	  && true_regnum (set_dest) == true_regnum (shift_count))
14034 	return true;
14035     }
14036 
14037   return false;
14038 }
14039 
14040 /* Return true if destination reg of SET_INSN is shift count of
14041    USE_INSN.  */
14042 
14043 bool
14044 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
14045 {
14046   return ix86_dep_by_shift_count_body (PATTERN (set_insn),
14047 				       PATTERN (use_insn));
14048 }
14049 
14050 /* Return TRUE or FALSE depending on whether the unary operator meets the
14051    appropriate constraints.  */
14052 
14053 int
14054 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
14055 			enum machine_mode mode ATTRIBUTE_UNUSED,
14056 			rtx operands[2] ATTRIBUTE_UNUSED)
14057 {
14058   /* If one of operands is memory, source and destination must match.  */
14059   if ((MEM_P (operands[0])
14060        || MEM_P (operands[1]))
14061       && ! rtx_equal_p (operands[0], operands[1]))
14062     return FALSE;
14063   return TRUE;
14064 }
14065 
14066 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
14067    are ok, keeping in mind the possible movddup alternative.  */
14068 
14069 bool
14070 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
14071 {
14072   if (MEM_P (operands[0]))
14073     return rtx_equal_p (operands[0], operands[1 + high]);
14074   if (MEM_P (operands[1]) && MEM_P (operands[2]))
14075     return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
14076   return true;
14077 }
14078 
14079 /* Post-reload splitter for converting an SF or DFmode value in an
14080    SSE register into an unsigned SImode.  */
14081 
14082 void
14083 ix86_split_convert_uns_si_sse (rtx operands[])
14084 {
14085   enum machine_mode vecmode;
14086   rtx value, large, zero_or_two31, input, two31, x;
14087 
14088   large = operands[1];
14089   zero_or_two31 = operands[2];
14090   input = operands[3];
14091   two31 = operands[4];
14092   vecmode = GET_MODE (large);
14093   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
14094 
14095   /* Load up the value into the low element.  We must ensure that the other
14096      elements are valid floats -- zero is the easiest such value.  */
14097   if (MEM_P (input))
14098     {
14099       if (vecmode == V4SFmode)
14100 	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
14101       else
14102 	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
14103     }
14104   else
14105     {
14106       input = gen_rtx_REG (vecmode, REGNO (input));
14107       emit_move_insn (value, CONST0_RTX (vecmode));
14108       if (vecmode == V4SFmode)
14109 	emit_insn (gen_sse_movss (value, value, input));
14110       else
14111 	emit_insn (gen_sse2_movsd (value, value, input));
14112     }
14113 
14114   emit_move_insn (large, two31);
14115   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
14116 
14117   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
14118   emit_insn (gen_rtx_SET (VOIDmode, large, x));
14119 
14120   x = gen_rtx_AND (vecmode, zero_or_two31, large);
14121   emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
14122 
14123   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
14124   emit_insn (gen_rtx_SET (VOIDmode, value, x));
14125 
14126   large = gen_rtx_REG (V4SImode, REGNO (large));
14127   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
14128 
14129   x = gen_rtx_REG (V4SImode, REGNO (value));
14130   if (vecmode == V4SFmode)
14131     emit_insn (gen_sse2_cvttps2dq (x, value));
14132   else
14133     emit_insn (gen_sse2_cvttpd2dq (x, value));
14134   value = x;
14135 
14136   emit_insn (gen_xorv4si3 (value, value, large));
14137 }
14138 
14139 /* Convert an unsigned DImode value into a DFmode, using only SSE.
14140    Expects the 64-bit DImode to be supplied in a pair of integral
14141    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
14142    -mfpmath=sse, !optimize_size only.  */
14143 
14144 void
14145 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
14146 {
14147   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
14148   rtx int_xmm, fp_xmm;
14149   rtx biases, exponents;
14150   rtx x;
14151 
14152   int_xmm = gen_reg_rtx (V4SImode);
14153   if (TARGET_INTER_UNIT_MOVES)
14154     emit_insn (gen_movdi_to_sse (int_xmm, input));
14155   else if (TARGET_SSE_SPLIT_REGS)
14156     {
14157       emit_clobber (int_xmm);
14158       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
14159     }
14160   else
14161     {
14162       x = gen_reg_rtx (V2DImode);
14163       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
14164       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
14165     }
14166 
14167   x = gen_rtx_CONST_VECTOR (V4SImode,
14168 			    gen_rtvec (4, GEN_INT (0x43300000UL),
14169 				       GEN_INT (0x45300000UL),
14170 				       const0_rtx, const0_rtx));
14171   exponents = validize_mem (force_const_mem (V4SImode, x));
14172 
14173   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
14174   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
14175 
14176   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
14177      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
14178      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
14179      (0x1.0p84 + double(fp_value_hi_xmm)).
14180      Note these exponents differ by 32.  */
14181 
14182   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
14183 
14184   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
14185      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
14186   real_ldexp (&bias_lo_rvt, &dconst1, 52);
14187   real_ldexp (&bias_hi_rvt, &dconst1, 84);
14188   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
14189   x = const_double_from_real_value (bias_hi_rvt, DFmode);
14190   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
14191   biases = validize_mem (force_const_mem (V2DFmode, biases));
14192   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
14193 
14194   /* Add the upper and lower DFmode values together.  */
14195   if (TARGET_SSE3)
14196     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
14197   else
14198     {
14199       x = copy_to_mode_reg (V2DFmode, fp_xmm);
14200       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
14201       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
14202     }
14203 
14204   ix86_expand_vector_extract (false, target, fp_xmm, 0);
14205 }
14206 
14207 /* Not used, but eases macroization of patterns.  */
14208 void
14209 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
14210 				  rtx input ATTRIBUTE_UNUSED)
14211 {
14212   gcc_unreachable ();
14213 }
14214 
14215 /* Convert an unsigned SImode value into a DFmode.  Only currently used
14216    for SSE, but applicable anywhere.  */
14217 
14218 void
14219 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
14220 {
14221   REAL_VALUE_TYPE TWO31r;
14222   rtx x, fp;
14223 
14224   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
14225 			   NULL, 1, OPTAB_DIRECT);
14226 
14227   fp = gen_reg_rtx (DFmode);
14228   emit_insn (gen_floatsidf2 (fp, x));
14229 
14230   real_ldexp (&TWO31r, &dconst1, 31);
14231   x = const_double_from_real_value (TWO31r, DFmode);
14232 
14233   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
14234   if (x != target)
14235     emit_move_insn (target, x);
14236 }
14237 
14238 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
14239    32-bit mode; otherwise we have a direct convert instruction.  */
14240 
14241 void
14242 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
14243 {
14244   REAL_VALUE_TYPE TWO32r;
14245   rtx fp_lo, fp_hi, x;
14246 
14247   fp_lo = gen_reg_rtx (DFmode);
14248   fp_hi = gen_reg_rtx (DFmode);
14249 
14250   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
14251 
14252   real_ldexp (&TWO32r, &dconst1, 32);
14253   x = const_double_from_real_value (TWO32r, DFmode);
14254   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
14255 
14256   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
14257 
14258   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
14259 			   0, OPTAB_DIRECT);
14260   if (x != target)
14261     emit_move_insn (target, x);
14262 }
14263 
14264 /* Convert an unsigned SImode value into a SFmode, using only SSE.
14265    For x86_32, -mfpmath=sse, !optimize_size only.  */
14266 void
14267 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
14268 {
14269   REAL_VALUE_TYPE ONE16r;
14270   rtx fp_hi, fp_lo, int_hi, int_lo, x;
14271 
14272   real_ldexp (&ONE16r, &dconst1, 16);
14273   x = const_double_from_real_value (ONE16r, SFmode);
14274   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
14275 				      NULL, 0, OPTAB_DIRECT);
14276   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
14277 				      NULL, 0, OPTAB_DIRECT);
14278   fp_hi = gen_reg_rtx (SFmode);
14279   fp_lo = gen_reg_rtx (SFmode);
14280   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
14281   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
14282   fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
14283 			       0, OPTAB_DIRECT);
14284   fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
14285 			       0, OPTAB_DIRECT);
14286   if (!rtx_equal_p (target, fp_hi))
14287     emit_move_insn (target, fp_hi);
14288 }
14289 
14290 /* A subroutine of ix86_build_signbit_mask.  If VECT is true,
14291    then replicate the value for all elements of the vector
14292    register.  */
14293 
14294 rtx
14295 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
14296 {
14297   rtvec v;
14298   switch (mode)
14299     {
14300     case SImode:
14301       gcc_assert (vect);
14302       v = gen_rtvec (4, value, value, value, value);
14303       return gen_rtx_CONST_VECTOR (V4SImode, v);
14304 
14305     case DImode:
14306       gcc_assert (vect);
14307       v = gen_rtvec (2, value, value);
14308       return gen_rtx_CONST_VECTOR (V2DImode, v);
14309 
14310     case SFmode:
14311       if (vect)
14312 	v = gen_rtvec (4, value, value, value, value);
14313       else
14314 	v = gen_rtvec (4, value, CONST0_RTX (SFmode),
14315 		       CONST0_RTX (SFmode), CONST0_RTX (SFmode));
14316       return gen_rtx_CONST_VECTOR (V4SFmode, v);
14317 
14318     case DFmode:
14319       if (vect)
14320 	v = gen_rtvec (2, value, value);
14321       else
14322 	v = gen_rtvec (2, value, CONST0_RTX (DFmode));
14323       return gen_rtx_CONST_VECTOR (V2DFmode, v);
14324 
14325     default:
14326       gcc_unreachable ();
14327     }
14328 }
14329 
14330 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
14331    and ix86_expand_int_vcond.  Create a mask for the sign bit in MODE
14332    for an SSE register.  If VECT is true, then replicate the mask for
14333    all elements of the vector register.  If INVERT is true, then create
14334    a mask excluding the sign bit.  */
14335 
14336 rtx
14337 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
14338 {
14339   enum machine_mode vec_mode, imode;
14340   HOST_WIDE_INT hi, lo;
14341   int shift = 63;
14342   rtx v;
14343   rtx mask;
14344 
14345   /* Find the sign bit, sign extended to 2*HWI.  */
14346   switch (mode)
14347     {
14348     case SImode:
14349     case SFmode:
14350       imode = SImode;
14351       vec_mode = (mode == SImode) ? V4SImode : V4SFmode;
14352       lo = 0x80000000, hi = lo < 0;
14353       break;
14354 
14355     case DImode:
14356     case DFmode:
14357       imode = DImode;
14358       vec_mode = (mode == DImode) ? V2DImode : V2DFmode;
14359       if (HOST_BITS_PER_WIDE_INT >= 64)
14360 	lo = (HOST_WIDE_INT)1 << shift, hi = -1;
14361       else
14362 	lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
14363       break;
14364 
14365     case TImode:
14366     case TFmode:
14367       vec_mode = VOIDmode;
14368       if (HOST_BITS_PER_WIDE_INT >= 64)
14369 	{
14370 	  imode = TImode;
14371 	  lo = 0, hi = (HOST_WIDE_INT)1 << shift;
14372 	}
14373       else
14374 	{
14375 	  rtvec vec;
14376 
14377 	  imode = DImode;
14378 	  lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
14379 
14380 	  if (invert)
14381 	    {
14382 	      lo = ~lo, hi = ~hi;
14383 	      v = constm1_rtx;
14384 	    }
14385 	  else
14386 	    v = const0_rtx;
14387 
14388 	  mask = immed_double_const (lo, hi, imode);
14389 
14390 	  vec = gen_rtvec (2, v, mask);
14391 	  v = gen_rtx_CONST_VECTOR (V2DImode, vec);
14392 	  v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
14393 
14394 	  return v;
14395 	}
14396      break;
14397 
14398     default:
14399       gcc_unreachable ();
14400     }
14401 
14402   if (invert)
14403     lo = ~lo, hi = ~hi;
14404 
14405   /* Force this value into the low part of a fp vector constant.  */
14406   mask = immed_double_const (lo, hi, imode);
14407   mask = gen_lowpart (mode, mask);
14408 
14409   if (vec_mode == VOIDmode)
14410     return force_reg (mode, mask);
14411 
14412   v = ix86_build_const_vector (mode, vect, mask);
14413   return force_reg (vec_mode, v);
14414 }
14415 
14416 /* Generate code for floating point ABS or NEG.  */
14417 
14418 void
14419 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
14420 				rtx operands[])
14421 {
14422   rtx mask, set, use, clob, dst, src;
14423   bool use_sse = false;
14424   bool vector_mode = VECTOR_MODE_P (mode);
14425   enum machine_mode elt_mode = mode;
14426 
14427   if (vector_mode)
14428     {
14429       elt_mode = GET_MODE_INNER (mode);
14430       use_sse = true;
14431     }
14432   else if (mode == TFmode)
14433     use_sse = true;
14434   else if (TARGET_SSE_MATH)
14435     use_sse = SSE_FLOAT_MODE_P (mode);
14436 
14437   /* NEG and ABS performed with SSE use bitwise mask operations.
14438      Create the appropriate mask now.  */
14439   if (use_sse)
14440     mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS);
14441   else
14442     mask = NULL_RTX;
14443 
14444   dst = operands[0];
14445   src = operands[1];
14446 
14447   if (vector_mode)
14448     {
14449       set = gen_rtx_fmt_ee (code == NEG ? XOR : AND, mode, src, mask);
14450       set = gen_rtx_SET (VOIDmode, dst, set);
14451       emit_insn (set);
14452     }
14453   else
14454     {
14455       set = gen_rtx_fmt_e (code, mode, src);
14456       set = gen_rtx_SET (VOIDmode, dst, set);
14457       if (mask)
14458         {
14459           use = gen_rtx_USE (VOIDmode, mask);
14460           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
14461           emit_insn (gen_rtx_PARALLEL (VOIDmode,
14462 				       gen_rtvec (3, set, use, clob)));
14463         }
14464       else
14465 	emit_insn (set);
14466     }
14467 }
14468 
14469 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
14470 
14471 void
14472 ix86_expand_copysign (rtx operands[])
14473 {
14474   enum machine_mode mode;
14475   rtx dest, op0, op1, mask, nmask;
14476 
14477   dest = operands[0];
14478   op0 = operands[1];
14479   op1 = operands[2];
14480 
14481   mode = GET_MODE (dest);
14482 
14483   if (GET_CODE (op0) == CONST_DOUBLE)
14484     {
14485       rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
14486 
14487       if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
14488 	op0 = simplify_unary_operation (ABS, mode, op0, mode);
14489 
14490       if (mode == SFmode || mode == DFmode)
14491 	{
14492 	  enum machine_mode vmode;
14493 
14494 	  vmode = mode == SFmode ? V4SFmode : V2DFmode;
14495 
14496 	  if (op0 == CONST0_RTX (mode))
14497 	    op0 = CONST0_RTX (vmode);
14498 	  else
14499 	    {
14500 	      rtx v = ix86_build_const_vector (mode, false, op0);
14501 
14502 	      op0 = force_reg (vmode, v);
14503 	    }
14504 	}
14505       else if (op0 != CONST0_RTX (mode))
14506 	op0 = force_reg (mode, op0);
14507 
14508       mask = ix86_build_signbit_mask (mode, 0, 0);
14509 
14510       if (mode == SFmode)
14511 	copysign_insn = gen_copysignsf3_const;
14512       else if (mode == DFmode)
14513 	copysign_insn = gen_copysigndf3_const;
14514       else
14515 	copysign_insn = gen_copysigntf3_const;
14516 
14517 	emit_insn (copysign_insn (dest, op0, op1, mask));
14518     }
14519   else
14520     {
14521       rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
14522 
14523       nmask = ix86_build_signbit_mask (mode, 0, 1);
14524       mask = ix86_build_signbit_mask (mode, 0, 0);
14525 
14526       if (mode == SFmode)
14527 	copysign_insn = gen_copysignsf3_var;
14528       else if (mode == DFmode)
14529 	copysign_insn = gen_copysigndf3_var;
14530       else
14531 	copysign_insn = gen_copysigntf3_var;
14532 
14533       emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
14534     }
14535 }
14536 
14537 /* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
14538    be a constant, and so has already been expanded into a vector constant.  */
14539 
14540 void
14541 ix86_split_copysign_const (rtx operands[])
14542 {
14543   enum machine_mode mode, vmode;
14544   rtx dest, op0, mask, x;
14545 
14546   dest = operands[0];
14547   op0 = operands[1];
14548   mask = operands[3];
14549 
14550   mode = GET_MODE (dest);
14551   vmode = GET_MODE (mask);
14552 
14553   dest = simplify_gen_subreg (vmode, dest, mode, 0);
14554   x = gen_rtx_AND (vmode, dest, mask);
14555   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
14556 
14557   if (op0 != CONST0_RTX (vmode))
14558     {
14559       x = gen_rtx_IOR (vmode, dest, op0);
14560       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
14561     }
14562 }
14563 
14564 /* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
14565    so we have to do two masks.  */
14566 
14567 void
14568 ix86_split_copysign_var (rtx operands[])
14569 {
14570   enum machine_mode mode, vmode;
14571   rtx dest, scratch, op0, op1, mask, nmask, x;
14572 
14573   dest = operands[0];
14574   scratch = operands[1];
14575   op0 = operands[2];
14576   op1 = operands[3];
14577   nmask = operands[4];
14578   mask = operands[5];
14579 
14580   mode = GET_MODE (dest);
14581   vmode = GET_MODE (mask);
14582 
14583   if (rtx_equal_p (op0, op1))
14584     {
14585       /* Shouldn't happen often (it's useless, obviously), but when it does
14586 	 we'd generate incorrect code if we continue below.  */
14587       emit_move_insn (dest, op0);
14588       return;
14589     }
14590 
14591   if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
14592     {
14593       gcc_assert (REGNO (op1) == REGNO (scratch));
14594 
14595       x = gen_rtx_AND (vmode, scratch, mask);
14596       emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
14597 
14598       dest = mask;
14599       op0 = simplify_gen_subreg (vmode, op0, mode, 0);
14600       x = gen_rtx_NOT (vmode, dest);
14601       x = gen_rtx_AND (vmode, x, op0);
14602       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
14603     }
14604   else
14605     {
14606       if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
14607 	{
14608 	  x = gen_rtx_AND (vmode, scratch, mask);
14609 	}
14610       else						/* alternative 2,4 */
14611 	{
14612           gcc_assert (REGNO (mask) == REGNO (scratch));
14613           op1 = simplify_gen_subreg (vmode, op1, mode, 0);
14614 	  x = gen_rtx_AND (vmode, scratch, op1);
14615 	}
14616       emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
14617 
14618       if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
14619 	{
14620 	  dest = simplify_gen_subreg (vmode, op0, mode, 0);
14621 	  x = gen_rtx_AND (vmode, dest, nmask);
14622 	}
14623       else						/* alternative 3,4 */
14624 	{
14625           gcc_assert (REGNO (nmask) == REGNO (dest));
14626 	  dest = nmask;
14627 	  op0 = simplify_gen_subreg (vmode, op0, mode, 0);
14628 	  x = gen_rtx_AND (vmode, dest, op0);
14629 	}
14630       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
14631     }
14632 
14633   x = gen_rtx_IOR (vmode, dest, scratch);
14634   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
14635 }
14636 
14637 /* Return TRUE or FALSE depending on whether the first SET in INSN
14638    has source and destination with matching CC modes, and that the
14639    CC mode is at least as constrained as REQ_MODE.  */
14640 
14641 int
14642 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
14643 {
14644   rtx set;
14645   enum machine_mode set_mode;
14646 
14647   set = PATTERN (insn);
14648   if (GET_CODE (set) == PARALLEL)
14649     set = XVECEXP (set, 0, 0);
14650   gcc_assert (GET_CODE (set) == SET);
14651   gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
14652 
14653   set_mode = GET_MODE (SET_DEST (set));
14654   switch (set_mode)
14655     {
14656     case CCNOmode:
14657       if (req_mode != CCNOmode
14658 	  && (req_mode != CCmode
14659 	      || XEXP (SET_SRC (set), 1) != const0_rtx))
14660 	return 0;
14661       break;
14662     case CCmode:
14663       if (req_mode == CCGCmode)
14664 	return 0;
14665       /* FALLTHRU */
14666     case CCGCmode:
14667       if (req_mode == CCGOCmode || req_mode == CCNOmode)
14668 	return 0;
14669       /* FALLTHRU */
14670     case CCGOCmode:
14671       if (req_mode == CCZmode)
14672 	return 0;
14673       /* FALLTHRU */
14674     case CCZmode:
14675       break;
14676 
14677     case CCAmode:
14678     case CCCmode:
14679     case CCOmode:
14680     case CCSmode:
14681       if (set_mode != req_mode)
14682 	return 0;
14683       break;
14684 
14685     default:
14686       gcc_unreachable ();
14687     }
14688 
14689   return (GET_MODE (SET_SRC (set)) == set_mode);
14690 }
14691 
14692 /* Generate insn patterns to do an integer compare of OPERANDS.  */
14693 
14694 static rtx
14695 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
14696 {
14697   enum machine_mode cmpmode;
14698   rtx tmp, flags;
14699 
14700   cmpmode = SELECT_CC_MODE (code, op0, op1);
14701   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
14702 
14703   /* This is very simple, but making the interface the same as in the
14704      FP case makes the rest of the code easier.  */
14705   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
14706   emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
14707 
14708   /* Return the test that should be put into the flags user, i.e.
14709      the bcc, scc, or cmov instruction.  */
14710   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
14711 }
14712 
14713 /* Figure out whether to use ordered or unordered fp comparisons.
14714    Return the appropriate mode to use.  */
14715 
14716 enum machine_mode
14717 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
14718 {
14719   /* ??? In order to make all comparisons reversible, we do all comparisons
14720      non-trapping when compiling for IEEE.  Once gcc is able to distinguish
14721      all forms trapping and nontrapping comparisons, we can make inequality
14722      comparisons trapping again, since it results in better code when using
14723      FCOM based compares.  */
14724   return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
14725 }
14726 
14727 enum machine_mode
14728 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
14729 {
14730   enum machine_mode mode = GET_MODE (op0);
14731 
14732   if (SCALAR_FLOAT_MODE_P (mode))
14733     {
14734       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
14735       return ix86_fp_compare_mode (code);
14736     }
14737 
14738   switch (code)
14739     {
14740       /* Only zero flag is needed.  */
14741     case EQ:			/* ZF=0 */
14742     case NE:			/* ZF!=0 */
14743       return CCZmode;
14744       /* Codes needing carry flag.  */
14745     case GEU:			/* CF=0 */
14746     case LTU:			/* CF=1 */
14747       /* Detect overflow checks.  They need just the carry flag.  */
14748       if (GET_CODE (op0) == PLUS
14749 	  && rtx_equal_p (op1, XEXP (op0, 0)))
14750 	return CCCmode;
14751       else
14752 	return CCmode;
14753     case GTU:			/* CF=0 & ZF=0 */
14754     case LEU:			/* CF=1 | ZF=1 */
14755       /* Detect overflow checks.  They need just the carry flag.  */
14756       if (GET_CODE (op0) == MINUS
14757 	  && rtx_equal_p (op1, XEXP (op0, 0)))
14758 	return CCCmode;
14759       else
14760 	return CCmode;
14761       /* Codes possibly doable only with sign flag when
14762          comparing against zero.  */
14763     case GE:			/* SF=OF   or   SF=0 */
14764     case LT:			/* SF<>OF  or   SF=1 */
14765       if (op1 == const0_rtx)
14766 	return CCGOCmode;
14767       else
14768 	/* For other cases Carry flag is not required.  */
14769 	return CCGCmode;
14770       /* Codes doable only with sign flag when comparing
14771          against zero, but we miss jump instruction for it
14772          so we need to use relational tests against overflow
14773          that thus needs to be zero.  */
14774     case GT:			/* ZF=0 & SF=OF */
14775     case LE:			/* ZF=1 | SF<>OF */
14776       if (op1 == const0_rtx)
14777 	return CCNOmode;
14778       else
14779 	return CCGCmode;
14780       /* strcmp pattern do (use flags) and combine may ask us for proper
14781 	 mode.  */
14782     case USE:
14783       return CCmode;
14784     default:
14785       gcc_unreachable ();
14786     }
14787 }
14788 
14789 /* Return the fixed registers used for condition codes.  */
14790 
14791 static bool
14792 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
14793 {
14794   *p1 = FLAGS_REG;
14795   *p2 = FPSR_REG;
14796   return true;
14797 }
14798 
14799 /* If two condition code modes are compatible, return a condition code
14800    mode which is compatible with both.  Otherwise, return
14801    VOIDmode.  */
14802 
14803 static enum machine_mode
14804 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
14805 {
14806   if (m1 == m2)
14807     return m1;
14808 
14809   if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
14810     return VOIDmode;
14811 
14812   if ((m1 == CCGCmode && m2 == CCGOCmode)
14813       || (m1 == CCGOCmode && m2 == CCGCmode))
14814     return CCGCmode;
14815 
14816   switch (m1)
14817     {
14818     default:
14819       gcc_unreachable ();
14820 
14821     case CCmode:
14822     case CCGCmode:
14823     case CCGOCmode:
14824     case CCNOmode:
14825     case CCAmode:
14826     case CCCmode:
14827     case CCOmode:
14828     case CCSmode:
14829     case CCZmode:
14830       switch (m2)
14831 	{
14832 	default:
14833 	  return VOIDmode;
14834 
14835 	case CCmode:
14836 	case CCGCmode:
14837 	case CCGOCmode:
14838 	case CCNOmode:
14839 	case CCAmode:
14840 	case CCCmode:
14841 	case CCOmode:
14842 	case CCSmode:
14843 	case CCZmode:
14844 	  return CCmode;
14845 	}
14846 
14847     case CCFPmode:
14848     case CCFPUmode:
14849       /* These are only compatible with themselves, which we already
14850 	 checked above.  */
14851       return VOIDmode;
14852     }
14853 }
14854 
14855 
14856 /* Return a comparison we can do and that it is equivalent to
14857    swap_condition (code) apart possibly from orderedness.
14858    But, never change orderedness if TARGET_IEEE_FP, returning
14859    UNKNOWN in that case if necessary.  */
14860 
14861 static enum rtx_code
14862 ix86_fp_swap_condition (enum rtx_code code)
14863 {
14864   switch (code)
14865     {
14866     case GT:                   /* GTU - CF=0 & ZF=0 */
14867       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
14868     case GE:                   /* GEU - CF=0 */
14869       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
14870     case UNLT:                 /* LTU - CF=1 */
14871       return TARGET_IEEE_FP ? UNKNOWN : GT;
14872     case UNLE:                 /* LEU - CF=1 | ZF=1 */
14873       return TARGET_IEEE_FP ? UNKNOWN : GE;
14874     default:
14875       return swap_condition (code);
14876     }
14877 }
14878 
14879 /* Return cost of comparison CODE using the best strategy for performance.
14880    All following functions do use number of instructions as a cost metrics.
14881    In future this should be tweaked to compute bytes for optimize_size and
14882    take into account performance of various instructions on various CPUs.  */
14883 
14884 static int
14885 ix86_fp_comparison_cost (enum rtx_code code)
14886 {
14887   int arith_cost;
14888 
14889   /* The cost of code using bit-twiddling on %ah.  */
14890   switch (code)
14891     {
14892     case UNLE:
14893     case UNLT:
14894     case LTGT:
14895     case GT:
14896     case GE:
14897     case UNORDERED:
14898     case ORDERED:
14899     case UNEQ:
14900       arith_cost = 4;
14901       break;
14902     case LT:
14903     case NE:
14904     case EQ:
14905     case UNGE:
14906       arith_cost = TARGET_IEEE_FP ? 5 : 4;
14907       break;
14908     case LE:
14909     case UNGT:
14910       arith_cost = TARGET_IEEE_FP ? 6 : 4;
14911       break;
14912     default:
14913       gcc_unreachable ();
14914     }
14915 
14916   switch (ix86_fp_comparison_strategy (code))
14917     {
14918     case IX86_FPCMP_COMI:
14919       return arith_cost > 4 ? 3 : 2;
14920     case IX86_FPCMP_SAHF:
14921       return arith_cost > 4 ? 4 : 3;
14922     default:
14923       return arith_cost;
14924     }
14925 }
14926 
14927 /* Return strategy to use for floating-point.  We assume that fcomi is always
14928    preferrable where available, since that is also true when looking at size
14929    (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test).  */
14930 
14931 enum ix86_fpcmp_strategy
14932 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
14933 {
14934   /* Do fcomi/sahf based test when profitable.  */
14935 
14936   if (TARGET_CMOVE)
14937     return IX86_FPCMP_COMI;
14938 
14939   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
14940     return IX86_FPCMP_SAHF;
14941 
14942   return IX86_FPCMP_ARITH;
14943 }
14944 
14945 /* Swap, force into registers, or otherwise massage the two operands
14946    to a fp comparison.  The operands are updated in place; the new
14947    comparison code is returned.  */
14948 
14949 static enum rtx_code
14950 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
14951 {
14952   enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
14953   rtx op0 = *pop0, op1 = *pop1;
14954   enum machine_mode op_mode = GET_MODE (op0);
14955   int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
14956 
14957   /* All of the unordered compare instructions only work on registers.
14958      The same is true of the fcomi compare instructions.  The XFmode
14959      compare instructions require registers except when comparing
14960      against zero or when converting operand 1 from fixed point to
14961      floating point.  */
14962 
14963   if (!is_sse
14964       && (fpcmp_mode == CCFPUmode
14965 	  || (op_mode == XFmode
14966 	      && ! (standard_80387_constant_p (op0) == 1
14967 		    || standard_80387_constant_p (op1) == 1)
14968 	      && GET_CODE (op1) != FLOAT)
14969 	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
14970     {
14971       op0 = force_reg (op_mode, op0);
14972       op1 = force_reg (op_mode, op1);
14973     }
14974   else
14975     {
14976       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
14977 	 things around if they appear profitable, otherwise force op0
14978 	 into a register.  */
14979 
14980       if (standard_80387_constant_p (op0) == 0
14981 	  || (MEM_P (op0)
14982 	      && ! (standard_80387_constant_p (op1) == 0
14983 		    || MEM_P (op1))))
14984 	{
14985 	  enum rtx_code new_code = ix86_fp_swap_condition (code);
14986 	  if (new_code != UNKNOWN)
14987 	    {
14988 	      rtx tmp;
14989 	      tmp = op0, op0 = op1, op1 = tmp;
14990 	      code = new_code;
14991 	    }
14992 	}
14993 
14994       if (!REG_P (op0))
14995 	op0 = force_reg (op_mode, op0);
14996 
14997       if (CONSTANT_P (op1))
14998 	{
14999 	  int tmp = standard_80387_constant_p (op1);
15000 	  if (tmp == 0)
15001 	    op1 = validize_mem (force_const_mem (op_mode, op1));
15002 	  else if (tmp == 1)
15003 	    {
15004 	      if (TARGET_CMOVE)
15005 		op1 = force_reg (op_mode, op1);
15006 	    }
15007 	  else
15008 	    op1 = force_reg (op_mode, op1);
15009 	}
15010     }
15011 
15012   /* Try to rearrange the comparison to make it cheaper.  */
15013   if (ix86_fp_comparison_cost (code)
15014       > ix86_fp_comparison_cost (swap_condition (code))
15015       && (REG_P (op1) || can_create_pseudo_p ()))
15016     {
15017       rtx tmp;
15018       tmp = op0, op0 = op1, op1 = tmp;
15019       code = swap_condition (code);
15020       if (!REG_P (op0))
15021 	op0 = force_reg (op_mode, op0);
15022     }
15023 
15024   *pop0 = op0;
15025   *pop1 = op1;
15026   return code;
15027 }
15028 
15029 /* Convert comparison codes we use to represent FP comparison to integer
15030    code that will result in proper branch.  Return UNKNOWN if no such code
15031    is available.  */
15032 
15033 enum rtx_code
15034 ix86_fp_compare_code_to_integer (enum rtx_code code)
15035 {
15036   switch (code)
15037     {
15038     case GT:
15039       return GTU;
15040     case GE:
15041       return GEU;
15042     case ORDERED:
15043     case UNORDERED:
15044       return code;
15045       break;
15046     case UNEQ:
15047       return EQ;
15048       break;
15049     case UNLT:
15050       return LTU;
15051       break;
15052     case UNLE:
15053       return LEU;
15054       break;
15055     case LTGT:
15056       return NE;
15057       break;
15058     default:
15059       return UNKNOWN;
15060     }
15061 }
15062 
15063 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
15064 
15065 static rtx
15066 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
15067 {
15068   enum machine_mode fpcmp_mode, intcmp_mode;
15069   rtx tmp, tmp2;
15070 
15071   fpcmp_mode = ix86_fp_compare_mode (code);
15072   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
15073 
15074   /* Do fcomi/sahf based test when profitable.  */
15075   switch (ix86_fp_comparison_strategy (code))
15076     {
15077     case IX86_FPCMP_COMI:
15078       intcmp_mode = fpcmp_mode;
15079       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
15080       tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
15081 			 tmp);
15082       emit_insn (tmp);
15083       break;
15084 
15085     case IX86_FPCMP_SAHF:
15086       intcmp_mode = fpcmp_mode;
15087       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
15088       tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
15089 			 tmp);
15090 
15091       if (!scratch)
15092 	scratch = gen_reg_rtx (HImode);
15093       tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
15094       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
15095       break;
15096 
15097     case IX86_FPCMP_ARITH:
15098       /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first.  */
15099       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
15100       tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
15101       if (!scratch)
15102 	scratch = gen_reg_rtx (HImode);
15103       emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
15104 
15105       /* In the unordered case, we have to check C2 for NaN's, which
15106 	 doesn't happen to work out to anything nice combination-wise.
15107 	 So do some bit twiddling on the value we've got in AH to come
15108 	 up with an appropriate set of condition codes.  */
15109 
15110       intcmp_mode = CCNOmode;
15111       switch (code)
15112 	{
15113 	case GT:
15114 	case UNGT:
15115 	  if (code == GT || !TARGET_IEEE_FP)
15116 	    {
15117 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
15118 	      code = EQ;
15119 	    }
15120 	  else
15121 	    {
15122 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
15123 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
15124 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
15125 	      intcmp_mode = CCmode;
15126 	      code = GEU;
15127 	    }
15128 	  break;
15129 	case LT:
15130 	case UNLT:
15131 	  if (code == LT && TARGET_IEEE_FP)
15132 	    {
15133 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
15134 	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
15135 	      intcmp_mode = CCmode;
15136 	      code = EQ;
15137 	    }
15138 	  else
15139 	    {
15140 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
15141 	      code = NE;
15142 	    }
15143 	  break;
15144 	case GE:
15145 	case UNGE:
15146 	  if (code == GE || !TARGET_IEEE_FP)
15147 	    {
15148 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
15149 	      code = EQ;
15150 	    }
15151 	  else
15152 	    {
15153 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
15154 	      emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
15155 	      code = NE;
15156 	    }
15157 	  break;
15158 	case LE:
15159 	case UNLE:
15160 	  if (code == LE && TARGET_IEEE_FP)
15161 	    {
15162 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
15163 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
15164 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
15165 	      intcmp_mode = CCmode;
15166 	      code = LTU;
15167 	    }
15168 	  else
15169 	    {
15170 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
15171 	      code = NE;
15172 	    }
15173 	  break;
15174 	case EQ:
15175 	case UNEQ:
15176 	  if (code == EQ && TARGET_IEEE_FP)
15177 	    {
15178 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
15179 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
15180 	      intcmp_mode = CCmode;
15181 	      code = EQ;
15182 	    }
15183 	  else
15184 	    {
15185 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
15186 	      code = NE;
15187 	    }
15188 	  break;
15189 	case NE:
15190 	case LTGT:
15191 	  if (code == NE && TARGET_IEEE_FP)
15192 	    {
15193 	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
15194 	      emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
15195 					     GEN_INT (0x40)));
15196 	      code = NE;
15197 	    }
15198 	  else
15199 	    {
15200 	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
15201 	      code = EQ;
15202 	    }
15203 	  break;
15204 
15205 	case UNORDERED:
15206 	  emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
15207 	  code = NE;
15208 	  break;
15209 	case ORDERED:
15210 	  emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
15211 	  code = EQ;
15212 	  break;
15213 
15214 	default:
15215 	  gcc_unreachable ();
15216 	}
15217 	break;
15218 
15219     default:
15220       gcc_unreachable();
15221     }
15222 
15223   /* Return the test that should be put into the flags user, i.e.
15224      the bcc, scc, or cmov instruction.  */
15225   return gen_rtx_fmt_ee (code, VOIDmode,
15226 			 gen_rtx_REG (intcmp_mode, FLAGS_REG),
15227 			 const0_rtx);
15228 }
15229 
15230 rtx
15231 ix86_expand_compare (enum rtx_code code)
15232 {
15233   rtx op0, op1, ret;
15234   op0 = ix86_compare_op0;
15235   op1 = ix86_compare_op1;
15236 
15237   if (GET_MODE_CLASS (GET_MODE (ix86_compare_op0)) == MODE_CC)
15238     ret = gen_rtx_fmt_ee (code, VOIDmode, ix86_compare_op0, ix86_compare_op1);
15239 
15240   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
15241     {
15242       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
15243       ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
15244     }
15245   else
15246     ret = ix86_expand_int_compare (code, op0, op1);
15247 
15248   return ret;
15249 }
15250 
15251 void
15252 ix86_expand_branch (enum rtx_code code, rtx label)
15253 {
15254   rtx tmp;
15255 
15256   switch (GET_MODE (ix86_compare_op0))
15257     {
15258     case SFmode:
15259     case DFmode:
15260     case XFmode:
15261     case QImode:
15262     case HImode:
15263     case SImode:
15264       simple:
15265       tmp = ix86_expand_compare (code);
15266       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15267 				  gen_rtx_LABEL_REF (VOIDmode, label),
15268 				  pc_rtx);
15269       emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
15270       return;
15271 
15272     case DImode:
15273       if (TARGET_64BIT)
15274 	goto simple;
15275     case TImode:
15276       /* Expand DImode branch into multiple compare+branch.  */
15277       {
15278 	rtx lo[2], hi[2], label2;
15279 	enum rtx_code code1, code2, code3;
15280 	enum machine_mode submode;
15281 
15282 	if (CONSTANT_P (ix86_compare_op0) && ! CONSTANT_P (ix86_compare_op1))
15283 	  {
15284 	    tmp = ix86_compare_op0;
15285 	    ix86_compare_op0 = ix86_compare_op1;
15286 	    ix86_compare_op1 = tmp;
15287 	    code = swap_condition (code);
15288 	  }
15289 	if (GET_MODE (ix86_compare_op0) == DImode)
15290 	  {
15291 	    split_di (&ix86_compare_op0, 1, lo+0, hi+0);
15292 	    split_di (&ix86_compare_op1, 1, lo+1, hi+1);
15293 	    submode = SImode;
15294 	  }
15295 	else
15296 	  {
15297 	    split_ti (&ix86_compare_op0, 1, lo+0, hi+0);
15298 	    split_ti (&ix86_compare_op1, 1, lo+1, hi+1);
15299 	    submode = DImode;
15300 	  }
15301 
15302 	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
15303 	   avoid two branches.  This costs one extra insn, so disable when
15304 	   optimizing for size.  */
15305 
15306 	if ((code == EQ || code == NE)
15307 	    && (!optimize_insn_for_size_p ()
15308 	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
15309 	  {
15310 	    rtx xor0, xor1;
15311 
15312 	    xor1 = hi[0];
15313 	    if (hi[1] != const0_rtx)
15314 	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
15315 				   NULL_RTX, 0, OPTAB_WIDEN);
15316 
15317 	    xor0 = lo[0];
15318 	    if (lo[1] != const0_rtx)
15319 	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
15320 				   NULL_RTX, 0, OPTAB_WIDEN);
15321 
15322 	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
15323 				NULL_RTX, 0, OPTAB_WIDEN);
15324 
15325 	    ix86_compare_op0 = tmp;
15326 	    ix86_compare_op1 = const0_rtx;
15327 	    ix86_expand_branch (code, label);
15328 	    return;
15329 	  }
15330 
15331 	/* Otherwise, if we are doing less-than or greater-or-equal-than,
15332 	   op1 is a constant and the low word is zero, then we can just
15333 	   examine the high word.  Similarly for low word -1 and
15334 	   less-or-equal-than or greater-than.  */
15335 
15336 	if (CONST_INT_P (hi[1]))
15337 	  switch (code)
15338 	    {
15339 	    case LT: case LTU: case GE: case GEU:
15340 	      if (lo[1] == const0_rtx)
15341 		{
15342 		  ix86_compare_op0 = hi[0];
15343 		  ix86_compare_op1 = hi[1];
15344 		  ix86_expand_branch (code, label);
15345 		  return;
15346 		}
15347 	      break;
15348 	    case LE: case LEU: case GT: case GTU:
15349 	      if (lo[1] == constm1_rtx)
15350 		{
15351 		  ix86_compare_op0 = hi[0];
15352 		  ix86_compare_op1 = hi[1];
15353 		  ix86_expand_branch (code, label);
15354 		  return;
15355 		}
15356 	      break;
15357 	    default:
15358 	      break;
15359 	    }
15360 
15361 	/* Otherwise, we need two or three jumps.  */
15362 
15363 	label2 = gen_label_rtx ();
15364 
15365 	code1 = code;
15366 	code2 = swap_condition (code);
15367 	code3 = unsigned_condition (code);
15368 
15369 	switch (code)
15370 	  {
15371 	  case LT: case GT: case LTU: case GTU:
15372 	    break;
15373 
15374 	  case LE:   code1 = LT;  code2 = GT;  break;
15375 	  case GE:   code1 = GT;  code2 = LT;  break;
15376 	  case LEU:  code1 = LTU; code2 = GTU; break;
15377 	  case GEU:  code1 = GTU; code2 = LTU; break;
15378 
15379 	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
15380 	  case NE:   code2 = UNKNOWN; break;
15381 
15382 	  default:
15383 	    gcc_unreachable ();
15384 	  }
15385 
15386 	/*
15387 	 * a < b =>
15388 	 *    if (hi(a) < hi(b)) goto true;
15389 	 *    if (hi(a) > hi(b)) goto false;
15390 	 *    if (lo(a) < lo(b)) goto true;
15391 	 *  false:
15392 	 */
15393 
15394 	ix86_compare_op0 = hi[0];
15395 	ix86_compare_op1 = hi[1];
15396 
15397 	if (code1 != UNKNOWN)
15398 	  ix86_expand_branch (code1, label);
15399 	if (code2 != UNKNOWN)
15400 	  ix86_expand_branch (code2, label2);
15401 
15402 	ix86_compare_op0 = lo[0];
15403 	ix86_compare_op1 = lo[1];
15404 	ix86_expand_branch (code3, label);
15405 
15406 	if (code2 != UNKNOWN)
15407 	  emit_label (label2);
15408 	return;
15409       }
15410 
15411     default:
15412       /* If we have already emitted a compare insn, go straight to simple.
15413          ix86_expand_compare won't emit anything if ix86_compare_emitted
15414          is non NULL.  */
15415       gcc_assert (GET_MODE_CLASS (GET_MODE (ix86_compare_op0)) == MODE_CC);
15416       goto simple;
15417     }
15418 }
15419 
15420 /* Split branch based on floating point condition.  */
15421 void
15422 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
15423 		      rtx target1, rtx target2, rtx tmp, rtx pushed)
15424 {
15425   rtx condition;
15426   rtx i;
15427 
15428   if (target2 != pc_rtx)
15429     {
15430       rtx tmp = target2;
15431       code = reverse_condition_maybe_unordered (code);
15432       target2 = target1;
15433       target1 = tmp;
15434     }
15435 
15436   condition = ix86_expand_fp_compare (code, op1, op2,
15437 				      tmp);
15438 
15439   /* Remove pushed operand from stack.  */
15440   if (pushed)
15441     ix86_free_from_memory (GET_MODE (pushed));
15442 
15443   i = emit_jump_insn (gen_rtx_SET
15444 		      (VOIDmode, pc_rtx,
15445 		       gen_rtx_IF_THEN_ELSE (VOIDmode,
15446 					     condition, target1, target2)));
15447   if (split_branch_probability >= 0)
15448     add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
15449 }
15450 
15451 void
15452 ix86_expand_setcc (enum rtx_code code, rtx dest)
15453 {
15454   rtx ret;
15455 
15456   gcc_assert (GET_MODE (dest) == QImode);
15457 
15458   ret = ix86_expand_compare (code);
15459   PUT_MODE (ret, QImode);
15460   emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
15461 }
15462 
15463 /* Expand comparison setting or clearing carry flag.  Return true when
15464    successful and set pop for the operation.  */
15465 static bool
15466 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
15467 {
15468   enum machine_mode mode =
15469     GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
15470 
15471   /* Do not handle DImode compares that go through special path.  */
15472   if (mode == (TARGET_64BIT ? TImode : DImode))
15473     return false;
15474 
15475   if (SCALAR_FLOAT_MODE_P (mode))
15476     {
15477       rtx compare_op, compare_seq;
15478 
15479       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
15480 
15481       /* Shortcut:  following common codes never translate
15482 	 into carry flag compares.  */
15483       if (code == EQ || code == NE || code == UNEQ || code == LTGT
15484 	  || code == ORDERED || code == UNORDERED)
15485 	return false;
15486 
15487       /* These comparisons require zero flag; swap operands so they won't.  */
15488       if ((code == GT || code == UNLE || code == LE || code == UNGT)
15489 	  && !TARGET_IEEE_FP)
15490 	{
15491 	  rtx tmp = op0;
15492 	  op0 = op1;
15493 	  op1 = tmp;
15494 	  code = swap_condition (code);
15495 	}
15496 
15497       /* Try to expand the comparison and verify that we end up with
15498 	 carry flag based comparison.  This fails to be true only when
15499 	 we decide to expand comparison using arithmetic that is not
15500 	 too common scenario.  */
15501       start_sequence ();
15502       compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
15503       compare_seq = get_insns ();
15504       end_sequence ();
15505 
15506       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
15507 	  || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
15508         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
15509       else
15510 	code = GET_CODE (compare_op);
15511 
15512       if (code != LTU && code != GEU)
15513 	return false;
15514 
15515       emit_insn (compare_seq);
15516       *pop = compare_op;
15517       return true;
15518     }
15519 
15520   if (!INTEGRAL_MODE_P (mode))
15521     return false;
15522 
15523   switch (code)
15524     {
15525     case LTU:
15526     case GEU:
15527       break;
15528 
15529     /* Convert a==0 into (unsigned)a<1.  */
15530     case EQ:
15531     case NE:
15532       if (op1 != const0_rtx)
15533 	return false;
15534       op1 = const1_rtx;
15535       code = (code == EQ ? LTU : GEU);
15536       break;
15537 
15538     /* Convert a>b into b<a or a>=b-1.  */
15539     case GTU:
15540     case LEU:
15541       if (CONST_INT_P (op1))
15542 	{
15543 	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
15544 	  /* Bail out on overflow.  We still can swap operands but that
15545 	     would force loading of the constant into register.  */
15546 	  if (op1 == const0_rtx
15547 	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
15548 	    return false;
15549 	  code = (code == GTU ? GEU : LTU);
15550 	}
15551       else
15552 	{
15553 	  rtx tmp = op1;
15554 	  op1 = op0;
15555 	  op0 = tmp;
15556 	  code = (code == GTU ? LTU : GEU);
15557 	}
15558       break;
15559 
15560     /* Convert a>=0 into (unsigned)a<0x80000000.  */
15561     case LT:
15562     case GE:
15563       if (mode == DImode || op1 != const0_rtx)
15564 	return false;
15565       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
15566       code = (code == LT ? GEU : LTU);
15567       break;
15568     case LE:
15569     case GT:
15570       if (mode == DImode || op1 != constm1_rtx)
15571 	return false;
15572       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
15573       code = (code == LE ? GEU : LTU);
15574       break;
15575 
15576     default:
15577       return false;
15578     }
15579   /* Swapping operands may cause constant to appear as first operand.  */
15580   if (!nonimmediate_operand (op0, VOIDmode))
15581     {
15582       if (!can_create_pseudo_p ())
15583 	return false;
15584       op0 = force_reg (mode, op0);
15585     }
15586   ix86_compare_op0 = op0;
15587   ix86_compare_op1 = op1;
15588   *pop = ix86_expand_compare (code);
15589   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
15590   return true;
15591 }
15592 
15593 int
15594 ix86_expand_int_movcc (rtx operands[])
15595 {
15596   enum rtx_code code = GET_CODE (operands[1]), compare_code;
15597   rtx compare_seq, compare_op;
15598   enum machine_mode mode = GET_MODE (operands[0]);
15599   bool sign_bit_compare_p = false;
15600 
15601   start_sequence ();
15602   ix86_compare_op0 = XEXP (operands[1], 0);
15603   ix86_compare_op1 = XEXP (operands[1], 1);
15604   compare_op = ix86_expand_compare (code);
15605   compare_seq = get_insns ();
15606   end_sequence ();
15607 
15608   compare_code = GET_CODE (compare_op);
15609 
15610   if ((ix86_compare_op1 == const0_rtx && (code == GE || code == LT))
15611       || (ix86_compare_op1 == constm1_rtx && (code == GT || code == LE)))
15612     sign_bit_compare_p = true;
15613 
15614   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
15615      HImode insns, we'd be swallowed in word prefix ops.  */
15616 
15617   if ((mode != HImode || TARGET_FAST_PREFIX)
15618       && (mode != (TARGET_64BIT ? TImode : DImode))
15619       && CONST_INT_P (operands[2])
15620       && CONST_INT_P (operands[3]))
15621     {
15622       rtx out = operands[0];
15623       HOST_WIDE_INT ct = INTVAL (operands[2]);
15624       HOST_WIDE_INT cf = INTVAL (operands[3]);
15625       HOST_WIDE_INT diff;
15626 
15627       diff = ct - cf;
15628       /*  Sign bit compares are better done using shifts than we do by using
15629 	  sbb.  */
15630       if (sign_bit_compare_p
15631 	  || ix86_expand_carry_flag_compare (code, ix86_compare_op0,
15632 					     ix86_compare_op1, &compare_op))
15633 	{
15634 	  /* Detect overlap between destination and compare sources.  */
15635 	  rtx tmp = out;
15636 
15637           if (!sign_bit_compare_p)
15638 	    {
15639 	      rtx flags;
15640 	      bool fpcmp = false;
15641 
15642 	      compare_code = GET_CODE (compare_op);
15643 
15644 	      flags = XEXP (compare_op, 0);
15645 
15646 	      if (GET_MODE (flags) == CCFPmode
15647 		  || GET_MODE (flags) == CCFPUmode)
15648 		{
15649 		  fpcmp = true;
15650 		  compare_code
15651 		    = ix86_fp_compare_code_to_integer (compare_code);
15652 		}
15653 
15654 	      /* To simplify rest of code, restrict to the GEU case.  */
15655 	      if (compare_code == LTU)
15656 		{
15657 		  HOST_WIDE_INT tmp = ct;
15658 		  ct = cf;
15659 		  cf = tmp;
15660 		  compare_code = reverse_condition (compare_code);
15661 		  code = reverse_condition (code);
15662 		}
15663 	      else
15664 		{
15665 		  if (fpcmp)
15666 		    PUT_CODE (compare_op,
15667 			      reverse_condition_maybe_unordered
15668 			        (GET_CODE (compare_op)));
15669 		  else
15670 		    PUT_CODE (compare_op,
15671 			      reverse_condition (GET_CODE (compare_op)));
15672 		}
15673 	      diff = ct - cf;
15674 
15675 	      if (reg_overlap_mentioned_p (out, ix86_compare_op0)
15676 		  || reg_overlap_mentioned_p (out, ix86_compare_op1))
15677 		tmp = gen_reg_rtx (mode);
15678 
15679 	      if (mode == DImode)
15680 		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
15681 	      else
15682 		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
15683 						 flags, compare_op));
15684 	    }
15685 	  else
15686 	    {
15687 	      if (code == GT || code == GE)
15688 		code = reverse_condition (code);
15689 	      else
15690 		{
15691 		  HOST_WIDE_INT tmp = ct;
15692 		  ct = cf;
15693 		  cf = tmp;
15694 		  diff = ct - cf;
15695 		}
15696 	      tmp = emit_store_flag (tmp, code, ix86_compare_op0,
15697 				     ix86_compare_op1, VOIDmode, 0, -1);
15698 	    }
15699 
15700 	  if (diff == 1)
15701 	    {
15702 	      /*
15703 	       * cmpl op0,op1
15704 	       * sbbl dest,dest
15705 	       * [addl dest, ct]
15706 	       *
15707 	       * Size 5 - 8.
15708 	       */
15709 	      if (ct)
15710 		tmp = expand_simple_binop (mode, PLUS,
15711 					   tmp, GEN_INT (ct),
15712 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
15713 	    }
15714 	  else if (cf == -1)
15715 	    {
15716 	      /*
15717 	       * cmpl op0,op1
15718 	       * sbbl dest,dest
15719 	       * orl $ct, dest
15720 	       *
15721 	       * Size 8.
15722 	       */
15723 	      tmp = expand_simple_binop (mode, IOR,
15724 					 tmp, GEN_INT (ct),
15725 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
15726 	    }
15727 	  else if (diff == -1 && ct)
15728 	    {
15729 	      /*
15730 	       * cmpl op0,op1
15731 	       * sbbl dest,dest
15732 	       * notl dest
15733 	       * [addl dest, cf]
15734 	       *
15735 	       * Size 8 - 11.
15736 	       */
15737 	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
15738 	      if (cf)
15739 		tmp = expand_simple_binop (mode, PLUS,
15740 					   copy_rtx (tmp), GEN_INT (cf),
15741 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
15742 	    }
15743 	  else
15744 	    {
15745 	      /*
15746 	       * cmpl op0,op1
15747 	       * sbbl dest,dest
15748 	       * [notl dest]
15749 	       * andl cf - ct, dest
15750 	       * [addl dest, ct]
15751 	       *
15752 	       * Size 8 - 11.
15753 	       */
15754 
15755 	      if (cf == 0)
15756 		{
15757 		  cf = ct;
15758 		  ct = 0;
15759 		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
15760 		}
15761 
15762 	      tmp = expand_simple_binop (mode, AND,
15763 					 copy_rtx (tmp),
15764 					 gen_int_mode (cf - ct, mode),
15765 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
15766 	      if (ct)
15767 		tmp = expand_simple_binop (mode, PLUS,
15768 					   copy_rtx (tmp), GEN_INT (ct),
15769 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
15770 	    }
15771 
15772 	  if (!rtx_equal_p (tmp, out))
15773 	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
15774 
15775 	  return 1; /* DONE */
15776 	}
15777 
15778       if (diff < 0)
15779 	{
15780 	  enum machine_mode cmp_mode = GET_MODE (ix86_compare_op0);
15781 
15782 	  HOST_WIDE_INT tmp;
15783 	  tmp = ct, ct = cf, cf = tmp;
15784 	  diff = -diff;
15785 
15786 	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
15787 	    {
15788 	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
15789 
15790 	      /* We may be reversing unordered compare to normal compare, that
15791 		 is not valid in general (we may convert non-trapping condition
15792 		 to trapping one), however on i386 we currently emit all
15793 		 comparisons unordered.  */
15794 	      compare_code = reverse_condition_maybe_unordered (compare_code);
15795 	      code = reverse_condition_maybe_unordered (code);
15796 	    }
15797 	  else
15798 	    {
15799 	      compare_code = reverse_condition (compare_code);
15800 	      code = reverse_condition (code);
15801 	    }
15802 	}
15803 
15804       compare_code = UNKNOWN;
15805       if (GET_MODE_CLASS (GET_MODE (ix86_compare_op0)) == MODE_INT
15806 	  && CONST_INT_P (ix86_compare_op1))
15807 	{
15808 	  if (ix86_compare_op1 == const0_rtx
15809 	      && (code == LT || code == GE))
15810 	    compare_code = code;
15811 	  else if (ix86_compare_op1 == constm1_rtx)
15812 	    {
15813 	      if (code == LE)
15814 		compare_code = LT;
15815 	      else if (code == GT)
15816 		compare_code = GE;
15817 	    }
15818 	}
15819 
15820       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
15821       if (compare_code != UNKNOWN
15822 	  && GET_MODE (ix86_compare_op0) == GET_MODE (out)
15823 	  && (cf == -1 || ct == -1))
15824 	{
15825 	  /* If lea code below could be used, only optimize
15826 	     if it results in a 2 insn sequence.  */
15827 
15828 	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
15829 		 || diff == 3 || diff == 5 || diff == 9)
15830 	      || (compare_code == LT && ct == -1)
15831 	      || (compare_code == GE && cf == -1))
15832 	    {
15833 	      /*
15834 	       * notl op1	(if necessary)
15835 	       * sarl $31, op1
15836 	       * orl cf, op1
15837 	       */
15838 	      if (ct != -1)
15839 		{
15840 		  cf = ct;
15841 		  ct = -1;
15842 		  code = reverse_condition (code);
15843 		}
15844 
15845 	      out = emit_store_flag (out, code, ix86_compare_op0,
15846 				     ix86_compare_op1, VOIDmode, 0, -1);
15847 
15848 	      out = expand_simple_binop (mode, IOR,
15849 					 out, GEN_INT (cf),
15850 					 out, 1, OPTAB_DIRECT);
15851 	      if (out != operands[0])
15852 		emit_move_insn (operands[0], out);
15853 
15854 	      return 1; /* DONE */
15855 	    }
15856 	}
15857 
15858 
15859       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
15860 	   || diff == 3 || diff == 5 || diff == 9)
15861 	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
15862 	  && (mode != DImode
15863 	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
15864 	{
15865 	  /*
15866 	   * xorl dest,dest
15867 	   * cmpl op1,op2
15868 	   * setcc dest
15869 	   * lea cf(dest*(ct-cf)),dest
15870 	   *
15871 	   * Size 14.
15872 	   *
15873 	   * This also catches the degenerate setcc-only case.
15874 	   */
15875 
15876 	  rtx tmp;
15877 	  int nops;
15878 
15879 	  out = emit_store_flag (out, code, ix86_compare_op0,
15880 				 ix86_compare_op1, VOIDmode, 0, 1);
15881 
15882 	  nops = 0;
15883 	  /* On x86_64 the lea instruction operates on Pmode, so we need
15884 	     to get arithmetics done in proper mode to match.  */
15885 	  if (diff == 1)
15886 	    tmp = copy_rtx (out);
15887 	  else
15888 	    {
15889 	      rtx out1;
15890 	      out1 = copy_rtx (out);
15891 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
15892 	      nops++;
15893 	      if (diff & 1)
15894 		{
15895 		  tmp = gen_rtx_PLUS (mode, tmp, out1);
15896 		  nops++;
15897 		}
15898 	    }
15899 	  if (cf != 0)
15900 	    {
15901 	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
15902 	      nops++;
15903 	    }
15904 	  if (!rtx_equal_p (tmp, out))
15905 	    {
15906 	      if (nops == 1)
15907 		out = force_operand (tmp, copy_rtx (out));
15908 	      else
15909 		emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
15910 	    }
15911 	  if (!rtx_equal_p (out, operands[0]))
15912 	    emit_move_insn (operands[0], copy_rtx (out));
15913 
15914 	  return 1; /* DONE */
15915 	}
15916 
15917       /*
15918        * General case:			Jumpful:
15919        *   xorl dest,dest		cmpl op1, op2
15920        *   cmpl op1, op2		movl ct, dest
15921        *   setcc dest			jcc 1f
15922        *   decl dest			movl cf, dest
15923        *   andl (cf-ct),dest		1:
15924        *   addl ct,dest
15925        *
15926        * Size 20.			Size 14.
15927        *
15928        * This is reasonably steep, but branch mispredict costs are
15929        * high on modern cpus, so consider failing only if optimizing
15930        * for space.
15931        */
15932 
15933       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
15934 	  && BRANCH_COST (optimize_insn_for_speed_p (),
15935 		  	  false) >= 2)
15936 	{
15937 	  if (cf == 0)
15938 	    {
15939 	      enum machine_mode cmp_mode = GET_MODE (ix86_compare_op0);
15940 
15941 	      cf = ct;
15942 	      ct = 0;
15943 
15944 	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
15945 		{
15946 		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
15947 
15948 		  /* We may be reversing unordered compare to normal compare,
15949 		     that is not valid in general (we may convert non-trapping
15950 		     condition to trapping one), however on i386 we currently
15951 		     emit all comparisons unordered.  */
15952 		  code = reverse_condition_maybe_unordered (code);
15953 		}
15954 	      else
15955 		{
15956 		  code = reverse_condition (code);
15957 		  if (compare_code != UNKNOWN)
15958 		    compare_code = reverse_condition (compare_code);
15959 		}
15960 	    }
15961 
15962 	  if (compare_code != UNKNOWN)
15963 	    {
15964 	      /* notl op1	(if needed)
15965 		 sarl $31, op1
15966 		 andl (cf-ct), op1
15967 		 addl ct, op1
15968 
15969 		 For x < 0 (resp. x <= -1) there will be no notl,
15970 		 so if possible swap the constants to get rid of the
15971 		 complement.
15972 		 True/false will be -1/0 while code below (store flag
15973 		 followed by decrement) is 0/-1, so the constants need
15974 		 to be exchanged once more.  */
15975 
15976 	      if (compare_code == GE || !cf)
15977 		{
15978 		  code = reverse_condition (code);
15979 		  compare_code = LT;
15980 		}
15981 	      else
15982 		{
15983 		  HOST_WIDE_INT tmp = cf;
15984 		  cf = ct;
15985 		  ct = tmp;
15986 		}
15987 
15988 	      out = emit_store_flag (out, code, ix86_compare_op0,
15989 				     ix86_compare_op1, VOIDmode, 0, -1);
15990 	    }
15991 	  else
15992 	    {
15993 	      out = emit_store_flag (out, code, ix86_compare_op0,
15994 				     ix86_compare_op1, VOIDmode, 0, 1);
15995 
15996 	      out = expand_simple_binop (mode, PLUS, copy_rtx (out), constm1_rtx,
15997 					 copy_rtx (out), 1, OPTAB_DIRECT);
15998 	    }
15999 
16000 	  out = expand_simple_binop (mode, AND, copy_rtx (out),
16001 				     gen_int_mode (cf - ct, mode),
16002 				     copy_rtx (out), 1, OPTAB_DIRECT);
16003 	  if (ct)
16004 	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
16005 				       copy_rtx (out), 1, OPTAB_DIRECT);
16006 	  if (!rtx_equal_p (out, operands[0]))
16007 	    emit_move_insn (operands[0], copy_rtx (out));
16008 
16009 	  return 1; /* DONE */
16010 	}
16011     }
16012 
16013   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
16014     {
16015       /* Try a few things more with specific constants and a variable.  */
16016 
16017       optab op;
16018       rtx var, orig_out, out, tmp;
16019 
16020       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
16021 	return 0; /* FAIL */
16022 
16023       /* If one of the two operands is an interesting constant, load a
16024 	 constant with the above and mask it in with a logical operation.  */
16025 
16026       if (CONST_INT_P (operands[2]))
16027 	{
16028 	  var = operands[3];
16029 	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
16030 	    operands[3] = constm1_rtx, op = and_optab;
16031 	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
16032 	    operands[3] = const0_rtx, op = ior_optab;
16033 	  else
16034 	    return 0; /* FAIL */
16035 	}
16036       else if (CONST_INT_P (operands[3]))
16037 	{
16038 	  var = operands[2];
16039 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
16040 	    operands[2] = constm1_rtx, op = and_optab;
16041 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
16042 	    operands[2] = const0_rtx, op = ior_optab;
16043 	  else
16044 	    return 0; /* FAIL */
16045 	}
16046       else
16047         return 0; /* FAIL */
16048 
16049       orig_out = operands[0];
16050       tmp = gen_reg_rtx (mode);
16051       operands[0] = tmp;
16052 
16053       /* Recurse to get the constant loaded.  */
16054       if (ix86_expand_int_movcc (operands) == 0)
16055         return 0; /* FAIL */
16056 
16057       /* Mask in the interesting variable.  */
16058       out = expand_binop (mode, op, var, tmp, orig_out, 0,
16059 			  OPTAB_WIDEN);
16060       if (!rtx_equal_p (out, orig_out))
16061 	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
16062 
16063       return 1; /* DONE */
16064     }
16065 
16066   /*
16067    * For comparison with above,
16068    *
16069    * movl cf,dest
16070    * movl ct,tmp
16071    * cmpl op1,op2
16072    * cmovcc tmp,dest
16073    *
16074    * Size 15.
16075    */
16076 
16077   if (! nonimmediate_operand (operands[2], mode))
16078     operands[2] = force_reg (mode, operands[2]);
16079   if (! nonimmediate_operand (operands[3], mode))
16080     operands[3] = force_reg (mode, operands[3]);
16081 
16082   if (! register_operand (operands[2], VOIDmode)
16083       && (mode == QImode
16084           || ! register_operand (operands[3], VOIDmode)))
16085     operands[2] = force_reg (mode, operands[2]);
16086 
16087   if (mode == QImode
16088       && ! register_operand (operands[3], VOIDmode))
16089     operands[3] = force_reg (mode, operands[3]);
16090 
16091   emit_insn (compare_seq);
16092   emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16093 			  gen_rtx_IF_THEN_ELSE (mode,
16094 						compare_op, operands[2],
16095 						operands[3])));
16096 
16097   return 1; /* DONE */
16098 }
16099 
16100 /* Swap, force into registers, or otherwise massage the two operands
16101    to an sse comparison with a mask result.  Thus we differ a bit from
16102    ix86_prepare_fp_compare_args which expects to produce a flags result.
16103 
16104    The DEST operand exists to help determine whether to commute commutative
16105    operators.  The POP0/POP1 operands are updated in place.  The new
16106    comparison code is returned, or UNKNOWN if not implementable.  */
16107 
16108 static enum rtx_code
16109 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
16110 				  rtx *pop0, rtx *pop1)
16111 {
16112   rtx tmp;
16113 
16114   switch (code)
16115     {
16116     case LTGT:
16117     case UNEQ:
16118       /* We have no LTGT as an operator.  We could implement it with
16119 	 NE & ORDERED, but this requires an extra temporary.  It's
16120 	 not clear that it's worth it.  */
16121       return UNKNOWN;
16122 
16123     case LT:
16124     case LE:
16125     case UNGT:
16126     case UNGE:
16127       /* These are supported directly.  */
16128       break;
16129 
16130     case EQ:
16131     case NE:
16132     case UNORDERED:
16133     case ORDERED:
16134       /* For commutative operators, try to canonicalize the destination
16135 	 operand to be first in the comparison - this helps reload to
16136 	 avoid extra moves.  */
16137       if (!dest || !rtx_equal_p (dest, *pop1))
16138 	break;
16139       /* FALLTHRU */
16140 
16141     case GE:
16142     case GT:
16143     case UNLE:
16144     case UNLT:
16145       /* These are not supported directly.  Swap the comparison operands
16146 	 to transform into something that is supported.  */
16147       tmp = *pop0;
16148       *pop0 = *pop1;
16149       *pop1 = tmp;
16150       code = swap_condition (code);
16151       break;
16152 
16153     default:
16154       gcc_unreachable ();
16155     }
16156 
16157   return code;
16158 }
16159 
16160 /* Detect conditional moves that exactly match min/max operational
16161    semantics.  Note that this is IEEE safe, as long as we don't
16162    interchange the operands.
16163 
16164    Returns FALSE if this conditional move doesn't match a MIN/MAX,
16165    and TRUE if the operation is successful and instructions are emitted.  */
16166 
16167 static bool
16168 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
16169 			   rtx cmp_op1, rtx if_true, rtx if_false)
16170 {
16171   enum machine_mode mode;
16172   bool is_min;
16173   rtx tmp;
16174 
16175   if (code == LT)
16176     ;
16177   else if (code == UNGE)
16178     {
16179       tmp = if_true;
16180       if_true = if_false;
16181       if_false = tmp;
16182     }
16183   else
16184     return false;
16185 
16186   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
16187     is_min = true;
16188   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
16189     is_min = false;
16190   else
16191     return false;
16192 
16193   mode = GET_MODE (dest);
16194 
16195   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
16196      but MODE may be a vector mode and thus not appropriate.  */
16197   if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
16198     {
16199       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
16200       rtvec v;
16201 
16202       if_true = force_reg (mode, if_true);
16203       v = gen_rtvec (2, if_true, if_false);
16204       tmp = gen_rtx_UNSPEC (mode, v, u);
16205     }
16206   else
16207     {
16208       code = is_min ? SMIN : SMAX;
16209       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
16210     }
16211 
16212   emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
16213   return true;
16214 }
16215 
16216 /* Expand an sse vector comparison.  Return the register with the result.  */
16217 
16218 static rtx
16219 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
16220 		     rtx op_true, rtx op_false)
16221 {
16222   enum machine_mode mode = GET_MODE (dest);
16223   rtx x;
16224 
16225   cmp_op0 = force_reg (mode, cmp_op0);
16226   if (!nonimmediate_operand (cmp_op1, mode))
16227     cmp_op1 = force_reg (mode, cmp_op1);
16228 
16229   if (optimize
16230       || reg_overlap_mentioned_p (dest, op_true)
16231       || reg_overlap_mentioned_p (dest, op_false))
16232     dest = gen_reg_rtx (mode);
16233 
16234   x = gen_rtx_fmt_ee (code, mode, cmp_op0, cmp_op1);
16235   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
16236 
16237   return dest;
16238 }
16239 
16240 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
16241    operations.  This is used for both scalar and vector conditional moves.  */
16242 
16243 static void
16244 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
16245 {
16246   enum machine_mode mode = GET_MODE (dest);
16247   rtx t2, t3, x;
16248 
16249   if (op_false == CONST0_RTX (mode))
16250     {
16251       op_true = force_reg (mode, op_true);
16252       x = gen_rtx_AND (mode, cmp, op_true);
16253       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
16254     }
16255   else if (op_true == CONST0_RTX (mode))
16256     {
16257       op_false = force_reg (mode, op_false);
16258       x = gen_rtx_NOT (mode, cmp);
16259       x = gen_rtx_AND (mode, x, op_false);
16260       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
16261     }
16262   else if (TARGET_XOP)
16263     {
16264       op_true = force_reg (mode, op_true);
16265 
16266       if (!nonimmediate_operand (op_false, mode))
16267 	op_false = force_reg (mode, op_false);
16268 
16269       emit_insn (gen_rtx_SET (mode, dest,
16270 			      gen_rtx_IF_THEN_ELSE (mode, cmp,
16271 						    op_true,
16272 						    op_false)));
16273     }
16274   else
16275     {
16276       op_true = force_reg (mode, op_true);
16277       op_false = force_reg (mode, op_false);
16278 
16279       t2 = gen_reg_rtx (mode);
16280       if (optimize)
16281 	t3 = gen_reg_rtx (mode);
16282       else
16283 	t3 = dest;
16284 
16285       x = gen_rtx_AND (mode, op_true, cmp);
16286       emit_insn (gen_rtx_SET (VOIDmode, t2, x));
16287 
16288       x = gen_rtx_NOT (mode, cmp);
16289       x = gen_rtx_AND (mode, x, op_false);
16290       emit_insn (gen_rtx_SET (VOIDmode, t3, x));
16291 
16292       x = gen_rtx_IOR (mode, t3, t2);
16293       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
16294     }
16295 }
16296 
16297 /* Expand a floating-point conditional move.  Return true if successful.  */
16298 
16299 int
16300 ix86_expand_fp_movcc (rtx operands[])
16301 {
16302   enum machine_mode mode = GET_MODE (operands[0]);
16303   enum rtx_code code = GET_CODE (operands[1]);
16304   rtx tmp, compare_op;
16305 
16306   ix86_compare_op0 = XEXP (operands[1], 0);
16307   ix86_compare_op1 = XEXP (operands[1], 1);
16308   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
16309     {
16310       enum machine_mode cmode;
16311 
16312       /* Since we've no cmove for sse registers, don't force bad register
16313 	 allocation just to gain access to it.  Deny movcc when the
16314 	 comparison mode doesn't match the move mode.  */
16315       cmode = GET_MODE (ix86_compare_op0);
16316       if (cmode == VOIDmode)
16317 	cmode = GET_MODE (ix86_compare_op1);
16318       if (cmode != mode)
16319 	return 0;
16320 
16321       code = ix86_prepare_sse_fp_compare_args (operands[0], code,
16322 					       &ix86_compare_op0,
16323 					       &ix86_compare_op1);
16324       if (code == UNKNOWN)
16325 	return 0;
16326 
16327       if (ix86_expand_sse_fp_minmax (operands[0], code, ix86_compare_op0,
16328 				     ix86_compare_op1, operands[2],
16329 				     operands[3]))
16330 	return 1;
16331 
16332       tmp = ix86_expand_sse_cmp (operands[0], code, ix86_compare_op0,
16333 				 ix86_compare_op1, operands[2], operands[3]);
16334       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
16335       return 1;
16336     }
16337 
16338   /* The floating point conditional move instructions don't directly
16339      support conditions resulting from a signed integer comparison.  */
16340 
16341   compare_op = ix86_expand_compare (code);
16342   if (!fcmov_comparison_operator (compare_op, VOIDmode))
16343     {
16344       tmp = gen_reg_rtx (QImode);
16345       ix86_expand_setcc (code, tmp);
16346       code = NE;
16347       ix86_compare_op0 = tmp;
16348       ix86_compare_op1 = const0_rtx;
16349       compare_op = ix86_expand_compare (code);
16350     }
16351 
16352   emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16353 			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
16354 						operands[2], operands[3])));
16355 
16356   return 1;
16357 }
16358 
16359 /* Expand a floating-point vector conditional move; a vcond operation
16360    rather than a movcc operation.  */
16361 
16362 bool
16363 ix86_expand_fp_vcond (rtx operands[])
16364 {
16365   enum rtx_code code = GET_CODE (operands[3]);
16366   rtx cmp;
16367 
16368   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
16369 					   &operands[4], &operands[5]);
16370   if (code == UNKNOWN)
16371     return false;
16372 
16373   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
16374 				 operands[5], operands[1], operands[2]))
16375     return true;
16376 
16377   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
16378 			     operands[1], operands[2]);
16379   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
16380   return true;
16381 }
16382 
16383 /* Expand a signed/unsigned integral vector conditional move.  */
16384 
16385 bool
16386 ix86_expand_int_vcond (rtx operands[])
16387 {
16388   enum machine_mode mode = GET_MODE (operands[0]);
16389   enum rtx_code code = GET_CODE (operands[3]);
16390   bool negate = false;
16391   rtx x, cop0, cop1;
16392 
16393   cop0 = operands[4];
16394   cop1 = operands[5];
16395 
16396   /* XOP supports all of the comparisons on all vector int types.  */
16397   if (!TARGET_XOP)
16398     {
16399       /* Canonicalize the comparison to EQ, GT, GTU.  */
16400       switch (code)
16401 	{
16402 	case EQ:
16403 	case GT:
16404 	case GTU:
16405 	  break;
16406 
16407 	case NE:
16408 	case LE:
16409 	case LEU:
16410 	  code = reverse_condition (code);
16411 	  negate = true;
16412 	  break;
16413 
16414 	case GE:
16415 	case GEU:
16416 	  code = reverse_condition (code);
16417 	  negate = true;
16418 	  /* FALLTHRU */
16419 
16420 	case LT:
16421 	case LTU:
16422 	  code = swap_condition (code);
16423 	  x = cop0, cop0 = cop1, cop1 = x;
16424 	  break;
16425 
16426 	default:
16427 	  gcc_unreachable ();
16428 	}
16429 
16430       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
16431       if (mode == V2DImode)
16432 	{
16433 	  switch (code)
16434 	    {
16435 	    case EQ:
16436 	      /* SSE4.1 supports EQ.  */
16437 	      if (!TARGET_SSE4_1)
16438 		return false;
16439 	      break;
16440 
16441 	    case GT:
16442 	    case GTU:
16443 	      /* SSE4.2 supports GT/GTU.  */
16444 	      if (!TARGET_SSE4_2)
16445 		return false;
16446 	      break;
16447 
16448 	    default:
16449 	      gcc_unreachable ();
16450 	    }
16451 	}
16452 
16453       /* Unsigned parallel compare is not supported by the hardware.
16454 	 Play some tricks to turn this into a signed comparison
16455 	 against 0.  */
16456       if (code == GTU)
16457 	{
16458 	  cop0 = force_reg (mode, cop0);
16459 
16460 	  switch (mode)
16461 	    {
16462 	    case V4SImode:
16463 	    case V2DImode:
16464 		{
16465 		  rtx t1, t2, mask;
16466 		  rtx (*gen_sub3) (rtx, rtx, rtx);
16467 
16468 		  /* Subtract (-(INT MAX) - 1) from both operands to make
16469 		     them signed.  */
16470 		  mask = ix86_build_signbit_mask (GET_MODE_INNER (mode),
16471 						  true, false);
16472 		  gen_sub3 = (mode == V4SImode
16473 			      ? gen_subv4si3 : gen_subv2di3);
16474 		  t1 = gen_reg_rtx (mode);
16475 		  emit_insn (gen_sub3 (t1, cop0, mask));
16476 
16477 		  t2 = gen_reg_rtx (mode);
16478 		  emit_insn (gen_sub3 (t2, cop1, mask));
16479 
16480 		  cop0 = t1;
16481 		  cop1 = t2;
16482 		  code = GT;
16483 		}
16484 	      break;
16485 
16486 	    case V16QImode:
16487 	    case V8HImode:
16488 	      /* Perform a parallel unsigned saturating subtraction.  */
16489 	      x = gen_reg_rtx (mode);
16490 	      emit_insn (gen_rtx_SET (VOIDmode, x,
16491 				      gen_rtx_US_MINUS (mode, cop0, cop1)));
16492 
16493 	      cop0 = x;
16494 	      cop1 = CONST0_RTX (mode);
16495 	      code = EQ;
16496 	      negate = !negate;
16497 	      break;
16498 
16499 	    default:
16500 	      gcc_unreachable ();
16501 	    }
16502 	}
16503     }
16504 
16505   x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
16506 			   operands[1+negate], operands[2-negate]);
16507 
16508   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
16509 			 operands[2-negate]);
16510   return true;
16511 }
16512 
16513 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
16514    true if we should do zero extension, else sign extension.  HIGH_P is
16515    true if we want the N/2 high elements, else the low elements.  */
16516 
16517 void
16518 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
16519 {
16520   enum machine_mode imode = GET_MODE (operands[1]);
16521   rtx (*unpack)(rtx, rtx, rtx);
16522   rtx se, dest;
16523 
16524   switch (imode)
16525     {
16526     case V16QImode:
16527       if (high_p)
16528         unpack = gen_vec_interleave_highv16qi;
16529       else
16530         unpack = gen_vec_interleave_lowv16qi;
16531       break;
16532     case V8HImode:
16533       if (high_p)
16534         unpack = gen_vec_interleave_highv8hi;
16535       else
16536         unpack = gen_vec_interleave_lowv8hi;
16537       break;
16538     case V4SImode:
16539       if (high_p)
16540         unpack = gen_vec_interleave_highv4si;
16541       else
16542         unpack = gen_vec_interleave_lowv4si;
16543       break;
16544     default:
16545       gcc_unreachable ();
16546     }
16547 
16548   dest = gen_lowpart (imode, operands[0]);
16549 
16550   if (unsigned_p)
16551     se = force_reg (imode, CONST0_RTX (imode));
16552   else
16553     se = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
16554                               operands[1], pc_rtx, pc_rtx);
16555 
16556   emit_insn (unpack (dest, operands[1], se));
16557 }
16558 
16559 /* This function performs the same task as ix86_expand_sse_unpack,
16560    but with SSE4.1 instructions.  */
16561 
16562 void
16563 ix86_expand_sse4_unpack (rtx operands[2], bool unsigned_p, bool high_p)
16564 {
16565   enum machine_mode imode = GET_MODE (operands[1]);
16566   rtx (*unpack)(rtx, rtx);
16567   rtx src, dest;
16568 
16569   switch (imode)
16570     {
16571     case V16QImode:
16572       if (unsigned_p)
16573 	unpack = gen_sse4_1_zero_extendv8qiv8hi2;
16574       else
16575 	unpack = gen_sse4_1_extendv8qiv8hi2;
16576       break;
16577     case V8HImode:
16578       if (unsigned_p)
16579 	unpack = gen_sse4_1_zero_extendv4hiv4si2;
16580       else
16581 	unpack = gen_sse4_1_extendv4hiv4si2;
16582       break;
16583     case V4SImode:
16584       if (unsigned_p)
16585 	unpack = gen_sse4_1_zero_extendv2siv2di2;
16586       else
16587 	unpack = gen_sse4_1_extendv2siv2di2;
16588       break;
16589     default:
16590       gcc_unreachable ();
16591     }
16592 
16593   dest = operands[0];
16594   if (high_p)
16595     {
16596       /* Shift higher 8 bytes to lower 8 bytes.  */
16597       src = gen_reg_rtx (imode);
16598       emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, src),
16599 				     gen_lowpart (V1TImode, operands[1]),
16600 				     GEN_INT (64)));
16601     }
16602   else
16603     src = operands[1];
16604 
16605   emit_insn (unpack (dest, src));
16606 }
16607 
16608 /* Expand conditional increment or decrement using adb/sbb instructions.
16609    The default case using setcc followed by the conditional move can be
16610    done by generic code.  */
16611 int
16612 ix86_expand_int_addcc (rtx operands[])
16613 {
16614   enum rtx_code code = GET_CODE (operands[1]);
16615   rtx flags;
16616   rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
16617   rtx compare_op;
16618   rtx val = const0_rtx;
16619   bool fpcmp = false;
16620   enum machine_mode mode;
16621 
16622   ix86_compare_op0 = XEXP (operands[1], 0);
16623   ix86_compare_op1 = XEXP (operands[1], 1);
16624   if (operands[3] != const1_rtx
16625       && operands[3] != constm1_rtx)
16626     return 0;
16627   if (!ix86_expand_carry_flag_compare (code, ix86_compare_op0,
16628 				       ix86_compare_op1, &compare_op))
16629      return 0;
16630   code = GET_CODE (compare_op);
16631 
16632   flags = XEXP (compare_op, 0);
16633 
16634   if (GET_MODE (flags) == CCFPmode
16635       || GET_MODE (flags) == CCFPUmode)
16636     {
16637       fpcmp = true;
16638       code = ix86_fp_compare_code_to_integer (code);
16639     }
16640 
16641   if (code != LTU)
16642     {
16643       val = constm1_rtx;
16644       if (fpcmp)
16645 	PUT_CODE (compare_op,
16646 		  reverse_condition_maybe_unordered
16647 		    (GET_CODE (compare_op)));
16648       else
16649 	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
16650     }
16651 
16652   mode = GET_MODE (operands[0]);
16653 
16654   /* Construct either adc or sbb insn.  */
16655   if ((code == LTU) == (operands[3] == constm1_rtx))
16656     {
16657       switch (mode)
16658 	{
16659 	  case QImode:
16660 	    insn = gen_subqi3_carry;
16661 	    break;
16662 	  case HImode:
16663 	    insn = gen_subhi3_carry;
16664 	    break;
16665 	  case SImode:
16666 	    insn = gen_subsi3_carry;
16667 	    break;
16668 	  case DImode:
16669 	    insn = gen_subdi3_carry;
16670 	    break;
16671 	  default:
16672 	    gcc_unreachable ();
16673 	}
16674     }
16675   else
16676     {
16677       switch (mode)
16678 	{
16679 	  case QImode:
16680 	    insn = gen_addqi3_carry;
16681 	    break;
16682 	  case HImode:
16683 	    insn = gen_addhi3_carry;
16684 	    break;
16685 	  case SImode:
16686 	    insn = gen_addsi3_carry;
16687 	    break;
16688 	  case DImode:
16689 	    insn = gen_adddi3_carry;
16690 	    break;
16691 	  default:
16692 	    gcc_unreachable ();
16693 	}
16694     }
16695   emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
16696 
16697   return 1; /* DONE */
16698 }
16699 
16700 
16701 /* Split operands 0 and 1 into SImode parts.  Similar to split_di, but
16702    works for floating pointer parameters and nonoffsetable memories.
16703    For pushes, it returns just stack offsets; the values will be saved
16704    in the right order.  Maximally three parts are generated.  */
16705 
16706 static int
16707 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
16708 {
16709   int size;
16710 
16711   if (!TARGET_64BIT)
16712     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
16713   else
16714     size = (GET_MODE_SIZE (mode) + 4) / 8;
16715 
16716   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
16717   gcc_assert (size >= 2 && size <= 4);
16718 
16719   /* Optimize constant pool reference to immediates.  This is used by fp
16720      moves, that force all constants to memory to allow combining.  */
16721   if (MEM_P (operand) && MEM_READONLY_P (operand))
16722     {
16723       rtx tmp = maybe_get_pool_constant (operand);
16724       if (tmp)
16725 	operand = tmp;
16726     }
16727 
16728   if (MEM_P (operand) && !offsettable_memref_p (operand))
16729     {
16730       /* The only non-offsetable memories we handle are pushes.  */
16731       int ok = push_operand (operand, VOIDmode);
16732 
16733       gcc_assert (ok);
16734 
16735       operand = copy_rtx (operand);
16736       PUT_MODE (operand, Pmode);
16737       parts[0] = parts[1] = parts[2] = parts[3] = operand;
16738       return size;
16739     }
16740 
16741   if (GET_CODE (operand) == CONST_VECTOR)
16742     {
16743       enum machine_mode imode = int_mode_for_mode (mode);
16744       /* Caution: if we looked through a constant pool memory above,
16745 	 the operand may actually have a different mode now.  That's
16746 	 ok, since we want to pun this all the way back to an integer.  */
16747       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
16748       gcc_assert (operand != NULL);
16749       mode = imode;
16750     }
16751 
16752   if (!TARGET_64BIT)
16753     {
16754       if (mode == DImode)
16755 	split_di (&operand, 1, &parts[0], &parts[1]);
16756       else
16757 	{
16758 	  int i;
16759 
16760 	  if (REG_P (operand))
16761 	    {
16762 	      gcc_assert (reload_completed);
16763 	      for (i = 0; i < size; i++)
16764 		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
16765 	    }
16766 	  else if (offsettable_memref_p (operand))
16767 	    {
16768 	      operand = adjust_address (operand, SImode, 0);
16769 	      parts[0] = operand;
16770 	      for (i = 1; i < size; i++)
16771 		parts[i] = adjust_address (operand, SImode, 4 * i);
16772 	    }
16773 	  else if (GET_CODE (operand) == CONST_DOUBLE)
16774 	    {
16775 	      REAL_VALUE_TYPE r;
16776 	      long l[4];
16777 
16778 	      REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
16779 	      switch (mode)
16780 		{
16781 		case TFmode:
16782 		  real_to_target (l, &r, mode);
16783 		  parts[3] = gen_int_mode (l[3], SImode);
16784 		  parts[2] = gen_int_mode (l[2], SImode);
16785 		  break;
16786 		case XFmode:
16787 		  REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
16788 		  parts[2] = gen_int_mode (l[2], SImode);
16789 		  break;
16790 		case DFmode:
16791 		  REAL_VALUE_TO_TARGET_DOUBLE (r, l);
16792 		  break;
16793 		default:
16794 		  gcc_unreachable ();
16795 		}
16796 	      parts[1] = gen_int_mode (l[1], SImode);
16797 	      parts[0] = gen_int_mode (l[0], SImode);
16798 	    }
16799 	  else
16800 	    gcc_unreachable ();
16801 	}
16802     }
16803   else
16804     {
16805       if (mode == TImode)
16806 	split_ti (&operand, 1, &parts[0], &parts[1]);
16807       if (mode == XFmode || mode == TFmode)
16808 	{
16809 	  enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
16810 	  if (REG_P (operand))
16811 	    {
16812 	      gcc_assert (reload_completed);
16813 	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
16814 	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
16815 	    }
16816 	  else if (offsettable_memref_p (operand))
16817 	    {
16818 	      operand = adjust_address (operand, DImode, 0);
16819 	      parts[0] = operand;
16820 	      parts[1] = adjust_address (operand, upper_mode, 8);
16821 	    }
16822 	  else if (GET_CODE (operand) == CONST_DOUBLE)
16823 	    {
16824 	      REAL_VALUE_TYPE r;
16825 	      long l[4];
16826 
16827 	      REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
16828 	      real_to_target (l, &r, mode);
16829 
16830 	      /* Do not use shift by 32 to avoid warning on 32bit systems.  */
16831 	      if (HOST_BITS_PER_WIDE_INT >= 64)
16832 	        parts[0]
16833 		  = gen_int_mode
16834 		      ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
16835 		       + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
16836 		       DImode);
16837 	      else
16838 	        parts[0] = immed_double_const (l[0], l[1], DImode);
16839 
16840 	      if (upper_mode == SImode)
16841 	        parts[1] = gen_int_mode (l[2], SImode);
16842 	      else if (HOST_BITS_PER_WIDE_INT >= 64)
16843 	        parts[1]
16844 		  = gen_int_mode
16845 		      ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
16846 		       + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
16847 		       DImode);
16848 	      else
16849 	        parts[1] = immed_double_const (l[2], l[3], DImode);
16850 	    }
16851 	  else
16852 	    gcc_unreachable ();
16853 	}
16854     }
16855 
16856   return size;
16857 }
16858 
16859 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
16860    Return false when normal moves are needed; true when all required
16861    insns have been emitted.  Operands 2-4 contain the input values
16862    int the correct order; operands 5-7 contain the output values.  */
16863 
16864 void
16865 ix86_split_long_move (rtx operands[])
16866 {
16867   rtx part[2][4];
16868   int nparts, i, j;
16869   int push = 0;
16870   int collisions = 0;
16871   enum machine_mode mode = GET_MODE (operands[0]);
16872   bool collisionparts[4];
16873 
16874   /* The DFmode expanders may ask us to move double.
16875      For 64bit target this is single move.  By hiding the fact
16876      here we simplify i386.md splitters.  */
16877   if (GET_MODE_SIZE (GET_MODE (operands[0])) == 8 && TARGET_64BIT)
16878     {
16879       /* Optimize constant pool reference to immediates.  This is used by
16880 	 fp moves, that force all constants to memory to allow combining.  */
16881 
16882       if (MEM_P (operands[1])
16883 	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
16884 	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
16885 	operands[1] = get_pool_constant (XEXP (operands[1], 0));
16886       if (push_operand (operands[0], VOIDmode))
16887 	{
16888 	  operands[0] = copy_rtx (operands[0]);
16889 	  PUT_MODE (operands[0], Pmode);
16890 	}
16891       else
16892         operands[0] = gen_lowpart (DImode, operands[0]);
16893       operands[1] = gen_lowpart (DImode, operands[1]);
16894       emit_move_insn (operands[0], operands[1]);
16895       return;
16896     }
16897 
16898   /* The only non-offsettable memory we handle is push.  */
16899   if (push_operand (operands[0], VOIDmode))
16900     push = 1;
16901   else
16902     gcc_assert (!MEM_P (operands[0])
16903 		|| offsettable_memref_p (operands[0]));
16904 
16905   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
16906   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
16907 
16908   /* When emitting push, take care for source operands on the stack.  */
16909   if (push && MEM_P (operands[1])
16910       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
16911     {
16912       rtx src_base = XEXP (part[1][nparts - 1], 0);
16913 
16914       /* Compensate for the stack decrement by 4.  */
16915       if (!TARGET_64BIT && nparts == 3
16916 	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
16917 	src_base = plus_constant (src_base, 4);
16918 
16919       /* src_base refers to the stack pointer and is
16920 	 automatically decreased by emitted push.  */
16921       for (i = 0; i < nparts; i++)
16922 	part[1][i] = change_address (part[1][i],
16923 				     GET_MODE (part[1][i]), src_base);
16924     }
16925 
16926   /* We need to do copy in the right order in case an address register
16927      of the source overlaps the destination.  */
16928   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
16929     {
16930       rtx tmp;
16931 
16932       for (i = 0; i < nparts; i++)
16933 	{
16934 	  collisionparts[i]
16935 	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
16936 	  if (collisionparts[i])
16937 	    collisions++;
16938 	}
16939 
16940       /* Collision in the middle part can be handled by reordering.  */
16941       if (collisions == 1 && nparts == 3 && collisionparts [1])
16942 	{
16943 	  tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
16944 	  tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
16945 	}
16946       else if (collisions == 1
16947 	       && nparts == 4
16948 	       && (collisionparts [1] || collisionparts [2]))
16949 	{
16950 	  if (collisionparts [1])
16951 	    {
16952 	      tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
16953 	      tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
16954 	    }
16955 	  else
16956 	    {
16957 	      tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
16958 	      tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
16959 	    }
16960 	}
16961 
16962       /* If there are more collisions, we can't handle it by reordering.
16963 	 Do an lea to the last part and use only one colliding move.  */
16964       else if (collisions > 1)
16965 	{
16966 	  rtx base;
16967 
16968 	  collisions = 1;
16969 
16970 	  base = part[0][nparts - 1];
16971 
16972 	  /* Handle the case when the last part isn't valid for lea.
16973 	     Happens in 64-bit mode storing the 12-byte XFmode.  */
16974 	  if (GET_MODE (base) != Pmode)
16975 	    base = gen_rtx_REG (Pmode, REGNO (base));
16976 
16977 	  emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
16978 	  part[1][0] = replace_equiv_address (part[1][0], base);
16979 	  for (i = 1; i < nparts; i++)
16980 	    {
16981 	      tmp = plus_constant (base, UNITS_PER_WORD * i);
16982 	      part[1][i] = replace_equiv_address (part[1][i], tmp);
16983 	    }
16984 	}
16985     }
16986 
16987   if (push)
16988     {
16989       if (!TARGET_64BIT)
16990 	{
16991 	  if (nparts == 3)
16992 	    {
16993 	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
16994                 emit_insn (gen_addsi3 (stack_pointer_rtx,
16995 				       stack_pointer_rtx, GEN_INT (-4)));
16996 	      emit_move_insn (part[0][2], part[1][2]);
16997 	    }
16998 	  else if (nparts == 4)
16999 	    {
17000 	      emit_move_insn (part[0][3], part[1][3]);
17001 	      emit_move_insn (part[0][2], part[1][2]);
17002 	    }
17003 	}
17004       else
17005 	{
17006 	  /* In 64bit mode we don't have 32bit push available.  In case this is
17007 	     register, it is OK - we will just use larger counterpart.  We also
17008 	     retype memory - these comes from attempt to avoid REX prefix on
17009 	     moving of second half of TFmode value.  */
17010 	  if (GET_MODE (part[1][1]) == SImode)
17011 	    {
17012 	      switch (GET_CODE (part[1][1]))
17013 		{
17014 		case MEM:
17015 		  part[1][1] = adjust_address (part[1][1], DImode, 0);
17016 		  break;
17017 
17018 		case REG:
17019 		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
17020 		  break;
17021 
17022 		default:
17023 		  gcc_unreachable ();
17024 		}
17025 
17026 	      if (GET_MODE (part[1][0]) == SImode)
17027 		part[1][0] = part[1][1];
17028 	    }
17029 	}
17030       emit_move_insn (part[0][1], part[1][1]);
17031       emit_move_insn (part[0][0], part[1][0]);
17032       return;
17033     }
17034 
17035   /* Choose correct order to not overwrite the source before it is copied.  */
17036   if ((REG_P (part[0][0])
17037        && REG_P (part[1][1])
17038        && (REGNO (part[0][0]) == REGNO (part[1][1])
17039 	   || (nparts == 3
17040 	       && REGNO (part[0][0]) == REGNO (part[1][2]))
17041 	   || (nparts == 4
17042 	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
17043       || (collisions > 0
17044 	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
17045     {
17046       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
17047 	{
17048 	  operands[2 + i] = part[0][j];
17049 	  operands[6 + i] = part[1][j];
17050 	}
17051     }
17052   else
17053     {
17054       for (i = 0; i < nparts; i++)
17055 	{
17056 	  operands[2 + i] = part[0][i];
17057 	  operands[6 + i] = part[1][i];
17058 	}
17059     }
17060 
17061   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
17062   if (optimize_insn_for_size_p ())
17063     {
17064       for (j = 0; j < nparts - 1; j++)
17065 	if (CONST_INT_P (operands[6 + j])
17066 	    && operands[6 + j] != const0_rtx
17067 	    && REG_P (operands[2 + j]))
17068 	  for (i = j; i < nparts - 1; i++)
17069 	    if (CONST_INT_P (operands[7 + i])
17070 		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
17071 	      operands[7 + i] = operands[2 + j];
17072     }
17073 
17074   for (i = 0; i < nparts; i++)
17075     emit_move_insn (operands[2 + i], operands[6 + i]);
17076 
17077   return;
17078 }
17079 
17080 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
17081    left shift by a constant, either using a single shift or
17082    a sequence of add instructions.  */
17083 
17084 static void
17085 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
17086 {
17087   if (count == 1)
17088     {
17089       emit_insn ((mode == DImode
17090 		  ? gen_addsi3
17091 		  : gen_adddi3) (operand, operand, operand));
17092     }
17093   else if (!optimize_insn_for_size_p ()
17094 	   && count * ix86_cost->add <= ix86_cost->shift_const)
17095     {
17096       int i;
17097       for (i=0; i<count; i++)
17098 	{
17099 	  emit_insn ((mode == DImode
17100 		      ? gen_addsi3
17101 		      : gen_adddi3) (operand, operand, operand));
17102 	}
17103     }
17104   else
17105     emit_insn ((mode == DImode
17106 		? gen_ashlsi3
17107 		: gen_ashldi3) (operand, operand, GEN_INT (count)));
17108 }
17109 
17110 void
17111 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
17112 {
17113   rtx low[2], high[2];
17114   int count;
17115   const int single_width = mode == DImode ? 32 : 64;
17116 
17117   if (CONST_INT_P (operands[2]))
17118     {
17119       (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
17120       count = INTVAL (operands[2]) & (single_width * 2 - 1);
17121 
17122       if (count >= single_width)
17123 	{
17124 	  emit_move_insn (high[0], low[1]);
17125 	  emit_move_insn (low[0], const0_rtx);
17126 
17127 	  if (count > single_width)
17128 	    ix86_expand_ashl_const (high[0], count - single_width, mode);
17129 	}
17130       else
17131 	{
17132 	  if (!rtx_equal_p (operands[0], operands[1]))
17133 	    emit_move_insn (operands[0], operands[1]);
17134 	  emit_insn ((mode == DImode
17135 		     ? gen_x86_shld
17136 		     : gen_x86_64_shld) (high[0], low[0], GEN_INT (count)));
17137 	  ix86_expand_ashl_const (low[0], count, mode);
17138 	}
17139       return;
17140     }
17141 
17142   (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
17143 
17144   if (operands[1] == const1_rtx)
17145     {
17146       /* Assuming we've chosen a QImode capable registers, then 1 << N
17147 	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
17148       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
17149 	{
17150 	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
17151 
17152 	  ix86_expand_clear (low[0]);
17153 	  ix86_expand_clear (high[0]);
17154 	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (single_width)));
17155 
17156 	  d = gen_lowpart (QImode, low[0]);
17157 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
17158 	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
17159 	  emit_insn (gen_rtx_SET (VOIDmode, d, s));
17160 
17161 	  d = gen_lowpart (QImode, high[0]);
17162 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
17163 	  s = gen_rtx_NE (QImode, flags, const0_rtx);
17164 	  emit_insn (gen_rtx_SET (VOIDmode, d, s));
17165 	}
17166 
17167       /* Otherwise, we can get the same results by manually performing
17168 	 a bit extract operation on bit 5/6, and then performing the two
17169 	 shifts.  The two methods of getting 0/1 into low/high are exactly
17170 	 the same size.  Avoiding the shift in the bit extract case helps
17171 	 pentium4 a bit; no one else seems to care much either way.  */
17172       else
17173 	{
17174 	  rtx x;
17175 
17176 	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
17177 	    x = gen_rtx_ZERO_EXTEND (mode == DImode ? SImode : DImode, operands[2]);
17178 	  else
17179 	    x = gen_lowpart (mode == DImode ? SImode : DImode, operands[2]);
17180 	  emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
17181 
17182 	  emit_insn ((mode == DImode
17183 		      ? gen_lshrsi3
17184 		      : gen_lshrdi3) (high[0], high[0],
17185 				      GEN_INT (mode == DImode ? 5 : 6)));
17186 	  emit_insn ((mode == DImode
17187 		      ? gen_andsi3
17188 		      : gen_anddi3) (high[0], high[0], const1_rtx));
17189 	  emit_move_insn (low[0], high[0]);
17190 	  emit_insn ((mode == DImode
17191 		      ? gen_xorsi3
17192 		      : gen_xordi3) (low[0], low[0], const1_rtx));
17193 	}
17194 
17195       emit_insn ((mode == DImode
17196 		    ? gen_ashlsi3
17197 		    : gen_ashldi3) (low[0], low[0], operands[2]));
17198       emit_insn ((mode == DImode
17199 		    ? gen_ashlsi3
17200 		    : gen_ashldi3) (high[0], high[0], operands[2]));
17201       return;
17202     }
17203 
17204   if (operands[1] == constm1_rtx)
17205     {
17206       /* For -1 << N, we can avoid the shld instruction, because we
17207 	 know that we're shifting 0...31/63 ones into a -1.  */
17208       emit_move_insn (low[0], constm1_rtx);
17209       if (optimize_insn_for_size_p ())
17210 	emit_move_insn (high[0], low[0]);
17211       else
17212 	emit_move_insn (high[0], constm1_rtx);
17213     }
17214   else
17215     {
17216       if (!rtx_equal_p (operands[0], operands[1]))
17217 	emit_move_insn (operands[0], operands[1]);
17218 
17219       (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
17220       emit_insn ((mode == DImode
17221 		  ? gen_x86_shld
17222 		  : gen_x86_64_shld) (high[0], low[0], operands[2]));
17223     }
17224 
17225   emit_insn ((mode == DImode ? gen_ashlsi3 : gen_ashldi3) (low[0], low[0], operands[2]));
17226 
17227   if (TARGET_CMOVE && scratch)
17228     {
17229       ix86_expand_clear (scratch);
17230       emit_insn ((mode == DImode
17231 		  ? gen_x86_shift_adj_1
17232 		  : gen_x86_64_shift_adj_1) (high[0], low[0], operands[2],
17233 					     scratch));
17234     }
17235   else
17236     emit_insn ((mode == DImode
17237 		? gen_x86_shift_adj_2
17238 		: gen_x86_64_shift_adj_2) (high[0], low[0], operands[2]));
17239 }
17240 
17241 void
17242 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
17243 {
17244   rtx low[2], high[2];
17245   int count;
17246   const int single_width = mode == DImode ? 32 : 64;
17247 
17248   if (CONST_INT_P (operands[2]))
17249     {
17250       (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
17251       count = INTVAL (operands[2]) & (single_width * 2 - 1);
17252 
17253       if (count == single_width * 2 - 1)
17254 	{
17255 	  emit_move_insn (high[0], high[1]);
17256 	  emit_insn ((mode == DImode
17257 		      ? gen_ashrsi3
17258 		      : gen_ashrdi3) (high[0], high[0],
17259 				      GEN_INT (single_width - 1)));
17260 	  emit_move_insn (low[0], high[0]);
17261 
17262 	}
17263       else if (count >= single_width)
17264 	{
17265 	  emit_move_insn (low[0], high[1]);
17266 	  emit_move_insn (high[0], low[0]);
17267 	  emit_insn ((mode == DImode
17268 		      ? gen_ashrsi3
17269 		      : gen_ashrdi3) (high[0], high[0],
17270 				      GEN_INT (single_width - 1)));
17271 	  if (count > single_width)
17272 	    emit_insn ((mode == DImode
17273 			? gen_ashrsi3
17274 			: gen_ashrdi3) (low[0], low[0],
17275 					GEN_INT (count - single_width)));
17276 	}
17277       else
17278 	{
17279 	  if (!rtx_equal_p (operands[0], operands[1]))
17280 	    emit_move_insn (operands[0], operands[1]);
17281 	  emit_insn ((mode == DImode
17282 		      ? gen_x86_shrd
17283 		      : gen_x86_64_shrd) (low[0], high[0], GEN_INT (count)));
17284 	  emit_insn ((mode == DImode
17285 		      ? gen_ashrsi3
17286 		      : gen_ashrdi3) (high[0], high[0], GEN_INT (count)));
17287 	}
17288     }
17289   else
17290     {
17291       if (!rtx_equal_p (operands[0], operands[1]))
17292 	emit_move_insn (operands[0], operands[1]);
17293 
17294       (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
17295 
17296       emit_insn ((mode == DImode
17297 		  ? gen_x86_shrd
17298 		  : gen_x86_64_shrd) (low[0], high[0], operands[2]));
17299       emit_insn ((mode == DImode
17300 		  ? gen_ashrsi3
17301 		  : gen_ashrdi3)  (high[0], high[0], operands[2]));
17302 
17303       if (TARGET_CMOVE && scratch)
17304 	{
17305 	  emit_move_insn (scratch, high[0]);
17306 	  emit_insn ((mode == DImode
17307 		      ? gen_ashrsi3
17308 		      : gen_ashrdi3) (scratch, scratch,
17309 				      GEN_INT (single_width - 1)));
17310 	  emit_insn ((mode == DImode
17311 		      ? gen_x86_shift_adj_1
17312 		      : gen_x86_64_shift_adj_1) (low[0], high[0], operands[2],
17313 						 scratch));
17314 	}
17315       else
17316 	emit_insn ((mode == DImode
17317 		    ? gen_x86_shift_adj_3
17318 		    : gen_x86_64_shift_adj_3) (low[0], high[0], operands[2]));
17319     }
17320 }
17321 
17322 void
17323 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
17324 {
17325   rtx low[2], high[2];
17326   int count;
17327   const int single_width = mode == DImode ? 32 : 64;
17328 
17329   if (CONST_INT_P (operands[2]))
17330     {
17331       (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
17332       count = INTVAL (operands[2]) & (single_width * 2 - 1);
17333 
17334       if (count >= single_width)
17335 	{
17336 	  emit_move_insn (low[0], high[1]);
17337 	  ix86_expand_clear (high[0]);
17338 
17339 	  if (count > single_width)
17340 	    emit_insn ((mode == DImode
17341 			? gen_lshrsi3
17342 			: gen_lshrdi3) (low[0], low[0],
17343 					GEN_INT (count - single_width)));
17344 	}
17345       else
17346 	{
17347 	  if (!rtx_equal_p (operands[0], operands[1]))
17348 	    emit_move_insn (operands[0], operands[1]);
17349 	  emit_insn ((mode == DImode
17350 		      ? gen_x86_shrd
17351 		      : gen_x86_64_shrd) (low[0], high[0], GEN_INT (count)));
17352 	  emit_insn ((mode == DImode
17353 		      ? gen_lshrsi3
17354 		      : gen_lshrdi3) (high[0], high[0], GEN_INT (count)));
17355 	}
17356     }
17357   else
17358     {
17359       if (!rtx_equal_p (operands[0], operands[1]))
17360 	emit_move_insn (operands[0], operands[1]);
17361 
17362       (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
17363 
17364       emit_insn ((mode == DImode
17365 		  ? gen_x86_shrd
17366 		  : gen_x86_64_shrd) (low[0], high[0], operands[2]));
17367       emit_insn ((mode == DImode
17368 		  ? gen_lshrsi3
17369 		  : gen_lshrdi3) (high[0], high[0], operands[2]));
17370 
17371       /* Heh.  By reversing the arguments, we can reuse this pattern.  */
17372       if (TARGET_CMOVE && scratch)
17373 	{
17374 	  ix86_expand_clear (scratch);
17375 	  emit_insn ((mode == DImode
17376 		      ? gen_x86_shift_adj_1
17377 		      : gen_x86_64_shift_adj_1) (low[0], high[0], operands[2],
17378 						 scratch));
17379 	}
17380       else
17381 	emit_insn ((mode == DImode
17382 		    ? gen_x86_shift_adj_2
17383 		    : gen_x86_64_shift_adj_2) (low[0], high[0], operands[2]));
17384     }
17385 }
17386 
17387 /* Predict just emitted jump instruction to be taken with probability PROB.  */
17388 static void
17389 predict_jump (int prob)
17390 {
17391   rtx insn = get_last_insn ();
17392   gcc_assert (JUMP_P (insn));
17393   add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
17394 }
17395 
17396 /* Helper function for the string operations below.  Dest VARIABLE whether
17397    it is aligned to VALUE bytes.  If true, jump to the label.  */
17398 static rtx
17399 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
17400 {
17401   rtx label = gen_label_rtx ();
17402   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
17403   if (GET_MODE (variable) == DImode)
17404     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
17405   else
17406     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
17407   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
17408 			   1, label);
17409   if (epilogue)
17410     predict_jump (REG_BR_PROB_BASE * 50 / 100);
17411   else
17412     predict_jump (REG_BR_PROB_BASE * 90 / 100);
17413   return label;
17414 }
17415 
17416 /* Adjust COUNTER by the VALUE.  */
17417 static void
17418 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
17419 {
17420   if (GET_MODE (countreg) == DImode)
17421     emit_insn (gen_adddi3 (countreg, countreg, GEN_INT (-value)));
17422   else
17423     emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-value)));
17424 }
17425 
17426 /* Zero extend possibly SImode EXP to Pmode register.  */
17427 rtx
17428 ix86_zero_extend_to_Pmode (rtx exp)
17429 {
17430   rtx r;
17431   if (GET_MODE (exp) == VOIDmode)
17432     return force_reg (Pmode, exp);
17433   if (GET_MODE (exp) == Pmode)
17434     return copy_to_mode_reg (Pmode, exp);
17435   r = gen_reg_rtx (Pmode);
17436   emit_insn (gen_zero_extendsidi2 (r, exp));
17437   return r;
17438 }
17439 
17440 /* Divide COUNTREG by SCALE.  */
17441 static rtx
17442 scale_counter (rtx countreg, int scale)
17443 {
17444   rtx sc;
17445 
17446   if (scale == 1)
17447     return countreg;
17448   if (CONST_INT_P (countreg))
17449     return GEN_INT (INTVAL (countreg) / scale);
17450   gcc_assert (REG_P (countreg));
17451 
17452   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
17453 			    GEN_INT (exact_log2 (scale)),
17454 			    NULL, 1, OPTAB_DIRECT);
17455   return sc;
17456 }
17457 
17458 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
17459    DImode for constant loop counts.  */
17460 
17461 static enum machine_mode
17462 counter_mode (rtx count_exp)
17463 {
17464   if (GET_MODE (count_exp) != VOIDmode)
17465     return GET_MODE (count_exp);
17466   if (!CONST_INT_P (count_exp))
17467     return Pmode;
17468   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
17469     return DImode;
17470   return SImode;
17471 }
17472 
17473 /* When SRCPTR is non-NULL, output simple loop to move memory
17474    pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
17475    overall size is COUNT specified in bytes.  When SRCPTR is NULL, output the
17476    equivalent loop to set memory by VALUE (supposed to be in MODE).
17477 
17478    The size is rounded down to whole number of chunk size moved at once.
17479    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
17480 
17481 
17482 static void
17483 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
17484 			       rtx destptr, rtx srcptr, rtx value,
17485 			       rtx count, enum machine_mode mode, int unroll,
17486 			       int expected_size)
17487 {
17488   rtx out_label, top_label, iter, tmp;
17489   enum machine_mode iter_mode = counter_mode (count);
17490   rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
17491   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
17492   rtx size;
17493   rtx x_addr;
17494   rtx y_addr;
17495   int i;
17496 
17497   top_label = gen_label_rtx ();
17498   out_label = gen_label_rtx ();
17499   iter = gen_reg_rtx (iter_mode);
17500 
17501   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
17502 			      NULL, 1, OPTAB_DIRECT);
17503   /* Those two should combine.  */
17504   if (piece_size == const1_rtx)
17505     {
17506       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
17507 			       true, out_label);
17508       predict_jump (REG_BR_PROB_BASE * 10 / 100);
17509     }
17510   emit_move_insn (iter, const0_rtx);
17511 
17512   emit_label (top_label);
17513 
17514   tmp = convert_modes (Pmode, iter_mode, iter, true);
17515   x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
17516   destmem = change_address (destmem, mode, x_addr);
17517 
17518   if (srcmem)
17519     {
17520       y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
17521       srcmem = change_address (srcmem, mode, y_addr);
17522 
17523       /* When unrolling for chips that reorder memory reads and writes,
17524 	 we can save registers by using single temporary.
17525 	 Also using 4 temporaries is overkill in 32bit mode.  */
17526       if (!TARGET_64BIT && 0)
17527 	{
17528 	  for (i = 0; i < unroll; i++)
17529 	    {
17530 	      if (i)
17531 		{
17532 		  destmem =
17533 		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
17534 		  srcmem =
17535 		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
17536 		}
17537 	      emit_move_insn (destmem, srcmem);
17538 	    }
17539 	}
17540       else
17541 	{
17542 	  rtx tmpreg[4];
17543 	  gcc_assert (unroll <= 4);
17544 	  for (i = 0; i < unroll; i++)
17545 	    {
17546 	      tmpreg[i] = gen_reg_rtx (mode);
17547 	      if (i)
17548 		{
17549 		  srcmem =
17550 		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
17551 		}
17552 	      emit_move_insn (tmpreg[i], srcmem);
17553 	    }
17554 	  for (i = 0; i < unroll; i++)
17555 	    {
17556 	      if (i)
17557 		{
17558 		  destmem =
17559 		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
17560 		}
17561 	      emit_move_insn (destmem, tmpreg[i]);
17562 	    }
17563 	}
17564     }
17565   else
17566     for (i = 0; i < unroll; i++)
17567       {
17568 	if (i)
17569 	  destmem =
17570 	    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
17571 	emit_move_insn (destmem, value);
17572       }
17573 
17574   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
17575 			     true, OPTAB_LIB_WIDEN);
17576   if (tmp != iter)
17577     emit_move_insn (iter, tmp);
17578 
17579   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
17580 			   true, top_label);
17581   if (expected_size != -1)
17582     {
17583       expected_size /= GET_MODE_SIZE (mode) * unroll;
17584       if (expected_size == 0)
17585 	predict_jump (0);
17586       else if (expected_size > REG_BR_PROB_BASE)
17587 	predict_jump (REG_BR_PROB_BASE - 1);
17588       else
17589         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
17590     }
17591   else
17592     predict_jump (REG_BR_PROB_BASE * 80 / 100);
17593   iter = ix86_zero_extend_to_Pmode (iter);
17594   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
17595 			     true, OPTAB_LIB_WIDEN);
17596   if (tmp != destptr)
17597     emit_move_insn (destptr, tmp);
17598   if (srcptr)
17599     {
17600       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
17601 				 true, OPTAB_LIB_WIDEN);
17602       if (tmp != srcptr)
17603 	emit_move_insn (srcptr, tmp);
17604     }
17605   emit_label (out_label);
17606 }
17607 
17608 /* Output "rep; mov" instruction.
17609    Arguments have same meaning as for previous function */
17610 static void
17611 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
17612 			   rtx destptr, rtx srcptr,
17613 			   rtx count,
17614 			   enum machine_mode mode)
17615 {
17616   rtx destexp;
17617   rtx srcexp;
17618   rtx countreg;
17619 
17620   /* If the size is known, it is shorter to use rep movs.  */
17621   if (mode == QImode && CONST_INT_P (count)
17622       && !(INTVAL (count) & 3))
17623     mode = SImode;
17624 
17625   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
17626     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
17627   if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
17628     srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
17629   countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
17630   if (mode != QImode)
17631     {
17632       destexp = gen_rtx_ASHIFT (Pmode, countreg,
17633 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
17634       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
17635       srcexp = gen_rtx_ASHIFT (Pmode, countreg,
17636 			       GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
17637       srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
17638     }
17639   else
17640     {
17641       destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
17642       srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
17643     }
17644   if (CONST_INT_P (count))
17645     {
17646       count = GEN_INT (INTVAL (count)
17647 		       & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
17648       destmem = shallow_copy_rtx (destmem);
17649       srcmem = shallow_copy_rtx (srcmem);
17650       set_mem_size (destmem, count);
17651       set_mem_size (srcmem, count);
17652     }
17653   else
17654     {
17655       if (MEM_SIZE (destmem))
17656 	set_mem_size (destmem, NULL_RTX);
17657       if (MEM_SIZE (srcmem))
17658 	set_mem_size (srcmem, NULL_RTX);
17659     }
17660   emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
17661 			  destexp, srcexp));
17662 }
17663 
17664 /* Output "rep; stos" instruction.
17665    Arguments have same meaning as for previous function */
17666 static void
17667 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
17668 			    rtx count, enum machine_mode mode,
17669 			    rtx orig_value)
17670 {
17671   rtx destexp;
17672   rtx countreg;
17673 
17674   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
17675     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
17676   value = force_reg (mode, gen_lowpart (mode, value));
17677   countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
17678   if (mode != QImode)
17679     {
17680       destexp = gen_rtx_ASHIFT (Pmode, countreg,
17681 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
17682       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
17683     }
17684   else
17685     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
17686   if (orig_value == const0_rtx && CONST_INT_P (count))
17687     {
17688       count = GEN_INT (INTVAL (count)
17689 		       & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
17690       destmem = shallow_copy_rtx (destmem);
17691       set_mem_size (destmem, count);
17692     }
17693   else if (MEM_SIZE (destmem))
17694     set_mem_size (destmem, NULL_RTX);
17695   emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
17696 }
17697 
17698 static void
17699 emit_strmov (rtx destmem, rtx srcmem,
17700 	     rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
17701 {
17702   rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
17703   rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
17704   emit_insn (gen_strmov (destptr, dest, srcptr, src));
17705 }
17706 
17707 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
17708 static void
17709 expand_movmem_epilogue (rtx destmem, rtx srcmem,
17710 			rtx destptr, rtx srcptr, rtx count, int max_size)
17711 {
17712   rtx src, dest;
17713   if (CONST_INT_P (count))
17714     {
17715       HOST_WIDE_INT countval = INTVAL (count);
17716       int offset = 0;
17717 
17718       if ((countval & 0x10) && max_size > 16)
17719 	{
17720 	  if (TARGET_64BIT)
17721 	    {
17722 	      emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
17723 	      emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
17724 	    }
17725 	  else
17726 	    gcc_unreachable ();
17727 	  offset += 16;
17728 	}
17729       if ((countval & 0x08) && max_size > 8)
17730 	{
17731 	  if (TARGET_64BIT)
17732 	    emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
17733 	  else
17734 	    {
17735 	      emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
17736 	      emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
17737 	    }
17738 	  offset += 8;
17739 	}
17740       if ((countval & 0x04) && max_size > 4)
17741 	{
17742           emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
17743 	  offset += 4;
17744 	}
17745       if ((countval & 0x02) && max_size > 2)
17746 	{
17747           emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
17748 	  offset += 2;
17749 	}
17750       if ((countval & 0x01) && max_size > 1)
17751 	{
17752           emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
17753 	  offset += 1;
17754 	}
17755       return;
17756     }
17757   if (max_size > 8)
17758     {
17759       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
17760 				    count, 1, OPTAB_DIRECT);
17761       expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
17762 				     count, QImode, 1, 4);
17763       return;
17764     }
17765 
17766   /* When there are stringops, we can cheaply increase dest and src pointers.
17767      Otherwise we save code size by maintaining offset (zero is readily
17768      available from preceding rep operation) and using x86 addressing modes.
17769    */
17770   if (TARGET_SINGLE_STRINGOP)
17771     {
17772       if (max_size > 4)
17773 	{
17774 	  rtx label = ix86_expand_aligntest (count, 4, true);
17775 	  src = change_address (srcmem, SImode, srcptr);
17776 	  dest = change_address (destmem, SImode, destptr);
17777 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
17778 	  emit_label (label);
17779 	  LABEL_NUSES (label) = 1;
17780 	}
17781       if (max_size > 2)
17782 	{
17783 	  rtx label = ix86_expand_aligntest (count, 2, true);
17784 	  src = change_address (srcmem, HImode, srcptr);
17785 	  dest = change_address (destmem, HImode, destptr);
17786 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
17787 	  emit_label (label);
17788 	  LABEL_NUSES (label) = 1;
17789 	}
17790       if (max_size > 1)
17791 	{
17792 	  rtx label = ix86_expand_aligntest (count, 1, true);
17793 	  src = change_address (srcmem, QImode, srcptr);
17794 	  dest = change_address (destmem, QImode, destptr);
17795 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
17796 	  emit_label (label);
17797 	  LABEL_NUSES (label) = 1;
17798 	}
17799     }
17800   else
17801     {
17802       rtx offset = force_reg (Pmode, const0_rtx);
17803       rtx tmp;
17804 
17805       if (max_size > 4)
17806 	{
17807 	  rtx label = ix86_expand_aligntest (count, 4, true);
17808 	  src = change_address (srcmem, SImode, srcptr);
17809 	  dest = change_address (destmem, SImode, destptr);
17810 	  emit_move_insn (dest, src);
17811 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
17812 				     true, OPTAB_LIB_WIDEN);
17813 	  if (tmp != offset)
17814 	    emit_move_insn (offset, tmp);
17815 	  emit_label (label);
17816 	  LABEL_NUSES (label) = 1;
17817 	}
17818       if (max_size > 2)
17819 	{
17820 	  rtx label = ix86_expand_aligntest (count, 2, true);
17821 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
17822 	  src = change_address (srcmem, HImode, tmp);
17823 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
17824 	  dest = change_address (destmem, HImode, tmp);
17825 	  emit_move_insn (dest, src);
17826 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
17827 				     true, OPTAB_LIB_WIDEN);
17828 	  if (tmp != offset)
17829 	    emit_move_insn (offset, tmp);
17830 	  emit_label (label);
17831 	  LABEL_NUSES (label) = 1;
17832 	}
17833       if (max_size > 1)
17834 	{
17835 	  rtx label = ix86_expand_aligntest (count, 1, true);
17836 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
17837 	  src = change_address (srcmem, QImode, tmp);
17838 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
17839 	  dest = change_address (destmem, QImode, tmp);
17840 	  emit_move_insn (dest, src);
17841 	  emit_label (label);
17842 	  LABEL_NUSES (label) = 1;
17843 	}
17844     }
17845 }
17846 
17847 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
17848 static void
17849 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
17850 				 rtx count, int max_size)
17851 {
17852   count =
17853     expand_simple_binop (counter_mode (count), AND, count,
17854 			 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
17855   expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
17856 				 gen_lowpart (QImode, value), count, QImode,
17857 				 1, max_size / 2);
17858 }
17859 
17860 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
17861 static void
17862 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
17863 {
17864   rtx dest;
17865 
17866   if (CONST_INT_P (count))
17867     {
17868       HOST_WIDE_INT countval = INTVAL (count);
17869       int offset = 0;
17870 
17871       if ((countval & 0x10) && max_size > 16)
17872 	{
17873 	  if (TARGET_64BIT)
17874 	    {
17875 	      dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
17876 	      emit_insn (gen_strset (destptr, dest, value));
17877 	      dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
17878 	      emit_insn (gen_strset (destptr, dest, value));
17879 	    }
17880 	  else
17881 	    gcc_unreachable ();
17882 	  offset += 16;
17883 	}
17884       if ((countval & 0x08) && max_size > 8)
17885 	{
17886 	  if (TARGET_64BIT)
17887 	    {
17888 	      dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
17889 	      emit_insn (gen_strset (destptr, dest, value));
17890 	    }
17891 	  else
17892 	    {
17893 	      dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
17894 	      emit_insn (gen_strset (destptr, dest, value));
17895 	      dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
17896 	      emit_insn (gen_strset (destptr, dest, value));
17897 	    }
17898 	  offset += 8;
17899 	}
17900       if ((countval & 0x04) && max_size > 4)
17901 	{
17902 	  dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
17903 	  emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
17904 	  offset += 4;
17905 	}
17906       if ((countval & 0x02) && max_size > 2)
17907 	{
17908 	  dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
17909 	  emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
17910 	  offset += 2;
17911 	}
17912       if ((countval & 0x01) && max_size > 1)
17913 	{
17914 	  dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
17915 	  emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
17916 	  offset += 1;
17917 	}
17918       return;
17919     }
17920   if (max_size > 32)
17921     {
17922       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
17923       return;
17924     }
17925   if (max_size > 16)
17926     {
17927       rtx label = ix86_expand_aligntest (count, 16, true);
17928       if (TARGET_64BIT)
17929 	{
17930 	  dest = change_address (destmem, DImode, destptr);
17931 	  emit_insn (gen_strset (destptr, dest, value));
17932 	  emit_insn (gen_strset (destptr, dest, value));
17933 	}
17934       else
17935 	{
17936 	  dest = change_address (destmem, SImode, destptr);
17937 	  emit_insn (gen_strset (destptr, dest, value));
17938 	  emit_insn (gen_strset (destptr, dest, value));
17939 	  emit_insn (gen_strset (destptr, dest, value));
17940 	  emit_insn (gen_strset (destptr, dest, value));
17941 	}
17942       emit_label (label);
17943       LABEL_NUSES (label) = 1;
17944     }
17945   if (max_size > 8)
17946     {
17947       rtx label = ix86_expand_aligntest (count, 8, true);
17948       if (TARGET_64BIT)
17949 	{
17950 	  dest = change_address (destmem, DImode, destptr);
17951 	  emit_insn (gen_strset (destptr, dest, value));
17952 	}
17953       else
17954 	{
17955 	  dest = change_address (destmem, SImode, destptr);
17956 	  emit_insn (gen_strset (destptr, dest, value));
17957 	  emit_insn (gen_strset (destptr, dest, value));
17958 	}
17959       emit_label (label);
17960       LABEL_NUSES (label) = 1;
17961     }
17962   if (max_size > 4)
17963     {
17964       rtx label = ix86_expand_aligntest (count, 4, true);
17965       dest = change_address (destmem, SImode, destptr);
17966       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
17967       emit_label (label);
17968       LABEL_NUSES (label) = 1;
17969     }
17970   if (max_size > 2)
17971     {
17972       rtx label = ix86_expand_aligntest (count, 2, true);
17973       dest = change_address (destmem, HImode, destptr);
17974       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
17975       emit_label (label);
17976       LABEL_NUSES (label) = 1;
17977     }
17978   if (max_size > 1)
17979     {
17980       rtx label = ix86_expand_aligntest (count, 1, true);
17981       dest = change_address (destmem, QImode, destptr);
17982       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
17983       emit_label (label);
17984       LABEL_NUSES (label) = 1;
17985     }
17986 }
17987 
17988 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
17989    DESIRED_ALIGNMENT.  */
17990 static void
17991 expand_movmem_prologue (rtx destmem, rtx srcmem,
17992 			rtx destptr, rtx srcptr, rtx count,
17993 			int align, int desired_alignment)
17994 {
17995   if (align <= 1 && desired_alignment > 1)
17996     {
17997       rtx label = ix86_expand_aligntest (destptr, 1, false);
17998       srcmem = change_address (srcmem, QImode, srcptr);
17999       destmem = change_address (destmem, QImode, destptr);
18000       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
18001       ix86_adjust_counter (count, 1);
18002       emit_label (label);
18003       LABEL_NUSES (label) = 1;
18004     }
18005   if (align <= 2 && desired_alignment > 2)
18006     {
18007       rtx label = ix86_expand_aligntest (destptr, 2, false);
18008       srcmem = change_address (srcmem, HImode, srcptr);
18009       destmem = change_address (destmem, HImode, destptr);
18010       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
18011       ix86_adjust_counter (count, 2);
18012       emit_label (label);
18013       LABEL_NUSES (label) = 1;
18014     }
18015   if (align <= 4 && desired_alignment > 4)
18016     {
18017       rtx label = ix86_expand_aligntest (destptr, 4, false);
18018       srcmem = change_address (srcmem, SImode, srcptr);
18019       destmem = change_address (destmem, SImode, destptr);
18020       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
18021       ix86_adjust_counter (count, 4);
18022       emit_label (label);
18023       LABEL_NUSES (label) = 1;
18024     }
18025   gcc_assert (desired_alignment <= 8);
18026 }
18027 
18028 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
18029    ALIGN_BYTES is how many bytes need to be copied.  */
18030 static rtx
18031 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
18032 				 int desired_align, int align_bytes)
18033 {
18034   rtx src = *srcp;
18035   rtx src_size, dst_size;
18036   int off = 0;
18037   int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
18038   if (src_align_bytes >= 0)
18039     src_align_bytes = desired_align - src_align_bytes;
18040   src_size = MEM_SIZE (src);
18041   dst_size = MEM_SIZE (dst);
18042   if (align_bytes & 1)
18043     {
18044       dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
18045       src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
18046       off = 1;
18047       emit_insn (gen_strmov (destreg, dst, srcreg, src));
18048     }
18049   if (align_bytes & 2)
18050     {
18051       dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
18052       src = adjust_automodify_address_nv (src, HImode, srcreg, off);
18053       if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
18054 	set_mem_align (dst, 2 * BITS_PER_UNIT);
18055       if (src_align_bytes >= 0
18056 	  && (src_align_bytes & 1) == (align_bytes & 1)
18057 	  && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
18058 	set_mem_align (src, 2 * BITS_PER_UNIT);
18059       off = 2;
18060       emit_insn (gen_strmov (destreg, dst, srcreg, src));
18061     }
18062   if (align_bytes & 4)
18063     {
18064       dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
18065       src = adjust_automodify_address_nv (src, SImode, srcreg, off);
18066       if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
18067 	set_mem_align (dst, 4 * BITS_PER_UNIT);
18068       if (src_align_bytes >= 0)
18069 	{
18070 	  unsigned int src_align = 0;
18071 	  if ((src_align_bytes & 3) == (align_bytes & 3))
18072 	    src_align = 4;
18073 	  else if ((src_align_bytes & 1) == (align_bytes & 1))
18074 	    src_align = 2;
18075 	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
18076 	    set_mem_align (src, src_align * BITS_PER_UNIT);
18077 	}
18078       off = 4;
18079       emit_insn (gen_strmov (destreg, dst, srcreg, src));
18080     }
18081   dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
18082   src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
18083   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
18084     set_mem_align (dst, desired_align * BITS_PER_UNIT);
18085   if (src_align_bytes >= 0)
18086     {
18087       unsigned int src_align = 0;
18088       if ((src_align_bytes & 7) == (align_bytes & 7))
18089 	src_align = 8;
18090       else if ((src_align_bytes & 3) == (align_bytes & 3))
18091 	src_align = 4;
18092       else if ((src_align_bytes & 1) == (align_bytes & 1))
18093 	src_align = 2;
18094       if (src_align > (unsigned int) desired_align)
18095 	src_align = desired_align;
18096       if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
18097 	set_mem_align (src, src_align * BITS_PER_UNIT);
18098     }
18099   if (dst_size)
18100     set_mem_size (dst, GEN_INT (INTVAL (dst_size) - align_bytes));
18101   if (src_size)
18102     set_mem_size (dst, GEN_INT (INTVAL (src_size) - align_bytes));
18103   *srcp = src;
18104   return dst;
18105 }
18106 
18107 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
18108    DESIRED_ALIGNMENT.  */
18109 static void
18110 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
18111 			int align, int desired_alignment)
18112 {
18113   if (align <= 1 && desired_alignment > 1)
18114     {
18115       rtx label = ix86_expand_aligntest (destptr, 1, false);
18116       destmem = change_address (destmem, QImode, destptr);
18117       emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
18118       ix86_adjust_counter (count, 1);
18119       emit_label (label);
18120       LABEL_NUSES (label) = 1;
18121     }
18122   if (align <= 2 && desired_alignment > 2)
18123     {
18124       rtx label = ix86_expand_aligntest (destptr, 2, false);
18125       destmem = change_address (destmem, HImode, destptr);
18126       emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
18127       ix86_adjust_counter (count, 2);
18128       emit_label (label);
18129       LABEL_NUSES (label) = 1;
18130     }
18131   if (align <= 4 && desired_alignment > 4)
18132     {
18133       rtx label = ix86_expand_aligntest (destptr, 4, false);
18134       destmem = change_address (destmem, SImode, destptr);
18135       emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
18136       ix86_adjust_counter (count, 4);
18137       emit_label (label);
18138       LABEL_NUSES (label) = 1;
18139     }
18140   gcc_assert (desired_alignment <= 8);
18141 }
18142 
18143 /* Set enough from DST to align DST known to by aligned by ALIGN to
18144    DESIRED_ALIGN.  ALIGN_BYTES is how many bytes need to be stored.  */
18145 static rtx
18146 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
18147 				 int desired_align, int align_bytes)
18148 {
18149   int off = 0;
18150   rtx dst_size = MEM_SIZE (dst);
18151   if (align_bytes & 1)
18152     {
18153       dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
18154       off = 1;
18155       emit_insn (gen_strset (destreg, dst,
18156 			     gen_lowpart (QImode, value)));
18157     }
18158   if (align_bytes & 2)
18159     {
18160       dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
18161       if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
18162 	set_mem_align (dst, 2 * BITS_PER_UNIT);
18163       off = 2;
18164       emit_insn (gen_strset (destreg, dst,
18165 			     gen_lowpart (HImode, value)));
18166     }
18167   if (align_bytes & 4)
18168     {
18169       dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
18170       if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
18171 	set_mem_align (dst, 4 * BITS_PER_UNIT);
18172       off = 4;
18173       emit_insn (gen_strset (destreg, dst,
18174 			     gen_lowpart (SImode, value)));
18175     }
18176   dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
18177   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
18178     set_mem_align (dst, desired_align * BITS_PER_UNIT);
18179   if (dst_size)
18180     set_mem_size (dst, GEN_INT (INTVAL (dst_size) - align_bytes));
18181   return dst;
18182 }
18183 
18184 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
18185 static enum stringop_alg
18186 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
18187 	    int *dynamic_check)
18188 {
18189   const struct stringop_algs * algs;
18190   bool optimize_for_speed;
18191   /* Algorithms using the rep prefix want at least edi and ecx;
18192      additionally, memset wants eax and memcpy wants esi.  Don't
18193      consider such algorithms if the user has appropriated those
18194      registers for their own purposes.	*/
18195   bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
18196                              || (memset
18197 				 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
18198 
18199 #define ALG_USABLE_P(alg) (rep_prefix_usable			\
18200 			   || (alg != rep_prefix_1_byte		\
18201 			       && alg != rep_prefix_4_byte      \
18202 			       && alg != rep_prefix_8_byte))
18203   const struct processor_costs *cost;
18204 
18205   /* Even if the string operation call is cold, we still might spend a lot
18206      of time processing large blocks.  */
18207   if (optimize_function_for_size_p (cfun)
18208       || (optimize_insn_for_size_p ()
18209           && expected_size != -1 && expected_size < 256))
18210     optimize_for_speed = false;
18211   else
18212     optimize_for_speed = true;
18213 
18214   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
18215 
18216   *dynamic_check = -1;
18217   if (memset)
18218     algs = &cost->memset[TARGET_64BIT != 0];
18219   else
18220     algs = &cost->memcpy[TARGET_64BIT != 0];
18221   if (stringop_alg != no_stringop && ALG_USABLE_P (stringop_alg))
18222     return stringop_alg;
18223   /* rep; movq or rep; movl is the smallest variant.  */
18224   else if (!optimize_for_speed)
18225     {
18226       if (!count || (count & 3))
18227 	return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
18228       else
18229 	return rep_prefix_usable ? rep_prefix_4_byte : loop;
18230     }
18231   /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
18232    */
18233   else if (expected_size != -1 && expected_size < 4)
18234     return loop_1_byte;
18235   else if (expected_size != -1)
18236     {
18237       unsigned int i;
18238       enum stringop_alg alg = libcall;
18239       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
18240 	{
18241 	  /* We get here if the algorithms that were not libcall-based
18242 	     were rep-prefix based and we are unable to use rep prefixes
18243 	     based on global register usage.  Break out of the loop and
18244 	     use the heuristic below.  */
18245 	  if (algs->size[i].max == 0)
18246 	    break;
18247 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
18248 	    {
18249 	      enum stringop_alg candidate = algs->size[i].alg;
18250 
18251 	      if (candidate != libcall && ALG_USABLE_P (candidate))
18252 		alg = candidate;
18253 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
18254 		 last non-libcall inline algorithm.  */
18255 	      if (TARGET_INLINE_ALL_STRINGOPS)
18256 		{
18257 		  /* When the current size is best to be copied by a libcall,
18258 		     but we are still forced to inline, run the heuristic below
18259 		     that will pick code for medium sized blocks.  */
18260 		  if (alg != libcall)
18261 		    return alg;
18262 		  break;
18263 		}
18264 	      else if (ALG_USABLE_P (candidate))
18265 		return candidate;
18266 	    }
18267 	}
18268       gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
18269     }
18270   /* When asked to inline the call anyway, try to pick meaningful choice.
18271      We look for maximal size of block that is faster to copy by hand and
18272      take blocks of at most of that size guessing that average size will
18273      be roughly half of the block.
18274 
18275      If this turns out to be bad, we might simply specify the preferred
18276      choice in ix86_costs.  */
18277   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
18278       && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
18279     {
18280       int max = -1;
18281       enum stringop_alg alg;
18282       int i;
18283       bool any_alg_usable_p = true;
18284 
18285       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
18286         {
18287           enum stringop_alg candidate = algs->size[i].alg;
18288           any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
18289 
18290           if (candidate != libcall && candidate
18291               && ALG_USABLE_P (candidate))
18292               max = algs->size[i].max;
18293         }
18294       /* If there aren't any usable algorithms, then recursing on
18295          smaller sizes isn't going to find anything.  Just return the
18296          simple byte-at-a-time copy loop.  */
18297       if (!any_alg_usable_p)
18298         {
18299           /* Pick something reasonable.  */
18300           if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
18301             *dynamic_check = 128;
18302           return loop_1_byte;
18303         }
18304       if (max == -1)
18305 	max = 4096;
18306       alg = decide_alg (count, max / 2, memset, dynamic_check);
18307       gcc_assert (*dynamic_check == -1);
18308       gcc_assert (alg != libcall);
18309       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
18310 	*dynamic_check = max;
18311       return alg;
18312     }
18313   return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
18314 #undef ALG_USABLE_P
18315 }
18316 
18317 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
18318    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
18319 static int
18320 decide_alignment (int align,
18321 		  enum stringop_alg alg,
18322 		  int expected_size)
18323 {
18324   int desired_align = 0;
18325   switch (alg)
18326     {
18327       case no_stringop:
18328 	gcc_unreachable ();
18329       case loop:
18330       case unrolled_loop:
18331 	desired_align = GET_MODE_SIZE (Pmode);
18332 	break;
18333       case rep_prefix_8_byte:
18334 	desired_align = 8;
18335 	break;
18336       case rep_prefix_4_byte:
18337 	/* PentiumPro has special logic triggering for 8 byte aligned blocks.
18338 	   copying whole cacheline at once.  */
18339 	if (TARGET_PENTIUMPRO)
18340 	  desired_align = 8;
18341 	else
18342 	  desired_align = 4;
18343 	break;
18344       case rep_prefix_1_byte:
18345 	/* PentiumPro has special logic triggering for 8 byte aligned blocks.
18346 	   copying whole cacheline at once.  */
18347 	if (TARGET_PENTIUMPRO)
18348 	  desired_align = 8;
18349 	else
18350 	  desired_align = 1;
18351 	break;
18352       case loop_1_byte:
18353 	desired_align = 1;
18354 	break;
18355       case libcall:
18356 	return 0;
18357     }
18358 
18359   if (optimize_size)
18360     desired_align = 1;
18361   if (desired_align < align)
18362     desired_align = align;
18363   if (expected_size != -1 && expected_size < 4)
18364     desired_align = align;
18365   return desired_align;
18366 }
18367 
18368 /* Return the smallest power of 2 greater than VAL.  */
18369 static int
18370 smallest_pow2_greater_than (int val)
18371 {
18372   int ret = 1;
18373   while (ret <= val)
18374     ret <<= 1;
18375   return ret;
18376 }
18377 
18378 /* Expand string move (memcpy) operation.  Use i386 string operations when
18379    profitable.  expand_setmem contains similar code.  The code depends upon
18380    architecture, block size and alignment, but always has the same
18381    overall structure:
18382 
18383    1) Prologue guard: Conditional that jumps up to epilogues for small
18384       blocks that can be handled by epilogue alone.  This is faster but
18385       also needed for correctness, since prologue assume the block is larger
18386       than the desired alignment.
18387 
18388       Optional dynamic check for size and libcall for large
18389       blocks is emitted here too, with -minline-stringops-dynamically.
18390 
18391    2) Prologue: copy first few bytes in order to get destination aligned
18392       to DESIRED_ALIGN.  It is emitted only when ALIGN is less than
18393       DESIRED_ALIGN and and up to DESIRED_ALIGN - ALIGN bytes can be copied.
18394       We emit either a jump tree on power of two sized blocks, or a byte loop.
18395 
18396    3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
18397       with specified algorithm.
18398 
18399    4) Epilogue: code copying tail of the block that is too small to be
18400       handled by main body (or up to size guarded by prologue guard).  */
18401 
18402 int
18403 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
18404 		    rtx expected_align_exp, rtx expected_size_exp)
18405 {
18406   rtx destreg;
18407   rtx srcreg;
18408   rtx label = NULL;
18409   rtx tmp;
18410   rtx jump_around_label = NULL;
18411   HOST_WIDE_INT align = 1;
18412   unsigned HOST_WIDE_INT count = 0;
18413   HOST_WIDE_INT expected_size = -1;
18414   int size_needed = 0, epilogue_size_needed;
18415   int desired_align = 0, align_bytes = 0;
18416   enum stringop_alg alg;
18417   int dynamic_check;
18418   bool need_zero_guard = false;
18419 
18420   if (CONST_INT_P (align_exp))
18421     align = INTVAL (align_exp);
18422   /* i386 can do misaligned access on reasonably increased cost.  */
18423   if (CONST_INT_P (expected_align_exp)
18424       && INTVAL (expected_align_exp) > align)
18425     align = INTVAL (expected_align_exp);
18426   /* ALIGN is the minimum of destination and source alignment, but we care here
18427      just about destination alignment.  */
18428   else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
18429     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
18430 
18431   if (CONST_INT_P (count_exp))
18432     count = expected_size = INTVAL (count_exp);
18433   if (CONST_INT_P (expected_size_exp) && count == 0)
18434     expected_size = INTVAL (expected_size_exp);
18435 
18436   /* Make sure we don't need to care about overflow later on.  */
18437   if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
18438     return 0;
18439 
18440   /* Step 0: Decide on preferred algorithm, desired alignment and
18441      size of chunks to be copied by main loop.  */
18442 
18443   alg = decide_alg (count, expected_size, false, &dynamic_check);
18444   desired_align = decide_alignment (align, alg, expected_size);
18445 
18446   if (!TARGET_ALIGN_STRINGOPS)
18447     align = desired_align;
18448 
18449   if (alg == libcall)
18450     return 0;
18451   gcc_assert (alg != no_stringop);
18452   if (!count)
18453     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
18454   destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18455   srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
18456   switch (alg)
18457     {
18458     case libcall:
18459     case no_stringop:
18460       gcc_unreachable ();
18461     case loop:
18462       need_zero_guard = true;
18463       size_needed = GET_MODE_SIZE (Pmode);
18464       break;
18465     case unrolled_loop:
18466       need_zero_guard = true;
18467       size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
18468       break;
18469     case rep_prefix_8_byte:
18470       size_needed = 8;
18471       break;
18472     case rep_prefix_4_byte:
18473       size_needed = 4;
18474       break;
18475     case rep_prefix_1_byte:
18476       size_needed = 1;
18477       break;
18478     case loop_1_byte:
18479       need_zero_guard = true;
18480       size_needed = 1;
18481       break;
18482     }
18483 
18484   epilogue_size_needed = size_needed;
18485 
18486   /* Step 1: Prologue guard.  */
18487 
18488   /* Alignment code needs count to be in register.  */
18489   if (CONST_INT_P (count_exp) && desired_align > align)
18490     {
18491       if (INTVAL (count_exp) > desired_align
18492 	  && INTVAL (count_exp) > size_needed)
18493 	{
18494 	  align_bytes
18495 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
18496 	  if (align_bytes <= 0)
18497 	    align_bytes = 0;
18498 	  else
18499 	    align_bytes = desired_align - align_bytes;
18500 	}
18501       if (align_bytes == 0)
18502 	count_exp = force_reg (counter_mode (count_exp), count_exp);
18503     }
18504   gcc_assert (desired_align >= 1 && align >= 1);
18505 
18506   /* Ensure that alignment prologue won't copy past end of block.  */
18507   if (size_needed > 1 || (desired_align > 1 && desired_align > align))
18508     {
18509       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
18510       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
18511 	 Make sure it is power of 2.  */
18512       epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
18513 
18514       if (count)
18515 	{
18516 	  if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
18517 	    {
18518 	      /* If main algorithm works on QImode, no epilogue is needed.
18519 		 For small sizes just don't align anything.  */
18520 	      if (size_needed == 1)
18521 		desired_align = align;
18522 	      else
18523 		goto epilogue;
18524 	    }
18525 	}
18526       else
18527 	{
18528 	  label = gen_label_rtx ();
18529 	  emit_cmp_and_jump_insns (count_exp,
18530 				   GEN_INT (epilogue_size_needed),
18531 				   LTU, 0, counter_mode (count_exp), 1, label);
18532 	  if (expected_size == -1 || expected_size < epilogue_size_needed)
18533 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
18534 	  else
18535 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
18536 	}
18537     }
18538 
18539   /* Emit code to decide on runtime whether library call or inline should be
18540      used.  */
18541   if (dynamic_check != -1)
18542     {
18543       if (CONST_INT_P (count_exp))
18544 	{
18545 	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
18546 	    {
18547 	      emit_block_move_via_libcall (dst, src, count_exp, false);
18548 	      count_exp = const0_rtx;
18549 	      goto epilogue;
18550 	    }
18551 	}
18552       else
18553 	{
18554 	  rtx hot_label = gen_label_rtx ();
18555 	  jump_around_label = gen_label_rtx ();
18556 	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
18557 				   LEU, 0, GET_MODE (count_exp), 1, hot_label);
18558 	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
18559 	  emit_block_move_via_libcall (dst, src, count_exp, false);
18560 	  emit_jump (jump_around_label);
18561 	  emit_label (hot_label);
18562 	}
18563     }
18564 
18565   /* Step 2: Alignment prologue.  */
18566 
18567   if (desired_align > align)
18568     {
18569       if (align_bytes == 0)
18570 	{
18571 	  /* Except for the first move in epilogue, we no longer know
18572 	     constant offset in aliasing info.  It don't seems to worth
18573 	     the pain to maintain it for the first move, so throw away
18574 	     the info early.  */
18575 	  src = change_address (src, BLKmode, srcreg);
18576 	  dst = change_address (dst, BLKmode, destreg);
18577 	  expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
18578 				  desired_align);
18579 	}
18580       else
18581 	{
18582 	  /* If we know how many bytes need to be stored before dst is
18583 	     sufficiently aligned, maintain aliasing info accurately.  */
18584 	  dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
18585 						 desired_align, align_bytes);
18586 	  count_exp = plus_constant (count_exp, -align_bytes);
18587 	  count -= align_bytes;
18588 	}
18589       if (need_zero_guard
18590 	  && (count < (unsigned HOST_WIDE_INT) size_needed
18591 	      || (align_bytes == 0
18592 		  && count < ((unsigned HOST_WIDE_INT) size_needed
18593 			      + desired_align - align))))
18594 	{
18595 	  /* It is possible that we copied enough so the main loop will not
18596 	     execute.  */
18597 	  gcc_assert (size_needed > 1);
18598 	  if (label == NULL_RTX)
18599 	    label = gen_label_rtx ();
18600 	  emit_cmp_and_jump_insns (count_exp,
18601 				   GEN_INT (size_needed),
18602 				   LTU, 0, counter_mode (count_exp), 1, label);
18603 	  if (expected_size == -1
18604 	      || expected_size < (desired_align - align) / 2 + size_needed)
18605 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
18606 	  else
18607 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
18608 	}
18609     }
18610   if (label && size_needed == 1)
18611     {
18612       emit_label (label);
18613       LABEL_NUSES (label) = 1;
18614       label = NULL;
18615       epilogue_size_needed = 1;
18616     }
18617   else if (label == NULL_RTX)
18618     epilogue_size_needed = size_needed;
18619 
18620   /* Step 3: Main loop.  */
18621 
18622   switch (alg)
18623     {
18624     case libcall:
18625     case no_stringop:
18626       gcc_unreachable ();
18627     case loop_1_byte:
18628       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
18629 				     count_exp, QImode, 1, expected_size);
18630       break;
18631     case loop:
18632       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
18633 				     count_exp, Pmode, 1, expected_size);
18634       break;
18635     case unrolled_loop:
18636       /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
18637 	 registers for 4 temporaries anyway.  */
18638       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
18639 				     count_exp, Pmode, TARGET_64BIT ? 4 : 2,
18640 				     expected_size);
18641       break;
18642     case rep_prefix_8_byte:
18643       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
18644 				 DImode);
18645       break;
18646     case rep_prefix_4_byte:
18647       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
18648 				 SImode);
18649       break;
18650     case rep_prefix_1_byte:
18651       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
18652 				 QImode);
18653       break;
18654     }
18655   /* Adjust properly the offset of src and dest memory for aliasing.  */
18656   if (CONST_INT_P (count_exp))
18657     {
18658       src = adjust_automodify_address_nv (src, BLKmode, srcreg,
18659 					  (count / size_needed) * size_needed);
18660       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
18661 					  (count / size_needed) * size_needed);
18662     }
18663   else
18664     {
18665       src = change_address (src, BLKmode, srcreg);
18666       dst = change_address (dst, BLKmode, destreg);
18667     }
18668 
18669   /* Step 4: Epilogue to copy the remaining bytes.  */
18670  epilogue:
18671   if (label)
18672     {
18673       /* When the main loop is done, COUNT_EXP might hold original count,
18674  	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
18675 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
18676 	 bytes. Compensate if needed.  */
18677 
18678       if (size_needed < epilogue_size_needed)
18679 	{
18680 	  tmp =
18681 	    expand_simple_binop (counter_mode (count_exp), AND, count_exp,
18682 				 GEN_INT (size_needed - 1), count_exp, 1,
18683 				 OPTAB_DIRECT);
18684 	  if (tmp != count_exp)
18685 	    emit_move_insn (count_exp, tmp);
18686 	}
18687       emit_label (label);
18688       LABEL_NUSES (label) = 1;
18689     }
18690 
18691   if (count_exp != const0_rtx && epilogue_size_needed > 1)
18692     expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
18693 			    epilogue_size_needed);
18694   if (jump_around_label)
18695     emit_label (jump_around_label);
18696   return 1;
18697 }
18698 
18699 /* Helper function for memcpy.  For QImode value 0xXY produce
18700    0xXYXYXYXY of wide specified by MODE.  This is essentially
18701    a * 0x10101010, but we can do slightly better than
18702    synth_mult by unwinding the sequence by hand on CPUs with
18703    slow multiply.  */
18704 static rtx
18705 promote_duplicated_reg (enum machine_mode mode, rtx val)
18706 {
18707   enum machine_mode valmode = GET_MODE (val);
18708   rtx tmp;
18709   int nops = mode == DImode ? 3 : 2;
18710 
18711   gcc_assert (mode == SImode || mode == DImode);
18712   if (val == const0_rtx)
18713     return copy_to_mode_reg (mode, const0_rtx);
18714   if (CONST_INT_P (val))
18715     {
18716       HOST_WIDE_INT v = INTVAL (val) & 255;
18717 
18718       v |= v << 8;
18719       v |= v << 16;
18720       if (mode == DImode)
18721         v |= (v << 16) << 16;
18722       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
18723     }
18724 
18725   if (valmode == VOIDmode)
18726     valmode = QImode;
18727   if (valmode != QImode)
18728     val = gen_lowpart (QImode, val);
18729   if (mode == QImode)
18730     return val;
18731   if (!TARGET_PARTIAL_REG_STALL)
18732     nops--;
18733   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
18734       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
18735       <= (ix86_cost->shift_const + ix86_cost->add) * nops
18736           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
18737     {
18738       rtx reg = convert_modes (mode, QImode, val, true);
18739       tmp = promote_duplicated_reg (mode, const1_rtx);
18740       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
18741 				  OPTAB_DIRECT);
18742     }
18743   else
18744     {
18745       rtx reg = convert_modes (mode, QImode, val, true);
18746 
18747       if (!TARGET_PARTIAL_REG_STALL)
18748 	if (mode == SImode)
18749 	  emit_insn (gen_movsi_insv_1 (reg, reg));
18750 	else
18751 	  emit_insn (gen_movdi_insv_1_rex64 (reg, reg));
18752       else
18753 	{
18754 	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
18755 				     NULL, 1, OPTAB_DIRECT);
18756 	  reg =
18757 	    expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
18758 	}
18759       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
18760 			         NULL, 1, OPTAB_DIRECT);
18761       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
18762       if (mode == SImode)
18763 	return reg;
18764       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
18765 				 NULL, 1, OPTAB_DIRECT);
18766       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
18767       return reg;
18768     }
18769 }
18770 
18771 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
18772    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
18773    alignment from ALIGN to DESIRED_ALIGN.  */
18774 static rtx
18775 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
18776 {
18777   rtx promoted_val;
18778 
18779   if (TARGET_64BIT
18780       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
18781     promoted_val = promote_duplicated_reg (DImode, val);
18782   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
18783     promoted_val = promote_duplicated_reg (SImode, val);
18784   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
18785     promoted_val = promote_duplicated_reg (HImode, val);
18786   else
18787     promoted_val = val;
18788 
18789   return promoted_val;
18790 }
18791 
18792 /* Expand string clear operation (bzero).  Use i386 string operations when
18793    profitable.  See expand_movmem comment for explanation of individual
18794    steps performed.  */
18795 int
18796 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
18797 		    rtx expected_align_exp, rtx expected_size_exp)
18798 {
18799   rtx destreg;
18800   rtx label = NULL;
18801   rtx tmp;
18802   rtx jump_around_label = NULL;
18803   HOST_WIDE_INT align = 1;
18804   unsigned HOST_WIDE_INT count = 0;
18805   HOST_WIDE_INT expected_size = -1;
18806   int size_needed = 0, epilogue_size_needed;
18807   int desired_align = 0, align_bytes = 0;
18808   enum stringop_alg alg;
18809   rtx promoted_val = NULL;
18810   bool force_loopy_epilogue = false;
18811   int dynamic_check;
18812   bool need_zero_guard = false;
18813 
18814   if (CONST_INT_P (align_exp))
18815     align = INTVAL (align_exp);
18816   /* i386 can do misaligned access on reasonably increased cost.  */
18817   if (CONST_INT_P (expected_align_exp)
18818       && INTVAL (expected_align_exp) > align)
18819     align = INTVAL (expected_align_exp);
18820   if (CONST_INT_P (count_exp))
18821     count = expected_size = INTVAL (count_exp);
18822   if (CONST_INT_P (expected_size_exp) && count == 0)
18823     expected_size = INTVAL (expected_size_exp);
18824 
18825   /* Make sure we don't need to care about overflow later on.  */
18826   if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
18827     return 0;
18828 
18829   /* Step 0: Decide on preferred algorithm, desired alignment and
18830      size of chunks to be copied by main loop.  */
18831 
18832   alg = decide_alg (count, expected_size, true, &dynamic_check);
18833   desired_align = decide_alignment (align, alg, expected_size);
18834 
18835   if (!TARGET_ALIGN_STRINGOPS)
18836     align = desired_align;
18837 
18838   if (alg == libcall)
18839     return 0;
18840   gcc_assert (alg != no_stringop);
18841   if (!count)
18842     count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
18843   destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18844   switch (alg)
18845     {
18846     case libcall:
18847     case no_stringop:
18848       gcc_unreachable ();
18849     case loop:
18850       need_zero_guard = true;
18851       size_needed = GET_MODE_SIZE (Pmode);
18852       break;
18853     case unrolled_loop:
18854       need_zero_guard = true;
18855       size_needed = GET_MODE_SIZE (Pmode) * 4;
18856       break;
18857     case rep_prefix_8_byte:
18858       size_needed = 8;
18859       break;
18860     case rep_prefix_4_byte:
18861       size_needed = 4;
18862       break;
18863     case rep_prefix_1_byte:
18864       size_needed = 1;
18865       break;
18866     case loop_1_byte:
18867       need_zero_guard = true;
18868       size_needed = 1;
18869       break;
18870     }
18871   epilogue_size_needed = size_needed;
18872 
18873   /* Step 1: Prologue guard.  */
18874 
18875   /* Alignment code needs count to be in register.  */
18876   if (CONST_INT_P (count_exp) && desired_align > align)
18877     {
18878       if (INTVAL (count_exp) > desired_align
18879 	  && INTVAL (count_exp) > size_needed)
18880 	{
18881 	  align_bytes
18882 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
18883 	  if (align_bytes <= 0)
18884 	    align_bytes = 0;
18885 	  else
18886 	    align_bytes = desired_align - align_bytes;
18887 	}
18888       if (align_bytes == 0)
18889 	{
18890 	  enum machine_mode mode = SImode;
18891 	  if (TARGET_64BIT && (count & ~0xffffffff))
18892 	    mode = DImode;
18893 	  count_exp = force_reg (mode, count_exp);
18894 	}
18895     }
18896   /* Do the cheap promotion to allow better CSE across the
18897      main loop and epilogue (ie one load of the big constant in the
18898      front of all code.  */
18899   if (CONST_INT_P (val_exp))
18900     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
18901 						   desired_align, align);
18902   /* Ensure that alignment prologue won't copy past end of block.  */
18903   if (size_needed > 1 || (desired_align > 1 && desired_align > align))
18904     {
18905       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
18906       /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
18907 	 Make sure it is power of 2.  */
18908       epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
18909 
18910       /* To improve performance of small blocks, we jump around the VAL
18911 	 promoting mode.  This mean that if the promoted VAL is not constant,
18912 	 we might not use it in the epilogue and have to use byte
18913 	 loop variant.  */
18914       if (epilogue_size_needed > 2 && !promoted_val)
18915         force_loopy_epilogue = true;
18916       if (count)
18917 	{
18918 	  if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
18919 	    {
18920 	      /* If main algorithm works on QImode, no epilogue is needed.
18921 		 For small sizes just don't align anything.  */
18922 	      if (size_needed == 1)
18923 		desired_align = align;
18924 	      else
18925 		goto epilogue;
18926 	    }
18927 	}
18928       else
18929 	{
18930 	  label = gen_label_rtx ();
18931 	  emit_cmp_and_jump_insns (count_exp,
18932 				   GEN_INT (epilogue_size_needed),
18933 				   LTU, 0, counter_mode (count_exp), 1, label);
18934 	  if (expected_size == -1 || expected_size <= epilogue_size_needed)
18935 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
18936 	  else
18937 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
18938 	}
18939     }
18940   if (dynamic_check != -1)
18941     {
18942       rtx hot_label = gen_label_rtx ();
18943       jump_around_label = gen_label_rtx ();
18944       emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
18945 			       LEU, 0, counter_mode (count_exp), 1, hot_label);
18946       predict_jump (REG_BR_PROB_BASE * 90 / 100);
18947       set_storage_via_libcall (dst, count_exp, val_exp, false);
18948       emit_jump (jump_around_label);
18949       emit_label (hot_label);
18950     }
18951 
18952   /* Step 2: Alignment prologue.  */
18953 
18954   /* Do the expensive promotion once we branched off the small blocks.  */
18955   if (!promoted_val)
18956     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
18957 						   desired_align, align);
18958   gcc_assert (desired_align >= 1 && align >= 1);
18959 
18960   if (desired_align > align)
18961     {
18962       if (align_bytes == 0)
18963 	{
18964 	  /* Except for the first move in epilogue, we no longer know
18965 	     constant offset in aliasing info.  It don't seems to worth
18966 	     the pain to maintain it for the first move, so throw away
18967 	     the info early.  */
18968 	  dst = change_address (dst, BLKmode, destreg);
18969 	  expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
18970 				  desired_align);
18971 	}
18972       else
18973 	{
18974 	  /* If we know how many bytes need to be stored before dst is
18975 	     sufficiently aligned, maintain aliasing info accurately.  */
18976 	  dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
18977 						 desired_align, align_bytes);
18978 	  count_exp = plus_constant (count_exp, -align_bytes);
18979 	  count -= align_bytes;
18980 	}
18981       if (need_zero_guard
18982 	  && (count < (unsigned HOST_WIDE_INT) size_needed
18983 	      || (align_bytes == 0
18984 		  && count < ((unsigned HOST_WIDE_INT) size_needed
18985 			      + desired_align - align))))
18986 	{
18987 	  /* It is possible that we copied enough so the main loop will not
18988 	     execute.  */
18989 	  gcc_assert (size_needed > 1);
18990 	  if (label == NULL_RTX)
18991 	    label = gen_label_rtx ();
18992 	  emit_cmp_and_jump_insns (count_exp,
18993 				   GEN_INT (size_needed),
18994 				   LTU, 0, counter_mode (count_exp), 1, label);
18995 	  if (expected_size == -1
18996 	      || expected_size < (desired_align - align) / 2 + size_needed)
18997 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
18998 	  else
18999 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
19000 	}
19001     }
19002   if (label && size_needed == 1)
19003     {
19004       emit_label (label);
19005       LABEL_NUSES (label) = 1;
19006       label = NULL;
19007       promoted_val = val_exp;
19008       epilogue_size_needed = 1;
19009     }
19010   else if (label == NULL_RTX)
19011     epilogue_size_needed = size_needed;
19012 
19013   /* Step 3: Main loop.  */
19014 
19015   switch (alg)
19016     {
19017     case libcall:
19018     case no_stringop:
19019       gcc_unreachable ();
19020     case loop_1_byte:
19021       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
19022 				     count_exp, QImode, 1, expected_size);
19023       break;
19024     case loop:
19025       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
19026 				     count_exp, Pmode, 1, expected_size);
19027       break;
19028     case unrolled_loop:
19029       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
19030 				     count_exp, Pmode, 4, expected_size);
19031       break;
19032     case rep_prefix_8_byte:
19033       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
19034 				  DImode, val_exp);
19035       break;
19036     case rep_prefix_4_byte:
19037       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
19038 				  SImode, val_exp);
19039       break;
19040     case rep_prefix_1_byte:
19041       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
19042 				  QImode, val_exp);
19043       break;
19044     }
19045   /* Adjust properly the offset of src and dest memory for aliasing.  */
19046   if (CONST_INT_P (count_exp))
19047     dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
19048 					(count / size_needed) * size_needed);
19049   else
19050     dst = change_address (dst, BLKmode, destreg);
19051 
19052   /* Step 4: Epilogue to copy the remaining bytes.  */
19053 
19054   if (label)
19055     {
19056       /* When the main loop is done, COUNT_EXP might hold original count,
19057  	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
19058 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
19059 	 bytes. Compensate if needed.  */
19060 
19061       if (size_needed < epilogue_size_needed)
19062 	{
19063 	  tmp =
19064 	    expand_simple_binop (counter_mode (count_exp), AND, count_exp,
19065 				 GEN_INT (size_needed - 1), count_exp, 1,
19066 				 OPTAB_DIRECT);
19067 	  if (tmp != count_exp)
19068 	    emit_move_insn (count_exp, tmp);
19069 	}
19070       emit_label (label);
19071       LABEL_NUSES (label) = 1;
19072     }
19073  epilogue:
19074   if (count_exp != const0_rtx && epilogue_size_needed > 1)
19075     {
19076       if (force_loopy_epilogue)
19077 	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
19078 					 epilogue_size_needed);
19079       else
19080 	expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
19081 				epilogue_size_needed);
19082     }
19083   if (jump_around_label)
19084     emit_label (jump_around_label);
19085   return 1;
19086 }
19087 
19088 /* Expand the appropriate insns for doing strlen if not just doing
19089    repnz; scasb
19090 
19091    out = result, initialized with the start address
19092    align_rtx = alignment of the address.
19093    scratch = scratch register, initialized with the startaddress when
19094 	not aligned, otherwise undefined
19095 
19096    This is just the body. It needs the initializations mentioned above and
19097    some address computing at the end.  These things are done in i386.md.  */
19098 
19099 static void
19100 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
19101 {
19102   int align;
19103   rtx tmp;
19104   rtx align_2_label = NULL_RTX;
19105   rtx align_3_label = NULL_RTX;
19106   rtx align_4_label = gen_label_rtx ();
19107   rtx end_0_label = gen_label_rtx ();
19108   rtx mem;
19109   rtx tmpreg = gen_reg_rtx (SImode);
19110   rtx scratch = gen_reg_rtx (SImode);
19111   rtx cmp;
19112 
19113   align = 0;
19114   if (CONST_INT_P (align_rtx))
19115     align = INTVAL (align_rtx);
19116 
19117   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
19118 
19119   /* Is there a known alignment and is it less than 4?  */
19120   if (align < 4)
19121     {
19122       rtx scratch1 = gen_reg_rtx (Pmode);
19123       emit_move_insn (scratch1, out);
19124       /* Is there a known alignment and is it not 2? */
19125       if (align != 2)
19126 	{
19127 	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
19128 	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
19129 
19130 	  /* Leave just the 3 lower bits.  */
19131 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
19132 				    NULL_RTX, 0, OPTAB_WIDEN);
19133 
19134 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
19135 				   Pmode, 1, align_4_label);
19136 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
19137 				   Pmode, 1, align_2_label);
19138 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
19139 				   Pmode, 1, align_3_label);
19140 	}
19141       else
19142         {
19143 	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
19144 	     check if is aligned to 4 - byte.  */
19145 
19146 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
19147 				    NULL_RTX, 0, OPTAB_WIDEN);
19148 
19149 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
19150 				   Pmode, 1, align_4_label);
19151         }
19152 
19153       mem = change_address (src, QImode, out);
19154 
19155       /* Now compare the bytes.  */
19156 
19157       /* Compare the first n unaligned byte on a byte per byte basis.  */
19158       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
19159 			       QImode, 1, end_0_label);
19160 
19161       /* Increment the address.  */
19162       emit_insn ((*ix86_gen_add3) (out, out, const1_rtx));
19163 
19164       /* Not needed with an alignment of 2 */
19165       if (align != 2)
19166 	{
19167 	  emit_label (align_2_label);
19168 
19169 	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
19170 				   end_0_label);
19171 
19172 	  emit_insn ((*ix86_gen_add3) (out, out, const1_rtx));
19173 
19174 	  emit_label (align_3_label);
19175 	}
19176 
19177       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
19178 			       end_0_label);
19179 
19180       emit_insn ((*ix86_gen_add3) (out, out, const1_rtx));
19181     }
19182 
19183   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
19184      align this loop.  It gives only huge programs, but does not help to
19185      speed up.  */
19186   emit_label (align_4_label);
19187 
19188   mem = change_address (src, SImode, out);
19189   emit_move_insn (scratch, mem);
19190   emit_insn ((*ix86_gen_add3) (out, out, GEN_INT (4)));
19191 
19192   /* This formula yields a nonzero result iff one of the bytes is zero.
19193      This saves three branches inside loop and many cycles.  */
19194 
19195   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
19196   emit_insn (gen_one_cmplsi2 (scratch, scratch));
19197   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
19198   emit_insn (gen_andsi3 (tmpreg, tmpreg,
19199 			 gen_int_mode (0x80808080, SImode)));
19200   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
19201 			   align_4_label);
19202 
19203   if (TARGET_CMOVE)
19204     {
19205        rtx reg = gen_reg_rtx (SImode);
19206        rtx reg2 = gen_reg_rtx (Pmode);
19207        emit_move_insn (reg, tmpreg);
19208        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
19209 
19210        /* If zero is not in the first two bytes, move two bytes forward.  */
19211        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
19212        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
19213        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
19214        emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
19215 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
19216 						     reg,
19217 						     tmpreg)));
19218        /* Emit lea manually to avoid clobbering of flags.  */
19219        emit_insn (gen_rtx_SET (SImode, reg2,
19220 			       gen_rtx_PLUS (Pmode, out, const2_rtx)));
19221 
19222        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
19223        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
19224        emit_insn (gen_rtx_SET (VOIDmode, out,
19225 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
19226 						     reg2,
19227 						     out)));
19228     }
19229   else
19230     {
19231        rtx end_2_label = gen_label_rtx ();
19232        /* Is zero in the first two bytes? */
19233 
19234        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
19235        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
19236        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
19237        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19238                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
19239                             pc_rtx);
19240        tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19241        JUMP_LABEL (tmp) = end_2_label;
19242 
19243        /* Not in the first two.  Move two bytes forward.  */
19244        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
19245        emit_insn ((*ix86_gen_add3) (out, out, const2_rtx));
19246 
19247        emit_label (end_2_label);
19248 
19249     }
19250 
19251   /* Avoid branch in fixing the byte.  */
19252   tmpreg = gen_lowpart (QImode, tmpreg);
19253   emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
19254   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
19255   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
19256   emit_insn ((*ix86_gen_sub3_carry) (out, out, GEN_INT (3), tmp, cmp));
19257 
19258   emit_label (end_0_label);
19259 }
19260 
19261 /* Expand strlen.  */
19262 
19263 int
19264 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
19265 {
19266   rtx addr, scratch1, scratch2, scratch3, scratch4;
19267 
19268   /* The generic case of strlen expander is long.  Avoid it's
19269      expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
19270 
19271   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
19272       && !TARGET_INLINE_ALL_STRINGOPS
19273       && !optimize_insn_for_size_p ()
19274       && (!CONST_INT_P (align) || INTVAL (align) < 4))
19275     return 0;
19276 
19277   addr = force_reg (Pmode, XEXP (src, 0));
19278   scratch1 = gen_reg_rtx (Pmode);
19279 
19280   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
19281       && !optimize_insn_for_size_p ())
19282     {
19283       /* Well it seems that some optimizer does not combine a call like
19284          foo(strlen(bar), strlen(bar));
19285          when the move and the subtraction is done here.  It does calculate
19286          the length just once when these instructions are done inside of
19287          output_strlen_unroll().  But I think since &bar[strlen(bar)] is
19288          often used and I use one fewer register for the lifetime of
19289          output_strlen_unroll() this is better.  */
19290 
19291       emit_move_insn (out, addr);
19292 
19293       ix86_expand_strlensi_unroll_1 (out, src, align);
19294 
19295       /* strlensi_unroll_1 returns the address of the zero at the end of
19296          the string, like memchr(), so compute the length by subtracting
19297          the start address.  */
19298       emit_insn ((*ix86_gen_sub3) (out, out, addr));
19299     }
19300   else
19301     {
19302       rtx unspec;
19303 
19304       /* Can't use this if the user has appropriated eax, ecx, or edi.  */
19305       if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
19306         return false;
19307 
19308       scratch2 = gen_reg_rtx (Pmode);
19309       scratch3 = gen_reg_rtx (Pmode);
19310       scratch4 = force_reg (Pmode, constm1_rtx);
19311 
19312       emit_move_insn (scratch3, addr);
19313       eoschar = force_reg (QImode, eoschar);
19314 
19315       src = replace_equiv_address_nv (src, scratch3);
19316 
19317       /* If .md starts supporting :P, this can be done in .md.  */
19318       unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
19319 						 scratch4), UNSPEC_SCAS);
19320       emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
19321       emit_insn ((*ix86_gen_one_cmpl2) (scratch2, scratch1));
19322       emit_insn ((*ix86_gen_add3) (out, scratch2, constm1_rtx));
19323     }
19324   return 1;
19325 }
19326 
19327 /* For given symbol (function) construct code to compute address of it's PLT
19328    entry in large x86-64 PIC model.  */
19329 rtx
19330 construct_plt_address (rtx symbol)
19331 {
19332   rtx tmp = gen_reg_rtx (Pmode);
19333   rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
19334 
19335   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
19336   gcc_assert (ix86_cmodel == CM_LARGE_PIC);
19337 
19338   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
19339   emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
19340   return tmp;
19341 }
19342 
19343 void
19344 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
19345 		  rtx callarg2,
19346 		  rtx pop, int sibcall)
19347 {
19348   rtx use = NULL, call;
19349 
19350   if (pop == const0_rtx)
19351     pop = NULL;
19352   gcc_assert (!TARGET_64BIT || !pop);
19353 
19354   if (TARGET_MACHO && !TARGET_64BIT)
19355     {
19356 #if TARGET_MACHO
19357       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
19358 	fnaddr = machopic_indirect_call_target (fnaddr);
19359 #endif
19360     }
19361   else
19362     {
19363       /* Static functions and indirect calls don't need the pic register.  */
19364       if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
19365 	  && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
19366 	  && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
19367 	use_reg (&use, pic_offset_table_rtx);
19368     }
19369 
19370   if (TARGET_64BIT && INTVAL (callarg2) >= 0)
19371     {
19372       rtx al = gen_rtx_REG (QImode, AX_REG);
19373       emit_move_insn (al, callarg2);
19374       use_reg (&use, al);
19375     }
19376 
19377   if (ix86_cmodel == CM_LARGE_PIC
19378       && MEM_P (fnaddr)
19379       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
19380       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
19381     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
19382   else if (sibcall
19383 	   ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
19384 	   : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
19385     {
19386       fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
19387       fnaddr = gen_rtx_MEM (QImode, fnaddr);
19388     }
19389 
19390   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
19391   if (retval)
19392     call = gen_rtx_SET (VOIDmode, retval, call);
19393   if (pop)
19394     {
19395       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
19396       pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
19397       call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, call, pop));
19398     }
19399   if (TARGET_64BIT
19400       && ix86_cfun_abi () == MS_ABI
19401       && (!callarg2 || INTVAL (callarg2) != -2))
19402     {
19403       /* We need to represent that SI and DI registers are clobbered
19404 	 by SYSV calls.  */
19405       static int clobbered_registers[] = {
19406 	XMM6_REG, XMM7_REG, XMM8_REG,
19407 	XMM9_REG, XMM10_REG, XMM11_REG,
19408 	XMM12_REG, XMM13_REG, XMM14_REG,
19409 	XMM15_REG, SI_REG, DI_REG
19410       };
19411       unsigned int i;
19412       rtx vec[ARRAY_SIZE (clobbered_registers) + 2];
19413       rtx unspec = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
19414       				   UNSPEC_MS_TO_SYSV_CALL);
19415 
19416       vec[0] = call;
19417       vec[1] = unspec;
19418       for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
19419         vec[i + 2] = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
19420 				      ? TImode : DImode,
19421 				      gen_rtx_REG
19422 				        (SSE_REGNO_P (clobbered_registers[i])
19423 						      ? TImode : DImode,
19424 					 clobbered_registers[i]));
19425 
19426       call = gen_rtx_PARALLEL (VOIDmode,
19427       			       gen_rtvec_v (ARRAY_SIZE (clobbered_registers)
19428 			       + 2, vec));
19429     }
19430 
19431   call = emit_call_insn (call);
19432   if (use)
19433     CALL_INSN_FUNCTION_USAGE (call) = use;
19434 }
19435 
19436 
19437 /* Clear stack slot assignments remembered from previous functions.
19438    This is called from INIT_EXPANDERS once before RTL is emitted for each
19439    function.  */
19440 
19441 static struct machine_function *
19442 ix86_init_machine_status (void)
19443 {
19444   struct machine_function *f;
19445 
19446   f = GGC_CNEW (struct machine_function);
19447   f->use_fast_prologue_epilogue_nregs = -1;
19448   f->tls_descriptor_call_expanded_p = 0;
19449   f->call_abi = ix86_abi;
19450 
19451   return f;
19452 }
19453 
19454 /* Return a MEM corresponding to a stack slot with mode MODE.
19455    Allocate a new slot if necessary.
19456 
19457    The RTL for a function can have several slots available: N is
19458    which slot to use.  */
19459 
19460 rtx
19461 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
19462 {
19463   struct stack_local_entry *s;
19464 
19465   gcc_assert (n < MAX_386_STACK_LOCALS);
19466 
19467   /* Virtual slot is valid only before vregs are instantiated.  */
19468   gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
19469 
19470   for (s = ix86_stack_locals; s; s = s->next)
19471     if (s->mode == mode && s->n == n)
19472       return copy_rtx (s->rtl);
19473 
19474   s = (struct stack_local_entry *)
19475     ggc_alloc (sizeof (struct stack_local_entry));
19476   s->n = n;
19477   s->mode = mode;
19478   s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
19479 
19480   s->next = ix86_stack_locals;
19481   ix86_stack_locals = s;
19482   return s->rtl;
19483 }
19484 
19485 /* Construct the SYMBOL_REF for the tls_get_addr function.  */
19486 
19487 static GTY(()) rtx ix86_tls_symbol;
19488 rtx
19489 ix86_tls_get_addr (void)
19490 {
19491 
19492   if (!ix86_tls_symbol)
19493     {
19494       ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode,
19495 					    (TARGET_ANY_GNU_TLS
19496 					     && !TARGET_64BIT)
19497 					    ? "___tls_get_addr"
19498 					    : "__tls_get_addr");
19499     }
19500 
19501   return ix86_tls_symbol;
19502 }
19503 
19504 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
19505 
19506 static GTY(()) rtx ix86_tls_module_base_symbol;
19507 rtx
19508 ix86_tls_module_base (void)
19509 {
19510 
19511   if (!ix86_tls_module_base_symbol)
19512     {
19513       ix86_tls_module_base_symbol = gen_rtx_SYMBOL_REF (Pmode,
19514 							"_TLS_MODULE_BASE_");
19515       SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
19516 	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
19517     }
19518 
19519   return ix86_tls_module_base_symbol;
19520 }
19521 
19522 /* Calculate the length of the memory address in the instruction
19523    encoding.  Does not include the one-byte modrm, opcode, or prefix.  */
19524 
19525 int
19526 memory_address_length (rtx addr)
19527 {
19528   struct ix86_address parts;
19529   rtx base, index, disp;
19530   int len;
19531   int ok;
19532 
19533   if (GET_CODE (addr) == PRE_DEC
19534       || GET_CODE (addr) == POST_INC
19535       || GET_CODE (addr) == PRE_MODIFY
19536       || GET_CODE (addr) == POST_MODIFY)
19537     return 0;
19538 
19539   ok = ix86_decompose_address (addr, &parts);
19540   gcc_assert (ok);
19541 
19542   if (parts.base && GET_CODE (parts.base) == SUBREG)
19543     parts.base = SUBREG_REG (parts.base);
19544   if (parts.index && GET_CODE (parts.index) == SUBREG)
19545     parts.index = SUBREG_REG (parts.index);
19546 
19547   base = parts.base;
19548   index = parts.index;
19549   disp = parts.disp;
19550   len = 0;
19551 
19552   /* Rule of thumb:
19553        - esp as the base always wants an index,
19554        - ebp as the base always wants a displacement,
19555        - r12 as the base always wants an index,
19556        - r13 as the base always wants a displacement.  */
19557 
19558   /* Register Indirect.  */
19559   if (base && !index && !disp)
19560     {
19561       /* esp (for its index) and ebp (for its displacement) need
19562 	 the two-byte modrm form.  Similarly for r12 and r13 in 64-bit
19563 	 code.  */
19564       if (REG_P (addr)
19565 	  && (addr == arg_pointer_rtx
19566 	      || addr == frame_pointer_rtx
19567 	      || REGNO (addr) == SP_REG
19568 	      || REGNO (addr) == BP_REG
19569 	      || REGNO (addr) == R12_REG
19570 	      || REGNO (addr) == R13_REG))
19571 	len = 1;
19572     }
19573 
19574   /* Direct Addressing.  In 64-bit mode mod 00 r/m 5
19575      is not disp32, but disp32(%rip), so for disp32
19576      SIB byte is needed, unless print_operand_address
19577      optimizes it into disp32(%rip) or (%rip) is implied
19578      by UNSPEC.  */
19579   else if (disp && !base && !index)
19580     {
19581       len = 4;
19582       if (TARGET_64BIT)
19583 	{
19584 	  rtx symbol = disp;
19585 
19586 	  if (GET_CODE (disp) == CONST)
19587 	    symbol = XEXP (disp, 0);
19588 	  if (GET_CODE (symbol) == PLUS
19589 	      && CONST_INT_P (XEXP (symbol, 1)))
19590 	    symbol = XEXP (symbol, 0);
19591 
19592 	  if (GET_CODE (symbol) != LABEL_REF
19593 	      && (GET_CODE (symbol) != SYMBOL_REF
19594 		  || SYMBOL_REF_TLS_MODEL (symbol) != 0)
19595 	      && (GET_CODE (symbol) != UNSPEC
19596 		  || (XINT (symbol, 1) != UNSPEC_GOTPCREL
19597 		      && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
19598 	    len += 1;
19599 	}
19600     }
19601 
19602   else
19603     {
19604       /* Find the length of the displacement constant.  */
19605       if (disp)
19606 	{
19607 	  if (base && satisfies_constraint_K (disp))
19608 	    len = 1;
19609 	  else
19610 	    len = 4;
19611 	}
19612       /* ebp always wants a displacement.  Similarly r13.  */
19613       else if (base && REG_P (base)
19614 	       && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
19615 	len = 1;
19616 
19617       /* An index requires the two-byte modrm form....  */
19618       if (index
19619 	  /* ...like esp (or r12), which always wants an index.  */
19620 	  || base == arg_pointer_rtx
19621 	  || base == frame_pointer_rtx
19622 	  || (base && REG_P (base)
19623 	      && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
19624 	len += 1;
19625     }
19626 
19627   switch (parts.seg)
19628     {
19629     case SEG_FS:
19630     case SEG_GS:
19631       len += 1;
19632       break;
19633     default:
19634       break;
19635     }
19636 
19637   return len;
19638 }
19639 
19640 /* Compute default value for "length_immediate" attribute.  When SHORTFORM
19641    is set, expect that insn have 8bit immediate alternative.  */
19642 int
19643 ix86_attr_length_immediate_default (rtx insn, int shortform)
19644 {
19645   int len = 0;
19646   int i;
19647   extract_insn_cached (insn);
19648   for (i = recog_data.n_operands - 1; i >= 0; --i)
19649     if (CONSTANT_P (recog_data.operand[i]))
19650       {
19651         enum attr_mode mode = get_attr_mode (insn);
19652 
19653 	gcc_assert (!len);
19654 	if (shortform && CONST_INT_P (recog_data.operand[i]))
19655 	  {
19656 	    HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
19657 	    switch (mode)
19658 	      {
19659 	      case MODE_QI:
19660 		len = 1;
19661 		continue;
19662 	      case MODE_HI:
19663 		ival = trunc_int_for_mode (ival, HImode);
19664 		break;
19665 	      case MODE_SI:
19666 		ival = trunc_int_for_mode (ival, SImode);
19667 		break;
19668 	      default:
19669 		break;
19670 	      }
19671 	    if (IN_RANGE (ival, -128, 127))
19672 	      {
19673 		len = 1;
19674 		continue;
19675 	      }
19676 	  }
19677 	switch (mode)
19678 	  {
19679 	  case MODE_QI:
19680 	    len = 1;
19681 	    break;
19682 	  case MODE_HI:
19683 	    len = 2;
19684 	    break;
19685 	  case MODE_SI:
19686 	    len = 4;
19687 	    break;
19688 	  /* Immediates for DImode instructions are encoded as 32bit sign extended values.  */
19689 	  case MODE_DI:
19690 	    len = 4;
19691 	    break;
19692 	  default:
19693 	    fatal_insn ("unknown insn mode", insn);
19694 	}
19695       }
19696   return len;
19697 }
19698 /* Compute default value for "length_address" attribute.  */
19699 int
19700 ix86_attr_length_address_default (rtx insn)
19701 {
19702   int i;
19703 
19704   if (get_attr_type (insn) == TYPE_LEA)
19705     {
19706       rtx set = PATTERN (insn), addr;
19707 
19708       if (GET_CODE (set) == PARALLEL)
19709 	set = XVECEXP (set, 0, 0);
19710 
19711       gcc_assert (GET_CODE (set) == SET);
19712 
19713       addr = SET_SRC (set);
19714       if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
19715 	{
19716 	  if (GET_CODE (addr) == ZERO_EXTEND)
19717 	    addr = XEXP (addr, 0);
19718 	  if (GET_CODE (addr) == SUBREG)
19719 	    addr = SUBREG_REG (addr);
19720 	}
19721 
19722       return memory_address_length (addr);
19723     }
19724 
19725   extract_insn_cached (insn);
19726   for (i = recog_data.n_operands - 1; i >= 0; --i)
19727     if (MEM_P (recog_data.operand[i]))
19728       {
19729         constrain_operands_cached (reload_completed);
19730         if (which_alternative != -1)
19731 	  {
19732 	    const char *constraints = recog_data.constraints[i];
19733 	    int alt = which_alternative;
19734 
19735 	    while (*constraints == '=' || *constraints == '+')
19736 	      constraints++;
19737 	    while (alt-- > 0)
19738 	      while (*constraints++ != ',')
19739 		;
19740 	    /* Skip ignored operands.  */
19741 	    if (*constraints == 'X')
19742 	      continue;
19743 	  }
19744 	return memory_address_length (XEXP (recog_data.operand[i], 0));
19745       }
19746   return 0;
19747 }
19748 
19749 /* Compute default value for "length_vex" attribute. It includes
19750    2 or 3 byte VEX prefix and 1 opcode byte.  */
19751 
19752 int
19753 ix86_attr_length_vex_default (rtx insn, int has_0f_opcode,
19754 			      int has_vex_w)
19755 {
19756   int i;
19757 
19758   /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
19759      byte VEX prefix.  */
19760   if (!has_0f_opcode || has_vex_w)
19761     return 3 + 1;
19762 
19763  /* We can always use 2 byte VEX prefix in 32bit.  */
19764   if (!TARGET_64BIT)
19765     return 2 + 1;
19766 
19767   extract_insn_cached (insn);
19768 
19769   for (i = recog_data.n_operands - 1; i >= 0; --i)
19770     if (REG_P (recog_data.operand[i]))
19771       {
19772 	/* REX.W bit uses 3 byte VEX prefix.  */
19773 	if (GET_MODE (recog_data.operand[i]) == DImode
19774 	    && GENERAL_REG_P (recog_data.operand[i]))
19775 	  return 3 + 1;
19776       }
19777     else
19778       {
19779 	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
19780 	if (MEM_P (recog_data.operand[i])
19781 	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
19782 	  return 3 + 1;
19783       }
19784 
19785   return 2 + 1;
19786 }
19787 
19788 /* Return the maximum number of instructions a cpu can issue.  */
19789 
19790 static int
19791 ix86_issue_rate (void)
19792 {
19793   switch (ix86_tune)
19794     {
19795     case PROCESSOR_PENTIUM:
19796     case PROCESSOR_ATOM:
19797     case PROCESSOR_K6:
19798       return 2;
19799 
19800     case PROCESSOR_PENTIUMPRO:
19801     case PROCESSOR_PENTIUM4:
19802     case PROCESSOR_ATHLON:
19803     case PROCESSOR_K8:
19804     case PROCESSOR_AMDFAM10:
19805     case PROCESSOR_NOCONA:
19806     case PROCESSOR_GENERIC32:
19807     case PROCESSOR_GENERIC64:
19808       return 3;
19809 
19810     case PROCESSOR_CORE2:
19811       return 4;
19812 
19813     default:
19814       return 1;
19815     }
19816 }
19817 
19818 /* A subroutine of ix86_adjust_cost -- return true iff INSN reads flags set
19819    by DEP_INSN and nothing set by DEP_INSN.  */
19820 
19821 static int
19822 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
19823 {
19824   rtx set, set2;
19825 
19826   /* Simplify the test for uninteresting insns.  */
19827   if (insn_type != TYPE_SETCC
19828       && insn_type != TYPE_ICMOV
19829       && insn_type != TYPE_FCMOV
19830       && insn_type != TYPE_IBR)
19831     return 0;
19832 
19833   if ((set = single_set (dep_insn)) != 0)
19834     {
19835       set = SET_DEST (set);
19836       set2 = NULL_RTX;
19837     }
19838   else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
19839 	   && XVECLEN (PATTERN (dep_insn), 0) == 2
19840 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
19841 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
19842     {
19843       set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
19844       set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
19845     }
19846   else
19847     return 0;
19848 
19849   if (!REG_P (set) || REGNO (set) != FLAGS_REG)
19850     return 0;
19851 
19852   /* This test is true if the dependent insn reads the flags but
19853      not any other potentially set register.  */
19854   if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
19855     return 0;
19856 
19857   if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
19858     return 0;
19859 
19860   return 1;
19861 }
19862 
19863 /* Return true iff USE_INSN has a memory address with operands set by
19864    SET_INSN.  */
19865 
19866 bool
19867 ix86_agi_dependent (rtx set_insn, rtx use_insn)
19868 {
19869   int i;
19870   extract_insn_cached (use_insn);
19871   for (i = recog_data.n_operands - 1; i >= 0; --i)
19872     if (MEM_P (recog_data.operand[i]))
19873       {
19874 	rtx addr = XEXP (recog_data.operand[i], 0);
19875 	return modified_in_p (addr, set_insn) != 0;
19876       }
19877   return false;
19878 }
19879 
19880 static int
19881 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
19882 {
19883   enum attr_type insn_type, dep_insn_type;
19884   enum attr_memory memory;
19885   rtx set, set2;
19886   int dep_insn_code_number;
19887 
19888   /* Anti and output dependencies have zero cost on all CPUs.  */
19889   if (REG_NOTE_KIND (link) != 0)
19890     return 0;
19891 
19892   dep_insn_code_number = recog_memoized (dep_insn);
19893 
19894   /* If we can't recognize the insns, we can't really do anything.  */
19895   if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
19896     return cost;
19897 
19898   insn_type = get_attr_type (insn);
19899   dep_insn_type = get_attr_type (dep_insn);
19900 
19901   switch (ix86_tune)
19902     {
19903     case PROCESSOR_PENTIUM:
19904       /* Address Generation Interlock adds a cycle of latency.  */
19905       if (insn_type == TYPE_LEA)
19906 	{
19907 	  rtx addr = PATTERN (insn);
19908 
19909 	  if (GET_CODE (addr) == PARALLEL)
19910 	    addr = XVECEXP (addr, 0, 0);
19911 
19912 	  gcc_assert (GET_CODE (addr) == SET);
19913 
19914 	  addr = SET_SRC (addr);
19915 	  if (modified_in_p (addr, dep_insn))
19916 	    cost += 1;
19917 	}
19918       else if (ix86_agi_dependent (dep_insn, insn))
19919 	cost += 1;
19920 
19921       /* ??? Compares pair with jump/setcc.  */
19922       if (ix86_flags_dependent (insn, dep_insn, insn_type))
19923 	cost = 0;
19924 
19925       /* Floating point stores require value to be ready one cycle earlier.  */
19926       if (insn_type == TYPE_FMOV
19927 	  && get_attr_memory (insn) == MEMORY_STORE
19928 	  && !ix86_agi_dependent (dep_insn, insn))
19929 	cost += 1;
19930       break;
19931 
19932     case PROCESSOR_PENTIUMPRO:
19933       memory = get_attr_memory (insn);
19934 
19935       /* INT->FP conversion is expensive.  */
19936       if (get_attr_fp_int_src (dep_insn))
19937 	cost += 5;
19938 
19939       /* There is one cycle extra latency between an FP op and a store.  */
19940       if (insn_type == TYPE_FMOV
19941 	  && (set = single_set (dep_insn)) != NULL_RTX
19942 	  && (set2 = single_set (insn)) != NULL_RTX
19943 	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
19944 	  && MEM_P (SET_DEST (set2)))
19945 	cost += 1;
19946 
19947       /* Show ability of reorder buffer to hide latency of load by executing
19948 	 in parallel with previous instruction in case
19949 	 previous instruction is not needed to compute the address.  */
19950       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
19951 	  && !ix86_agi_dependent (dep_insn, insn))
19952 	{
19953 	  /* Claim moves to take one cycle, as core can issue one load
19954 	     at time and the next load can start cycle later.  */
19955 	  if (dep_insn_type == TYPE_IMOV
19956 	      || dep_insn_type == TYPE_FMOV)
19957 	    cost = 1;
19958 	  else if (cost > 1)
19959 	    cost--;
19960 	}
19961       break;
19962 
19963     case PROCESSOR_K6:
19964       memory = get_attr_memory (insn);
19965 
19966       /* The esp dependency is resolved before the instruction is really
19967          finished.  */
19968       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
19969 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
19970 	return 1;
19971 
19972       /* INT->FP conversion is expensive.  */
19973       if (get_attr_fp_int_src (dep_insn))
19974 	cost += 5;
19975 
19976       /* Show ability of reorder buffer to hide latency of load by executing
19977 	 in parallel with previous instruction in case
19978 	 previous instruction is not needed to compute the address.  */
19979       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
19980 	  && !ix86_agi_dependent (dep_insn, insn))
19981 	{
19982 	  /* Claim moves to take one cycle, as core can issue one load
19983 	     at time and the next load can start cycle later.  */
19984 	  if (dep_insn_type == TYPE_IMOV
19985 	      || dep_insn_type == TYPE_FMOV)
19986 	    cost = 1;
19987 	  else if (cost > 2)
19988 	    cost -= 2;
19989 	  else
19990 	    cost = 1;
19991 	}
19992       break;
19993 
19994     case PROCESSOR_ATHLON:
19995     case PROCESSOR_K8:
19996     case PROCESSOR_AMDFAM10:
19997     case PROCESSOR_ATOM:
19998     case PROCESSOR_GENERIC32:
19999     case PROCESSOR_GENERIC64:
20000       memory = get_attr_memory (insn);
20001 
20002       /* Show ability of reorder buffer to hide latency of load by executing
20003 	 in parallel with previous instruction in case
20004 	 previous instruction is not needed to compute the address.  */
20005       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
20006 	  && !ix86_agi_dependent (dep_insn, insn))
20007 	{
20008 	  enum attr_unit unit = get_attr_unit (insn);
20009 	  int loadcost = 3;
20010 
20011 	  /* Because of the difference between the length of integer and
20012 	     floating unit pipeline preparation stages, the memory operands
20013 	     for floating point are cheaper.
20014 
20015 	     ??? For Athlon it the difference is most probably 2.  */
20016 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
20017 	    loadcost = 3;
20018 	  else
20019 	    loadcost = TARGET_ATHLON ? 2 : 0;
20020 
20021 	  if (cost >= loadcost)
20022 	    cost -= loadcost;
20023 	  else
20024 	    cost = 0;
20025 	}
20026 
20027     default:
20028       break;
20029     }
20030 
20031   return cost;
20032 }
20033 
20034 /* How many alternative schedules to try.  This should be as wide as the
20035    scheduling freedom in the DFA, but no wider.  Making this value too
20036    large results extra work for the scheduler.  */
20037 
20038 static int
20039 ia32_multipass_dfa_lookahead (void)
20040 {
20041   switch (ix86_tune)
20042     {
20043     case PROCESSOR_PENTIUM:
20044       return 2;
20045 
20046     case PROCESSOR_PENTIUMPRO:
20047     case PROCESSOR_K6:
20048       return 1;
20049 
20050     default:
20051       return 0;
20052     }
20053 }
20054 
20055 
20056 /* Compute the alignment given to a constant that is being placed in memory.
20057    EXP is the constant and ALIGN is the alignment that the object would
20058    ordinarily have.
20059    The value of this function is used instead of that alignment to align
20060    the object.  */
20061 
20062 int
20063 ix86_constant_alignment (tree exp, int align)
20064 {
20065   if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
20066       || TREE_CODE (exp) == INTEGER_CST)
20067     {
20068       if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
20069 	return 64;
20070       else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
20071 	return 128;
20072     }
20073   else if (!optimize_size && TREE_CODE (exp) == STRING_CST
20074 	   && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
20075     return BITS_PER_WORD;
20076 
20077   return align;
20078 }
20079 
20080 /* Compute the alignment for a static variable.
20081    TYPE is the data type, and ALIGN is the alignment that
20082    the object would ordinarily have.  The value of this function is used
20083    instead of that alignment to align the object.  */
20084 
20085 int
20086 ix86_data_alignment (tree type, int align)
20087 {
20088   int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
20089 
20090   if (AGGREGATE_TYPE_P (type)
20091       && TYPE_SIZE (type)
20092       && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
20093       && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
20094 	  || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
20095       && align < max_align)
20096     align = max_align;
20097 
20098   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
20099      to 16byte boundary.  */
20100   if (TARGET_64BIT)
20101     {
20102       if (AGGREGATE_TYPE_P (type)
20103 	   && TYPE_SIZE (type)
20104 	   && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
20105 	   && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
20106 	       || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
20107 	return 128;
20108     }
20109 
20110   if (TREE_CODE (type) == ARRAY_TYPE)
20111     {
20112       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
20113 	return 64;
20114       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
20115 	return 128;
20116     }
20117   else if (TREE_CODE (type) == COMPLEX_TYPE)
20118     {
20119 
20120       if (TYPE_MODE (type) == DCmode && align < 64)
20121 	return 64;
20122       if ((TYPE_MODE (type) == XCmode
20123 	   || TYPE_MODE (type) == TCmode) && align < 128)
20124 	return 128;
20125     }
20126   else if ((TREE_CODE (type) == RECORD_TYPE
20127 	    || TREE_CODE (type) == UNION_TYPE
20128 	    || TREE_CODE (type) == QUAL_UNION_TYPE)
20129 	   && TYPE_FIELDS (type))
20130     {
20131       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
20132 	return 64;
20133       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
20134 	return 128;
20135     }
20136   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
20137 	   || TREE_CODE (type) == INTEGER_TYPE)
20138     {
20139       if (TYPE_MODE (type) == DFmode && align < 64)
20140 	return 64;
20141       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
20142 	return 128;
20143     }
20144 
20145   return align;
20146 }
20147 
20148 /* Compute the alignment for a local variable or a stack slot.  EXP is
20149    the data type or decl itself, MODE is the widest mode available and
20150    ALIGN is the alignment that the object would ordinarily have.  The
20151    value of this macro is used instead of that alignment to align the
20152    object.  */
20153 
20154 unsigned int
20155 ix86_local_alignment (tree exp, enum machine_mode mode,
20156 		      unsigned int align)
20157 {
20158   tree type, decl;
20159 
20160   if (exp && DECL_P (exp))
20161     {
20162       type = TREE_TYPE (exp);
20163       decl = exp;
20164     }
20165   else
20166     {
20167       type = exp;
20168       decl = NULL;
20169     }
20170 
20171   /* Don't do dynamic stack realignment for long long objects with
20172      -mpreferred-stack-boundary=2.  */
20173   if (!TARGET_64BIT
20174       && align == 64
20175       && ix86_preferred_stack_boundary < 64
20176       && (mode == DImode || (type && TYPE_MODE (type) == DImode)
20177           || mode == DFmode || (type && TYPE_MODE (type) == DFmode)
20178           || mode == DCmode || (type && TYPE_MODE (type) == DCmode))
20179       && (!type || !TYPE_USER_ALIGN (type))
20180       && (!decl || !DECL_USER_ALIGN (decl)))
20181     align = 32;
20182 
20183   /* If TYPE is NULL, we are allocating a stack slot for caller-save
20184      register in MODE.  We will return the largest alignment of XF
20185      and DF.  */
20186   if (!type)
20187     {
20188       if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
20189 	align = GET_MODE_ALIGNMENT (DFmode);
20190       return align;
20191     }
20192 
20193   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
20194      to 16byte boundary.  */
20195   if (TARGET_64BIT)
20196     {
20197       if (AGGREGATE_TYPE_P (type)
20198 	   && TYPE_SIZE (type)
20199 	   && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
20200 	   && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
20201 	       || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
20202 	return 128;
20203     }
20204   if (TREE_CODE (type) == ARRAY_TYPE)
20205     {
20206       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
20207 	return 64;
20208       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
20209 	return 128;
20210     }
20211   else if (TREE_CODE (type) == COMPLEX_TYPE)
20212     {
20213       if (TYPE_MODE (type) == DCmode && align < 64)
20214 	return 64;
20215       if ((TYPE_MODE (type) == XCmode
20216 	   || TYPE_MODE (type) == TCmode) && align < 128)
20217 	return 128;
20218     }
20219   else if ((TREE_CODE (type) == RECORD_TYPE
20220 	    || TREE_CODE (type) == UNION_TYPE
20221 	    || TREE_CODE (type) == QUAL_UNION_TYPE)
20222 	   && TYPE_FIELDS (type))
20223     {
20224       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64
20225           && ix86_preferred_stack_boundary >= 64)
20226 	return 64;
20227       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
20228 	return 128;
20229     }
20230   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
20231 	   || TREE_CODE (type) == INTEGER_TYPE)
20232     {
20233 
20234       if (TYPE_MODE (type) == DFmode && align < 64
20235           && ix86_preferred_stack_boundary >= 64)
20236 	return 64;
20237       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
20238 	return 128;
20239     }
20240   return align;
20241 }
20242 
20243 /* Compute the minimum required alignment for dynamic stack realignment
20244    purposes for a local variable, parameter or a stack slot.  EXP is
20245    the data type or decl itself, MODE is its mode and ALIGN is the
20246    alignment that the object would ordinarily have.  */
20247 
20248 unsigned int
20249 ix86_minimum_alignment (tree exp, enum machine_mode mode,
20250 			unsigned int align)
20251 {
20252   tree type, decl;
20253 
20254   if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
20255     return align;
20256 
20257   if (exp && DECL_P (exp))
20258     {
20259       type = TREE_TYPE (exp);
20260       decl = exp;
20261     }
20262   else
20263     {
20264       type = exp;
20265       decl = NULL;
20266     }
20267 
20268   /* Don't do dynamic stack realignment for long long objects with
20269      -mpreferred-stack-boundary=2.  */
20270   if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
20271       && (!type || !TYPE_USER_ALIGN (type))
20272       && (!decl || !DECL_USER_ALIGN (decl)))
20273     return 32;
20274 
20275   return align;
20276 }
20277 
20278 /* Find a location for the static chain incoming to a nested function.
20279    This is a register, unless all free registers are used by arguments.  */
20280 
20281 static rtx
20282 ix86_static_chain (const_tree fndecl, bool incoming_p)
20283 {
20284   unsigned regno;
20285 
20286   if (!DECL_STATIC_CHAIN (fndecl))
20287     return NULL;
20288 
20289   if (TARGET_64BIT)
20290     {
20291       /* We always use R10 in 64-bit mode.  */
20292       regno = R10_REG;
20293     }
20294   else
20295     {
20296       tree fntype;
20297       /* By default in 32-bit mode we use ECX to pass the static chain.  */
20298       regno = CX_REG;
20299 
20300       fntype = TREE_TYPE (fndecl);
20301       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)))
20302 	{
20303 	  /* Fastcall functions use ecx/edx for arguments, which leaves
20304 	     us with EAX for the static chain.  */
20305 	  regno = AX_REG;
20306 	}
20307       else if (ix86_function_regparm (fntype, fndecl) == 3)
20308 	{
20309 	  /* For regparm 3, we have no free call-clobbered registers in
20310 	     which to store the static chain.  In order to implement this,
20311 	     we have the trampoline push the static chain to the stack.
20312 	     However, we can't push a value below the return address when
20313 	     we call the nested function directly, so we have to use an
20314 	     alternate entry point.  For this we use ESI, and have the
20315 	     alternate entry point push ESI, so that things appear the
20316 	     same once we're executing the nested function.  */
20317 	  if (incoming_p)
20318 	    {
20319 	      if (fndecl == current_function_decl)
20320 		ix86_static_chain_on_stack = true;
20321 	      return gen_frame_mem (SImode,
20322 				    plus_constant (arg_pointer_rtx, -8));
20323 	    }
20324 	  regno = SI_REG;
20325 	}
20326     }
20327 
20328   return gen_rtx_REG (Pmode, regno);
20329 }
20330 
20331 /* Emit RTL insns to initialize the variable parts of a trampoline.
20332    FNDECL is the decl of the target address; M_TRAMP is a MEM for
20333    the trampoline, and CHAIN_VALUE is an RTX for the static chain
20334    to be passed to the target function.  */
20335 
20336 static void
20337 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
20338 {
20339   rtx mem, fnaddr;
20340 
20341   fnaddr = XEXP (DECL_RTL (fndecl), 0);
20342 
20343   if (!TARGET_64BIT)
20344     {
20345       rtx disp, chain;
20346       int opcode;
20347 
20348       /* Depending on the static chain location, either load a register
20349 	 with a constant, or push the constant to the stack.  All of the
20350 	 instructions are the same size.  */
20351       chain = ix86_static_chain (fndecl, true);
20352       if (REG_P (chain))
20353 	{
20354 	  if (REGNO (chain) == CX_REG)
20355 	    opcode = 0xb9;
20356 	  else if (REGNO (chain) == AX_REG)
20357 	    opcode = 0xb8;
20358 	  else
20359 	    gcc_unreachable ();
20360 	}
20361       else
20362 	opcode = 0x68;
20363 
20364       mem = adjust_address (m_tramp, QImode, 0);
20365       emit_move_insn (mem, gen_int_mode (opcode, QImode));
20366 
20367       mem = adjust_address (m_tramp, SImode, 1);
20368       emit_move_insn (mem, chain_value);
20369 
20370       /* Compute offset from the end of the jmp to the target function.
20371 	 In the case in which the trampoline stores the static chain on
20372 	 the stack, we need to skip the first insn which pushes the
20373 	 (call-saved) register static chain; this push is 1 byte.  */
20374       disp = expand_binop (SImode, sub_optab, fnaddr,
20375 			   plus_constant (XEXP (m_tramp, 0),
20376 					  MEM_P (chain) ? 9 : 10),
20377 			   NULL_RTX, 1, OPTAB_DIRECT);
20378 
20379       mem = adjust_address (m_tramp, QImode, 5);
20380       emit_move_insn (mem, gen_int_mode (0xe9, QImode));
20381 
20382       mem = adjust_address (m_tramp, SImode, 6);
20383       emit_move_insn (mem, disp);
20384     }
20385   else
20386     {
20387       int offset = 0;
20388 
20389       /* Load the function address to r11.  Try to load address using
20390 	 the shorter movl instead of movabs.  We may want to support
20391 	 movq for kernel mode, but kernel does not use trampolines at
20392 	 the moment.  */
20393       if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
20394 	{
20395 	  fnaddr = copy_to_mode_reg (DImode, fnaddr);
20396 
20397 	  mem = adjust_address (m_tramp, HImode, offset);
20398 	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
20399 
20400 	  mem = adjust_address (m_tramp, SImode, offset + 2);
20401 	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
20402 	  offset += 6;
20403 	}
20404       else
20405 	{
20406 	  mem = adjust_address (m_tramp, HImode, offset);
20407 	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
20408 
20409 	  mem = adjust_address (m_tramp, DImode, offset + 2);
20410 	  emit_move_insn (mem, fnaddr);
20411 	  offset += 10;
20412 	}
20413 
20414       /* Load static chain using movabs to r10.  */
20415       mem = adjust_address (m_tramp, HImode, offset);
20416       emit_move_insn (mem, gen_int_mode (0xba49, HImode));
20417 
20418       mem = adjust_address (m_tramp, DImode, offset + 2);
20419       emit_move_insn (mem, chain_value);
20420       offset += 10;
20421 
20422       /* Jump to r11; the last (unused) byte is a nop, only there to
20423 	 pad the write out to a single 32-bit store.  */
20424       mem = adjust_address (m_tramp, SImode, offset);
20425       emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
20426       offset += 4;
20427 
20428       gcc_assert (offset <= TRAMPOLINE_SIZE);
20429     }
20430 
20431 #ifdef ENABLE_EXECUTE_STACK
20432 #ifdef CHECK_EXECUTE_STACK_ENABLED
20433   if (CHECK_EXECUTE_STACK_ENABLED)
20434 #endif
20435   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
20436 		     LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
20437 #endif
20438 }
20439 
20440 /* The following file contains several enumerations and data structures
20441    built from the definitions in i386-builtin-types.def.  */
20442 
20443 #include "i386-builtin-types.inc"
20444 
20445 /* Table for the ix86 builtin non-function types.  */
20446 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
20447 
20448 /* Retrieve an element from the above table, building some of
20449    the types lazily.  */
20450 
20451 static tree
20452 ix86_get_builtin_type (enum ix86_builtin_type tcode)
20453 {
20454   unsigned int index;
20455   tree type, itype;
20456 
20457   gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
20458 
20459   type = ix86_builtin_type_tab[(int) tcode];
20460   if (type != NULL)
20461     return type;
20462 
20463   gcc_assert (tcode > IX86_BT_LAST_PRIM);
20464   if (tcode <= IX86_BT_LAST_VECT)
20465     {
20466       enum machine_mode mode;
20467 
20468       index = tcode - IX86_BT_LAST_PRIM - 1;
20469       itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
20470       mode = ix86_builtin_type_vect_mode[index];
20471 
20472       type = build_vector_type_for_mode (itype, mode);
20473     }
20474   else
20475     {
20476       int quals;
20477 
20478       index = tcode - IX86_BT_LAST_VECT - 1;
20479       if (tcode <= IX86_BT_LAST_PTR)
20480 	quals = TYPE_UNQUALIFIED;
20481       else
20482 	quals = TYPE_QUAL_CONST;
20483 
20484       itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
20485       if (quals != TYPE_UNQUALIFIED)
20486 	itype = build_qualified_type (itype, quals);
20487 
20488       type = build_pointer_type (itype);
20489     }
20490 
20491   ix86_builtin_type_tab[(int) tcode] = type;
20492   return type;
20493 }
20494 
20495 /* Table for the ix86 builtin function types.  */
20496 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
20497 
20498 /* Retrieve an element from the above table, building some of
20499    the types lazily.  */
20500 
20501 static tree
20502 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
20503 {
20504   tree type;
20505 
20506   gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
20507 
20508   type = ix86_builtin_func_type_tab[(int) tcode];
20509   if (type != NULL)
20510     return type;
20511 
20512   if (tcode <= IX86_BT_LAST_FUNC)
20513     {
20514       unsigned start = ix86_builtin_func_start[(int) tcode];
20515       unsigned after = ix86_builtin_func_start[(int) tcode + 1];
20516       tree rtype, atype, args = void_list_node;
20517       unsigned i;
20518 
20519       rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
20520       for (i = after - 1; i > start; --i)
20521 	{
20522 	  atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
20523 	  args = tree_cons (NULL, atype, args);
20524 	}
20525 
20526       type = build_function_type (rtype, args);
20527     }
20528   else
20529     {
20530       unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
20531       enum ix86_builtin_func_type icode;
20532 
20533       icode = ix86_builtin_func_alias_base[index];
20534       type = ix86_get_builtin_func_type (icode);
20535     }
20536 
20537   ix86_builtin_func_type_tab[(int) tcode] = type;
20538   return type;
20539 }
20540 
20541 
20542 /* Codes for all the SSE/MMX builtins.  */
20543 enum ix86_builtins
20544 {
20545   IX86_BUILTIN_ADDPS,
20546   IX86_BUILTIN_ADDSS,
20547   IX86_BUILTIN_DIVPS,
20548   IX86_BUILTIN_DIVSS,
20549   IX86_BUILTIN_MULPS,
20550   IX86_BUILTIN_MULSS,
20551   IX86_BUILTIN_SUBPS,
20552   IX86_BUILTIN_SUBSS,
20553 
20554   IX86_BUILTIN_CMPEQPS,
20555   IX86_BUILTIN_CMPLTPS,
20556   IX86_BUILTIN_CMPLEPS,
20557   IX86_BUILTIN_CMPGTPS,
20558   IX86_BUILTIN_CMPGEPS,
20559   IX86_BUILTIN_CMPNEQPS,
20560   IX86_BUILTIN_CMPNLTPS,
20561   IX86_BUILTIN_CMPNLEPS,
20562   IX86_BUILTIN_CMPNGTPS,
20563   IX86_BUILTIN_CMPNGEPS,
20564   IX86_BUILTIN_CMPORDPS,
20565   IX86_BUILTIN_CMPUNORDPS,
20566   IX86_BUILTIN_CMPEQSS,
20567   IX86_BUILTIN_CMPLTSS,
20568   IX86_BUILTIN_CMPLESS,
20569   IX86_BUILTIN_CMPNEQSS,
20570   IX86_BUILTIN_CMPNLTSS,
20571   IX86_BUILTIN_CMPNLESS,
20572   IX86_BUILTIN_CMPNGTSS,
20573   IX86_BUILTIN_CMPNGESS,
20574   IX86_BUILTIN_CMPORDSS,
20575   IX86_BUILTIN_CMPUNORDSS,
20576 
20577   IX86_BUILTIN_COMIEQSS,
20578   IX86_BUILTIN_COMILTSS,
20579   IX86_BUILTIN_COMILESS,
20580   IX86_BUILTIN_COMIGTSS,
20581   IX86_BUILTIN_COMIGESS,
20582   IX86_BUILTIN_COMINEQSS,
20583   IX86_BUILTIN_UCOMIEQSS,
20584   IX86_BUILTIN_UCOMILTSS,
20585   IX86_BUILTIN_UCOMILESS,
20586   IX86_BUILTIN_UCOMIGTSS,
20587   IX86_BUILTIN_UCOMIGESS,
20588   IX86_BUILTIN_UCOMINEQSS,
20589 
20590   IX86_BUILTIN_CVTPI2PS,
20591   IX86_BUILTIN_CVTPS2PI,
20592   IX86_BUILTIN_CVTSI2SS,
20593   IX86_BUILTIN_CVTSI642SS,
20594   IX86_BUILTIN_CVTSS2SI,
20595   IX86_BUILTIN_CVTSS2SI64,
20596   IX86_BUILTIN_CVTTPS2PI,
20597   IX86_BUILTIN_CVTTSS2SI,
20598   IX86_BUILTIN_CVTTSS2SI64,
20599 
20600   IX86_BUILTIN_MAXPS,
20601   IX86_BUILTIN_MAXSS,
20602   IX86_BUILTIN_MINPS,
20603   IX86_BUILTIN_MINSS,
20604 
20605   IX86_BUILTIN_LOADUPS,
20606   IX86_BUILTIN_STOREUPS,
20607   IX86_BUILTIN_MOVSS,
20608 
20609   IX86_BUILTIN_MOVHLPS,
20610   IX86_BUILTIN_MOVLHPS,
20611   IX86_BUILTIN_LOADHPS,
20612   IX86_BUILTIN_LOADLPS,
20613   IX86_BUILTIN_STOREHPS,
20614   IX86_BUILTIN_STORELPS,
20615 
20616   IX86_BUILTIN_MASKMOVQ,
20617   IX86_BUILTIN_MOVMSKPS,
20618   IX86_BUILTIN_PMOVMSKB,
20619 
20620   IX86_BUILTIN_MOVNTPS,
20621   IX86_BUILTIN_MOVNTQ,
20622 
20623   IX86_BUILTIN_LOADDQU,
20624   IX86_BUILTIN_STOREDQU,
20625 
20626   IX86_BUILTIN_PACKSSWB,
20627   IX86_BUILTIN_PACKSSDW,
20628   IX86_BUILTIN_PACKUSWB,
20629 
20630   IX86_BUILTIN_PADDB,
20631   IX86_BUILTIN_PADDW,
20632   IX86_BUILTIN_PADDD,
20633   IX86_BUILTIN_PADDQ,
20634   IX86_BUILTIN_PADDSB,
20635   IX86_BUILTIN_PADDSW,
20636   IX86_BUILTIN_PADDUSB,
20637   IX86_BUILTIN_PADDUSW,
20638   IX86_BUILTIN_PSUBB,
20639   IX86_BUILTIN_PSUBW,
20640   IX86_BUILTIN_PSUBD,
20641   IX86_BUILTIN_PSUBQ,
20642   IX86_BUILTIN_PSUBSB,
20643   IX86_BUILTIN_PSUBSW,
20644   IX86_BUILTIN_PSUBUSB,
20645   IX86_BUILTIN_PSUBUSW,
20646 
20647   IX86_BUILTIN_PAND,
20648   IX86_BUILTIN_PANDN,
20649   IX86_BUILTIN_POR,
20650   IX86_BUILTIN_PXOR,
20651 
20652   IX86_BUILTIN_PAVGB,
20653   IX86_BUILTIN_PAVGW,
20654 
20655   IX86_BUILTIN_PCMPEQB,
20656   IX86_BUILTIN_PCMPEQW,
20657   IX86_BUILTIN_PCMPEQD,
20658   IX86_BUILTIN_PCMPGTB,
20659   IX86_BUILTIN_PCMPGTW,
20660   IX86_BUILTIN_PCMPGTD,
20661 
20662   IX86_BUILTIN_PMADDWD,
20663 
20664   IX86_BUILTIN_PMAXSW,
20665   IX86_BUILTIN_PMAXUB,
20666   IX86_BUILTIN_PMINSW,
20667   IX86_BUILTIN_PMINUB,
20668 
20669   IX86_BUILTIN_PMULHUW,
20670   IX86_BUILTIN_PMULHW,
20671   IX86_BUILTIN_PMULLW,
20672 
20673   IX86_BUILTIN_PSADBW,
20674   IX86_BUILTIN_PSHUFW,
20675 
20676   IX86_BUILTIN_PSLLW,
20677   IX86_BUILTIN_PSLLD,
20678   IX86_BUILTIN_PSLLQ,
20679   IX86_BUILTIN_PSRAW,
20680   IX86_BUILTIN_PSRAD,
20681   IX86_BUILTIN_PSRLW,
20682   IX86_BUILTIN_PSRLD,
20683   IX86_BUILTIN_PSRLQ,
20684   IX86_BUILTIN_PSLLWI,
20685   IX86_BUILTIN_PSLLDI,
20686   IX86_BUILTIN_PSLLQI,
20687   IX86_BUILTIN_PSRAWI,
20688   IX86_BUILTIN_PSRADI,
20689   IX86_BUILTIN_PSRLWI,
20690   IX86_BUILTIN_PSRLDI,
20691   IX86_BUILTIN_PSRLQI,
20692 
20693   IX86_BUILTIN_PUNPCKHBW,
20694   IX86_BUILTIN_PUNPCKHWD,
20695   IX86_BUILTIN_PUNPCKHDQ,
20696   IX86_BUILTIN_PUNPCKLBW,
20697   IX86_BUILTIN_PUNPCKLWD,
20698   IX86_BUILTIN_PUNPCKLDQ,
20699 
20700   IX86_BUILTIN_SHUFPS,
20701 
20702   IX86_BUILTIN_RCPPS,
20703   IX86_BUILTIN_RCPSS,
20704   IX86_BUILTIN_RSQRTPS,
20705   IX86_BUILTIN_RSQRTPS_NR,
20706   IX86_BUILTIN_RSQRTSS,
20707   IX86_BUILTIN_RSQRTF,
20708   IX86_BUILTIN_SQRTPS,
20709   IX86_BUILTIN_SQRTPS_NR,
20710   IX86_BUILTIN_SQRTSS,
20711 
20712   IX86_BUILTIN_UNPCKHPS,
20713   IX86_BUILTIN_UNPCKLPS,
20714 
20715   IX86_BUILTIN_ANDPS,
20716   IX86_BUILTIN_ANDNPS,
20717   IX86_BUILTIN_ORPS,
20718   IX86_BUILTIN_XORPS,
20719 
20720   IX86_BUILTIN_EMMS,
20721   IX86_BUILTIN_LDMXCSR,
20722   IX86_BUILTIN_STMXCSR,
20723   IX86_BUILTIN_SFENCE,
20724 
20725   /* 3DNow! Original */
20726   IX86_BUILTIN_FEMMS,
20727   IX86_BUILTIN_PAVGUSB,
20728   IX86_BUILTIN_PF2ID,
20729   IX86_BUILTIN_PFACC,
20730   IX86_BUILTIN_PFADD,
20731   IX86_BUILTIN_PFCMPEQ,
20732   IX86_BUILTIN_PFCMPGE,
20733   IX86_BUILTIN_PFCMPGT,
20734   IX86_BUILTIN_PFMAX,
20735   IX86_BUILTIN_PFMIN,
20736   IX86_BUILTIN_PFMUL,
20737   IX86_BUILTIN_PFRCP,
20738   IX86_BUILTIN_PFRCPIT1,
20739   IX86_BUILTIN_PFRCPIT2,
20740   IX86_BUILTIN_PFRSQIT1,
20741   IX86_BUILTIN_PFRSQRT,
20742   IX86_BUILTIN_PFSUB,
20743   IX86_BUILTIN_PFSUBR,
20744   IX86_BUILTIN_PI2FD,
20745   IX86_BUILTIN_PMULHRW,
20746 
20747   /* 3DNow! Athlon Extensions */
20748   IX86_BUILTIN_PF2IW,
20749   IX86_BUILTIN_PFNACC,
20750   IX86_BUILTIN_PFPNACC,
20751   IX86_BUILTIN_PI2FW,
20752   IX86_BUILTIN_PSWAPDSI,
20753   IX86_BUILTIN_PSWAPDSF,
20754 
20755   /* SSE2 */
20756   IX86_BUILTIN_ADDPD,
20757   IX86_BUILTIN_ADDSD,
20758   IX86_BUILTIN_DIVPD,
20759   IX86_BUILTIN_DIVSD,
20760   IX86_BUILTIN_MULPD,
20761   IX86_BUILTIN_MULSD,
20762   IX86_BUILTIN_SUBPD,
20763   IX86_BUILTIN_SUBSD,
20764 
20765   IX86_BUILTIN_CMPEQPD,
20766   IX86_BUILTIN_CMPLTPD,
20767   IX86_BUILTIN_CMPLEPD,
20768   IX86_BUILTIN_CMPGTPD,
20769   IX86_BUILTIN_CMPGEPD,
20770   IX86_BUILTIN_CMPNEQPD,
20771   IX86_BUILTIN_CMPNLTPD,
20772   IX86_BUILTIN_CMPNLEPD,
20773   IX86_BUILTIN_CMPNGTPD,
20774   IX86_BUILTIN_CMPNGEPD,
20775   IX86_BUILTIN_CMPORDPD,
20776   IX86_BUILTIN_CMPUNORDPD,
20777   IX86_BUILTIN_CMPEQSD,
20778   IX86_BUILTIN_CMPLTSD,
20779   IX86_BUILTIN_CMPLESD,
20780   IX86_BUILTIN_CMPNEQSD,
20781   IX86_BUILTIN_CMPNLTSD,
20782   IX86_BUILTIN_CMPNLESD,
20783   IX86_BUILTIN_CMPORDSD,
20784   IX86_BUILTIN_CMPUNORDSD,
20785 
20786   IX86_BUILTIN_COMIEQSD,
20787   IX86_BUILTIN_COMILTSD,
20788   IX86_BUILTIN_COMILESD,
20789   IX86_BUILTIN_COMIGTSD,
20790   IX86_BUILTIN_COMIGESD,
20791   IX86_BUILTIN_COMINEQSD,
20792   IX86_BUILTIN_UCOMIEQSD,
20793   IX86_BUILTIN_UCOMILTSD,
20794   IX86_BUILTIN_UCOMILESD,
20795   IX86_BUILTIN_UCOMIGTSD,
20796   IX86_BUILTIN_UCOMIGESD,
20797   IX86_BUILTIN_UCOMINEQSD,
20798 
20799   IX86_BUILTIN_MAXPD,
20800   IX86_BUILTIN_MAXSD,
20801   IX86_BUILTIN_MINPD,
20802   IX86_BUILTIN_MINSD,
20803 
20804   IX86_BUILTIN_ANDPD,
20805   IX86_BUILTIN_ANDNPD,
20806   IX86_BUILTIN_ORPD,
20807   IX86_BUILTIN_XORPD,
20808 
20809   IX86_BUILTIN_SQRTPD,
20810   IX86_BUILTIN_SQRTSD,
20811 
20812   IX86_BUILTIN_UNPCKHPD,
20813   IX86_BUILTIN_UNPCKLPD,
20814 
20815   IX86_BUILTIN_SHUFPD,
20816 
20817   IX86_BUILTIN_LOADUPD,
20818   IX86_BUILTIN_STOREUPD,
20819   IX86_BUILTIN_MOVSD,
20820 
20821   IX86_BUILTIN_LOADHPD,
20822   IX86_BUILTIN_LOADLPD,
20823 
20824   IX86_BUILTIN_CVTDQ2PD,
20825   IX86_BUILTIN_CVTDQ2PS,
20826 
20827   IX86_BUILTIN_CVTPD2DQ,
20828   IX86_BUILTIN_CVTPD2PI,
20829   IX86_BUILTIN_CVTPD2PS,
20830   IX86_BUILTIN_CVTTPD2DQ,
20831   IX86_BUILTIN_CVTTPD2PI,
20832 
20833   IX86_BUILTIN_CVTPI2PD,
20834   IX86_BUILTIN_CVTSI2SD,
20835   IX86_BUILTIN_CVTSI642SD,
20836 
20837   IX86_BUILTIN_CVTSD2SI,
20838   IX86_BUILTIN_CVTSD2SI64,
20839   IX86_BUILTIN_CVTSD2SS,
20840   IX86_BUILTIN_CVTSS2SD,
20841   IX86_BUILTIN_CVTTSD2SI,
20842   IX86_BUILTIN_CVTTSD2SI64,
20843 
20844   IX86_BUILTIN_CVTPS2DQ,
20845   IX86_BUILTIN_CVTPS2PD,
20846   IX86_BUILTIN_CVTTPS2DQ,
20847 
20848   IX86_BUILTIN_MOVNTI,
20849   IX86_BUILTIN_MOVNTPD,
20850   IX86_BUILTIN_MOVNTDQ,
20851 
20852   IX86_BUILTIN_MOVQ128,
20853 
20854   /* SSE2 MMX */
20855   IX86_BUILTIN_MASKMOVDQU,
20856   IX86_BUILTIN_MOVMSKPD,
20857   IX86_BUILTIN_PMOVMSKB128,
20858 
20859   IX86_BUILTIN_PACKSSWB128,
20860   IX86_BUILTIN_PACKSSDW128,
20861   IX86_BUILTIN_PACKUSWB128,
20862 
20863   IX86_BUILTIN_PADDB128,
20864   IX86_BUILTIN_PADDW128,
20865   IX86_BUILTIN_PADDD128,
20866   IX86_BUILTIN_PADDQ128,
20867   IX86_BUILTIN_PADDSB128,
20868   IX86_BUILTIN_PADDSW128,
20869   IX86_BUILTIN_PADDUSB128,
20870   IX86_BUILTIN_PADDUSW128,
20871   IX86_BUILTIN_PSUBB128,
20872   IX86_BUILTIN_PSUBW128,
20873   IX86_BUILTIN_PSUBD128,
20874   IX86_BUILTIN_PSUBQ128,
20875   IX86_BUILTIN_PSUBSB128,
20876   IX86_BUILTIN_PSUBSW128,
20877   IX86_BUILTIN_PSUBUSB128,
20878   IX86_BUILTIN_PSUBUSW128,
20879 
20880   IX86_BUILTIN_PAND128,
20881   IX86_BUILTIN_PANDN128,
20882   IX86_BUILTIN_POR128,
20883   IX86_BUILTIN_PXOR128,
20884 
20885   IX86_BUILTIN_PAVGB128,
20886   IX86_BUILTIN_PAVGW128,
20887 
20888   IX86_BUILTIN_PCMPEQB128,
20889   IX86_BUILTIN_PCMPEQW128,
20890   IX86_BUILTIN_PCMPEQD128,
20891   IX86_BUILTIN_PCMPGTB128,
20892   IX86_BUILTIN_PCMPGTW128,
20893   IX86_BUILTIN_PCMPGTD128,
20894 
20895   IX86_BUILTIN_PMADDWD128,
20896 
20897   IX86_BUILTIN_PMAXSW128,
20898   IX86_BUILTIN_PMAXUB128,
20899   IX86_BUILTIN_PMINSW128,
20900   IX86_BUILTIN_PMINUB128,
20901 
20902   IX86_BUILTIN_PMULUDQ,
20903   IX86_BUILTIN_PMULUDQ128,
20904   IX86_BUILTIN_PMULHUW128,
20905   IX86_BUILTIN_PMULHW128,
20906   IX86_BUILTIN_PMULLW128,
20907 
20908   IX86_BUILTIN_PSADBW128,
20909   IX86_BUILTIN_PSHUFHW,
20910   IX86_BUILTIN_PSHUFLW,
20911   IX86_BUILTIN_PSHUFD,
20912 
20913   IX86_BUILTIN_PSLLDQI128,
20914   IX86_BUILTIN_PSLLWI128,
20915   IX86_BUILTIN_PSLLDI128,
20916   IX86_BUILTIN_PSLLQI128,
20917   IX86_BUILTIN_PSRAWI128,
20918   IX86_BUILTIN_PSRADI128,
20919   IX86_BUILTIN_PSRLDQI128,
20920   IX86_BUILTIN_PSRLWI128,
20921   IX86_BUILTIN_PSRLDI128,
20922   IX86_BUILTIN_PSRLQI128,
20923 
20924   IX86_BUILTIN_PSLLDQ128,
20925   IX86_BUILTIN_PSLLW128,
20926   IX86_BUILTIN_PSLLD128,
20927   IX86_BUILTIN_PSLLQ128,
20928   IX86_BUILTIN_PSRAW128,
20929   IX86_BUILTIN_PSRAD128,
20930   IX86_BUILTIN_PSRLW128,
20931   IX86_BUILTIN_PSRLD128,
20932   IX86_BUILTIN_PSRLQ128,
20933 
20934   IX86_BUILTIN_PUNPCKHBW128,
20935   IX86_BUILTIN_PUNPCKHWD128,
20936   IX86_BUILTIN_PUNPCKHDQ128,
20937   IX86_BUILTIN_PUNPCKHQDQ128,
20938   IX86_BUILTIN_PUNPCKLBW128,
20939   IX86_BUILTIN_PUNPCKLWD128,
20940   IX86_BUILTIN_PUNPCKLDQ128,
20941   IX86_BUILTIN_PUNPCKLQDQ128,
20942 
20943   IX86_BUILTIN_CLFLUSH,
20944   IX86_BUILTIN_MFENCE,
20945   IX86_BUILTIN_LFENCE,
20946 
20947   IX86_BUILTIN_BSRSI,
20948   IX86_BUILTIN_BSRDI,
20949   IX86_BUILTIN_RDPMC,
20950   IX86_BUILTIN_RDTSC,
20951   IX86_BUILTIN_RDTSCP,
20952   IX86_BUILTIN_ROLQI,
20953   IX86_BUILTIN_ROLHI,
20954   IX86_BUILTIN_RORQI,
20955   IX86_BUILTIN_RORHI,
20956 
20957   /* SSE3.  */
20958   IX86_BUILTIN_ADDSUBPS,
20959   IX86_BUILTIN_HADDPS,
20960   IX86_BUILTIN_HSUBPS,
20961   IX86_BUILTIN_MOVSHDUP,
20962   IX86_BUILTIN_MOVSLDUP,
20963   IX86_BUILTIN_ADDSUBPD,
20964   IX86_BUILTIN_HADDPD,
20965   IX86_BUILTIN_HSUBPD,
20966   IX86_BUILTIN_LDDQU,
20967 
20968   IX86_BUILTIN_MONITOR,
20969   IX86_BUILTIN_MWAIT,
20970 
20971   /* SSSE3.  */
20972   IX86_BUILTIN_PHADDW,
20973   IX86_BUILTIN_PHADDD,
20974   IX86_BUILTIN_PHADDSW,
20975   IX86_BUILTIN_PHSUBW,
20976   IX86_BUILTIN_PHSUBD,
20977   IX86_BUILTIN_PHSUBSW,
20978   IX86_BUILTIN_PMADDUBSW,
20979   IX86_BUILTIN_PMULHRSW,
20980   IX86_BUILTIN_PSHUFB,
20981   IX86_BUILTIN_PSIGNB,
20982   IX86_BUILTIN_PSIGNW,
20983   IX86_BUILTIN_PSIGND,
20984   IX86_BUILTIN_PALIGNR,
20985   IX86_BUILTIN_PABSB,
20986   IX86_BUILTIN_PABSW,
20987   IX86_BUILTIN_PABSD,
20988 
20989   IX86_BUILTIN_PHADDW128,
20990   IX86_BUILTIN_PHADDD128,
20991   IX86_BUILTIN_PHADDSW128,
20992   IX86_BUILTIN_PHSUBW128,
20993   IX86_BUILTIN_PHSUBD128,
20994   IX86_BUILTIN_PHSUBSW128,
20995   IX86_BUILTIN_PMADDUBSW128,
20996   IX86_BUILTIN_PMULHRSW128,
20997   IX86_BUILTIN_PSHUFB128,
20998   IX86_BUILTIN_PSIGNB128,
20999   IX86_BUILTIN_PSIGNW128,
21000   IX86_BUILTIN_PSIGND128,
21001   IX86_BUILTIN_PALIGNR128,
21002   IX86_BUILTIN_PABSB128,
21003   IX86_BUILTIN_PABSW128,
21004   IX86_BUILTIN_PABSD128,
21005 
21006   /* AMDFAM10 - SSE4A New Instructions.  */
21007   IX86_BUILTIN_MOVNTSD,
21008   IX86_BUILTIN_MOVNTSS,
21009   IX86_BUILTIN_EXTRQI,
21010   IX86_BUILTIN_EXTRQ,
21011   IX86_BUILTIN_INSERTQI,
21012   IX86_BUILTIN_INSERTQ,
21013 
21014   /* SSE4.1.  */
21015   IX86_BUILTIN_BLENDPD,
21016   IX86_BUILTIN_BLENDPS,
21017   IX86_BUILTIN_BLENDVPD,
21018   IX86_BUILTIN_BLENDVPS,
21019   IX86_BUILTIN_PBLENDVB128,
21020   IX86_BUILTIN_PBLENDW128,
21021 
21022   IX86_BUILTIN_DPPD,
21023   IX86_BUILTIN_DPPS,
21024 
21025   IX86_BUILTIN_INSERTPS128,
21026 
21027   IX86_BUILTIN_MOVNTDQA,
21028   IX86_BUILTIN_MPSADBW128,
21029   IX86_BUILTIN_PACKUSDW128,
21030   IX86_BUILTIN_PCMPEQQ,
21031   IX86_BUILTIN_PHMINPOSUW128,
21032 
21033   IX86_BUILTIN_PMAXSB128,
21034   IX86_BUILTIN_PMAXSD128,
21035   IX86_BUILTIN_PMAXUD128,
21036   IX86_BUILTIN_PMAXUW128,
21037 
21038   IX86_BUILTIN_PMINSB128,
21039   IX86_BUILTIN_PMINSD128,
21040   IX86_BUILTIN_PMINUD128,
21041   IX86_BUILTIN_PMINUW128,
21042 
21043   IX86_BUILTIN_PMOVSXBW128,
21044   IX86_BUILTIN_PMOVSXBD128,
21045   IX86_BUILTIN_PMOVSXBQ128,
21046   IX86_BUILTIN_PMOVSXWD128,
21047   IX86_BUILTIN_PMOVSXWQ128,
21048   IX86_BUILTIN_PMOVSXDQ128,
21049 
21050   IX86_BUILTIN_PMOVZXBW128,
21051   IX86_BUILTIN_PMOVZXBD128,
21052   IX86_BUILTIN_PMOVZXBQ128,
21053   IX86_BUILTIN_PMOVZXWD128,
21054   IX86_BUILTIN_PMOVZXWQ128,
21055   IX86_BUILTIN_PMOVZXDQ128,
21056 
21057   IX86_BUILTIN_PMULDQ128,
21058   IX86_BUILTIN_PMULLD128,
21059 
21060   IX86_BUILTIN_ROUNDPD,
21061   IX86_BUILTIN_ROUNDPS,
21062   IX86_BUILTIN_ROUNDSD,
21063   IX86_BUILTIN_ROUNDSS,
21064 
21065   IX86_BUILTIN_PTESTZ,
21066   IX86_BUILTIN_PTESTC,
21067   IX86_BUILTIN_PTESTNZC,
21068 
21069   IX86_BUILTIN_VEC_INIT_V2SI,
21070   IX86_BUILTIN_VEC_INIT_V4HI,
21071   IX86_BUILTIN_VEC_INIT_V8QI,
21072   IX86_BUILTIN_VEC_EXT_V2DF,
21073   IX86_BUILTIN_VEC_EXT_V2DI,
21074   IX86_BUILTIN_VEC_EXT_V4SF,
21075   IX86_BUILTIN_VEC_EXT_V4SI,
21076   IX86_BUILTIN_VEC_EXT_V8HI,
21077   IX86_BUILTIN_VEC_EXT_V2SI,
21078   IX86_BUILTIN_VEC_EXT_V4HI,
21079   IX86_BUILTIN_VEC_EXT_V16QI,
21080   IX86_BUILTIN_VEC_SET_V2DI,
21081   IX86_BUILTIN_VEC_SET_V4SF,
21082   IX86_BUILTIN_VEC_SET_V4SI,
21083   IX86_BUILTIN_VEC_SET_V8HI,
21084   IX86_BUILTIN_VEC_SET_V4HI,
21085   IX86_BUILTIN_VEC_SET_V16QI,
21086 
21087   IX86_BUILTIN_VEC_PACK_SFIX,
21088 
21089   /* SSE4.2.  */
21090   IX86_BUILTIN_CRC32QI,
21091   IX86_BUILTIN_CRC32HI,
21092   IX86_BUILTIN_CRC32SI,
21093   IX86_BUILTIN_CRC32DI,
21094 
21095   IX86_BUILTIN_PCMPESTRI128,
21096   IX86_BUILTIN_PCMPESTRM128,
21097   IX86_BUILTIN_PCMPESTRA128,
21098   IX86_BUILTIN_PCMPESTRC128,
21099   IX86_BUILTIN_PCMPESTRO128,
21100   IX86_BUILTIN_PCMPESTRS128,
21101   IX86_BUILTIN_PCMPESTRZ128,
21102   IX86_BUILTIN_PCMPISTRI128,
21103   IX86_BUILTIN_PCMPISTRM128,
21104   IX86_BUILTIN_PCMPISTRA128,
21105   IX86_BUILTIN_PCMPISTRC128,
21106   IX86_BUILTIN_PCMPISTRO128,
21107   IX86_BUILTIN_PCMPISTRS128,
21108   IX86_BUILTIN_PCMPISTRZ128,
21109 
21110   IX86_BUILTIN_PCMPGTQ,
21111 
21112   /* AES instructions */
21113   IX86_BUILTIN_AESENC128,
21114   IX86_BUILTIN_AESENCLAST128,
21115   IX86_BUILTIN_AESDEC128,
21116   IX86_BUILTIN_AESDECLAST128,
21117   IX86_BUILTIN_AESIMC128,
21118   IX86_BUILTIN_AESKEYGENASSIST128,
21119 
21120   /* PCLMUL instruction */
21121   IX86_BUILTIN_PCLMULQDQ128,
21122 
21123   /* AVX */
21124   IX86_BUILTIN_ADDPD256,
21125   IX86_BUILTIN_ADDPS256,
21126   IX86_BUILTIN_ADDSUBPD256,
21127   IX86_BUILTIN_ADDSUBPS256,
21128   IX86_BUILTIN_ANDPD256,
21129   IX86_BUILTIN_ANDPS256,
21130   IX86_BUILTIN_ANDNPD256,
21131   IX86_BUILTIN_ANDNPS256,
21132   IX86_BUILTIN_BLENDPD256,
21133   IX86_BUILTIN_BLENDPS256,
21134   IX86_BUILTIN_BLENDVPD256,
21135   IX86_BUILTIN_BLENDVPS256,
21136   IX86_BUILTIN_DIVPD256,
21137   IX86_BUILTIN_DIVPS256,
21138   IX86_BUILTIN_DPPS256,
21139   IX86_BUILTIN_HADDPD256,
21140   IX86_BUILTIN_HADDPS256,
21141   IX86_BUILTIN_HSUBPD256,
21142   IX86_BUILTIN_HSUBPS256,
21143   IX86_BUILTIN_MAXPD256,
21144   IX86_BUILTIN_MAXPS256,
21145   IX86_BUILTIN_MINPD256,
21146   IX86_BUILTIN_MINPS256,
21147   IX86_BUILTIN_MULPD256,
21148   IX86_BUILTIN_MULPS256,
21149   IX86_BUILTIN_ORPD256,
21150   IX86_BUILTIN_ORPS256,
21151   IX86_BUILTIN_SHUFPD256,
21152   IX86_BUILTIN_SHUFPS256,
21153   IX86_BUILTIN_SUBPD256,
21154   IX86_BUILTIN_SUBPS256,
21155   IX86_BUILTIN_XORPD256,
21156   IX86_BUILTIN_XORPS256,
21157   IX86_BUILTIN_CMPSD,
21158   IX86_BUILTIN_CMPSS,
21159   IX86_BUILTIN_CMPPD,
21160   IX86_BUILTIN_CMPPS,
21161   IX86_BUILTIN_CMPPD256,
21162   IX86_BUILTIN_CMPPS256,
21163   IX86_BUILTIN_CVTDQ2PD256,
21164   IX86_BUILTIN_CVTDQ2PS256,
21165   IX86_BUILTIN_CVTPD2PS256,
21166   IX86_BUILTIN_CVTPS2DQ256,
21167   IX86_BUILTIN_CVTPS2PD256,
21168   IX86_BUILTIN_CVTTPD2DQ256,
21169   IX86_BUILTIN_CVTPD2DQ256,
21170   IX86_BUILTIN_CVTTPS2DQ256,
21171   IX86_BUILTIN_EXTRACTF128PD256,
21172   IX86_BUILTIN_EXTRACTF128PS256,
21173   IX86_BUILTIN_EXTRACTF128SI256,
21174   IX86_BUILTIN_VZEROALL,
21175   IX86_BUILTIN_VZEROUPPER,
21176   IX86_BUILTIN_VPERMILVARPD,
21177   IX86_BUILTIN_VPERMILVARPS,
21178   IX86_BUILTIN_VPERMILVARPD256,
21179   IX86_BUILTIN_VPERMILVARPS256,
21180   IX86_BUILTIN_VPERMILPD,
21181   IX86_BUILTIN_VPERMILPS,
21182   IX86_BUILTIN_VPERMILPD256,
21183   IX86_BUILTIN_VPERMILPS256,
21184   IX86_BUILTIN_VPERMIL2PD,
21185   IX86_BUILTIN_VPERMIL2PS,
21186   IX86_BUILTIN_VPERMIL2PD256,
21187   IX86_BUILTIN_VPERMIL2PS256,
21188   IX86_BUILTIN_VPERM2F128PD256,
21189   IX86_BUILTIN_VPERM2F128PS256,
21190   IX86_BUILTIN_VPERM2F128SI256,
21191   IX86_BUILTIN_VBROADCASTSS,
21192   IX86_BUILTIN_VBROADCASTSD256,
21193   IX86_BUILTIN_VBROADCASTSS256,
21194   IX86_BUILTIN_VBROADCASTPD256,
21195   IX86_BUILTIN_VBROADCASTPS256,
21196   IX86_BUILTIN_VINSERTF128PD256,
21197   IX86_BUILTIN_VINSERTF128PS256,
21198   IX86_BUILTIN_VINSERTF128SI256,
21199   IX86_BUILTIN_LOADUPD256,
21200   IX86_BUILTIN_LOADUPS256,
21201   IX86_BUILTIN_STOREUPD256,
21202   IX86_BUILTIN_STOREUPS256,
21203   IX86_BUILTIN_LDDQU256,
21204   IX86_BUILTIN_MOVNTDQ256,
21205   IX86_BUILTIN_MOVNTPD256,
21206   IX86_BUILTIN_MOVNTPS256,
21207   IX86_BUILTIN_LOADDQU256,
21208   IX86_BUILTIN_STOREDQU256,
21209   IX86_BUILTIN_MASKLOADPD,
21210   IX86_BUILTIN_MASKLOADPS,
21211   IX86_BUILTIN_MASKSTOREPD,
21212   IX86_BUILTIN_MASKSTOREPS,
21213   IX86_BUILTIN_MASKLOADPD256,
21214   IX86_BUILTIN_MASKLOADPS256,
21215   IX86_BUILTIN_MASKSTOREPD256,
21216   IX86_BUILTIN_MASKSTOREPS256,
21217   IX86_BUILTIN_MOVSHDUP256,
21218   IX86_BUILTIN_MOVSLDUP256,
21219   IX86_BUILTIN_MOVDDUP256,
21220 
21221   IX86_BUILTIN_SQRTPD256,
21222   IX86_BUILTIN_SQRTPS256,
21223   IX86_BUILTIN_SQRTPS_NR256,
21224   IX86_BUILTIN_RSQRTPS256,
21225   IX86_BUILTIN_RSQRTPS_NR256,
21226 
21227   IX86_BUILTIN_RCPPS256,
21228 
21229   IX86_BUILTIN_ROUNDPD256,
21230   IX86_BUILTIN_ROUNDPS256,
21231 
21232   IX86_BUILTIN_UNPCKHPD256,
21233   IX86_BUILTIN_UNPCKLPD256,
21234   IX86_BUILTIN_UNPCKHPS256,
21235   IX86_BUILTIN_UNPCKLPS256,
21236 
21237   IX86_BUILTIN_SI256_SI,
21238   IX86_BUILTIN_PS256_PS,
21239   IX86_BUILTIN_PD256_PD,
21240   IX86_BUILTIN_SI_SI256,
21241   IX86_BUILTIN_PS_PS256,
21242   IX86_BUILTIN_PD_PD256,
21243 
21244   IX86_BUILTIN_VTESTZPD,
21245   IX86_BUILTIN_VTESTCPD,
21246   IX86_BUILTIN_VTESTNZCPD,
21247   IX86_BUILTIN_VTESTZPS,
21248   IX86_BUILTIN_VTESTCPS,
21249   IX86_BUILTIN_VTESTNZCPS,
21250   IX86_BUILTIN_VTESTZPD256,
21251   IX86_BUILTIN_VTESTCPD256,
21252   IX86_BUILTIN_VTESTNZCPD256,
21253   IX86_BUILTIN_VTESTZPS256,
21254   IX86_BUILTIN_VTESTCPS256,
21255   IX86_BUILTIN_VTESTNZCPS256,
21256   IX86_BUILTIN_PTESTZ256,
21257   IX86_BUILTIN_PTESTC256,
21258   IX86_BUILTIN_PTESTNZC256,
21259 
21260   IX86_BUILTIN_MOVMSKPD256,
21261   IX86_BUILTIN_MOVMSKPS256,
21262 
21263   /* TFmode support builtins.  */
21264   IX86_BUILTIN_INFQ,
21265   IX86_BUILTIN_HUGE_VALQ,
21266   IX86_BUILTIN_FABSQ,
21267   IX86_BUILTIN_COPYSIGNQ,
21268 
21269   /* Vectorizer support builtins.  */
21270   IX86_BUILTIN_CPYSGNPS,
21271   IX86_BUILTIN_CPYSGNPD,
21272 
21273   IX86_BUILTIN_CVTUDQ2PS,
21274 
21275   IX86_BUILTIN_VEC_PERM_V2DF,
21276   IX86_BUILTIN_VEC_PERM_V4SF,
21277   IX86_BUILTIN_VEC_PERM_V2DI,
21278   IX86_BUILTIN_VEC_PERM_V4SI,
21279   IX86_BUILTIN_VEC_PERM_V8HI,
21280   IX86_BUILTIN_VEC_PERM_V16QI,
21281   IX86_BUILTIN_VEC_PERM_V2DI_U,
21282   IX86_BUILTIN_VEC_PERM_V4SI_U,
21283   IX86_BUILTIN_VEC_PERM_V8HI_U,
21284   IX86_BUILTIN_VEC_PERM_V16QI_U,
21285   IX86_BUILTIN_VEC_PERM_V4DF,
21286   IX86_BUILTIN_VEC_PERM_V8SF,
21287 
21288   /* FMA4 and XOP instructions.  */
21289   IX86_BUILTIN_VFMADDSS,
21290   IX86_BUILTIN_VFMADDSD,
21291   IX86_BUILTIN_VFMADDPS,
21292   IX86_BUILTIN_VFMADDPD,
21293   IX86_BUILTIN_VFMSUBSS,
21294   IX86_BUILTIN_VFMSUBSD,
21295   IX86_BUILTIN_VFMSUBPS,
21296   IX86_BUILTIN_VFMSUBPD,
21297   IX86_BUILTIN_VFMADDSUBPS,
21298   IX86_BUILTIN_VFMADDSUBPD,
21299   IX86_BUILTIN_VFMSUBADDPS,
21300   IX86_BUILTIN_VFMSUBADDPD,
21301   IX86_BUILTIN_VFNMADDSS,
21302   IX86_BUILTIN_VFNMADDSD,
21303   IX86_BUILTIN_VFNMADDPS,
21304   IX86_BUILTIN_VFNMADDPD,
21305   IX86_BUILTIN_VFNMSUBSS,
21306   IX86_BUILTIN_VFNMSUBSD,
21307   IX86_BUILTIN_VFNMSUBPS,
21308   IX86_BUILTIN_VFNMSUBPD,
21309   IX86_BUILTIN_VFMADDPS256,
21310   IX86_BUILTIN_VFMADDPD256,
21311   IX86_BUILTIN_VFMSUBPS256,
21312   IX86_BUILTIN_VFMSUBPD256,
21313   IX86_BUILTIN_VFMADDSUBPS256,
21314   IX86_BUILTIN_VFMADDSUBPD256,
21315   IX86_BUILTIN_VFMSUBADDPS256,
21316   IX86_BUILTIN_VFMSUBADDPD256,
21317   IX86_BUILTIN_VFNMADDPS256,
21318   IX86_BUILTIN_VFNMADDPD256,
21319   IX86_BUILTIN_VFNMSUBPS256,
21320   IX86_BUILTIN_VFNMSUBPD256,
21321 
21322   IX86_BUILTIN_VPCMOV,
21323   IX86_BUILTIN_VPCMOV_V2DI,
21324   IX86_BUILTIN_VPCMOV_V4SI,
21325   IX86_BUILTIN_VPCMOV_V8HI,
21326   IX86_BUILTIN_VPCMOV_V16QI,
21327   IX86_BUILTIN_VPCMOV_V4SF,
21328   IX86_BUILTIN_VPCMOV_V2DF,
21329   IX86_BUILTIN_VPCMOV256,
21330   IX86_BUILTIN_VPCMOV_V4DI256,
21331   IX86_BUILTIN_VPCMOV_V8SI256,
21332   IX86_BUILTIN_VPCMOV_V16HI256,
21333   IX86_BUILTIN_VPCMOV_V32QI256,
21334   IX86_BUILTIN_VPCMOV_V8SF256,
21335   IX86_BUILTIN_VPCMOV_V4DF256,
21336 
21337   IX86_BUILTIN_VPPERM,
21338 
21339   IX86_BUILTIN_VPMACSSWW,
21340   IX86_BUILTIN_VPMACSWW,
21341   IX86_BUILTIN_VPMACSSWD,
21342   IX86_BUILTIN_VPMACSWD,
21343   IX86_BUILTIN_VPMACSSDD,
21344   IX86_BUILTIN_VPMACSDD,
21345   IX86_BUILTIN_VPMACSSDQL,
21346   IX86_BUILTIN_VPMACSSDQH,
21347   IX86_BUILTIN_VPMACSDQL,
21348   IX86_BUILTIN_VPMACSDQH,
21349   IX86_BUILTIN_VPMADCSSWD,
21350   IX86_BUILTIN_VPMADCSWD,
21351 
21352   IX86_BUILTIN_VPHADDBW,
21353   IX86_BUILTIN_VPHADDBD,
21354   IX86_BUILTIN_VPHADDBQ,
21355   IX86_BUILTIN_VPHADDWD,
21356   IX86_BUILTIN_VPHADDWQ,
21357   IX86_BUILTIN_VPHADDDQ,
21358   IX86_BUILTIN_VPHADDUBW,
21359   IX86_BUILTIN_VPHADDUBD,
21360   IX86_BUILTIN_VPHADDUBQ,
21361   IX86_BUILTIN_VPHADDUWD,
21362   IX86_BUILTIN_VPHADDUWQ,
21363   IX86_BUILTIN_VPHADDUDQ,
21364   IX86_BUILTIN_VPHSUBBW,
21365   IX86_BUILTIN_VPHSUBWD,
21366   IX86_BUILTIN_VPHSUBDQ,
21367 
21368   IX86_BUILTIN_VPROTB,
21369   IX86_BUILTIN_VPROTW,
21370   IX86_BUILTIN_VPROTD,
21371   IX86_BUILTIN_VPROTQ,
21372   IX86_BUILTIN_VPROTB_IMM,
21373   IX86_BUILTIN_VPROTW_IMM,
21374   IX86_BUILTIN_VPROTD_IMM,
21375   IX86_BUILTIN_VPROTQ_IMM,
21376 
21377   IX86_BUILTIN_VPSHLB,
21378   IX86_BUILTIN_VPSHLW,
21379   IX86_BUILTIN_VPSHLD,
21380   IX86_BUILTIN_VPSHLQ,
21381   IX86_BUILTIN_VPSHAB,
21382   IX86_BUILTIN_VPSHAW,
21383   IX86_BUILTIN_VPSHAD,
21384   IX86_BUILTIN_VPSHAQ,
21385 
21386   IX86_BUILTIN_VFRCZSS,
21387   IX86_BUILTIN_VFRCZSD,
21388   IX86_BUILTIN_VFRCZPS,
21389   IX86_BUILTIN_VFRCZPD,
21390   IX86_BUILTIN_VFRCZPS256,
21391   IX86_BUILTIN_VFRCZPD256,
21392 
21393   IX86_BUILTIN_VPCOMEQUB,
21394   IX86_BUILTIN_VPCOMNEUB,
21395   IX86_BUILTIN_VPCOMLTUB,
21396   IX86_BUILTIN_VPCOMLEUB,
21397   IX86_BUILTIN_VPCOMGTUB,
21398   IX86_BUILTIN_VPCOMGEUB,
21399   IX86_BUILTIN_VPCOMFALSEUB,
21400   IX86_BUILTIN_VPCOMTRUEUB,
21401 
21402   IX86_BUILTIN_VPCOMEQUW,
21403   IX86_BUILTIN_VPCOMNEUW,
21404   IX86_BUILTIN_VPCOMLTUW,
21405   IX86_BUILTIN_VPCOMLEUW,
21406   IX86_BUILTIN_VPCOMGTUW,
21407   IX86_BUILTIN_VPCOMGEUW,
21408   IX86_BUILTIN_VPCOMFALSEUW,
21409   IX86_BUILTIN_VPCOMTRUEUW,
21410 
21411   IX86_BUILTIN_VPCOMEQUD,
21412   IX86_BUILTIN_VPCOMNEUD,
21413   IX86_BUILTIN_VPCOMLTUD,
21414   IX86_BUILTIN_VPCOMLEUD,
21415   IX86_BUILTIN_VPCOMGTUD,
21416   IX86_BUILTIN_VPCOMGEUD,
21417   IX86_BUILTIN_VPCOMFALSEUD,
21418   IX86_BUILTIN_VPCOMTRUEUD,
21419 
21420   IX86_BUILTIN_VPCOMEQUQ,
21421   IX86_BUILTIN_VPCOMNEUQ,
21422   IX86_BUILTIN_VPCOMLTUQ,
21423   IX86_BUILTIN_VPCOMLEUQ,
21424   IX86_BUILTIN_VPCOMGTUQ,
21425   IX86_BUILTIN_VPCOMGEUQ,
21426   IX86_BUILTIN_VPCOMFALSEUQ,
21427   IX86_BUILTIN_VPCOMTRUEUQ,
21428 
21429   IX86_BUILTIN_VPCOMEQB,
21430   IX86_BUILTIN_VPCOMNEB,
21431   IX86_BUILTIN_VPCOMLTB,
21432   IX86_BUILTIN_VPCOMLEB,
21433   IX86_BUILTIN_VPCOMGTB,
21434   IX86_BUILTIN_VPCOMGEB,
21435   IX86_BUILTIN_VPCOMFALSEB,
21436   IX86_BUILTIN_VPCOMTRUEB,
21437 
21438   IX86_BUILTIN_VPCOMEQW,
21439   IX86_BUILTIN_VPCOMNEW,
21440   IX86_BUILTIN_VPCOMLTW,
21441   IX86_BUILTIN_VPCOMLEW,
21442   IX86_BUILTIN_VPCOMGTW,
21443   IX86_BUILTIN_VPCOMGEW,
21444   IX86_BUILTIN_VPCOMFALSEW,
21445   IX86_BUILTIN_VPCOMTRUEW,
21446 
21447   IX86_BUILTIN_VPCOMEQD,
21448   IX86_BUILTIN_VPCOMNED,
21449   IX86_BUILTIN_VPCOMLTD,
21450   IX86_BUILTIN_VPCOMLED,
21451   IX86_BUILTIN_VPCOMGTD,
21452   IX86_BUILTIN_VPCOMGED,
21453   IX86_BUILTIN_VPCOMFALSED,
21454   IX86_BUILTIN_VPCOMTRUED,
21455 
21456   IX86_BUILTIN_VPCOMEQQ,
21457   IX86_BUILTIN_VPCOMNEQ,
21458   IX86_BUILTIN_VPCOMLTQ,
21459   IX86_BUILTIN_VPCOMLEQ,
21460   IX86_BUILTIN_VPCOMGTQ,
21461   IX86_BUILTIN_VPCOMGEQ,
21462   IX86_BUILTIN_VPCOMFALSEQ,
21463   IX86_BUILTIN_VPCOMTRUEQ,
21464 
21465   /* LWP instructions.  */
21466   IX86_BUILTIN_LLWPCB,
21467   IX86_BUILTIN_SLWPCB,
21468   IX86_BUILTIN_LWPVAL32,
21469   IX86_BUILTIN_LWPVAL64,
21470   IX86_BUILTIN_LWPINS32,
21471   IX86_BUILTIN_LWPINS64,
21472 
21473   IX86_BUILTIN_CLZS,
21474 
21475   IX86_BUILTIN_MAX
21476 };
21477 
21478 /* Table for the ix86 builtin decls.  */
21479 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
21480 
21481 /* Table of all of the builtin functions that are possible with different ISA's
21482    but are waiting to be built until a function is declared to use that
21483    ISA.  */
21484 struct builtin_isa {
21485   const char *name;		/* function name */
21486   enum ix86_builtin_func_type tcode; /* type to use in the declaration */
21487   int isa;			/* isa_flags this builtin is defined for */
21488   bool const_p;			/* true if the declaration is constant */
21489   bool set_and_not_built_p;
21490 };
21491 
21492 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
21493 
21494 
21495 /* Add an ix86 target builtin function with CODE, NAME and TYPE.  Save the MASK
21496    of which isa_flags to use in the ix86_builtins_isa array.  Stores the
21497    function decl in the ix86_builtins array.  Returns the function decl or
21498    NULL_TREE, if the builtin was not added.
21499 
21500    If the front end has a special hook for builtin functions, delay adding
21501    builtin functions that aren't in the current ISA until the ISA is changed
21502    with function specific optimization.  Doing so, can save about 300K for the
21503    default compiler.  When the builtin is expanded, check at that time whether
21504    it is valid.
21505 
21506    If the front end doesn't have a special hook, record all builtins, even if
21507    it isn't an instruction set in the current ISA in case the user uses
21508    function specific options for a different ISA, so that we don't get scope
21509    errors if a builtin is added in the middle of a function scope.  */
21510 
21511 static inline tree
21512 def_builtin (int mask, const char *name, enum ix86_builtin_func_type tcode,
21513 	     enum ix86_builtins code)
21514 {
21515   tree decl = NULL_TREE;
21516 
21517   if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
21518     {
21519       ix86_builtins_isa[(int) code].isa = mask;
21520 
21521       mask &= ~OPTION_MASK_ISA_64BIT;
21522       if (mask == 0
21523 	  || (mask & ix86_isa_flags) != 0
21524 	  || (lang_hooks.builtin_function
21525 	      == lang_hooks.builtin_function_ext_scope))
21526 
21527 	{
21528 	  tree type = ix86_get_builtin_func_type (tcode);
21529 	  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
21530 				       NULL, NULL_TREE);
21531 	  ix86_builtins[(int) code] = decl;
21532 	  ix86_builtins_isa[(int) code].set_and_not_built_p = false;
21533 	}
21534       else
21535 	{
21536 	  ix86_builtins[(int) code] = NULL_TREE;
21537 	  ix86_builtins_isa[(int) code].tcode = tcode;
21538 	  ix86_builtins_isa[(int) code].name = name;
21539 	  ix86_builtins_isa[(int) code].const_p = false;
21540 	  ix86_builtins_isa[(int) code].set_and_not_built_p = true;
21541 	}
21542     }
21543 
21544   return decl;
21545 }
21546 
21547 /* Like def_builtin, but also marks the function decl "const".  */
21548 
21549 static inline tree
21550 def_builtin_const (int mask, const char *name,
21551 		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
21552 {
21553   tree decl = def_builtin (mask, name, tcode, code);
21554   if (decl)
21555     TREE_READONLY (decl) = 1;
21556   else
21557     ix86_builtins_isa[(int) code].const_p = true;
21558 
21559   return decl;
21560 }
21561 
21562 /* Add any new builtin functions for a given ISA that may not have been
21563    declared.  This saves a bit of space compared to adding all of the
21564    declarations to the tree, even if we didn't use them.  */
21565 
21566 static void
21567 ix86_add_new_builtins (int isa)
21568 {
21569   int i;
21570 
21571   for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
21572     {
21573       if ((ix86_builtins_isa[i].isa & isa) != 0
21574 	  && ix86_builtins_isa[i].set_and_not_built_p)
21575 	{
21576 	  tree decl, type;
21577 
21578 	  /* Don't define the builtin again.  */
21579 	  ix86_builtins_isa[i].set_and_not_built_p = false;
21580 
21581 	  type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
21582 	  decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
21583 						 type, i, BUILT_IN_MD, NULL,
21584 						 NULL_TREE);
21585 
21586 	  ix86_builtins[i] = decl;
21587 	  if (ix86_builtins_isa[i].const_p)
21588 	    TREE_READONLY (decl) = 1;
21589 	}
21590     }
21591 }
21592 
21593 /* Bits for builtin_description.flag.  */
21594 
21595 /* Set when we don't support the comparison natively, and should
21596    swap_comparison in order to support it.  */
21597 #define BUILTIN_DESC_SWAP_OPERANDS	1
21598 
21599 struct builtin_description
21600 {
21601   const unsigned int mask;
21602   const enum insn_code icode;
21603   const char *const name;
21604   const enum ix86_builtins code;
21605   const enum rtx_code comparison;
21606   const int flag;
21607 };
21608 
21609 static const struct builtin_description bdesc_comi[] =
21610 {
21611   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
21612   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
21613   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
21614   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
21615   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
21616   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
21617   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
21618   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
21619   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
21620   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
21621   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
21622   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
21623   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
21624   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
21625   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
21626   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
21627   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
21628   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
21629   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
21630   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
21631   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
21632   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
21633   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
21634   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
21635 };
21636 
21637 static const struct builtin_description bdesc_pcmpestr[] =
21638 {
21639   /* SSE4.2 */
21640   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
21641   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
21642   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
21643   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
21644   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
21645   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
21646   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
21647 };
21648 
21649 static const struct builtin_description bdesc_pcmpistr[] =
21650 {
21651   /* SSE4.2 */
21652   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
21653   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
21654   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
21655   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
21656   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
21657   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
21658   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
21659 };
21660 
21661 /* Special builtins with variable number of arguments.  */
21662 static const struct builtin_description bdesc_special_args[] =
21663 {
21664   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
21665   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
21666 
21667   /* MMX */
21668   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
21669 
21670   /* 3DNow! */
21671   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
21672 
21673   /* SSE */
21674   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
21675   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
21676   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
21677 
21678   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
21679   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
21680   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
21681   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
21682 
21683   /* SSE or 3DNow!A  */
21684   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
21685   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
21686 
21687   /* SSE2 */
21688   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
21689   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
21690   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
21691   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
21692   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
21693   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
21694   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
21695   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
21696   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
21697 
21698   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
21699   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
21700 
21701   /* SSE3 */
21702   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
21703 
21704   /* SSE4.1 */
21705   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
21706 
21707   /* SSE4A */
21708   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
21709   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
21710 
21711   /* AVX */
21712   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
21713   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
21714 
21715   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
21716   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
21717   { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
21718   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
21719   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
21720 
21721   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
21722   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
21723   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
21724   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
21725   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
21726   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
21727   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
21728 
21729   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
21730   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
21731   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
21732 
21733   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
21734   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
21735   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
21736   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
21737   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
21738   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
21739   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
21740   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
21741 
21742   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
21743   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
21744   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
21745   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
21746   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
21747   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
21748 
21749 };
21750 
21751 /* Builtins with variable number of arguments.  */
21752 static const struct builtin_description bdesc_args[] =
21753 {
21754   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
21755   { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
21756   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
21757   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
21758   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
21759   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
21760   { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
21761 
21762   /* MMX */
21763   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21764   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21765   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
21766   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21767   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21768   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
21769 
21770   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21771   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21772   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21773   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21774   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21775   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21776   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21777   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21778 
21779   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21780   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21781 
21782   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
21783   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
21784   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
21785   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
21786 
21787   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21788   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21789   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
21790   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21791   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21792   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
21793 
21794   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21795   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21796   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
21797   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21798   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
21799   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
21800 
21801   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
21802   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
21803   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
21804 
21805   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
21806 
21807   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
21808   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
21809   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
21810   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
21811   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
21812   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
21813 
21814   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
21815   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
21816   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
21817   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
21818   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
21819   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
21820 
21821   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
21822   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
21823   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
21824   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
21825 
21826   /* 3DNow! */
21827   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
21828   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
21829   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
21830   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
21831 
21832   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21833   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21834   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21835   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
21836   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
21837   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
21838   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21839   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21840   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21841   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21842   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21843   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21844   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21845   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21846   { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21847 
21848   /* 3DNow!A */
21849   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
21850   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
21851   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
21852   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
21853   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21854   { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
21855 
21856   /* SSE */
21857   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
21858   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
21859   { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
21860   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
21861   { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
21862   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
21863   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
21864   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
21865   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
21866   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
21867   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
21868   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
21869 
21870   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
21871 
21872   { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21873   { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21874   { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21875   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21876   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3,  "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21877   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3,  "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21878   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3,  "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21879   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3,  "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21880 
21881   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
21882   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
21883   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
21884   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
21885   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
21886   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
21887   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
21888   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
21889   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
21890   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
21891   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
21892   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
21893   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
21894   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
21895   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
21896   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
21897   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
21898   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
21899   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
21900   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
21901   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
21902   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
21903 
21904   { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21905   { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21906   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21907   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21908 
21909   { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21910   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3,  "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21911   { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21912   { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3,  "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21913 
21914   { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3,  "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21915 
21916   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21917   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp,  "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21918   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp,  "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21919   { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21920   { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
21921 
21922   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
21923   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
21924   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
21925 
21926   { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
21927 
21928   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
21929   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
21930   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
21931 
21932   /* SSE MMX or 3Dnow!A */
21933   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21934   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21935   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21936 
21937   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21938   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21939   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
21940   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
21941 
21942   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
21943   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
21944 
21945   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
21946 
21947   /* SSE2 */
21948   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
21949 
21950   { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2df", IX86_BUILTIN_VEC_PERM_V2DF, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI },
21951   { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4sf", IX86_BUILTIN_VEC_PERM_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI },
21952   { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di", IX86_BUILTIN_VEC_PERM_V2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI },
21953   { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si", IX86_BUILTIN_VEC_PERM_V4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI },
21954   { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi", IX86_BUILTIN_VEC_PERM_V8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI },
21955   { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi", IX86_BUILTIN_VEC_PERM_V16QI, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
21956   { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di_u", IX86_BUILTIN_VEC_PERM_V2DI_U, UNKNOWN, (int) V2UDI_FTYPE_V2UDI_V2UDI_V2UDI },
21957   { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si_u", IX86_BUILTIN_VEC_PERM_V4SI_U, UNKNOWN, (int) V4USI_FTYPE_V4USI_V4USI_V4USI },
21958   { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi_u", IX86_BUILTIN_VEC_PERM_V8HI_U, UNKNOWN, (int) V8UHI_FTYPE_V8UHI_V8UHI_V8UHI },
21959   { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi_u", IX86_BUILTIN_VEC_PERM_V16QI_U, UNKNOWN, (int) V16UQI_FTYPE_V16UQI_V16UQI_V16UQI },
21960   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4df", IX86_BUILTIN_VEC_PERM_V4DF, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DI },
21961   { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8sf", IX86_BUILTIN_VEC_PERM_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SI },
21962 
21963   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF  },
21964   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
21965   { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
21966   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
21967   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2ps, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
21968   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtudq2ps, "__builtin_ia32_cvtudq2ps", IX86_BUILTIN_CVTUDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
21969 
21970   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
21971   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
21972   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
21973   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
21974   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
21975 
21976   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
21977 
21978   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
21979   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
21980   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
21981   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
21982 
21983   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
21984   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
21985   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttps2dq, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
21986 
21987   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
21988   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
21989   { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
21990   { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
21991   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3,  "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
21992   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3,  "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
21993   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3,  "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
21994   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3,  "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
21995 
21996   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
21997   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
21998   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
21999   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
22000   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
22001   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
22002   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
22003   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
22004   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
22005   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
22006   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
22007   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
22008   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
22009   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
22010   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
22011   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
22012   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
22013   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
22014   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
22015   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
22016 
22017   { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22018   { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22019   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22020   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22021 
22022   { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22023   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3,  "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22024   { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22025   { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3,  "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22026 
22027   { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3,  "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22028 
22029   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22030   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22031   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22032 
22033   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
22034 
22035   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22036   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22037   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22038   { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22039   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22040   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22041   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22042   { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22043 
22044   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22045   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22046   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22047   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22048   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22049   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22050   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22051   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22052 
22053   { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22054   { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
22055 
22056   { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22057   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22058   { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22059   { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22060 
22061   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22062   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22063 
22064   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22065   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22066   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI  },
22067   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22068   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22069   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI  },
22070 
22071   { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22072   { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22073   { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22074   { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22075 
22076   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22077   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI  },
22078   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN,  (int) V4SI_FTYPE_V4SI_V4SI },
22079   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22080   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22081   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22082   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22083   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22084 
22085   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
22086   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
22087   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
22088 
22089   { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22090   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
22091 
22092   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
22093   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
22094 
22095   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
22096 
22097   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
22098   { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
22099   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
22100   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
22101 
22102   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
22103   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
22104   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
22105   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
22106   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
22107   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
22108   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
22109 
22110   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
22111   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
22112   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
22113   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
22114   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
22115   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
22116   { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
22117 
22118   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
22119   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
22120   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
22121   { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
22122 
22123   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
22124   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
22125   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
22126 
22127   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
22128 
22129   { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
22130   { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
22131 
22132   { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
22133 
22134   /* SSE2 MMX */
22135   { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
22136   { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
22137 
22138   /* SSE3 */
22139   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
22140   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
22141 
22142   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
22143   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22144   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
22145   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22146   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
22147   { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
22148 
22149   /* SSSE3 */
22150   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
22151   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
22152   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
22153   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
22154   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
22155   { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
22156 
22157   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22158   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
22159   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22160   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
22161   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22162   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
22163   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22164   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
22165   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22166   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
22167   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22168   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
22169   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
22170   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
22171   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22172   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
22173   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22174   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
22175   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22176   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
22177   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22178   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
22179   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22180   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
22181 
22182   /* SSSE3.  */
22183   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
22184   { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
22185 
22186   /* SSE4.1 */
22187   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
22188   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
22189   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
22190   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
22191   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
22192   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
22193   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
22194   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
22195   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
22196   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
22197 
22198   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
22199   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
22200   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
22201   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
22202   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
22203   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
22204   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
22205   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
22206   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
22207   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
22208   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
22209   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
22210   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
22211 
22212   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
22213   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22214   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22215   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22216   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22217   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22218   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
22219   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22220   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22221   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
22222   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
22223   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
22224 
22225   /* SSE4.1 */
22226   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
22227   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
22228   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
22229   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
22230 
22231   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
22232   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
22233   { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
22234 
22235   /* SSE4.2 */
22236   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22237   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
22238   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
22239   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
22240   { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
22241 
22242   /* SSE4A */
22243   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
22244   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
22245   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
22246   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22247 
22248   /* AES */
22249   { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
22250   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
22251 
22252   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22253   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22254   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22255   { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
22256 
22257   /* PCLMUL */
22258   { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
22259 
22260   /* AVX */
22261   { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22262   { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22263   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22264   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22265   { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22266   { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22267   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22268   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22269   { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22270   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22271   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22272   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22273   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22274   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22275   { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22276   { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22277   { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22278   { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22279   { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22280   { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22281   { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22282   { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22283   { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22284   { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22285   { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22286   { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22287 
22288   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
22289   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
22290   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
22291   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
22292 
22293   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
22294   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
22295   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
22296   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
22297   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
22298   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
22299   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
22300   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpsdv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
22301   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpssv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
22302   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppdv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
22303   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppsv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
22304   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppdv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
22305   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppsv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
22306   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
22307   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
22308   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
22309   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2pd256, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
22310   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2ps256, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
22311   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
22312   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
22313   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
22314   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttpd2dq256, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
22315   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
22316   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttps2dq256, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
22317   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
22318   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
22319   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
22320   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
22321   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
22322   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
22323   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
22324   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
22325   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
22326   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
22327 
22328   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
22329   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
22330   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
22331 
22332   { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
22333   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
22334   { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
22335   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
22336   { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
22337 
22338   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
22339 
22340   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
22341   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
22342 
22343   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256,  "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22344   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256,  "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
22345   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256,  "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22346   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256,  "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
22347 
22348   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
22349   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
22350   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
22351   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si_si256, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
22352   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps_ps256, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
22353   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd_pd256, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
22354 
22355   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
22356   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
22357   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
22358   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
22359   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
22360   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
22361   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
22362   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
22363   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
22364   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
22365   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
22366   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
22367   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
22368   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
22369   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
22370 
22371   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF  },
22372   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
22373 
22374   { OPTION_MASK_ISA_ABM, CODE_FOR_clzhi2_abm,   "__builtin_clzs",   IX86_BUILTIN_CLZS,    UNKNOWN,     (int) UINT16_FTYPE_UINT16 },
22375 };
22376 
22377 /* FMA4 and XOP.  */
22378 #define MULTI_ARG_4_DF2_DI_I	V2DF_FTYPE_V2DF_V2DF_V2DI_INT
22379 #define MULTI_ARG_4_DF2_DI_I1	V4DF_FTYPE_V4DF_V4DF_V4DI_INT
22380 #define MULTI_ARG_4_SF2_SI_I	V4SF_FTYPE_V4SF_V4SF_V4SI_INT
22381 #define MULTI_ARG_4_SF2_SI_I1	V8SF_FTYPE_V8SF_V8SF_V8SI_INT
22382 #define MULTI_ARG_3_SF		V4SF_FTYPE_V4SF_V4SF_V4SF
22383 #define MULTI_ARG_3_DF		V2DF_FTYPE_V2DF_V2DF_V2DF
22384 #define MULTI_ARG_3_SF2		V8SF_FTYPE_V8SF_V8SF_V8SF
22385 #define MULTI_ARG_3_DF2		V4DF_FTYPE_V4DF_V4DF_V4DF
22386 #define MULTI_ARG_3_DI		V2DI_FTYPE_V2DI_V2DI_V2DI
22387 #define MULTI_ARG_3_SI		V4SI_FTYPE_V4SI_V4SI_V4SI
22388 #define MULTI_ARG_3_SI_DI	V4SI_FTYPE_V4SI_V4SI_V2DI
22389 #define MULTI_ARG_3_HI		V8HI_FTYPE_V8HI_V8HI_V8HI
22390 #define MULTI_ARG_3_HI_SI	V8HI_FTYPE_V8HI_V8HI_V4SI
22391 #define MULTI_ARG_3_QI		V16QI_FTYPE_V16QI_V16QI_V16QI
22392 #define MULTI_ARG_3_DI2		V4DI_FTYPE_V4DI_V4DI_V4DI
22393 #define MULTI_ARG_3_SI2		V8SI_FTYPE_V8SI_V8SI_V8SI
22394 #define MULTI_ARG_3_HI2		V16HI_FTYPE_V16HI_V16HI_V16HI
22395 #define MULTI_ARG_3_QI2		V32QI_FTYPE_V32QI_V32QI_V32QI
22396 #define MULTI_ARG_2_SF		V4SF_FTYPE_V4SF_V4SF
22397 #define MULTI_ARG_2_DF		V2DF_FTYPE_V2DF_V2DF
22398 #define MULTI_ARG_2_DI		V2DI_FTYPE_V2DI_V2DI
22399 #define MULTI_ARG_2_SI		V4SI_FTYPE_V4SI_V4SI
22400 #define MULTI_ARG_2_HI		V8HI_FTYPE_V8HI_V8HI
22401 #define MULTI_ARG_2_QI		V16QI_FTYPE_V16QI_V16QI
22402 #define MULTI_ARG_2_DI_IMM	V2DI_FTYPE_V2DI_SI
22403 #define MULTI_ARG_2_SI_IMM	V4SI_FTYPE_V4SI_SI
22404 #define MULTI_ARG_2_HI_IMM	V8HI_FTYPE_V8HI_SI
22405 #define MULTI_ARG_2_QI_IMM	V16QI_FTYPE_V16QI_SI
22406 #define MULTI_ARG_2_DI_CMP	V2DI_FTYPE_V2DI_V2DI_CMP
22407 #define MULTI_ARG_2_SI_CMP	V4SI_FTYPE_V4SI_V4SI_CMP
22408 #define MULTI_ARG_2_HI_CMP	V8HI_FTYPE_V8HI_V8HI_CMP
22409 #define MULTI_ARG_2_QI_CMP	V16QI_FTYPE_V16QI_V16QI_CMP
22410 #define MULTI_ARG_2_SF_TF	V4SF_FTYPE_V4SF_V4SF_TF
22411 #define MULTI_ARG_2_DF_TF	V2DF_FTYPE_V2DF_V2DF_TF
22412 #define MULTI_ARG_2_DI_TF	V2DI_FTYPE_V2DI_V2DI_TF
22413 #define MULTI_ARG_2_SI_TF	V4SI_FTYPE_V4SI_V4SI_TF
22414 #define MULTI_ARG_2_HI_TF	V8HI_FTYPE_V8HI_V8HI_TF
22415 #define MULTI_ARG_2_QI_TF	V16QI_FTYPE_V16QI_V16QI_TF
22416 #define MULTI_ARG_1_SF		V4SF_FTYPE_V4SF
22417 #define MULTI_ARG_1_DF		V2DF_FTYPE_V2DF
22418 #define MULTI_ARG_1_SF2		V8SF_FTYPE_V8SF
22419 #define MULTI_ARG_1_DF2		V4DF_FTYPE_V4DF
22420 #define MULTI_ARG_1_DI		V2DI_FTYPE_V2DI
22421 #define MULTI_ARG_1_SI		V4SI_FTYPE_V4SI
22422 #define MULTI_ARG_1_HI		V8HI_FTYPE_V8HI
22423 #define MULTI_ARG_1_QI		V16QI_FTYPE_V16QI
22424 #define MULTI_ARG_1_SI_DI	V2DI_FTYPE_V4SI
22425 #define MULTI_ARG_1_HI_DI	V2DI_FTYPE_V8HI
22426 #define MULTI_ARG_1_HI_SI	V4SI_FTYPE_V8HI
22427 #define MULTI_ARG_1_QI_DI	V2DI_FTYPE_V16QI
22428 #define MULTI_ARG_1_QI_SI	V4SI_FTYPE_V16QI
22429 #define MULTI_ARG_1_QI_HI	V8HI_FTYPE_V16QI
22430 
22431 static const struct builtin_description bdesc_multi_arg[] =
22432 {
22433   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmaddv4sf4,     "__builtin_ia32_vfmaddss",    IX86_BUILTIN_VFMADDSS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
22434   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmaddv2df4,     "__builtin_ia32_vfmaddsd",    IX86_BUILTIN_VFMADDSD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
22435   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv4sf4,       "__builtin_ia32_vfmaddps",    IX86_BUILTIN_VFMADDPS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
22436   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv2df4,       "__builtin_ia32_vfmaddpd",    IX86_BUILTIN_VFMADDPD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
22437   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmsubv4sf4,     "__builtin_ia32_vfmsubss",    IX86_BUILTIN_VFMSUBSS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
22438   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmsubv2df4,     "__builtin_ia32_vfmsubsd",    IX86_BUILTIN_VFMSUBSD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
22439   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv4sf4,       "__builtin_ia32_vfmsubps",    IX86_BUILTIN_VFMSUBPS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
22440   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv2df4,       "__builtin_ia32_vfmsubpd",    IX86_BUILTIN_VFMSUBPD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
22441 
22442   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmaddv4sf4,    "__builtin_ia32_vfnmaddss",   IX86_BUILTIN_VFNMADDSS,   UNKNOWN,      (int)MULTI_ARG_3_SF },
22443   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmaddv2df4,    "__builtin_ia32_vfnmaddsd",   IX86_BUILTIN_VFNMADDSD,   UNKNOWN,      (int)MULTI_ARG_3_DF },
22444   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv4sf4,      "__builtin_ia32_vfnmaddps",   IX86_BUILTIN_VFNMADDPS,   UNKNOWN,      (int)MULTI_ARG_3_SF },
22445   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv2df4,      "__builtin_ia32_vfnmaddpd",   IX86_BUILTIN_VFNMADDPD,   UNKNOWN,      (int)MULTI_ARG_3_DF },
22446   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmsubv4sf4,    "__builtin_ia32_vfnmsubss",   IX86_BUILTIN_VFNMSUBSS,   UNKNOWN,      (int)MULTI_ARG_3_SF },
22447   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmsubv2df4,    "__builtin_ia32_vfnmsubsd",   IX86_BUILTIN_VFNMSUBSD,   UNKNOWN,      (int)MULTI_ARG_3_DF },
22448   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv4sf4,      "__builtin_ia32_vfnmsubps",   IX86_BUILTIN_VFNMSUBPS,   UNKNOWN,      (int)MULTI_ARG_3_SF },
22449   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv2df4,      "__builtin_ia32_vfnmsubpd",   IX86_BUILTIN_VFNMSUBPD,   UNKNOWN,      (int)MULTI_ARG_3_DF },
22450 
22451   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv4sf4,	   "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
22452   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv2df4,	   "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
22453   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv4sf4,	   "__builtin_ia32_vfmsubaddps", IX86_BUILTIN_VFMSUBADDPS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
22454   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv2df4,	   "__builtin_ia32_vfmsubaddpd", IX86_BUILTIN_VFMSUBADDPD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
22455 
22456   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv8sf4256,       "__builtin_ia32_vfmaddps256",    IX86_BUILTIN_VFMADDPS256,    UNKNOWN,      (int)MULTI_ARG_3_SF2 },
22457   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv4df4256,       "__builtin_ia32_vfmaddpd256",    IX86_BUILTIN_VFMADDPD256,    UNKNOWN,      (int)MULTI_ARG_3_DF2 },
22458   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv8sf4256,       "__builtin_ia32_vfmsubps256",    IX86_BUILTIN_VFMSUBPS256,    UNKNOWN,      (int)MULTI_ARG_3_SF2 },
22459   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv4df4256,       "__builtin_ia32_vfmsubpd256",    IX86_BUILTIN_VFMSUBPD256,    UNKNOWN,      (int)MULTI_ARG_3_DF2 },
22460 
22461   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv8sf4256,      "__builtin_ia32_vfnmaddps256",   IX86_BUILTIN_VFNMADDPS256,   UNKNOWN,      (int)MULTI_ARG_3_SF2 },
22462   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv4df4256,      "__builtin_ia32_vfnmaddpd256",   IX86_BUILTIN_VFNMADDPD256,   UNKNOWN,      (int)MULTI_ARG_3_DF2 },
22463   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv8sf4256,      "__builtin_ia32_vfnmsubps256",   IX86_BUILTIN_VFNMSUBPS256,   UNKNOWN,      (int)MULTI_ARG_3_SF2 },
22464   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv4df4256,      "__builtin_ia32_vfnmsubpd256",   IX86_BUILTIN_VFNMSUBPD256,   UNKNOWN,      (int)MULTI_ARG_3_DF2 },
22465 
22466   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv8sf4,	   "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,    UNKNOWN,      (int)MULTI_ARG_3_SF2 },
22467   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv4df4,	   "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,    UNKNOWN,      (int)MULTI_ARG_3_DF2 },
22468   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv8sf4,	   "__builtin_ia32_vfmsubaddps256", IX86_BUILTIN_VFMSUBADDPS256,    UNKNOWN,      (int)MULTI_ARG_3_SF2 },
22469   { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv4df4,	   "__builtin_ia32_vfmsubaddpd256", IX86_BUILTIN_VFMSUBADDPD256,    UNKNOWN,      (int)MULTI_ARG_3_DF2 },
22470 
22471   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov",      IX86_BUILTIN_VPCMOV,	 UNKNOWN,      (int)MULTI_ARG_3_DI },
22472   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN,      (int)MULTI_ARG_3_DI },
22473   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si,        "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN,      (int)MULTI_ARG_3_SI },
22474   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi,        "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN,      (int)MULTI_ARG_3_HI },
22475   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi,       "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN,      (int)MULTI_ARG_3_QI },
22476   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df,        "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN,      (int)MULTI_ARG_3_DF },
22477   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf,        "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN,      (int)MULTI_ARG_3_SF },
22478 
22479   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256,        "__builtin_ia32_vpcmov256",       IX86_BUILTIN_VPCMOV256,       UNKNOWN,      (int)MULTI_ARG_3_DI2 },
22480   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256,        "__builtin_ia32_vpcmov_v4di256",  IX86_BUILTIN_VPCMOV_V4DI256,  UNKNOWN,      (int)MULTI_ARG_3_DI2 },
22481   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256,        "__builtin_ia32_vpcmov_v8si256",  IX86_BUILTIN_VPCMOV_V8SI256,  UNKNOWN,      (int)MULTI_ARG_3_SI2 },
22482   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256,       "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN,      (int)MULTI_ARG_3_HI2 },
22483   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256,       "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN,      (int)MULTI_ARG_3_QI2 },
22484   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256,        "__builtin_ia32_vpcmov_v4df256",  IX86_BUILTIN_VPCMOV_V4DF256,  UNKNOWN,      (int)MULTI_ARG_3_DF2 },
22485   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256,        "__builtin_ia32_vpcmov_v8sf256",  IX86_BUILTIN_VPCMOV_V8SF256,  UNKNOWN,      (int)MULTI_ARG_3_SF2 },
22486 
22487   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm,             "__builtin_ia32_vpperm",      IX86_BUILTIN_VPPERM,      UNKNOWN,      (int)MULTI_ARG_3_QI },
22488 
22489   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww,          "__builtin_ia32_vpmacssww",   IX86_BUILTIN_VPMACSSWW,   UNKNOWN,      (int)MULTI_ARG_3_HI },
22490   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww,           "__builtin_ia32_vpmacsww",    IX86_BUILTIN_VPMACSWW,    UNKNOWN,      (int)MULTI_ARG_3_HI },
22491   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd,          "__builtin_ia32_vpmacsswd",   IX86_BUILTIN_VPMACSSWD,   UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
22492   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd,           "__builtin_ia32_vpmacswd",    IX86_BUILTIN_VPMACSWD,    UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
22493   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd,          "__builtin_ia32_vpmacssdd",   IX86_BUILTIN_VPMACSSDD,   UNKNOWN,      (int)MULTI_ARG_3_SI },
22494   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd,           "__builtin_ia32_vpmacsdd",    IX86_BUILTIN_VPMACSDD,    UNKNOWN,      (int)MULTI_ARG_3_SI },
22495   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql,         "__builtin_ia32_vpmacssdql",  IX86_BUILTIN_VPMACSSDQL,  UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
22496   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh,         "__builtin_ia32_vpmacssdqh",  IX86_BUILTIN_VPMACSSDQH,  UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
22497   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql,          "__builtin_ia32_vpmacsdql",   IX86_BUILTIN_VPMACSDQL,   UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
22498   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh,          "__builtin_ia32_vpmacsdqh",   IX86_BUILTIN_VPMACSDQH,   UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
22499   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd,         "__builtin_ia32_vpmadcsswd",  IX86_BUILTIN_VPMADCSSWD,  UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
22500   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd,          "__builtin_ia32_vpmadcswd",   IX86_BUILTIN_VPMADCSWD,   UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
22501 
22502   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3,        "__builtin_ia32_vprotq",      IX86_BUILTIN_VPROTQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
22503   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3,        "__builtin_ia32_vprotd",      IX86_BUILTIN_VPROTD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
22504   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3,        "__builtin_ia32_vprotw",      IX86_BUILTIN_VPROTW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
22505   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3,       "__builtin_ia32_vprotb",      IX86_BUILTIN_VPROTB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
22506   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3,         "__builtin_ia32_vprotqi",     IX86_BUILTIN_VPROTQ_IMM,  UNKNOWN,      (int)MULTI_ARG_2_DI_IMM },
22507   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3,         "__builtin_ia32_vprotdi",     IX86_BUILTIN_VPROTD_IMM,  UNKNOWN,      (int)MULTI_ARG_2_SI_IMM },
22508   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3,         "__builtin_ia32_vprotwi",     IX86_BUILTIN_VPROTW_IMM,  UNKNOWN,      (int)MULTI_ARG_2_HI_IMM },
22509   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3,        "__builtin_ia32_vprotbi",     IX86_BUILTIN_VPROTB_IMM,  UNKNOWN,      (int)MULTI_ARG_2_QI_IMM },
22510   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv2di3,         "__builtin_ia32_vpshaq",      IX86_BUILTIN_VPSHAQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
22511   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv4si3,         "__builtin_ia32_vpshad",      IX86_BUILTIN_VPSHAD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
22512   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv8hi3,         "__builtin_ia32_vpshaw",      IX86_BUILTIN_VPSHAW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
22513   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv16qi3,        "__builtin_ia32_vpshab",      IX86_BUILTIN_VPSHAB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
22514   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv2di3,         "__builtin_ia32_vpshlq",      IX86_BUILTIN_VPSHLQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
22515   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv4si3,         "__builtin_ia32_vpshld",      IX86_BUILTIN_VPSHLD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
22516   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv8hi3,         "__builtin_ia32_vpshlw",      IX86_BUILTIN_VPSHLW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
22517   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv16qi3,        "__builtin_ia32_vpshlb",      IX86_BUILTIN_VPSHLB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
22518 
22519   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2,       "__builtin_ia32_vfrczss",     IX86_BUILTIN_VFRCZSS,     UNKNOWN,      (int)MULTI_ARG_2_SF },
22520   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2,       "__builtin_ia32_vfrczsd",     IX86_BUILTIN_VFRCZSD,     UNKNOWN,      (int)MULTI_ARG_2_DF },
22521   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2,         "__builtin_ia32_vfrczps",     IX86_BUILTIN_VFRCZPS,     UNKNOWN,      (int)MULTI_ARG_1_SF },
22522   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2,         "__builtin_ia32_vfrczpd",     IX86_BUILTIN_VFRCZPD,     UNKNOWN,      (int)MULTI_ARG_1_DF },
22523   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2256,         "__builtin_ia32_vfrczps256",  IX86_BUILTIN_VFRCZPS256,  UNKNOWN,      (int)MULTI_ARG_1_SF2 },
22524   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2256,         "__builtin_ia32_vfrczpd256",  IX86_BUILTIN_VFRCZPD256,  UNKNOWN,      (int)MULTI_ARG_1_DF2 },
22525 
22526   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw,           "__builtin_ia32_vphaddbw",    IX86_BUILTIN_VPHADDBW,    UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
22527   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd,           "__builtin_ia32_vphaddbd",    IX86_BUILTIN_VPHADDBD,    UNKNOWN,      (int)MULTI_ARG_1_QI_SI },
22528   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq,           "__builtin_ia32_vphaddbq",    IX86_BUILTIN_VPHADDBQ,    UNKNOWN,      (int)MULTI_ARG_1_QI_DI },
22529   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd,           "__builtin_ia32_vphaddwd",    IX86_BUILTIN_VPHADDWD,    UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
22530   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq,           "__builtin_ia32_vphaddwq",    IX86_BUILTIN_VPHADDWQ,    UNKNOWN,      (int)MULTI_ARG_1_HI_DI },
22531   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq,           "__builtin_ia32_vphadddq",    IX86_BUILTIN_VPHADDDQ,    UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
22532   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw,          "__builtin_ia32_vphaddubw",   IX86_BUILTIN_VPHADDUBW,   UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
22533   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd,          "__builtin_ia32_vphaddubd",   IX86_BUILTIN_VPHADDUBD,   UNKNOWN,      (int)MULTI_ARG_1_QI_SI },
22534   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq,          "__builtin_ia32_vphaddubq",   IX86_BUILTIN_VPHADDUBQ,   UNKNOWN,      (int)MULTI_ARG_1_QI_DI },
22535   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd,          "__builtin_ia32_vphadduwd",   IX86_BUILTIN_VPHADDUWD,   UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
22536   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq,          "__builtin_ia32_vphadduwq",   IX86_BUILTIN_VPHADDUWQ,   UNKNOWN,      (int)MULTI_ARG_1_HI_DI },
22537   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq,          "__builtin_ia32_vphaddudq",   IX86_BUILTIN_VPHADDUDQ,   UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
22538   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw,           "__builtin_ia32_vphsubbw",    IX86_BUILTIN_VPHSUBBW,    UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
22539   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd,           "__builtin_ia32_vphsubwd",    IX86_BUILTIN_VPHSUBWD,    UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
22540   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq,           "__builtin_ia32_vphsubdq",    IX86_BUILTIN_VPHSUBDQ,    UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
22541 
22542   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomeqb",    IX86_BUILTIN_VPCOMEQB,    EQ,           (int)MULTI_ARG_2_QI_CMP },
22543   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomneb",    IX86_BUILTIN_VPCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
22544   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomneqb",   IX86_BUILTIN_VPCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
22545   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomltb",    IX86_BUILTIN_VPCOMLTB,    LT,           (int)MULTI_ARG_2_QI_CMP },
22546   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomleb",    IX86_BUILTIN_VPCOMLEB,    LE,           (int)MULTI_ARG_2_QI_CMP },
22547   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomgtb",    IX86_BUILTIN_VPCOMGTB,    GT,           (int)MULTI_ARG_2_QI_CMP },
22548   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomgeb",    IX86_BUILTIN_VPCOMGEB,    GE,           (int)MULTI_ARG_2_QI_CMP },
22549 
22550   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomeqw",    IX86_BUILTIN_VPCOMEQW,    EQ,           (int)MULTI_ARG_2_HI_CMP },
22551   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomnew",    IX86_BUILTIN_VPCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
22552   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomneqw",   IX86_BUILTIN_VPCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
22553   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomltw",    IX86_BUILTIN_VPCOMLTW,    LT,           (int)MULTI_ARG_2_HI_CMP },
22554   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomlew",    IX86_BUILTIN_VPCOMLEW,    LE,           (int)MULTI_ARG_2_HI_CMP },
22555   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomgtw",    IX86_BUILTIN_VPCOMGTW,    GT,           (int)MULTI_ARG_2_HI_CMP },
22556   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomgew",    IX86_BUILTIN_VPCOMGEW,    GE,           (int)MULTI_ARG_2_HI_CMP },
22557 
22558   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomeqd",    IX86_BUILTIN_VPCOMEQD,    EQ,           (int)MULTI_ARG_2_SI_CMP },
22559   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomned",    IX86_BUILTIN_VPCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
22560   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomneqd",   IX86_BUILTIN_VPCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
22561   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomltd",    IX86_BUILTIN_VPCOMLTD,    LT,           (int)MULTI_ARG_2_SI_CMP },
22562   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomled",    IX86_BUILTIN_VPCOMLED,    LE,           (int)MULTI_ARG_2_SI_CMP },
22563   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomgtd",    IX86_BUILTIN_VPCOMGTD,    GT,           (int)MULTI_ARG_2_SI_CMP },
22564   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomged",    IX86_BUILTIN_VPCOMGED,    GE,           (int)MULTI_ARG_2_SI_CMP },
22565 
22566   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomeqq",    IX86_BUILTIN_VPCOMEQQ,    EQ,           (int)MULTI_ARG_2_DI_CMP },
22567   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomneq",    IX86_BUILTIN_VPCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
22568   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomneqq",   IX86_BUILTIN_VPCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
22569   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomltq",    IX86_BUILTIN_VPCOMLTQ,    LT,           (int)MULTI_ARG_2_DI_CMP },
22570   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomleq",    IX86_BUILTIN_VPCOMLEQ,    LE,           (int)MULTI_ARG_2_DI_CMP },
22571   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomgtq",    IX86_BUILTIN_VPCOMGTQ,    GT,           (int)MULTI_ARG_2_DI_CMP },
22572   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomgeq",    IX86_BUILTIN_VPCOMGEQ,    GE,           (int)MULTI_ARG_2_DI_CMP },
22573 
22574   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb",   IX86_BUILTIN_VPCOMEQUB,   EQ,           (int)MULTI_ARG_2_QI_CMP },
22575   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub",   IX86_BUILTIN_VPCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
22576   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb",  IX86_BUILTIN_VPCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
22577   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub",   IX86_BUILTIN_VPCOMLTUB,   LTU,          (int)MULTI_ARG_2_QI_CMP },
22578   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub",   IX86_BUILTIN_VPCOMLEUB,   LEU,          (int)MULTI_ARG_2_QI_CMP },
22579   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub",   IX86_BUILTIN_VPCOMGTUB,   GTU,          (int)MULTI_ARG_2_QI_CMP },
22580   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub",   IX86_BUILTIN_VPCOMGEUB,   GEU,          (int)MULTI_ARG_2_QI_CMP },
22581 
22582   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw",   IX86_BUILTIN_VPCOMEQUW,   EQ,           (int)MULTI_ARG_2_HI_CMP },
22583   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw",   IX86_BUILTIN_VPCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
22584   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw",  IX86_BUILTIN_VPCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
22585   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomltuw",   IX86_BUILTIN_VPCOMLTUW,   LTU,          (int)MULTI_ARG_2_HI_CMP },
22586   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomleuw",   IX86_BUILTIN_VPCOMLEUW,   LEU,          (int)MULTI_ARG_2_HI_CMP },
22587   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomgtuw",   IX86_BUILTIN_VPCOMGTUW,   GTU,          (int)MULTI_ARG_2_HI_CMP },
22588   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomgeuw",   IX86_BUILTIN_VPCOMGEUW,   GEU,          (int)MULTI_ARG_2_HI_CMP },
22589 
22590   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd",   IX86_BUILTIN_VPCOMEQUD,   EQ,           (int)MULTI_ARG_2_SI_CMP },
22591   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud",   IX86_BUILTIN_VPCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
22592   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd",  IX86_BUILTIN_VPCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
22593   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomltud",   IX86_BUILTIN_VPCOMLTUD,   LTU,          (int)MULTI_ARG_2_SI_CMP },
22594   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomleud",   IX86_BUILTIN_VPCOMLEUD,   LEU,          (int)MULTI_ARG_2_SI_CMP },
22595   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomgtud",   IX86_BUILTIN_VPCOMGTUD,   GTU,          (int)MULTI_ARG_2_SI_CMP },
22596   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomgeud",   IX86_BUILTIN_VPCOMGEUD,   GEU,          (int)MULTI_ARG_2_SI_CMP },
22597 
22598   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq",   IX86_BUILTIN_VPCOMEQUQ,   EQ,           (int)MULTI_ARG_2_DI_CMP },
22599   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq",   IX86_BUILTIN_VPCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
22600   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq",  IX86_BUILTIN_VPCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
22601   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomltuq",   IX86_BUILTIN_VPCOMLTUQ,   LTU,          (int)MULTI_ARG_2_DI_CMP },
22602   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomleuq",   IX86_BUILTIN_VPCOMLEUQ,   LEU,          (int)MULTI_ARG_2_DI_CMP },
22603   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomgtuq",   IX86_BUILTIN_VPCOMGTUQ,   GTU,          (int)MULTI_ARG_2_DI_CMP },
22604   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomgeuq",   IX86_BUILTIN_VPCOMGEUQ,   GEU,          (int)MULTI_ARG_2_DI_CMP },
22605 
22606   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
22607   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
22608   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
22609   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
22610   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
22611   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
22612   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
22613   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
22614 
22615   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomtrueb",  IX86_BUILTIN_VPCOMTRUEB,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
22616   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomtruew",  IX86_BUILTIN_VPCOMTRUEW,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
22617   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomtrued",  IX86_BUILTIN_VPCOMTRUED,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
22618   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomtrueq",  IX86_BUILTIN_VPCOMTRUEQ,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
22619   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
22620   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
22621   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
22622   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
22623 
22624   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3,     "__builtin_ia32_vpermil2pd",  IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
22625   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3,     "__builtin_ia32_vpermil2ps",  IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
22626   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3,     "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
22627   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3,     "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
22628 
22629 };
22630 
22631 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
22632    in the current target ISA to allow the user to compile particular modules
22633    with different target specific options that differ from the command line
22634    options.  */
22635 static void
22636 ix86_init_mmx_sse_builtins (void)
22637 {
22638   const struct builtin_description * d;
22639   enum ix86_builtin_func_type ftype;
22640   size_t i;
22641 
22642   /* Add all special builtins with variable number of operands.  */
22643   for (i = 0, d = bdesc_special_args;
22644        i < ARRAY_SIZE (bdesc_special_args);
22645        i++, d++)
22646     {
22647       if (d->name == 0)
22648 	continue;
22649 
22650       ftype = (enum ix86_builtin_func_type) d->flag;
22651       def_builtin (d->mask, d->name, ftype, d->code);
22652     }
22653 
22654   /* Add all builtins with variable number of operands.  */
22655   for (i = 0, d = bdesc_args;
22656        i < ARRAY_SIZE (bdesc_args);
22657        i++, d++)
22658     {
22659       if (d->name == 0)
22660 	continue;
22661 
22662       ftype = (enum ix86_builtin_func_type) d->flag;
22663       def_builtin_const (d->mask, d->name, ftype, d->code);
22664     }
22665 
22666   /* pcmpestr[im] insns.  */
22667   for (i = 0, d = bdesc_pcmpestr;
22668        i < ARRAY_SIZE (bdesc_pcmpestr);
22669        i++, d++)
22670     {
22671       if (d->code == IX86_BUILTIN_PCMPESTRM128)
22672 	ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
22673       else
22674 	ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
22675       def_builtin_const (d->mask, d->name, ftype, d->code);
22676     }
22677 
22678   /* pcmpistr[im] insns.  */
22679   for (i = 0, d = bdesc_pcmpistr;
22680        i < ARRAY_SIZE (bdesc_pcmpistr);
22681        i++, d++)
22682     {
22683       if (d->code == IX86_BUILTIN_PCMPISTRM128)
22684 	ftype = V16QI_FTYPE_V16QI_V16QI_INT;
22685       else
22686 	ftype = INT_FTYPE_V16QI_V16QI_INT;
22687       def_builtin_const (d->mask, d->name, ftype, d->code);
22688     }
22689 
22690   /* comi/ucomi insns.  */
22691   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
22692     {
22693       if (d->mask == OPTION_MASK_ISA_SSE2)
22694 	ftype = INT_FTYPE_V2DF_V2DF;
22695       else
22696 	ftype = INT_FTYPE_V4SF_V4SF;
22697       def_builtin_const (d->mask, d->name, ftype, d->code);
22698     }
22699 
22700   /* SSE */
22701   def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
22702 	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
22703   def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
22704 	       UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
22705 
22706   /* SSE or 3DNow!A */
22707   def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
22708 	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
22709 	       IX86_BUILTIN_MASKMOVQ);
22710 
22711   /* SSE2 */
22712   def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
22713 	       VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
22714 
22715   def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
22716 	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
22717   x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
22718 			    VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
22719 
22720   /* SSE3.  */
22721   def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
22722 	       VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
22723   def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
22724 	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
22725 
22726   /* AES */
22727   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
22728 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
22729   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
22730 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
22731   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
22732 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
22733   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
22734 		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
22735   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
22736 		     V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
22737   def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
22738 		     V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
22739 
22740   /* PCLMUL */
22741   def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
22742 		     V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
22743 
22744   /* MMX access to the vec_init patterns.  */
22745   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
22746 		     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
22747 
22748   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
22749 		     V4HI_FTYPE_HI_HI_HI_HI,
22750 		     IX86_BUILTIN_VEC_INIT_V4HI);
22751 
22752   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
22753 		     V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
22754 		     IX86_BUILTIN_VEC_INIT_V8QI);
22755 
22756   /* Access to the vec_extract patterns.  */
22757   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
22758 		     DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
22759   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
22760 		     DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
22761   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
22762 		     FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
22763   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
22764 		     SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
22765   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
22766 		     HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
22767 
22768   def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
22769 		     "__builtin_ia32_vec_ext_v4hi",
22770 		     HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
22771 
22772   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
22773 		     SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
22774 
22775   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
22776 		     QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
22777 
22778   /* Access to the vec_set patterns.  */
22779   def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
22780 		     "__builtin_ia32_vec_set_v2di",
22781 		     V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
22782 
22783   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
22784 		     V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
22785 
22786   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
22787 		     V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
22788 
22789   def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
22790 		     V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
22791 
22792   def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
22793 		     "__builtin_ia32_vec_set_v4hi",
22794 		     V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
22795 
22796   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
22797 		     V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
22798 
22799   /* Add FMA4 multi-arg argument instructions */
22800   for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
22801     {
22802       if (d->name == 0)
22803 	continue;
22804 
22805       ftype = (enum ix86_builtin_func_type) d->flag;
22806       def_builtin_const (d->mask, d->name, ftype, d->code);
22807     }
22808 }
22809 
22810 /* Internal method for ix86_init_builtins.  */
22811 
22812 static void
22813 ix86_init_builtins_va_builtins_abi (void)
22814 {
22815   tree ms_va_ref, sysv_va_ref;
22816   tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
22817   tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
22818   tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
22819   tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
22820 
22821   if (!TARGET_64BIT)
22822     return;
22823   fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
22824   fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
22825   ms_va_ref = build_reference_type (ms_va_list_type_node);
22826   sysv_va_ref =
22827     build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
22828 
22829   fnvoid_va_end_ms =
22830     build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
22831   fnvoid_va_start_ms =
22832     build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
22833   fnvoid_va_end_sysv =
22834     build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
22835   fnvoid_va_start_sysv =
22836     build_varargs_function_type_list (void_type_node, sysv_va_ref,
22837     				       NULL_TREE);
22838   fnvoid_va_copy_ms =
22839     build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
22840     			      NULL_TREE);
22841   fnvoid_va_copy_sysv =
22842     build_function_type_list (void_type_node, sysv_va_ref,
22843     			      sysv_va_ref, NULL_TREE);
22844 
22845   add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
22846   			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
22847   add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
22848   			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
22849   add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
22850 			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
22851   add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
22852   			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
22853   add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
22854   			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
22855   add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
22856 			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
22857 }
22858 
22859 static void
22860 ix86_init_builtin_types (void)
22861 {
22862   tree float128_type_node, float80_type_node;
22863 
22864   /* The __float80 type.  */
22865   float80_type_node = long_double_type_node;
22866   if (TYPE_MODE (float80_type_node) != XFmode)
22867     {
22868       /* The __float80 type.  */
22869       float80_type_node = make_node (REAL_TYPE);
22870 
22871       TYPE_PRECISION (float80_type_node) = 80;
22872       layout_type (float80_type_node);
22873     }
22874   (*lang_hooks.types.register_builtin_type) (float80_type_node, "__float80");
22875 
22876   /* The __float128 type.  */
22877   float128_type_node = make_node (REAL_TYPE);
22878   TYPE_PRECISION (float128_type_node) = 128;
22879   layout_type (float128_type_node);
22880   (*lang_hooks.types.register_builtin_type) (float128_type_node, "__float128");
22881 
22882   /* This macro is built by i386-builtin-types.awk.  */
22883   DEFINE_BUILTIN_PRIMITIVE_TYPES;
22884 }
22885 
22886 static void
22887 ix86_init_builtins (void)
22888 {
22889   tree t;
22890 
22891   ix86_init_builtin_types ();
22892 
22893   /* TFmode support builtins.  */
22894   def_builtin_const (0, "__builtin_infq",
22895 		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
22896   def_builtin_const (0, "__builtin_huge_valq",
22897 		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
22898 
22899   /* We will expand them to normal call if SSE2 isn't available since
22900      they are used by libgcc. */
22901   t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
22902   t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
22903 			    BUILT_IN_MD, "__fabstf2", NULL_TREE);
22904   TREE_READONLY (t) = 1;
22905   ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
22906 
22907   t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
22908   t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
22909 			    BUILT_IN_MD, "__copysigntf3", NULL_TREE);
22910   TREE_READONLY (t) = 1;
22911   ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
22912 
22913   ix86_init_mmx_sse_builtins ();
22914 
22915   if (TARGET_64BIT)
22916     ix86_init_builtins_va_builtins_abi ();
22917 }
22918 
22919 /* Return the ix86 builtin for CODE.  */
22920 
22921 static tree
22922 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
22923 {
22924   if (code >= IX86_BUILTIN_MAX)
22925     return error_mark_node;
22926 
22927   return ix86_builtins[code];
22928 }
22929 
22930 /* Errors in the source file can cause expand_expr to return const0_rtx
22931    where we expect a vector.  To avoid crashing, use one of the vector
22932    clear instructions.  */
22933 static rtx
22934 safe_vector_operand (rtx x, enum machine_mode mode)
22935 {
22936   if (x == const0_rtx)
22937     x = CONST0_RTX (mode);
22938   return x;
22939 }
22940 
22941 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
22942 
22943 static rtx
22944 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
22945 {
22946   rtx pat;
22947   tree arg0 = CALL_EXPR_ARG (exp, 0);
22948   tree arg1 = CALL_EXPR_ARG (exp, 1);
22949   rtx op0 = expand_normal (arg0);
22950   rtx op1 = expand_normal (arg1);
22951   enum machine_mode tmode = insn_data[icode].operand[0].mode;
22952   enum machine_mode mode0 = insn_data[icode].operand[1].mode;
22953   enum machine_mode mode1 = insn_data[icode].operand[2].mode;
22954 
22955   if (VECTOR_MODE_P (mode0))
22956     op0 = safe_vector_operand (op0, mode0);
22957   if (VECTOR_MODE_P (mode1))
22958     op1 = safe_vector_operand (op1, mode1);
22959 
22960   if (optimize || !target
22961       || GET_MODE (target) != tmode
22962       || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
22963     target = gen_reg_rtx (tmode);
22964 
22965   if (GET_MODE (op1) == SImode && mode1 == TImode)
22966     {
22967       rtx x = gen_reg_rtx (V4SImode);
22968       emit_insn (gen_sse2_loadd (x, op1));
22969       op1 = gen_lowpart (TImode, x);
22970     }
22971 
22972   if (!(*insn_data[icode].operand[1].predicate) (op0, mode0))
22973     op0 = copy_to_mode_reg (mode0, op0);
22974   if (!(*insn_data[icode].operand[2].predicate) (op1, mode1))
22975     op1 = copy_to_mode_reg (mode1, op1);
22976 
22977   pat = GEN_FCN (icode) (target, op0, op1);
22978   if (! pat)
22979     return 0;
22980 
22981   emit_insn (pat);
22982 
22983   return target;
22984 }
22985 
22986 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
22987 
22988 static rtx
22989 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
22990 			       enum ix86_builtin_func_type m_type,
22991 			       enum rtx_code sub_code)
22992 {
22993   rtx pat;
22994   int i;
22995   int nargs;
22996   bool comparison_p = false;
22997   bool tf_p = false;
22998   bool last_arg_constant = false;
22999   int num_memory = 0;
23000   struct {
23001     rtx op;
23002     enum machine_mode mode;
23003   } args[4];
23004 
23005   enum machine_mode tmode = insn_data[icode].operand[0].mode;
23006 
23007   switch (m_type)
23008     {
23009     case MULTI_ARG_4_DF2_DI_I:
23010     case MULTI_ARG_4_DF2_DI_I1:
23011     case MULTI_ARG_4_SF2_SI_I:
23012     case MULTI_ARG_4_SF2_SI_I1:
23013       nargs = 4;
23014       last_arg_constant = true;
23015       break;
23016 
23017     case MULTI_ARG_3_SF:
23018     case MULTI_ARG_3_DF:
23019     case MULTI_ARG_3_SF2:
23020     case MULTI_ARG_3_DF2:
23021     case MULTI_ARG_3_DI:
23022     case MULTI_ARG_3_SI:
23023     case MULTI_ARG_3_SI_DI:
23024     case MULTI_ARG_3_HI:
23025     case MULTI_ARG_3_HI_SI:
23026     case MULTI_ARG_3_QI:
23027     case MULTI_ARG_3_DI2:
23028     case MULTI_ARG_3_SI2:
23029     case MULTI_ARG_3_HI2:
23030     case MULTI_ARG_3_QI2:
23031       nargs = 3;
23032       break;
23033 
23034     case MULTI_ARG_2_SF:
23035     case MULTI_ARG_2_DF:
23036     case MULTI_ARG_2_DI:
23037     case MULTI_ARG_2_SI:
23038     case MULTI_ARG_2_HI:
23039     case MULTI_ARG_2_QI:
23040       nargs = 2;
23041       break;
23042 
23043     case MULTI_ARG_2_DI_IMM:
23044     case MULTI_ARG_2_SI_IMM:
23045     case MULTI_ARG_2_HI_IMM:
23046     case MULTI_ARG_2_QI_IMM:
23047       nargs = 2;
23048       last_arg_constant = true;
23049       break;
23050 
23051     case MULTI_ARG_1_SF:
23052     case MULTI_ARG_1_DF:
23053     case MULTI_ARG_1_SF2:
23054     case MULTI_ARG_1_DF2:
23055     case MULTI_ARG_1_DI:
23056     case MULTI_ARG_1_SI:
23057     case MULTI_ARG_1_HI:
23058     case MULTI_ARG_1_QI:
23059     case MULTI_ARG_1_SI_DI:
23060     case MULTI_ARG_1_HI_DI:
23061     case MULTI_ARG_1_HI_SI:
23062     case MULTI_ARG_1_QI_DI:
23063     case MULTI_ARG_1_QI_SI:
23064     case MULTI_ARG_1_QI_HI:
23065       nargs = 1;
23066       break;
23067 
23068     case MULTI_ARG_2_DI_CMP:
23069     case MULTI_ARG_2_SI_CMP:
23070     case MULTI_ARG_2_HI_CMP:
23071     case MULTI_ARG_2_QI_CMP:
23072       nargs = 2;
23073       comparison_p = true;
23074       break;
23075 
23076     case MULTI_ARG_2_SF_TF:
23077     case MULTI_ARG_2_DF_TF:
23078     case MULTI_ARG_2_DI_TF:
23079     case MULTI_ARG_2_SI_TF:
23080     case MULTI_ARG_2_HI_TF:
23081     case MULTI_ARG_2_QI_TF:
23082       nargs = 2;
23083       tf_p = true;
23084       break;
23085 
23086     default:
23087       gcc_unreachable ();
23088     }
23089 
23090   if (optimize || !target
23091       || GET_MODE (target) != tmode
23092       || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
23093     target = gen_reg_rtx (tmode);
23094 
23095   gcc_assert (nargs <= 4);
23096 
23097   for (i = 0; i < nargs; i++)
23098     {
23099       tree arg = CALL_EXPR_ARG (exp, i);
23100       rtx op = expand_normal (arg);
23101       int adjust = (comparison_p) ? 1 : 0;
23102       enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
23103 
23104       if (last_arg_constant && i == nargs-1)
23105 	{
23106 	  if (!CONST_INT_P (op))
23107 	    {
23108 	      error ("last argument must be an immediate");
23109 	      return gen_reg_rtx (tmode);
23110 	    }
23111 	}
23112       else
23113 	{
23114 	  if (VECTOR_MODE_P (mode))
23115 	    op = safe_vector_operand (op, mode);
23116 
23117 	  /* If we aren't optimizing, only allow one memory operand to be
23118 	     generated.  */
23119 	  if (memory_operand (op, mode))
23120 	    num_memory++;
23121 
23122 	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
23123 
23124 	  if (optimize
23125 	      || ! (*insn_data[icode].operand[i+adjust+1].predicate) (op, mode)
23126 	      || num_memory > 1)
23127 	    op = force_reg (mode, op);
23128 	}
23129 
23130       args[i].op = op;
23131       args[i].mode = mode;
23132     }
23133 
23134   switch (nargs)
23135     {
23136     case 1:
23137       pat = GEN_FCN (icode) (target, args[0].op);
23138       break;
23139 
23140     case 2:
23141       if (tf_p)
23142 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
23143 			       GEN_INT ((int)sub_code));
23144       else if (! comparison_p)
23145 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
23146       else
23147 	{
23148 	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
23149 				       args[0].op,
23150 				       args[1].op);
23151 
23152 	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
23153 	}
23154       break;
23155 
23156     case 3:
23157       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
23158       break;
23159 
23160     case 4:
23161       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
23162       break;
23163 
23164     default:
23165       gcc_unreachable ();
23166     }
23167 
23168   if (! pat)
23169     return 0;
23170 
23171   emit_insn (pat);
23172   return target;
23173 }
23174 
23175 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
23176    insns with vec_merge.  */
23177 
23178 static rtx
23179 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
23180 				    rtx target)
23181 {
23182   rtx pat;
23183   tree arg0 = CALL_EXPR_ARG (exp, 0);
23184   rtx op1, op0 = expand_normal (arg0);
23185   enum machine_mode tmode = insn_data[icode].operand[0].mode;
23186   enum machine_mode mode0 = insn_data[icode].operand[1].mode;
23187 
23188   if (optimize || !target
23189       || GET_MODE (target) != tmode
23190       || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
23191     target = gen_reg_rtx (tmode);
23192 
23193   if (VECTOR_MODE_P (mode0))
23194     op0 = safe_vector_operand (op0, mode0);
23195 
23196   if ((optimize && !register_operand (op0, mode0))
23197       || ! (*insn_data[icode].operand[1].predicate) (op0, mode0))
23198     op0 = copy_to_mode_reg (mode0, op0);
23199 
23200   op1 = op0;
23201   if (! (*insn_data[icode].operand[2].predicate) (op1, mode0))
23202     op1 = copy_to_mode_reg (mode0, op1);
23203 
23204   pat = GEN_FCN (icode) (target, op0, op1);
23205   if (! pat)
23206     return 0;
23207   emit_insn (pat);
23208   return target;
23209 }
23210 
23211 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
23212 
23213 static rtx
23214 ix86_expand_sse_compare (const struct builtin_description *d,
23215 			 tree exp, rtx target, bool swap)
23216 {
23217   rtx pat;
23218   tree arg0 = CALL_EXPR_ARG (exp, 0);
23219   tree arg1 = CALL_EXPR_ARG (exp, 1);
23220   rtx op0 = expand_normal (arg0);
23221   rtx op1 = expand_normal (arg1);
23222   rtx op2;
23223   enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
23224   enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
23225   enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
23226   enum rtx_code comparison = d->comparison;
23227 
23228   if (VECTOR_MODE_P (mode0))
23229     op0 = safe_vector_operand (op0, mode0);
23230   if (VECTOR_MODE_P (mode1))
23231     op1 = safe_vector_operand (op1, mode1);
23232 
23233   /* Swap operands if we have a comparison that isn't available in
23234      hardware.  */
23235   if (swap)
23236     {
23237       rtx tmp = gen_reg_rtx (mode1);
23238       emit_move_insn (tmp, op1);
23239       op1 = op0;
23240       op0 = tmp;
23241     }
23242 
23243   if (optimize || !target
23244       || GET_MODE (target) != tmode
23245       || ! (*insn_data[d->icode].operand[0].predicate) (target, tmode))
23246     target = gen_reg_rtx (tmode);
23247 
23248   if ((optimize && !register_operand (op0, mode0))
23249       || ! (*insn_data[d->icode].operand[1].predicate) (op0, mode0))
23250     op0 = copy_to_mode_reg (mode0, op0);
23251   if ((optimize && !register_operand (op1, mode1))
23252       || ! (*insn_data[d->icode].operand[2].predicate) (op1, mode1))
23253     op1 = copy_to_mode_reg (mode1, op1);
23254 
23255   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
23256   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
23257   if (! pat)
23258     return 0;
23259   emit_insn (pat);
23260   return target;
23261 }
23262 
23263 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
23264 
23265 static rtx
23266 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
23267 		      rtx target)
23268 {
23269   rtx pat;
23270   tree arg0 = CALL_EXPR_ARG (exp, 0);
23271   tree arg1 = CALL_EXPR_ARG (exp, 1);
23272   rtx op0 = expand_normal (arg0);
23273   rtx op1 = expand_normal (arg1);
23274   enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
23275   enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
23276   enum rtx_code comparison = d->comparison;
23277 
23278   if (VECTOR_MODE_P (mode0))
23279     op0 = safe_vector_operand (op0, mode0);
23280   if (VECTOR_MODE_P (mode1))
23281     op1 = safe_vector_operand (op1, mode1);
23282 
23283   /* Swap operands if we have a comparison that isn't available in
23284      hardware.  */
23285   if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
23286     {
23287       rtx tmp = op1;
23288       op1 = op0;
23289       op0 = tmp;
23290     }
23291 
23292   target = gen_reg_rtx (SImode);
23293   emit_move_insn (target, const0_rtx);
23294   target = gen_rtx_SUBREG (QImode, target, 0);
23295 
23296   if ((optimize && !register_operand (op0, mode0))
23297       || !(*insn_data[d->icode].operand[0].predicate) (op0, mode0))
23298     op0 = copy_to_mode_reg (mode0, op0);
23299   if ((optimize && !register_operand (op1, mode1))
23300       || !(*insn_data[d->icode].operand[1].predicate) (op1, mode1))
23301     op1 = copy_to_mode_reg (mode1, op1);
23302 
23303   pat = GEN_FCN (d->icode) (op0, op1);
23304   if (! pat)
23305     return 0;
23306   emit_insn (pat);
23307   emit_insn (gen_rtx_SET (VOIDmode,
23308 			  gen_rtx_STRICT_LOW_PART (VOIDmode, target),
23309 			  gen_rtx_fmt_ee (comparison, QImode,
23310 					  SET_DEST (pat),
23311 					  const0_rtx)));
23312 
23313   return SUBREG_REG (target);
23314 }
23315 
23316 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
23317 
23318 static rtx
23319 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
23320 		       rtx target)
23321 {
23322   rtx pat;
23323   tree arg0 = CALL_EXPR_ARG (exp, 0);
23324   tree arg1 = CALL_EXPR_ARG (exp, 1);
23325   rtx op0 = expand_normal (arg0);
23326   rtx op1 = expand_normal (arg1);
23327   enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
23328   enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
23329   enum rtx_code comparison = d->comparison;
23330 
23331   if (VECTOR_MODE_P (mode0))
23332     op0 = safe_vector_operand (op0, mode0);
23333   if (VECTOR_MODE_P (mode1))
23334     op1 = safe_vector_operand (op1, mode1);
23335 
23336   target = gen_reg_rtx (SImode);
23337   emit_move_insn (target, const0_rtx);
23338   target = gen_rtx_SUBREG (QImode, target, 0);
23339 
23340   if ((optimize && !register_operand (op0, mode0))
23341       || !(*insn_data[d->icode].operand[0].predicate) (op0, mode0))
23342     op0 = copy_to_mode_reg (mode0, op0);
23343   if ((optimize && !register_operand (op1, mode1))
23344       || !(*insn_data[d->icode].operand[1].predicate) (op1, mode1))
23345     op1 = copy_to_mode_reg (mode1, op1);
23346 
23347   pat = GEN_FCN (d->icode) (op0, op1);
23348   if (! pat)
23349     return 0;
23350   emit_insn (pat);
23351   emit_insn (gen_rtx_SET (VOIDmode,
23352 			  gen_rtx_STRICT_LOW_PART (VOIDmode, target),
23353 			  gen_rtx_fmt_ee (comparison, QImode,
23354 					  SET_DEST (pat),
23355 					  const0_rtx)));
23356 
23357   return SUBREG_REG (target);
23358 }
23359 
23360 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
23361 
23362 static rtx
23363 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
23364 			  tree exp, rtx target)
23365 {
23366   rtx pat;
23367   tree arg0 = CALL_EXPR_ARG (exp, 0);
23368   tree arg1 = CALL_EXPR_ARG (exp, 1);
23369   tree arg2 = CALL_EXPR_ARG (exp, 2);
23370   tree arg3 = CALL_EXPR_ARG (exp, 3);
23371   tree arg4 = CALL_EXPR_ARG (exp, 4);
23372   rtx scratch0, scratch1;
23373   rtx op0 = expand_normal (arg0);
23374   rtx op1 = expand_normal (arg1);
23375   rtx op2 = expand_normal (arg2);
23376   rtx op3 = expand_normal (arg3);
23377   rtx op4 = expand_normal (arg4);
23378   enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
23379 
23380   tmode0 = insn_data[d->icode].operand[0].mode;
23381   tmode1 = insn_data[d->icode].operand[1].mode;
23382   modev2 = insn_data[d->icode].operand[2].mode;
23383   modei3 = insn_data[d->icode].operand[3].mode;
23384   modev4 = insn_data[d->icode].operand[4].mode;
23385   modei5 = insn_data[d->icode].operand[5].mode;
23386   modeimm = insn_data[d->icode].operand[6].mode;
23387 
23388   if (VECTOR_MODE_P (modev2))
23389     op0 = safe_vector_operand (op0, modev2);
23390   if (VECTOR_MODE_P (modev4))
23391     op2 = safe_vector_operand (op2, modev4);
23392 
23393   if (! (*insn_data[d->icode].operand[2].predicate) (op0, modev2))
23394     op0 = copy_to_mode_reg (modev2, op0);
23395   if (! (*insn_data[d->icode].operand[3].predicate) (op1, modei3))
23396     op1 = copy_to_mode_reg (modei3, op1);
23397   if ((optimize && !register_operand (op2, modev4))
23398       || !(*insn_data[d->icode].operand[4].predicate) (op2, modev4))
23399     op2 = copy_to_mode_reg (modev4, op2);
23400   if (! (*insn_data[d->icode].operand[5].predicate) (op3, modei5))
23401     op3 = copy_to_mode_reg (modei5, op3);
23402 
23403   if (! (*insn_data[d->icode].operand[6].predicate) (op4, modeimm))
23404     {
23405       error ("the fifth argument must be a 8-bit immediate");
23406       return const0_rtx;
23407     }
23408 
23409   if (d->code == IX86_BUILTIN_PCMPESTRI128)
23410     {
23411       if (optimize || !target
23412 	  || GET_MODE (target) != tmode0
23413 	  || ! (*insn_data[d->icode].operand[0].predicate) (target, tmode0))
23414 	target = gen_reg_rtx (tmode0);
23415 
23416       scratch1 = gen_reg_rtx (tmode1);
23417 
23418       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
23419     }
23420   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
23421     {
23422       if (optimize || !target
23423 	  || GET_MODE (target) != tmode1
23424 	  || ! (*insn_data[d->icode].operand[1].predicate) (target, tmode1))
23425 	target = gen_reg_rtx (tmode1);
23426 
23427       scratch0 = gen_reg_rtx (tmode0);
23428 
23429       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
23430     }
23431   else
23432     {
23433       gcc_assert (d->flag);
23434 
23435       scratch0 = gen_reg_rtx (tmode0);
23436       scratch1 = gen_reg_rtx (tmode1);
23437 
23438       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
23439     }
23440 
23441   if (! pat)
23442     return 0;
23443 
23444   emit_insn (pat);
23445 
23446   if (d->flag)
23447     {
23448       target = gen_reg_rtx (SImode);
23449       emit_move_insn (target, const0_rtx);
23450       target = gen_rtx_SUBREG (QImode, target, 0);
23451 
23452       emit_insn
23453 	(gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
23454 		      gen_rtx_fmt_ee (EQ, QImode,
23455 				      gen_rtx_REG ((enum machine_mode) d->flag,
23456 						   FLAGS_REG),
23457 				      const0_rtx)));
23458       return SUBREG_REG (target);
23459     }
23460   else
23461     return target;
23462 }
23463 
23464 
23465 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
23466 
23467 static rtx
23468 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
23469 			  tree exp, rtx target)
23470 {
23471   rtx pat;
23472   tree arg0 = CALL_EXPR_ARG (exp, 0);
23473   tree arg1 = CALL_EXPR_ARG (exp, 1);
23474   tree arg2 = CALL_EXPR_ARG (exp, 2);
23475   rtx scratch0, scratch1;
23476   rtx op0 = expand_normal (arg0);
23477   rtx op1 = expand_normal (arg1);
23478   rtx op2 = expand_normal (arg2);
23479   enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
23480 
23481   tmode0 = insn_data[d->icode].operand[0].mode;
23482   tmode1 = insn_data[d->icode].operand[1].mode;
23483   modev2 = insn_data[d->icode].operand[2].mode;
23484   modev3 = insn_data[d->icode].operand[3].mode;
23485   modeimm = insn_data[d->icode].operand[4].mode;
23486 
23487   if (VECTOR_MODE_P (modev2))
23488     op0 = safe_vector_operand (op0, modev2);
23489   if (VECTOR_MODE_P (modev3))
23490     op1 = safe_vector_operand (op1, modev3);
23491 
23492   if (! (*insn_data[d->icode].operand[2].predicate) (op0, modev2))
23493     op0 = copy_to_mode_reg (modev2, op0);
23494   if ((optimize && !register_operand (op1, modev3))
23495       || !(*insn_data[d->icode].operand[3].predicate) (op1, modev3))
23496     op1 = copy_to_mode_reg (modev3, op1);
23497 
23498   if (! (*insn_data[d->icode].operand[4].predicate) (op2, modeimm))
23499     {
23500       error ("the third argument must be a 8-bit immediate");
23501       return const0_rtx;
23502     }
23503 
23504   if (d->code == IX86_BUILTIN_PCMPISTRI128)
23505     {
23506       if (optimize || !target
23507 	  || GET_MODE (target) != tmode0
23508 	  || ! (*insn_data[d->icode].operand[0].predicate) (target, tmode0))
23509 	target = gen_reg_rtx (tmode0);
23510 
23511       scratch1 = gen_reg_rtx (tmode1);
23512 
23513       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
23514     }
23515   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
23516     {
23517       if (optimize || !target
23518 	  || GET_MODE (target) != tmode1
23519 	  || ! (*insn_data[d->icode].operand[1].predicate) (target, tmode1))
23520 	target = gen_reg_rtx (tmode1);
23521 
23522       scratch0 = gen_reg_rtx (tmode0);
23523 
23524       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
23525     }
23526   else
23527     {
23528       gcc_assert (d->flag);
23529 
23530       scratch0 = gen_reg_rtx (tmode0);
23531       scratch1 = gen_reg_rtx (tmode1);
23532 
23533       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
23534     }
23535 
23536   if (! pat)
23537     return 0;
23538 
23539   emit_insn (pat);
23540 
23541   if (d->flag)
23542     {
23543       target = gen_reg_rtx (SImode);
23544       emit_move_insn (target, const0_rtx);
23545       target = gen_rtx_SUBREG (QImode, target, 0);
23546 
23547       emit_insn
23548 	(gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
23549 		      gen_rtx_fmt_ee (EQ, QImode,
23550 				      gen_rtx_REG ((enum machine_mode) d->flag,
23551 						   FLAGS_REG),
23552 				      const0_rtx)));
23553       return SUBREG_REG (target);
23554     }
23555   else
23556     return target;
23557 }
23558 
23559 /* Subroutine of ix86_expand_builtin to take care of insns with
23560    variable number of operands.  */
23561 
23562 static rtx
23563 ix86_expand_args_builtin (const struct builtin_description *d,
23564 			  tree exp, rtx target)
23565 {
23566   rtx pat, real_target;
23567   unsigned int i, nargs;
23568   unsigned int nargs_constant = 0;
23569   int num_memory = 0;
23570   struct
23571     {
23572       rtx op;
23573       enum machine_mode mode;
23574     } args[4];
23575   bool last_arg_count = false;
23576   enum insn_code icode = d->icode;
23577   const struct insn_data *insn_p = &insn_data[icode];
23578   enum machine_mode tmode = insn_p->operand[0].mode;
23579   enum machine_mode rmode = VOIDmode;
23580   bool swap = false;
23581   enum rtx_code comparison = d->comparison;
23582 
23583   switch ((enum ix86_builtin_func_type) d->flag)
23584     {
23585     case INT_FTYPE_V8SF_V8SF_PTEST:
23586     case INT_FTYPE_V4DI_V4DI_PTEST:
23587     case INT_FTYPE_V4DF_V4DF_PTEST:
23588     case INT_FTYPE_V4SF_V4SF_PTEST:
23589     case INT_FTYPE_V2DI_V2DI_PTEST:
23590     case INT_FTYPE_V2DF_V2DF_PTEST:
23591       return ix86_expand_sse_ptest (d, exp, target);
23592     case FLOAT128_FTYPE_FLOAT128:
23593     case FLOAT_FTYPE_FLOAT:
23594     case INT_FTYPE_INT:
23595     case UINT64_FTYPE_INT:
23596     case UINT16_FTYPE_UINT16:
23597     case INT64_FTYPE_INT64:
23598     case INT64_FTYPE_V4SF:
23599     case INT64_FTYPE_V2DF:
23600     case INT_FTYPE_V16QI:
23601     case INT_FTYPE_V8QI:
23602     case INT_FTYPE_V8SF:
23603     case INT_FTYPE_V4DF:
23604     case INT_FTYPE_V4SF:
23605     case INT_FTYPE_V2DF:
23606     case V16QI_FTYPE_V16QI:
23607     case V8SI_FTYPE_V8SF:
23608     case V8SI_FTYPE_V4SI:
23609     case V8HI_FTYPE_V8HI:
23610     case V8HI_FTYPE_V16QI:
23611     case V8QI_FTYPE_V8QI:
23612     case V8SF_FTYPE_V8SF:
23613     case V8SF_FTYPE_V8SI:
23614     case V8SF_FTYPE_V4SF:
23615     case V4SI_FTYPE_V4SI:
23616     case V4SI_FTYPE_V16QI:
23617     case V4SI_FTYPE_V4SF:
23618     case V4SI_FTYPE_V8SI:
23619     case V4SI_FTYPE_V8HI:
23620     case V4SI_FTYPE_V4DF:
23621     case V4SI_FTYPE_V2DF:
23622     case V4HI_FTYPE_V4HI:
23623     case V4DF_FTYPE_V4DF:
23624     case V4DF_FTYPE_V4SI:
23625     case V4DF_FTYPE_V4SF:
23626     case V4DF_FTYPE_V2DF:
23627     case V4SF_FTYPE_V4SF:
23628     case V4SF_FTYPE_V4SI:
23629     case V4SF_FTYPE_V8SF:
23630     case V4SF_FTYPE_V4DF:
23631     case V4SF_FTYPE_V2DF:
23632     case V2DI_FTYPE_V2DI:
23633     case V2DI_FTYPE_V16QI:
23634     case V2DI_FTYPE_V8HI:
23635     case V2DI_FTYPE_V4SI:
23636     case V2DF_FTYPE_V2DF:
23637     case V2DF_FTYPE_V4SI:
23638     case V2DF_FTYPE_V4DF:
23639     case V2DF_FTYPE_V4SF:
23640     case V2DF_FTYPE_V2SI:
23641     case V2SI_FTYPE_V2SI:
23642     case V2SI_FTYPE_V4SF:
23643     case V2SI_FTYPE_V2SF:
23644     case V2SI_FTYPE_V2DF:
23645     case V2SF_FTYPE_V2SF:
23646     case V2SF_FTYPE_V2SI:
23647       nargs = 1;
23648       break;
23649     case V4SF_FTYPE_V4SF_VEC_MERGE:
23650     case V2DF_FTYPE_V2DF_VEC_MERGE:
23651       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
23652     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
23653     case V16QI_FTYPE_V16QI_V16QI:
23654     case V16QI_FTYPE_V8HI_V8HI:
23655     case V8QI_FTYPE_V8QI_V8QI:
23656     case V8QI_FTYPE_V4HI_V4HI:
23657     case V8HI_FTYPE_V8HI_V8HI:
23658     case V8HI_FTYPE_V16QI_V16QI:
23659     case V8HI_FTYPE_V4SI_V4SI:
23660     case V8SF_FTYPE_V8SF_V8SF:
23661     case V8SF_FTYPE_V8SF_V8SI:
23662     case V4SI_FTYPE_V4SI_V4SI:
23663     case V4SI_FTYPE_V8HI_V8HI:
23664     case V4SI_FTYPE_V4SF_V4SF:
23665     case V4SI_FTYPE_V2DF_V2DF:
23666     case V4HI_FTYPE_V4HI_V4HI:
23667     case V4HI_FTYPE_V8QI_V8QI:
23668     case V4HI_FTYPE_V2SI_V2SI:
23669     case V4DF_FTYPE_V4DF_V4DF:
23670     case V4DF_FTYPE_V4DF_V4DI:
23671     case V4SF_FTYPE_V4SF_V4SF:
23672     case V4SF_FTYPE_V4SF_V4SI:
23673     case V4SF_FTYPE_V4SF_V2SI:
23674     case V4SF_FTYPE_V4SF_V2DF:
23675     case V4SF_FTYPE_V4SF_DI:
23676     case V4SF_FTYPE_V4SF_SI:
23677     case V2DI_FTYPE_V2DI_V2DI:
23678     case V2DI_FTYPE_V16QI_V16QI:
23679     case V2DI_FTYPE_V4SI_V4SI:
23680     case V2DI_FTYPE_V2DI_V16QI:
23681     case V2DI_FTYPE_V2DF_V2DF:
23682     case V2SI_FTYPE_V2SI_V2SI:
23683     case V2SI_FTYPE_V4HI_V4HI:
23684     case V2SI_FTYPE_V2SF_V2SF:
23685     case V2DF_FTYPE_V2DF_V2DF:
23686     case V2DF_FTYPE_V2DF_V4SF:
23687     case V2DF_FTYPE_V2DF_V2DI:
23688     case V2DF_FTYPE_V2DF_DI:
23689     case V2DF_FTYPE_V2DF_SI:
23690     case V2SF_FTYPE_V2SF_V2SF:
23691     case V1DI_FTYPE_V1DI_V1DI:
23692     case V1DI_FTYPE_V8QI_V8QI:
23693     case V1DI_FTYPE_V2SI_V2SI:
23694       if (comparison == UNKNOWN)
23695 	return ix86_expand_binop_builtin (icode, exp, target);
23696       nargs = 2;
23697       break;
23698     case V4SF_FTYPE_V4SF_V4SF_SWAP:
23699     case V2DF_FTYPE_V2DF_V2DF_SWAP:
23700       gcc_assert (comparison != UNKNOWN);
23701       nargs = 2;
23702       swap = true;
23703       break;
23704     case V8HI_FTYPE_V8HI_V8HI_COUNT:
23705     case V8HI_FTYPE_V8HI_SI_COUNT:
23706     case V4SI_FTYPE_V4SI_V4SI_COUNT:
23707     case V4SI_FTYPE_V4SI_SI_COUNT:
23708     case V4HI_FTYPE_V4HI_V4HI_COUNT:
23709     case V4HI_FTYPE_V4HI_SI_COUNT:
23710     case V2DI_FTYPE_V2DI_V2DI_COUNT:
23711     case V2DI_FTYPE_V2DI_SI_COUNT:
23712     case V2SI_FTYPE_V2SI_V2SI_COUNT:
23713     case V2SI_FTYPE_V2SI_SI_COUNT:
23714     case V1DI_FTYPE_V1DI_V1DI_COUNT:
23715     case V1DI_FTYPE_V1DI_SI_COUNT:
23716       nargs = 2;
23717       last_arg_count = true;
23718       break;
23719     case UINT64_FTYPE_UINT64_UINT64:
23720     case UINT_FTYPE_UINT_UINT:
23721     case UINT_FTYPE_UINT_USHORT:
23722     case UINT_FTYPE_UINT_UCHAR:
23723     case UINT16_FTYPE_UINT16_INT:
23724     case UINT8_FTYPE_UINT8_INT:
23725       nargs = 2;
23726       break;
23727     case V2DI_FTYPE_V2DI_INT_CONVERT:
23728       nargs = 2;
23729       rmode = V1TImode;
23730       nargs_constant = 1;
23731       break;
23732     case V8HI_FTYPE_V8HI_INT:
23733     case V8SF_FTYPE_V8SF_INT:
23734     case V4SI_FTYPE_V4SI_INT:
23735     case V4SI_FTYPE_V8SI_INT:
23736     case V4HI_FTYPE_V4HI_INT:
23737     case V4DF_FTYPE_V4DF_INT:
23738     case V4SF_FTYPE_V4SF_INT:
23739     case V4SF_FTYPE_V8SF_INT:
23740     case V2DI_FTYPE_V2DI_INT:
23741     case V2DF_FTYPE_V2DF_INT:
23742     case V2DF_FTYPE_V4DF_INT:
23743       nargs = 2;
23744       nargs_constant = 1;
23745       break;
23746     case V16QI_FTYPE_V16QI_V16QI_V16QI:
23747     case V8SF_FTYPE_V8SF_V8SF_V8SF:
23748     case V4DF_FTYPE_V4DF_V4DF_V4DF:
23749     case V4SF_FTYPE_V4SF_V4SF_V4SF:
23750     case V2DF_FTYPE_V2DF_V2DF_V2DF:
23751       nargs = 3;
23752       break;
23753     case V16QI_FTYPE_V16QI_V16QI_INT:
23754     case V8HI_FTYPE_V8HI_V8HI_INT:
23755     case V8SI_FTYPE_V8SI_V8SI_INT:
23756     case V8SI_FTYPE_V8SI_V4SI_INT:
23757     case V8SF_FTYPE_V8SF_V8SF_INT:
23758     case V8SF_FTYPE_V8SF_V4SF_INT:
23759     case V4SI_FTYPE_V4SI_V4SI_INT:
23760     case V4DF_FTYPE_V4DF_V4DF_INT:
23761     case V4DF_FTYPE_V4DF_V2DF_INT:
23762     case V4SF_FTYPE_V4SF_V4SF_INT:
23763     case V2DI_FTYPE_V2DI_V2DI_INT:
23764     case V2DF_FTYPE_V2DF_V2DF_INT:
23765       nargs = 3;
23766       nargs_constant = 1;
23767       break;
23768     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
23769       nargs = 3;
23770       rmode = V2DImode;
23771       nargs_constant = 1;
23772       break;
23773     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
23774       nargs = 3;
23775       rmode = DImode;
23776       nargs_constant = 1;
23777       break;
23778     case V2DI_FTYPE_V2DI_UINT_UINT:
23779       nargs = 3;
23780       nargs_constant = 2;
23781       break;
23782     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
23783     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
23784     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
23785     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
23786       nargs = 4;
23787       nargs_constant = 1;
23788       break;
23789     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
23790       nargs = 4;
23791       nargs_constant = 2;
23792       break;
23793     default:
23794       gcc_unreachable ();
23795     }
23796 
23797   gcc_assert (nargs <= ARRAY_SIZE (args));
23798 
23799   if (comparison != UNKNOWN)
23800     {
23801       gcc_assert (nargs == 2);
23802       return ix86_expand_sse_compare (d, exp, target, swap);
23803     }
23804 
23805   if (rmode == VOIDmode || rmode == tmode)
23806     {
23807       if (optimize
23808 	  || target == 0
23809 	  || GET_MODE (target) != tmode
23810 	  || ! (*insn_p->operand[0].predicate) (target, tmode))
23811 	target = gen_reg_rtx (tmode);
23812       real_target = target;
23813     }
23814   else
23815     {
23816       target = gen_reg_rtx (rmode);
23817       real_target = simplify_gen_subreg (tmode, target, rmode, 0);
23818     }
23819 
23820   for (i = 0; i < nargs; i++)
23821     {
23822       tree arg = CALL_EXPR_ARG (exp, i);
23823       rtx op = expand_normal (arg);
23824       enum machine_mode mode = insn_p->operand[i + 1].mode;
23825       bool match = (*insn_p->operand[i + 1].predicate) (op, mode);
23826 
23827       if (last_arg_count && (i + 1) == nargs)
23828 	{
23829 	  /* SIMD shift insns take either an 8-bit immediate or
23830 	     register as count.  But builtin functions take int as
23831 	     count.  If count doesn't match, we put it in register.  */
23832 	  if (!match)
23833 	    {
23834 	      op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
23835 	      if (!(*insn_p->operand[i + 1].predicate) (op, mode))
23836 		op = copy_to_reg (op);
23837 	    }
23838 	}
23839       else if ((nargs - i) <= nargs_constant)
23840 	{
23841 	  if (!match)
23842 	    switch (icode)
23843 	      {
23844 	      case CODE_FOR_sse4_1_roundpd:
23845 	      case CODE_FOR_sse4_1_roundps:
23846 	      case CODE_FOR_sse4_1_roundsd:
23847 	      case CODE_FOR_sse4_1_roundss:
23848 	      case CODE_FOR_sse4_1_blendps:
23849 	      case CODE_FOR_avx_blendpd256:
23850 	      case CODE_FOR_avx_vpermilv4df:
23851 	      case CODE_FOR_avx_roundpd256:
23852 	      case CODE_FOR_avx_roundps256:
23853 		error ("the last argument must be a 4-bit immediate");
23854 		return const0_rtx;
23855 
23856 	      case CODE_FOR_sse4_1_blendpd:
23857 	      case CODE_FOR_avx_vpermilv2df:
23858 	      case CODE_FOR_xop_vpermil2v2df3:
23859 	      case CODE_FOR_xop_vpermil2v4sf3:
23860 	      case CODE_FOR_xop_vpermil2v4df3:
23861 	      case CODE_FOR_xop_vpermil2v8sf3:
23862 		error ("the last argument must be a 2-bit immediate");
23863 		return const0_rtx;
23864 
23865 	      case CODE_FOR_avx_vextractf128v4df:
23866 	      case CODE_FOR_avx_vextractf128v8sf:
23867 	      case CODE_FOR_avx_vextractf128v8si:
23868 	      case CODE_FOR_avx_vinsertf128v4df:
23869 	      case CODE_FOR_avx_vinsertf128v8sf:
23870 	      case CODE_FOR_avx_vinsertf128v8si:
23871 		error ("the last argument must be a 1-bit immediate");
23872 		return const0_rtx;
23873 
23874 	      case CODE_FOR_avx_cmpsdv2df3:
23875 	      case CODE_FOR_avx_cmpssv4sf3:
23876 	      case CODE_FOR_avx_cmppdv2df3:
23877 	      case CODE_FOR_avx_cmppsv4sf3:
23878 	      case CODE_FOR_avx_cmppdv4df3:
23879 	      case CODE_FOR_avx_cmppsv8sf3:
23880 		error ("the last argument must be a 5-bit immediate");
23881 		return const0_rtx;
23882 
23883 	     default:
23884 		switch (nargs_constant)
23885 		  {
23886 		  case 2:
23887 		    if ((nargs - i) == nargs_constant)
23888 		      {
23889 			error ("the next to last argument must be an 8-bit immediate");
23890 			break;
23891 		      }
23892 		  case 1:
23893 		    error ("the last argument must be an 8-bit immediate");
23894 		    break;
23895 		  default:
23896 		    gcc_unreachable ();
23897 		  }
23898 		return const0_rtx;
23899 	      }
23900 	}
23901       else
23902 	{
23903 	  if (VECTOR_MODE_P (mode))
23904 	    op = safe_vector_operand (op, mode);
23905 
23906 	  /* If we aren't optimizing, only allow one memory operand to
23907 	     be generated.  */
23908 	  if (memory_operand (op, mode))
23909 	    num_memory++;
23910 
23911 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
23912 	    {
23913 	      if (optimize || !match || num_memory > 1)
23914 		op = copy_to_mode_reg (mode, op);
23915 	    }
23916 	  else
23917 	    {
23918 	      op = copy_to_reg (op);
23919 	      op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
23920 	    }
23921 	}
23922 
23923       args[i].op = op;
23924       args[i].mode = mode;
23925     }
23926 
23927   switch (nargs)
23928     {
23929     case 1:
23930       pat = GEN_FCN (icode) (real_target, args[0].op);
23931       break;
23932     case 2:
23933       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
23934       break;
23935     case 3:
23936       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
23937 			     args[2].op);
23938       break;
23939     case 4:
23940       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
23941 			     args[2].op, args[3].op);
23942       break;
23943     default:
23944       gcc_unreachable ();
23945     }
23946 
23947   if (! pat)
23948     return 0;
23949 
23950   emit_insn (pat);
23951   return target;
23952 }
23953 
23954 /* Subroutine of ix86_expand_builtin to take care of special insns
23955    with variable number of operands.  */
23956 
23957 static rtx
23958 ix86_expand_special_args_builtin (const struct builtin_description *d,
23959 				    tree exp, rtx target)
23960 {
23961   tree arg;
23962   rtx pat, op;
23963   unsigned int i, nargs, arg_adjust, memory;
23964   struct
23965     {
23966       rtx op;
23967       enum machine_mode mode;
23968     } args[3];
23969   enum insn_code icode = d->icode;
23970   bool last_arg_constant = false;
23971   const struct insn_data *insn_p = &insn_data[icode];
23972   enum machine_mode tmode = insn_p->operand[0].mode;
23973   enum { load, store } klass;
23974 
23975   switch ((enum ix86_builtin_func_type) d->flag)
23976     {
23977     case VOID_FTYPE_VOID:
23978       emit_insn (GEN_FCN (icode) (target));
23979       return 0;
23980     case UINT64_FTYPE_VOID:
23981       nargs = 0;
23982       klass = load;
23983       memory = 0;
23984       break;
23985     case UINT64_FTYPE_PUNSIGNED:
23986     case V2DI_FTYPE_PV2DI:
23987     case V32QI_FTYPE_PCCHAR:
23988     case V16QI_FTYPE_PCCHAR:
23989     case V8SF_FTYPE_PCV4SF:
23990     case V8SF_FTYPE_PCFLOAT:
23991     case V4SF_FTYPE_PCFLOAT:
23992     case V4DF_FTYPE_PCV2DF:
23993     case V4DF_FTYPE_PCDOUBLE:
23994     case V2DF_FTYPE_PCDOUBLE:
23995     case VOID_FTYPE_PVOID:
23996       nargs = 1;
23997       klass = load;
23998       memory = 0;
23999       break;
24000     case VOID_FTYPE_PV2SF_V4SF:
24001     case VOID_FTYPE_PV4DI_V4DI:
24002     case VOID_FTYPE_PV2DI_V2DI:
24003     case VOID_FTYPE_PCHAR_V32QI:
24004     case VOID_FTYPE_PCHAR_V16QI:
24005     case VOID_FTYPE_PFLOAT_V8SF:
24006     case VOID_FTYPE_PFLOAT_V4SF:
24007     case VOID_FTYPE_PDOUBLE_V4DF:
24008     case VOID_FTYPE_PDOUBLE_V2DF:
24009     case VOID_FTYPE_PULONGLONG_ULONGLONG:
24010     case VOID_FTYPE_PINT_INT:
24011       nargs = 1;
24012       klass = store;
24013       /* Reserve memory operand for target.  */
24014       memory = ARRAY_SIZE (args);
24015       break;
24016     case V4SF_FTYPE_V4SF_PCV2SF:
24017     case V2DF_FTYPE_V2DF_PCDOUBLE:
24018       nargs = 2;
24019       klass = load;
24020       memory = 1;
24021       break;
24022     case V8SF_FTYPE_PCV8SF_V8SI:
24023     case V4DF_FTYPE_PCV4DF_V4DI:
24024     case V4SF_FTYPE_PCV4SF_V4SI:
24025     case V2DF_FTYPE_PCV2DF_V2DI:
24026       nargs = 2;
24027       klass = load;
24028       memory = 0;
24029       break;
24030     case VOID_FTYPE_PV8SF_V8SI_V8SF:
24031     case VOID_FTYPE_PV4DF_V4DI_V4DF:
24032     case VOID_FTYPE_PV4SF_V4SI_V4SF:
24033     case VOID_FTYPE_PV2DF_V2DI_V2DF:
24034       nargs = 2;
24035       klass = store;
24036       /* Reserve memory operand for target.  */
24037       memory = ARRAY_SIZE (args);
24038       break;
24039     case VOID_FTYPE_UINT_UINT_UINT:
24040     case VOID_FTYPE_UINT64_UINT_UINT:
24041     case UCHAR_FTYPE_UINT_UINT_UINT:
24042     case UCHAR_FTYPE_UINT64_UINT_UINT:
24043       nargs = 3;
24044       klass = load;
24045       memory = ARRAY_SIZE (args);
24046       last_arg_constant = true;
24047       break;
24048     default:
24049       gcc_unreachable ();
24050     }
24051 
24052   gcc_assert (nargs <= ARRAY_SIZE (args));
24053 
24054   if (klass == store)
24055     {
24056       arg = CALL_EXPR_ARG (exp, 0);
24057       op = expand_normal (arg);
24058       gcc_assert (target == 0);
24059       target = gen_rtx_MEM (tmode, copy_to_mode_reg (Pmode, op));
24060       arg_adjust = 1;
24061     }
24062   else
24063     {
24064       arg_adjust = 0;
24065       if (optimize
24066 	  || target == 0
24067 	  || !register_operand (target, tmode)
24068 	  || GET_MODE (target) != tmode)
24069 	target = gen_reg_rtx (tmode);
24070     }
24071 
24072   for (i = 0; i < nargs; i++)
24073     {
24074       enum machine_mode mode = insn_p->operand[i + 1].mode;
24075       bool match;
24076 
24077       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
24078       op = expand_normal (arg);
24079       match = (*insn_p->operand[i + 1].predicate) (op, mode);
24080 
24081       if (last_arg_constant && (i + 1) == nargs)
24082 	{
24083 	  if (!match)
24084 	    {
24085 	      if (icode == CODE_FOR_lwp_lwpvalsi3
24086 		  || icode == CODE_FOR_lwp_lwpinssi3
24087 		  || icode == CODE_FOR_lwp_lwpvaldi3
24088 		  || icode == CODE_FOR_lwp_lwpinsdi3)
24089 		error ("the last argument must be a 32-bit immediate");
24090 	      else
24091 		error ("the last argument must be an 8-bit immediate");
24092 	      return const0_rtx;
24093 	    }
24094 	}
24095       else
24096 	{
24097 	  if (i == memory)
24098 	    {
24099 	      /* This must be the memory operand.  */
24100 	      op = gen_rtx_MEM (mode, copy_to_mode_reg (Pmode, op));
24101 	      gcc_assert (GET_MODE (op) == mode
24102 			  || GET_MODE (op) == VOIDmode);
24103 	    }
24104 	  else
24105 	    {
24106 	      /* This must be register.  */
24107 	      if (VECTOR_MODE_P (mode))
24108 		op = safe_vector_operand (op, mode);
24109 
24110 	      gcc_assert (GET_MODE (op) == mode
24111 			  || GET_MODE (op) == VOIDmode);
24112 	      op = copy_to_mode_reg (mode, op);
24113 	    }
24114 	}
24115 
24116       args[i].op = op;
24117       args[i].mode = mode;
24118     }
24119 
24120   switch (nargs)
24121     {
24122     case 0:
24123       pat = GEN_FCN (icode) (target);
24124       break;
24125     case 1:
24126       pat = GEN_FCN (icode) (target, args[0].op);
24127       break;
24128     case 2:
24129       pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
24130       break;
24131     case 3:
24132       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
24133       break;
24134     default:
24135       gcc_unreachable ();
24136     }
24137 
24138   if (! pat)
24139     return 0;
24140   emit_insn (pat);
24141   return klass == store ? 0 : target;
24142 }
24143 
24144 /* Return the integer constant in ARG.  Constrain it to be in the range
24145    of the subparts of VEC_TYPE; issue an error if not.  */
24146 
24147 static int
24148 get_element_number (tree vec_type, tree arg)
24149 {
24150   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
24151 
24152   if (!host_integerp (arg, 1)
24153       || (elt = tree_low_cst (arg, 1), elt > max))
24154     {
24155       error ("selector must be an integer constant in the range 0..%wi", max);
24156       return 0;
24157     }
24158 
24159   return elt;
24160 }
24161 
24162 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
24163    ix86_expand_vector_init.  We DO have language-level syntax for this, in
24164    the form of  (type){ init-list }.  Except that since we can't place emms
24165    instructions from inside the compiler, we can't allow the use of MMX
24166    registers unless the user explicitly asks for it.  So we do *not* define
24167    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
24168    we have builtins invoked by mmintrin.h that gives us license to emit
24169    these sorts of instructions.  */
24170 
24171 static rtx
24172 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
24173 {
24174   enum machine_mode tmode = TYPE_MODE (type);
24175   enum machine_mode inner_mode = GET_MODE_INNER (tmode);
24176   int i, n_elt = GET_MODE_NUNITS (tmode);
24177   rtvec v = rtvec_alloc (n_elt);
24178 
24179   gcc_assert (VECTOR_MODE_P (tmode));
24180   gcc_assert (call_expr_nargs (exp) == n_elt);
24181 
24182   for (i = 0; i < n_elt; ++i)
24183     {
24184       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
24185       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
24186     }
24187 
24188   if (!target || !register_operand (target, tmode))
24189     target = gen_reg_rtx (tmode);
24190 
24191   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
24192   return target;
24193 }
24194 
24195 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
24196    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
24197    had a language-level syntax for referencing vector elements.  */
24198 
24199 static rtx
24200 ix86_expand_vec_ext_builtin (tree exp, rtx target)
24201 {
24202   enum machine_mode tmode, mode0;
24203   tree arg0, arg1;
24204   int elt;
24205   rtx op0;
24206 
24207   arg0 = CALL_EXPR_ARG (exp, 0);
24208   arg1 = CALL_EXPR_ARG (exp, 1);
24209 
24210   op0 = expand_normal (arg0);
24211   elt = get_element_number (TREE_TYPE (arg0), arg1);
24212 
24213   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
24214   mode0 = TYPE_MODE (TREE_TYPE (arg0));
24215   gcc_assert (VECTOR_MODE_P (mode0));
24216 
24217   op0 = force_reg (mode0, op0);
24218 
24219   if (optimize || !target || !register_operand (target, tmode))
24220     target = gen_reg_rtx (tmode);
24221 
24222   ix86_expand_vector_extract (true, target, op0, elt);
24223 
24224   return target;
24225 }
24226 
24227 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
24228    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
24229    a language-level syntax for referencing vector elements.  */
24230 
24231 static rtx
24232 ix86_expand_vec_set_builtin (tree exp)
24233 {
24234   enum machine_mode tmode, mode1;
24235   tree arg0, arg1, arg2;
24236   int elt;
24237   rtx op0, op1, target;
24238 
24239   arg0 = CALL_EXPR_ARG (exp, 0);
24240   arg1 = CALL_EXPR_ARG (exp, 1);
24241   arg2 = CALL_EXPR_ARG (exp, 2);
24242 
24243   tmode = TYPE_MODE (TREE_TYPE (arg0));
24244   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
24245   gcc_assert (VECTOR_MODE_P (tmode));
24246 
24247   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
24248   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
24249   elt = get_element_number (TREE_TYPE (arg0), arg2);
24250 
24251   if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
24252     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
24253 
24254   op0 = force_reg (tmode, op0);
24255   op1 = force_reg (mode1, op1);
24256 
24257   /* OP0 is the source of these builtin functions and shouldn't be
24258      modified.  Create a copy, use it and return it as target.  */
24259   target = gen_reg_rtx (tmode);
24260   emit_move_insn (target, op0);
24261   ix86_expand_vector_set (true, target, op1, elt);
24262 
24263   return target;
24264 }
24265 
24266 /* Expand an expression EXP that calls a built-in function,
24267    with result going to TARGET if that's convenient
24268    (and in mode MODE if that's convenient).
24269    SUBTARGET may be used as the target for computing one of EXP's operands.
24270    IGNORE is nonzero if the value is to be ignored.  */
24271 
24272 static rtx
24273 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
24274 		     enum machine_mode mode ATTRIBUTE_UNUSED,
24275 		     int ignore ATTRIBUTE_UNUSED)
24276 {
24277   const struct builtin_description *d;
24278   size_t i;
24279   enum insn_code icode;
24280   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
24281   tree arg0, arg1, arg2;
24282   rtx op0, op1, op2, pat;
24283   enum machine_mode mode0, mode1, mode2;
24284   unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
24285 
24286   /* Determine whether the builtin function is available under the current ISA.
24287      Originally the builtin was not created if it wasn't applicable to the
24288      current ISA based on the command line switches.  With function specific
24289      options, we need to check in the context of the function making the call
24290      whether it is supported.  */
24291   if (ix86_builtins_isa[fcode].isa
24292       && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
24293     {
24294       char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
24295 				       NULL, NULL, false);
24296 
24297       if (!opts)
24298 	error ("%qE needs unknown isa option", fndecl);
24299       else
24300 	{
24301 	  gcc_assert (opts != NULL);
24302 	  error ("%qE needs isa option %s", fndecl, opts);
24303 	  free (opts);
24304 	}
24305       return const0_rtx;
24306     }
24307 
24308   switch (fcode)
24309     {
24310     case IX86_BUILTIN_MASKMOVQ:
24311     case IX86_BUILTIN_MASKMOVDQU:
24312       icode = (fcode == IX86_BUILTIN_MASKMOVQ
24313 	       ? CODE_FOR_mmx_maskmovq
24314 	       : CODE_FOR_sse2_maskmovdqu);
24315       /* Note the arg order is different from the operand order.  */
24316       arg1 = CALL_EXPR_ARG (exp, 0);
24317       arg2 = CALL_EXPR_ARG (exp, 1);
24318       arg0 = CALL_EXPR_ARG (exp, 2);
24319       op0 = expand_normal (arg0);
24320       op1 = expand_normal (arg1);
24321       op2 = expand_normal (arg2);
24322       mode0 = insn_data[icode].operand[0].mode;
24323       mode1 = insn_data[icode].operand[1].mode;
24324       mode2 = insn_data[icode].operand[2].mode;
24325 
24326       op0 = force_reg (Pmode, op0);
24327       op0 = gen_rtx_MEM (mode1, op0);
24328 
24329       if (! (*insn_data[icode].operand[0].predicate) (op0, mode0))
24330 	op0 = copy_to_mode_reg (mode0, op0);
24331       if (! (*insn_data[icode].operand[1].predicate) (op1, mode1))
24332 	op1 = copy_to_mode_reg (mode1, op1);
24333       if (! (*insn_data[icode].operand[2].predicate) (op2, mode2))
24334 	op2 = copy_to_mode_reg (mode2, op2);
24335       pat = GEN_FCN (icode) (op0, op1, op2);
24336       if (! pat)
24337 	return 0;
24338       emit_insn (pat);
24339       return 0;
24340 
24341     case IX86_BUILTIN_LDMXCSR:
24342       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
24343       target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
24344       emit_move_insn (target, op0);
24345       emit_insn (gen_sse_ldmxcsr (target));
24346       return 0;
24347 
24348     case IX86_BUILTIN_STMXCSR:
24349       target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
24350       emit_insn (gen_sse_stmxcsr (target));
24351       return copy_to_mode_reg (SImode, target);
24352 
24353     case IX86_BUILTIN_CLFLUSH:
24354 	arg0 = CALL_EXPR_ARG (exp, 0);
24355 	op0 = expand_normal (arg0);
24356 	icode = CODE_FOR_sse2_clflush;
24357 	if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode))
24358 	    op0 = copy_to_mode_reg (Pmode, op0);
24359 
24360 	emit_insn (gen_sse2_clflush (op0));
24361 	return 0;
24362 
24363     case IX86_BUILTIN_MONITOR:
24364       arg0 = CALL_EXPR_ARG (exp, 0);
24365       arg1 = CALL_EXPR_ARG (exp, 1);
24366       arg2 = CALL_EXPR_ARG (exp, 2);
24367       op0 = expand_normal (arg0);
24368       op1 = expand_normal (arg1);
24369       op2 = expand_normal (arg2);
24370       if (!REG_P (op0))
24371 	op0 = copy_to_mode_reg (Pmode, op0);
24372       if (!REG_P (op1))
24373 	op1 = copy_to_mode_reg (SImode, op1);
24374       if (!REG_P (op2))
24375 	op2 = copy_to_mode_reg (SImode, op2);
24376       emit_insn ((*ix86_gen_monitor) (op0, op1, op2));
24377       return 0;
24378 
24379     case IX86_BUILTIN_MWAIT:
24380       arg0 = CALL_EXPR_ARG (exp, 0);
24381       arg1 = CALL_EXPR_ARG (exp, 1);
24382       op0 = expand_normal (arg0);
24383       op1 = expand_normal (arg1);
24384       if (!REG_P (op0))
24385 	op0 = copy_to_mode_reg (SImode, op0);
24386       if (!REG_P (op1))
24387 	op1 = copy_to_mode_reg (SImode, op1);
24388       emit_insn (gen_sse3_mwait (op0, op1));
24389       return 0;
24390 
24391     case IX86_BUILTIN_VEC_INIT_V2SI:
24392     case IX86_BUILTIN_VEC_INIT_V4HI:
24393     case IX86_BUILTIN_VEC_INIT_V8QI:
24394       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
24395 
24396     case IX86_BUILTIN_VEC_EXT_V2DF:
24397     case IX86_BUILTIN_VEC_EXT_V2DI:
24398     case IX86_BUILTIN_VEC_EXT_V4SF:
24399     case IX86_BUILTIN_VEC_EXT_V4SI:
24400     case IX86_BUILTIN_VEC_EXT_V8HI:
24401     case IX86_BUILTIN_VEC_EXT_V2SI:
24402     case IX86_BUILTIN_VEC_EXT_V4HI:
24403     case IX86_BUILTIN_VEC_EXT_V16QI:
24404       return ix86_expand_vec_ext_builtin (exp, target);
24405 
24406     case IX86_BUILTIN_VEC_SET_V2DI:
24407     case IX86_BUILTIN_VEC_SET_V4SF:
24408     case IX86_BUILTIN_VEC_SET_V4SI:
24409     case IX86_BUILTIN_VEC_SET_V8HI:
24410     case IX86_BUILTIN_VEC_SET_V4HI:
24411     case IX86_BUILTIN_VEC_SET_V16QI:
24412       return ix86_expand_vec_set_builtin (exp);
24413 
24414     case IX86_BUILTIN_VEC_PERM_V2DF:
24415     case IX86_BUILTIN_VEC_PERM_V4SF:
24416     case IX86_BUILTIN_VEC_PERM_V2DI:
24417     case IX86_BUILTIN_VEC_PERM_V4SI:
24418     case IX86_BUILTIN_VEC_PERM_V8HI:
24419     case IX86_BUILTIN_VEC_PERM_V16QI:
24420     case IX86_BUILTIN_VEC_PERM_V2DI_U:
24421     case IX86_BUILTIN_VEC_PERM_V4SI_U:
24422     case IX86_BUILTIN_VEC_PERM_V8HI_U:
24423     case IX86_BUILTIN_VEC_PERM_V16QI_U:
24424     case IX86_BUILTIN_VEC_PERM_V4DF:
24425     case IX86_BUILTIN_VEC_PERM_V8SF:
24426       return ix86_expand_vec_perm_builtin (exp);
24427 
24428     case IX86_BUILTIN_INFQ:
24429     case IX86_BUILTIN_HUGE_VALQ:
24430       {
24431 	REAL_VALUE_TYPE inf;
24432 	rtx tmp;
24433 
24434 	real_inf (&inf);
24435 	tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
24436 
24437 	tmp = validize_mem (force_const_mem (mode, tmp));
24438 
24439 	if (target == 0)
24440 	  target = gen_reg_rtx (mode);
24441 
24442 	emit_move_insn (target, tmp);
24443 	return target;
24444       }
24445 
24446     case IX86_BUILTIN_LLWPCB:
24447       arg0 = CALL_EXPR_ARG (exp, 0);
24448       op0 = expand_normal (arg0);
24449       icode = CODE_FOR_lwp_llwpcb;
24450       if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode))
24451 	op0 = copy_to_mode_reg (Pmode, op0);
24452       emit_insn (gen_lwp_llwpcb (op0));
24453       return 0;
24454 
24455     case IX86_BUILTIN_SLWPCB:
24456       icode = CODE_FOR_lwp_slwpcb;
24457       if (!target
24458 	  || ! (*insn_data[icode].operand[0].predicate) (target, Pmode))
24459 	target = gen_reg_rtx (Pmode);
24460       emit_insn (gen_lwp_slwpcb (target));
24461       return target;
24462 
24463     default:
24464       break;
24465     }
24466 
24467   for (i = 0, d = bdesc_special_args;
24468        i < ARRAY_SIZE (bdesc_special_args);
24469        i++, d++)
24470     if (d->code == fcode)
24471       return ix86_expand_special_args_builtin (d, exp, target);
24472 
24473   for (i = 0, d = bdesc_args;
24474        i < ARRAY_SIZE (bdesc_args);
24475        i++, d++)
24476     if (d->code == fcode)
24477       switch (fcode)
24478 	{
24479 	case IX86_BUILTIN_FABSQ:
24480 	case IX86_BUILTIN_COPYSIGNQ:
24481 	  if (!TARGET_SSE2)
24482 	    /* Emit a normal call if SSE2 isn't available.  */
24483 	    return expand_call (exp, target, ignore);
24484 	default:
24485 	  return ix86_expand_args_builtin (d, exp, target);
24486 	}
24487 
24488   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
24489     if (d->code == fcode)
24490       return ix86_expand_sse_comi (d, exp, target);
24491 
24492   for (i = 0, d = bdesc_pcmpestr;
24493        i < ARRAY_SIZE (bdesc_pcmpestr);
24494        i++, d++)
24495     if (d->code == fcode)
24496       return ix86_expand_sse_pcmpestr (d, exp, target);
24497 
24498   for (i = 0, d = bdesc_pcmpistr;
24499        i < ARRAY_SIZE (bdesc_pcmpistr);
24500        i++, d++)
24501     if (d->code == fcode)
24502       return ix86_expand_sse_pcmpistr (d, exp, target);
24503 
24504   for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
24505     if (d->code == fcode)
24506       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
24507 					    (enum ix86_builtin_func_type)
24508 					    d->flag, d->comparison);
24509 
24510   gcc_unreachable ();
24511 }
24512 
24513 /* Returns a function decl for a vectorized version of the builtin function
24514    with builtin function code FN and the result vector type TYPE, or NULL_TREE
24515    if it is not available.  */
24516 
24517 static tree
24518 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
24519 				  tree type_in)
24520 {
24521   enum machine_mode in_mode, out_mode;
24522   int in_n, out_n;
24523   enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
24524 
24525   if (TREE_CODE (type_out) != VECTOR_TYPE
24526       || TREE_CODE (type_in) != VECTOR_TYPE
24527       || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
24528     return NULL_TREE;
24529 
24530   out_mode = TYPE_MODE (TREE_TYPE (type_out));
24531   out_n = TYPE_VECTOR_SUBPARTS (type_out);
24532   in_mode = TYPE_MODE (TREE_TYPE (type_in));
24533   in_n = TYPE_VECTOR_SUBPARTS (type_in);
24534 
24535   switch (fn)
24536     {
24537     case BUILT_IN_SQRT:
24538       if (out_mode == DFmode && out_n == 2
24539 	  && in_mode == DFmode && in_n == 2)
24540 	return ix86_builtins[IX86_BUILTIN_SQRTPD];
24541       break;
24542 
24543     case BUILT_IN_SQRTF:
24544       if (out_mode == SFmode && out_n == 4
24545 	  && in_mode == SFmode && in_n == 4)
24546 	return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
24547       break;
24548 
24549     case BUILT_IN_LRINT:
24550       if (out_mode == SImode && out_n == 4
24551 	  && in_mode == DFmode && in_n == 2)
24552 	return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
24553       break;
24554 
24555     case BUILT_IN_LRINTF:
24556       if (out_mode == SImode && out_n == 4
24557 	  && in_mode == SFmode && in_n == 4)
24558 	return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
24559       break;
24560 
24561     case BUILT_IN_COPYSIGN:
24562       if (out_mode == DFmode && out_n == 2
24563 	  && in_mode == DFmode && in_n == 2)
24564 	return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
24565       break;
24566 
24567     case BUILT_IN_COPYSIGNF:
24568       if (out_mode == SFmode && out_n == 4
24569 	  && in_mode == SFmode && in_n == 4)
24570 	return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
24571       break;
24572 
24573     default:
24574       ;
24575     }
24576 
24577   /* Dispatch to a handler for a vectorization library.  */
24578   if (ix86_veclib_handler)
24579     return (*ix86_veclib_handler) ((enum built_in_function) fn, type_out,
24580 				   type_in);
24581 
24582   return NULL_TREE;
24583 }
24584 
24585 /* Handler for an SVML-style interface to
24586    a library with vectorized intrinsics.  */
24587 
24588 static tree
24589 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
24590 {
24591   char name[20];
24592   tree fntype, new_fndecl, args;
24593   unsigned arity;
24594   const char *bname;
24595   enum machine_mode el_mode, in_mode;
24596   int n, in_n;
24597 
24598   /* The SVML is suitable for unsafe math only.  */
24599   if (!flag_unsafe_math_optimizations)
24600     return NULL_TREE;
24601 
24602   el_mode = TYPE_MODE (TREE_TYPE (type_out));
24603   n = TYPE_VECTOR_SUBPARTS (type_out);
24604   in_mode = TYPE_MODE (TREE_TYPE (type_in));
24605   in_n = TYPE_VECTOR_SUBPARTS (type_in);
24606   if (el_mode != in_mode
24607       || n != in_n)
24608     return NULL_TREE;
24609 
24610   switch (fn)
24611     {
24612     case BUILT_IN_EXP:
24613     case BUILT_IN_LOG:
24614     case BUILT_IN_LOG10:
24615     case BUILT_IN_POW:
24616     case BUILT_IN_TANH:
24617     case BUILT_IN_TAN:
24618     case BUILT_IN_ATAN:
24619     case BUILT_IN_ATAN2:
24620     case BUILT_IN_ATANH:
24621     case BUILT_IN_CBRT:
24622     case BUILT_IN_SINH:
24623     case BUILT_IN_SIN:
24624     case BUILT_IN_ASINH:
24625     case BUILT_IN_ASIN:
24626     case BUILT_IN_COSH:
24627     case BUILT_IN_COS:
24628     case BUILT_IN_ACOSH:
24629     case BUILT_IN_ACOS:
24630       if (el_mode != DFmode || n != 2)
24631 	return NULL_TREE;
24632       break;
24633 
24634     case BUILT_IN_EXPF:
24635     case BUILT_IN_LOGF:
24636     case BUILT_IN_LOG10F:
24637     case BUILT_IN_POWF:
24638     case BUILT_IN_TANHF:
24639     case BUILT_IN_TANF:
24640     case BUILT_IN_ATANF:
24641     case BUILT_IN_ATAN2F:
24642     case BUILT_IN_ATANHF:
24643     case BUILT_IN_CBRTF:
24644     case BUILT_IN_SINHF:
24645     case BUILT_IN_SINF:
24646     case BUILT_IN_ASINHF:
24647     case BUILT_IN_ASINF:
24648     case BUILT_IN_COSHF:
24649     case BUILT_IN_COSF:
24650     case BUILT_IN_ACOSHF:
24651     case BUILT_IN_ACOSF:
24652       if (el_mode != SFmode || n != 4)
24653 	return NULL_TREE;
24654       break;
24655 
24656     default:
24657       return NULL_TREE;
24658     }
24659 
24660   bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
24661 
24662   if (fn == BUILT_IN_LOGF)
24663     strcpy (name, "vmlsLn4");
24664   else if (fn == BUILT_IN_LOG)
24665     strcpy (name, "vmldLn2");
24666   else if (n == 4)
24667     {
24668       sprintf (name, "vmls%s", bname+10);
24669       name[strlen (name)-1] = '4';
24670     }
24671   else
24672     sprintf (name, "vmld%s2", bname+10);
24673 
24674   /* Convert to uppercase. */
24675   name[4] &= ~0x20;
24676 
24677   arity = 0;
24678   for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
24679        args = TREE_CHAIN (args))
24680     arity++;
24681 
24682   if (arity == 1)
24683     fntype = build_function_type_list (type_out, type_in, NULL);
24684   else
24685     fntype = build_function_type_list (type_out, type_in, type_in, NULL);
24686 
24687   /* Build a function declaration for the vectorized function.  */
24688   new_fndecl = build_decl (BUILTINS_LOCATION,
24689 			   FUNCTION_DECL, get_identifier (name), fntype);
24690   TREE_PUBLIC (new_fndecl) = 1;
24691   DECL_EXTERNAL (new_fndecl) = 1;
24692   DECL_IS_NOVOPS (new_fndecl) = 1;
24693   TREE_READONLY (new_fndecl) = 1;
24694 
24695   return new_fndecl;
24696 }
24697 
24698 /* Handler for an ACML-style interface to
24699    a library with vectorized intrinsics.  */
24700 
24701 static tree
24702 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
24703 {
24704   char name[20] = "__vr.._";
24705   tree fntype, new_fndecl, args;
24706   unsigned arity;
24707   const char *bname;
24708   enum machine_mode el_mode, in_mode;
24709   int n, in_n;
24710 
24711   /* The ACML is 64bits only and suitable for unsafe math only as
24712      it does not correctly support parts of IEEE with the required
24713      precision such as denormals.  */
24714   if (!TARGET_64BIT
24715       || !flag_unsafe_math_optimizations)
24716     return NULL_TREE;
24717 
24718   el_mode = TYPE_MODE (TREE_TYPE (type_out));
24719   n = TYPE_VECTOR_SUBPARTS (type_out);
24720   in_mode = TYPE_MODE (TREE_TYPE (type_in));
24721   in_n = TYPE_VECTOR_SUBPARTS (type_in);
24722   if (el_mode != in_mode
24723       || n != in_n)
24724     return NULL_TREE;
24725 
24726   switch (fn)
24727     {
24728     case BUILT_IN_SIN:
24729     case BUILT_IN_COS:
24730     case BUILT_IN_EXP:
24731     case BUILT_IN_LOG:
24732     case BUILT_IN_LOG2:
24733     case BUILT_IN_LOG10:
24734       name[4] = 'd';
24735       name[5] = '2';
24736       if (el_mode != DFmode
24737 	  || n != 2)
24738 	return NULL_TREE;
24739       break;
24740 
24741     case BUILT_IN_SINF:
24742     case BUILT_IN_COSF:
24743     case BUILT_IN_EXPF:
24744     case BUILT_IN_POWF:
24745     case BUILT_IN_LOGF:
24746     case BUILT_IN_LOG2F:
24747     case BUILT_IN_LOG10F:
24748       name[4] = 's';
24749       name[5] = '4';
24750       if (el_mode != SFmode
24751 	  || n != 4)
24752 	return NULL_TREE;
24753       break;
24754 
24755     default:
24756       return NULL_TREE;
24757     }
24758 
24759   bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
24760   sprintf (name + 7, "%s", bname+10);
24761 
24762   arity = 0;
24763   for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
24764        args = TREE_CHAIN (args))
24765     arity++;
24766 
24767   if (arity == 1)
24768     fntype = build_function_type_list (type_out, type_in, NULL);
24769   else
24770     fntype = build_function_type_list (type_out, type_in, type_in, NULL);
24771 
24772   /* Build a function declaration for the vectorized function.  */
24773   new_fndecl = build_decl (BUILTINS_LOCATION,
24774 			   FUNCTION_DECL, get_identifier (name), fntype);
24775   TREE_PUBLIC (new_fndecl) = 1;
24776   DECL_EXTERNAL (new_fndecl) = 1;
24777   DECL_IS_NOVOPS (new_fndecl) = 1;
24778   TREE_READONLY (new_fndecl) = 1;
24779 
24780   return new_fndecl;
24781 }
24782 
24783 
24784 /* Returns a decl of a function that implements conversion of an integer vector
24785    into a floating-point vector, or vice-versa. TYPE is the type of the integer
24786    side of the conversion.
24787    Return NULL_TREE if it is not available.  */
24788 
24789 static tree
24790 ix86_vectorize_builtin_conversion (unsigned int code, tree type)
24791 {
24792   if (! (TARGET_SSE2 && TREE_CODE (type) == VECTOR_TYPE))
24793     return NULL_TREE;
24794 
24795   switch (code)
24796     {
24797     case FLOAT_EXPR:
24798       switch (TYPE_MODE (type))
24799 	{
24800 	case V4SImode:
24801 	  return TYPE_UNSIGNED (type)
24802 	    ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS]
24803 	    : ix86_builtins[IX86_BUILTIN_CVTDQ2PS];
24804 	default:
24805 	  return NULL_TREE;
24806 	}
24807 
24808     case FIX_TRUNC_EXPR:
24809       switch (TYPE_MODE (type))
24810 	{
24811 	case V4SImode:
24812 	  return TYPE_UNSIGNED (type)
24813 	    ? NULL_TREE
24814 	    : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ];
24815 	default:
24816 	  return NULL_TREE;
24817 	}
24818     default:
24819       return NULL_TREE;
24820 
24821     }
24822 }
24823 
24824 /* Returns a code for a target-specific builtin that implements
24825    reciprocal of the function, or NULL_TREE if not available.  */
24826 
24827 static tree
24828 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
24829 			 bool sqrt ATTRIBUTE_UNUSED)
24830 {
24831   if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
24832 	 && flag_finite_math_only && !flag_trapping_math
24833 	 && flag_unsafe_math_optimizations))
24834     return NULL_TREE;
24835 
24836   if (md_fn)
24837     /* Machine dependent builtins.  */
24838     switch (fn)
24839       {
24840 	/* Vectorized version of sqrt to rsqrt conversion.  */
24841       case IX86_BUILTIN_SQRTPS_NR:
24842 	return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
24843 
24844       default:
24845 	return NULL_TREE;
24846       }
24847   else
24848     /* Normal builtins.  */
24849     switch (fn)
24850       {
24851 	/* Sqrt to rsqrt conversion.  */
24852       case BUILT_IN_SQRTF:
24853 	return ix86_builtins[IX86_BUILTIN_RSQRTF];
24854 
24855       default:
24856 	return NULL_TREE;
24857       }
24858 }
24859 
24860 /* Helper for avx_vpermilps256_operand et al.  This is also used by
24861    the expansion functions to turn the parallel back into a mask.
24862    The return value is 0 for no match and the imm8+1 for a match.  */
24863 
24864 int
24865 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
24866 {
24867   unsigned i, nelt = GET_MODE_NUNITS (mode);
24868   unsigned mask = 0;
24869   unsigned char ipar[8];
24870 
24871   if (XVECLEN (par, 0) != (int) nelt)
24872     return 0;
24873 
24874   /* Validate that all of the elements are constants, and not totally
24875      out of range.  Copy the data into an integral array to make the
24876      subsequent checks easier.  */
24877   for (i = 0; i < nelt; ++i)
24878     {
24879       rtx er = XVECEXP (par, 0, i);
24880       unsigned HOST_WIDE_INT ei;
24881 
24882       if (!CONST_INT_P (er))
24883 	return 0;
24884       ei = INTVAL (er);
24885       if (ei >= nelt)
24886 	return 0;
24887       ipar[i] = ei;
24888     }
24889 
24890   switch (mode)
24891     {
24892     case V4DFmode:
24893       /* In the 256-bit DFmode case, we can only move elements within
24894          a 128-bit lane.  */
24895       for (i = 0; i < 2; ++i)
24896 	{
24897 	  if (ipar[i] >= 2)
24898 	    return 0;
24899 	  mask |= ipar[i] << i;
24900 	}
24901       for (i = 2; i < 4; ++i)
24902 	{
24903 	  if (ipar[i] < 2)
24904 	    return 0;
24905 	  mask |= (ipar[i] - 2) << i;
24906 	}
24907       break;
24908 
24909     case V8SFmode:
24910       /* In the 256-bit SFmode case, we have full freedom of movement
24911 	 within the low 128-bit lane, but the high 128-bit lane must
24912 	 mirror the exact same pattern.  */
24913       for (i = 0; i < 4; ++i)
24914 	if (ipar[i] + 4 != ipar[i + 4])
24915 	  return 0;
24916       nelt = 4;
24917       /* FALLTHRU */
24918 
24919     case V2DFmode:
24920     case V4SFmode:
24921       /* In the 128-bit case, we've full freedom in the placement of
24922 	 the elements from the source operand.  */
24923       for (i = 0; i < nelt; ++i)
24924 	mask |= ipar[i] << (i * (nelt / 2));
24925       break;
24926 
24927     default:
24928       gcc_unreachable ();
24929     }
24930 
24931   /* Make sure success has a non-zero value by adding one.  */
24932   return mask + 1;
24933 }
24934 
24935 /* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
24936    the expansion functions to turn the parallel back into a mask.
24937    The return value is 0 for no match and the imm8+1 for a match.  */
24938 
24939 int
24940 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
24941 {
24942   unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
24943   unsigned mask = 0;
24944   unsigned char ipar[8];
24945 
24946   if (XVECLEN (par, 0) != (int) nelt)
24947     return 0;
24948 
24949   /* Validate that all of the elements are constants, and not totally
24950      out of range.  Copy the data into an integral array to make the
24951      subsequent checks easier.  */
24952   for (i = 0; i < nelt; ++i)
24953     {
24954       rtx er = XVECEXP (par, 0, i);
24955       unsigned HOST_WIDE_INT ei;
24956 
24957       if (!CONST_INT_P (er))
24958 	return 0;
24959       ei = INTVAL (er);
24960       if (ei >= 2 * nelt)
24961 	return 0;
24962       ipar[i] = ei;
24963     }
24964 
24965   /* Validate that the halves of the permute are halves.  */
24966   for (i = 0; i < nelt2 - 1; ++i)
24967     if (ipar[i] + 1 != ipar[i + 1])
24968       return 0;
24969   for (i = nelt2; i < nelt - 1; ++i)
24970     if (ipar[i] + 1 != ipar[i + 1])
24971       return 0;
24972 
24973   /* Reconstruct the mask.  */
24974   for (i = 0; i < 2; ++i)
24975     {
24976       unsigned e = ipar[i * nelt2];
24977       if (e % nelt2)
24978 	return 0;
24979       e /= nelt2;
24980       mask |= e << (i * 4);
24981     }
24982 
24983   /* Make sure success has a non-zero value by adding one.  */
24984   return mask + 1;
24985 }
24986 
24987 
24988 /* Store OPERAND to the memory after reload is completed.  This means
24989    that we can't easily use assign_stack_local.  */
24990 rtx
24991 ix86_force_to_memory (enum machine_mode mode, rtx operand)
24992 {
24993   rtx result;
24994 
24995   gcc_assert (reload_completed);
24996   if (!TARGET_64BIT_MS_ABI && TARGET_RED_ZONE)
24997     {
24998       result = gen_rtx_MEM (mode,
24999 			    gen_rtx_PLUS (Pmode,
25000 					  stack_pointer_rtx,
25001 					  GEN_INT (-RED_ZONE_SIZE)));
25002       emit_move_insn (result, operand);
25003     }
25004   else if ((TARGET_64BIT_MS_ABI || !TARGET_RED_ZONE) && TARGET_64BIT)
25005     {
25006       switch (mode)
25007 	{
25008 	case HImode:
25009 	case SImode:
25010 	  operand = gen_lowpart (DImode, operand);
25011 	  /* FALLTHRU */
25012 	case DImode:
25013 	  emit_insn (
25014 		      gen_rtx_SET (VOIDmode,
25015 				   gen_rtx_MEM (DImode,
25016 						gen_rtx_PRE_DEC (DImode,
25017 							stack_pointer_rtx)),
25018 				   operand));
25019 	  break;
25020 	default:
25021 	  gcc_unreachable ();
25022 	}
25023       result = gen_rtx_MEM (mode, stack_pointer_rtx);
25024     }
25025   else
25026     {
25027       switch (mode)
25028 	{
25029 	case DImode:
25030 	  {
25031 	    rtx operands[2];
25032 	    split_di (&operand, 1, operands, operands + 1);
25033 	    emit_insn (
25034 			gen_rtx_SET (VOIDmode,
25035 				     gen_rtx_MEM (SImode,
25036 						  gen_rtx_PRE_DEC (Pmode,
25037 							stack_pointer_rtx)),
25038 				     operands[1]));
25039 	    emit_insn (
25040 			gen_rtx_SET (VOIDmode,
25041 				     gen_rtx_MEM (SImode,
25042 						  gen_rtx_PRE_DEC (Pmode,
25043 							stack_pointer_rtx)),
25044 				     operands[0]));
25045 	  }
25046 	  break;
25047 	case HImode:
25048 	  /* Store HImodes as SImodes.  */
25049 	  operand = gen_lowpart (SImode, operand);
25050 	  /* FALLTHRU */
25051 	case SImode:
25052 	  emit_insn (
25053 		      gen_rtx_SET (VOIDmode,
25054 				   gen_rtx_MEM (GET_MODE (operand),
25055 						gen_rtx_PRE_DEC (SImode,
25056 							stack_pointer_rtx)),
25057 				   operand));
25058 	  break;
25059 	default:
25060 	  gcc_unreachable ();
25061 	}
25062       result = gen_rtx_MEM (mode, stack_pointer_rtx);
25063     }
25064   return result;
25065 }
25066 
25067 /* Free operand from the memory.  */
25068 void
25069 ix86_free_from_memory (enum machine_mode mode)
25070 {
25071   if (!TARGET_RED_ZONE || TARGET_64BIT_MS_ABI)
25072     {
25073       int size;
25074 
25075       if (mode == DImode || TARGET_64BIT)
25076 	size = 8;
25077       else
25078 	size = 4;
25079       /* Use LEA to deallocate stack space.  In peephole2 it will be converted
25080          to pop or add instruction if registers are available.  */
25081       emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
25082 			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
25083 					    GEN_INT (size))));
25084     }
25085 }
25086 
25087 /* Implement TARGET_IRA_COVER_CLASSES.  If -mfpmath=sse, we prefer
25088    SSE_REGS to FLOAT_REGS if their costs for a pseudo are the
25089    same.  */
25090 static const enum reg_class *
25091 i386_ira_cover_classes (void)
25092 {
25093   static const enum reg_class sse_fpmath_classes[] = {
25094     GENERAL_REGS, SSE_REGS, MMX_REGS, FLOAT_REGS, LIM_REG_CLASSES
25095   };
25096   static const enum reg_class no_sse_fpmath_classes[] = {
25097     GENERAL_REGS, FLOAT_REGS, MMX_REGS, SSE_REGS, LIM_REG_CLASSES
25098   };
25099 
25100  return TARGET_SSE_MATH ? sse_fpmath_classes : no_sse_fpmath_classes;
25101 }
25102 
25103 /* Put float CONST_DOUBLE in the constant pool instead of fp regs.
25104    QImode must go into class Q_REGS.
25105    Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
25106    movdf to do mem-to-mem moves through integer regs.  */
25107 enum reg_class
25108 ix86_preferred_reload_class (rtx x, enum reg_class regclass)
25109 {
25110   enum machine_mode mode = GET_MODE (x);
25111 
25112   /* We're only allowed to return a subclass of CLASS.  Many of the
25113      following checks fail for NO_REGS, so eliminate that early.  */
25114   if (regclass == NO_REGS)
25115     return NO_REGS;
25116 
25117   /* All classes can load zeros.  */
25118   if (x == CONST0_RTX (mode))
25119     return regclass;
25120 
25121   /* Force constants into memory if we are loading a (nonzero) constant into
25122      an MMX or SSE register.  This is because there are no MMX/SSE instructions
25123      to load from a constant.  */
25124   if (CONSTANT_P (x)
25125       && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
25126     return NO_REGS;
25127 
25128   /* Prefer SSE regs only, if we can use them for math.  */
25129   if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
25130     return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
25131 
25132   /* Floating-point constants need more complex checks.  */
25133   if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
25134     {
25135       /* General regs can load everything.  */
25136       if (reg_class_subset_p (regclass, GENERAL_REGS))
25137         return regclass;
25138 
25139       /* Floats can load 0 and 1 plus some others.  Note that we eliminated
25140 	 zero above.  We only want to wind up preferring 80387 registers if
25141 	 we plan on doing computation with them.  */
25142       if (TARGET_80387
25143 	  && standard_80387_constant_p (x) > 0)
25144 	{
25145 	  /* Limit class to non-sse.  */
25146 	  if (regclass == FLOAT_SSE_REGS)
25147 	    return FLOAT_REGS;
25148 	  if (regclass == FP_TOP_SSE_REGS)
25149 	    return FP_TOP_REG;
25150 	  if (regclass == FP_SECOND_SSE_REGS)
25151 	    return FP_SECOND_REG;
25152 	  if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
25153 	    return regclass;
25154 	}
25155 
25156       return NO_REGS;
25157     }
25158 
25159   /* Generally when we see PLUS here, it's the function invariant
25160      (plus soft-fp const_int).  Which can only be computed into general
25161      regs.  */
25162   if (GET_CODE (x) == PLUS)
25163     return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
25164 
25165   /* QImode constants are easy to load, but non-constant QImode data
25166      must go into Q_REGS.  */
25167   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
25168     {
25169       if (reg_class_subset_p (regclass, Q_REGS))
25170 	return regclass;
25171       if (reg_class_subset_p (Q_REGS, regclass))
25172 	return Q_REGS;
25173       return NO_REGS;
25174     }
25175 
25176   return regclass;
25177 }
25178 
25179 /* Discourage putting floating-point values in SSE registers unless
25180    SSE math is being used, and likewise for the 387 registers.  */
25181 enum reg_class
25182 ix86_preferred_output_reload_class (rtx x, enum reg_class regclass)
25183 {
25184   enum machine_mode mode = GET_MODE (x);
25185 
25186   /* Restrict the output reload class to the register bank that we are doing
25187      math on.  If we would like not to return a subset of CLASS, reject this
25188      alternative: if reload cannot do this, it will still use its choice.  */
25189   mode = GET_MODE (x);
25190   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
25191     return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
25192 
25193   if (X87_FLOAT_MODE_P (mode))
25194     {
25195       if (regclass == FP_TOP_SSE_REGS)
25196 	return FP_TOP_REG;
25197       else if (regclass == FP_SECOND_SSE_REGS)
25198 	return FP_SECOND_REG;
25199       else
25200 	return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
25201     }
25202 
25203   return regclass;
25204 }
25205 
25206 static enum reg_class
25207 ix86_secondary_reload (bool in_p, rtx x, enum reg_class rclass,
25208 		       enum machine_mode mode,
25209 		       secondary_reload_info *sri ATTRIBUTE_UNUSED)
25210 {
25211   /* QImode spills from non-QI registers require
25212      intermediate register on 32bit targets.  */
25213   if (!TARGET_64BIT
25214       && !in_p && mode == QImode
25215       && (rclass == GENERAL_REGS
25216 	  || rclass == LEGACY_REGS
25217 	  || rclass == INDEX_REGS))
25218     {
25219       int regno;
25220 
25221       if (REG_P (x))
25222 	regno = REGNO (x);
25223       else
25224 	regno = -1;
25225 
25226       if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
25227 	regno = true_regnum (x);
25228 
25229       /* Return Q_REGS if the operand is in memory.  */
25230       if (regno == -1)
25231 	return Q_REGS;
25232     }
25233 
25234   /* This condition handles corner case where an expression involving
25235      pointers gets vectorized.  We're trying to use the address of a
25236      stack slot as a vector initializer.
25237 
25238      (set (reg:V2DI 74 [ vect_cst_.2 ])
25239           (vec_duplicate:V2DI (reg/f:DI 20 frame)))
25240 
25241      Eventually frame gets turned into sp+offset like this:
25242 
25243      (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
25244           (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
25245 	                               (const_int 392 [0x188]))))
25246 
25247      That later gets turned into:
25248 
25249      (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
25250           (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
25251 	    (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
25252 
25253      We'll have the following reload recorded:
25254 
25255      Reload 0: reload_in (DI) =
25256            (plus:DI (reg/f:DI 7 sp)
25257             (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
25258      reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
25259      SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
25260      reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
25261      reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
25262      reload_reg_rtx: (reg:V2DI 22 xmm1)
25263 
25264      Which isn't going to work since SSE instructions can't handle scalar
25265      additions.  Returning GENERAL_REGS forces the addition into integer
25266      register and reload can handle subsequent reloads without problems.  */
25267 
25268   if (in_p && GET_CODE (x) == PLUS
25269       && SSE_CLASS_P (rclass)
25270       && SCALAR_INT_MODE_P (mode))
25271     return GENERAL_REGS;
25272 
25273   return NO_REGS;
25274 }
25275 
25276 /* If we are copying between general and FP registers, we need a memory
25277    location. The same is true for SSE and MMX registers.
25278 
25279    To optimize register_move_cost performance, allow inline variant.
25280 
25281    The macro can't work reliably when one of the CLASSES is class containing
25282    registers from multiple units (SSE, MMX, integer).  We avoid this by never
25283    combining those units in single alternative in the machine description.
25284    Ensure that this constraint holds to avoid unexpected surprises.
25285 
25286    When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
25287    enforce these sanity checks.  */
25288 
25289 static inline int
25290 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
25291 			      enum machine_mode mode, int strict)
25292 {
25293   if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
25294       || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
25295       || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
25296       || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
25297       || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
25298       || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
25299     {
25300       gcc_assert (!strict);
25301       return true;
25302     }
25303 
25304   if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
25305     return true;
25306 
25307   /* ??? This is a lie.  We do have moves between mmx/general, and for
25308      mmx/sse2.  But by saying we need secondary memory we discourage the
25309      register allocator from using the mmx registers unless needed.  */
25310   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
25311     return true;
25312 
25313   if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
25314     {
25315       /* SSE1 doesn't have any direct moves from other classes.  */
25316       if (!TARGET_SSE2)
25317 	return true;
25318 
25319       /* If the target says that inter-unit moves are more expensive
25320 	 than moving through memory, then don't generate them.  */
25321       if (!TARGET_INTER_UNIT_MOVES)
25322 	return true;
25323 
25324       /* Between SSE and general, we have moves no larger than word size.  */
25325       if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
25326 	return true;
25327     }
25328 
25329   return false;
25330 }
25331 
25332 int
25333 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
25334 			      enum machine_mode mode, int strict)
25335 {
25336   return inline_secondary_memory_needed (class1, class2, mode, strict);
25337 }
25338 
25339 /* Return true if the registers in CLASS cannot represent the change from
25340    modes FROM to TO.  */
25341 
25342 bool
25343 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
25344 			       enum reg_class regclass)
25345 {
25346   if (from == to)
25347     return false;
25348 
25349   /* x87 registers can't do subreg at all, as all values are reformatted
25350      to extended precision.  */
25351   if (MAYBE_FLOAT_CLASS_P (regclass))
25352     return true;
25353 
25354   if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
25355     {
25356       /* Vector registers do not support QI or HImode loads.  If we don't
25357 	 disallow a change to these modes, reload will assume it's ok to
25358 	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
25359 	 the vec_dupv4hi pattern.  */
25360       if (GET_MODE_SIZE (from) < 4)
25361 	return true;
25362 
25363       /* Vector registers do not support subreg with nonzero offsets, which
25364 	 are otherwise valid for integer registers.  Since we can't see
25365 	 whether we have a nonzero offset from here, prohibit all
25366          nonparadoxical subregs changing size.  */
25367       if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
25368 	return true;
25369     }
25370 
25371   return false;
25372 }
25373 
25374 /* Return the cost of moving data of mode M between a
25375    register and memory.  A value of 2 is the default; this cost is
25376    relative to those in `REGISTER_MOVE_COST'.
25377 
25378    This function is used extensively by register_move_cost that is used to
25379    build tables at startup.  Make it inline in this case.
25380    When IN is 2, return maximum of in and out move cost.
25381 
25382    If moving between registers and memory is more expensive than
25383    between two registers, you should define this macro to express the
25384    relative cost.
25385 
25386    Model also increased moving costs of QImode registers in non
25387    Q_REGS classes.
25388  */
25389 static inline int
25390 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
25391 			 int in)
25392 {
25393   int cost;
25394   if (FLOAT_CLASS_P (regclass))
25395     {
25396       int index;
25397       switch (mode)
25398 	{
25399 	  case SFmode:
25400 	    index = 0;
25401 	    break;
25402 	  case DFmode:
25403 	    index = 1;
25404 	    break;
25405 	  case XFmode:
25406 	    index = 2;
25407 	    break;
25408 	  default:
25409 	    return 100;
25410 	}
25411       if (in == 2)
25412         return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
25413       return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
25414     }
25415   if (SSE_CLASS_P (regclass))
25416     {
25417       int index;
25418       switch (GET_MODE_SIZE (mode))
25419 	{
25420 	  case 4:
25421 	    index = 0;
25422 	    break;
25423 	  case 8:
25424 	    index = 1;
25425 	    break;
25426 	  case 16:
25427 	    index = 2;
25428 	    break;
25429 	  default:
25430 	    return 100;
25431 	}
25432       if (in == 2)
25433         return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
25434       return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
25435     }
25436   if (MMX_CLASS_P (regclass))
25437     {
25438       int index;
25439       switch (GET_MODE_SIZE (mode))
25440 	{
25441 	  case 4:
25442 	    index = 0;
25443 	    break;
25444 	  case 8:
25445 	    index = 1;
25446 	    break;
25447 	  default:
25448 	    return 100;
25449 	}
25450       if (in)
25451         return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
25452       return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
25453     }
25454   switch (GET_MODE_SIZE (mode))
25455     {
25456       case 1:
25457 	if (Q_CLASS_P (regclass) || TARGET_64BIT)
25458 	  {
25459 	    if (!in)
25460 	      return ix86_cost->int_store[0];
25461 	    if (TARGET_PARTIAL_REG_DEPENDENCY
25462 	        && optimize_function_for_speed_p (cfun))
25463 	      cost = ix86_cost->movzbl_load;
25464 	    else
25465 	      cost = ix86_cost->int_load[0];
25466 	    if (in == 2)
25467 	      return MAX (cost, ix86_cost->int_store[0]);
25468 	    return cost;
25469 	  }
25470 	else
25471 	  {
25472 	   if (in == 2)
25473 	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
25474 	   if (in)
25475 	     return ix86_cost->movzbl_load;
25476 	   else
25477 	     return ix86_cost->int_store[0] + 4;
25478 	  }
25479 	break;
25480       case 2:
25481 	if (in == 2)
25482 	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
25483 	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
25484       default:
25485 	/* Compute number of 32bit moves needed.  TFmode is moved as XFmode.  */
25486 	if (mode == TFmode)
25487 	  mode = XFmode;
25488 	if (in == 2)
25489 	  cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
25490 	else if (in)
25491 	  cost = ix86_cost->int_load[2];
25492 	else
25493 	  cost = ix86_cost->int_store[2];
25494 	return (cost * (((int) GET_MODE_SIZE (mode)
25495 		        + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
25496     }
25497 }
25498 
25499 int
25500 ix86_memory_move_cost (enum machine_mode mode, enum reg_class regclass, int in)
25501 {
25502   return inline_memory_move_cost (mode, regclass, in);
25503 }
25504 
25505 
25506 /* Return the cost of moving data from a register in class CLASS1 to
25507    one in class CLASS2.
25508 
25509    It is not required that the cost always equal 2 when FROM is the same as TO;
25510    on some machines it is expensive to move between registers if they are not
25511    general registers.  */
25512 
25513 int
25514 ix86_register_move_cost (enum machine_mode mode, enum reg_class class1,
25515 			 enum reg_class class2)
25516 {
25517   /* In case we require secondary memory, compute cost of the store followed
25518      by load.  In order to avoid bad register allocation choices, we need
25519      for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
25520 
25521   if (inline_secondary_memory_needed (class1, class2, mode, 0))
25522     {
25523       int cost = 1;
25524 
25525       cost += inline_memory_move_cost (mode, class1, 2);
25526       cost += inline_memory_move_cost (mode, class2, 2);
25527 
25528       /* In case of copying from general_purpose_register we may emit multiple
25529          stores followed by single load causing memory size mismatch stall.
25530          Count this as arbitrarily high cost of 20.  */
25531       if (CLASS_MAX_NREGS (class1, mode) > CLASS_MAX_NREGS (class2, mode))
25532 	cost += 20;
25533 
25534       /* In the case of FP/MMX moves, the registers actually overlap, and we
25535 	 have to switch modes in order to treat them differently.  */
25536       if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
25537           || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
25538 	cost += 20;
25539 
25540       return cost;
25541     }
25542 
25543   /* Moves between SSE/MMX and integer unit are expensive.  */
25544   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
25545       || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
25546 
25547     /* ??? By keeping returned value relatively high, we limit the number
25548        of moves between integer and MMX/SSE registers for all targets.
25549        Additionally, high value prevents problem with x86_modes_tieable_p(),
25550        where integer modes in MMX/SSE registers are not tieable
25551        because of missing QImode and HImode moves to, from or between
25552        MMX/SSE registers.  */
25553     return MAX (8, ix86_cost->mmxsse_to_integer);
25554 
25555   if (MAYBE_FLOAT_CLASS_P (class1))
25556     return ix86_cost->fp_move;
25557   if (MAYBE_SSE_CLASS_P (class1))
25558     return ix86_cost->sse_move;
25559   if (MAYBE_MMX_CLASS_P (class1))
25560     return ix86_cost->mmx_move;
25561   return 2;
25562 }
25563 
25564 /* Return 1 if hard register REGNO can hold a value of machine-mode MODE.  */
25565 
25566 bool
25567 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
25568 {
25569   /* Flags and only flags can only hold CCmode values.  */
25570   if (CC_REGNO_P (regno))
25571     return GET_MODE_CLASS (mode) == MODE_CC;
25572   if (GET_MODE_CLASS (mode) == MODE_CC
25573       || GET_MODE_CLASS (mode) == MODE_RANDOM
25574       || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
25575     return 0;
25576   if (FP_REGNO_P (regno))
25577     return VALID_FP_MODE_P (mode);
25578   if (SSE_REGNO_P (regno))
25579     {
25580       /* We implement the move patterns for all vector modes into and
25581 	 out of SSE registers, even when no operation instructions
25582 	 are available.  OImode move is available only when AVX is
25583 	 enabled.  */
25584       return ((TARGET_AVX && mode == OImode)
25585 	      || VALID_AVX256_REG_MODE (mode)
25586 	      || VALID_SSE_REG_MODE (mode)
25587 	      || VALID_SSE2_REG_MODE (mode)
25588 	      || VALID_MMX_REG_MODE (mode)
25589 	      || VALID_MMX_REG_MODE_3DNOW (mode));
25590     }
25591   if (MMX_REGNO_P (regno))
25592     {
25593       /* We implement the move patterns for 3DNOW modes even in MMX mode,
25594 	 so if the register is available at all, then we can move data of
25595 	 the given mode into or out of it.  */
25596       return (VALID_MMX_REG_MODE (mode)
25597 	      || VALID_MMX_REG_MODE_3DNOW (mode));
25598     }
25599 
25600   if (mode == QImode)
25601     {
25602       /* Take care for QImode values - they can be in non-QI regs,
25603 	 but then they do cause partial register stalls.  */
25604       if (regno <= BX_REG || TARGET_64BIT)
25605 	return 1;
25606       if (!TARGET_PARTIAL_REG_STALL)
25607 	return 1;
25608       return reload_in_progress || reload_completed;
25609     }
25610   /* We handle both integer and floats in the general purpose registers.  */
25611   else if (VALID_INT_MODE_P (mode))
25612     return 1;
25613   else if (VALID_FP_MODE_P (mode))
25614     return 1;
25615   else if (VALID_DFP_MODE_P (mode))
25616     return 1;
25617   /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
25618      on to use that value in smaller contexts, this can easily force a
25619      pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
25620      supporting DImode, allow it.  */
25621   else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
25622     return 1;
25623 
25624   return 0;
25625 }
25626 
25627 /* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
25628    tieable integer mode.  */
25629 
25630 static bool
25631 ix86_tieable_integer_mode_p (enum machine_mode mode)
25632 {
25633   switch (mode)
25634     {
25635     case HImode:
25636     case SImode:
25637       return true;
25638 
25639     case QImode:
25640       return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
25641 
25642     case DImode:
25643       return TARGET_64BIT;
25644 
25645     default:
25646       return false;
25647     }
25648 }
25649 
25650 /* Return true if MODE1 is accessible in a register that can hold MODE2
25651    without copying.  That is, all register classes that can hold MODE2
25652    can also hold MODE1.  */
25653 
25654 bool
25655 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
25656 {
25657   if (mode1 == mode2)
25658     return true;
25659 
25660   if (ix86_tieable_integer_mode_p (mode1)
25661       && ix86_tieable_integer_mode_p (mode2))
25662     return true;
25663 
25664   /* MODE2 being XFmode implies fp stack or general regs, which means we
25665      can tie any smaller floating point modes to it.  Note that we do not
25666      tie this with TFmode.  */
25667   if (mode2 == XFmode)
25668     return mode1 == SFmode || mode1 == DFmode;
25669 
25670   /* MODE2 being DFmode implies fp stack, general or sse regs, which means
25671      that we can tie it with SFmode.  */
25672   if (mode2 == DFmode)
25673     return mode1 == SFmode;
25674 
25675   /* If MODE2 is only appropriate for an SSE register, then tie with
25676      any other mode acceptable to SSE registers.  */
25677   if (GET_MODE_SIZE (mode2) == 16
25678       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
25679     return (GET_MODE_SIZE (mode1) == 16
25680 	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
25681 
25682   /* If MODE2 is appropriate for an MMX register, then tie
25683      with any other mode acceptable to MMX registers.  */
25684   if (GET_MODE_SIZE (mode2) == 8
25685       && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
25686     return (GET_MODE_SIZE (mode1) == 8
25687 	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
25688 
25689   return false;
25690 }
25691 
25692 /* Compute a (partial) cost for rtx X.  Return true if the complete
25693    cost has been computed, and false if subexpressions should be
25694    scanned.  In either case, *TOTAL contains the cost result.  */
25695 
25696 static bool
25697 ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed)
25698 {
25699   enum rtx_code outer_code = (enum rtx_code) outer_code_i;
25700   enum machine_mode mode = GET_MODE (x);
25701   const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
25702 
25703   switch (code)
25704     {
25705     case CONST_INT:
25706     case CONST:
25707     case LABEL_REF:
25708     case SYMBOL_REF:
25709       if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
25710 	*total = 3;
25711       else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
25712 	*total = 2;
25713       else if (flag_pic && SYMBOLIC_CONST (x)
25714 	       && (!TARGET_64BIT
25715 		   || (!GET_CODE (x) != LABEL_REF
25716 		       && (GET_CODE (x) != SYMBOL_REF
25717 		           || !SYMBOL_REF_LOCAL_P (x)))))
25718 	*total = 1;
25719       else
25720 	*total = 0;
25721       return true;
25722 
25723     case CONST_DOUBLE:
25724       if (mode == VOIDmode)
25725 	*total = 0;
25726       else
25727 	switch (standard_80387_constant_p (x))
25728 	  {
25729 	  case 1: /* 0.0 */
25730 	    *total = 1;
25731 	    break;
25732 	  default: /* Other constants */
25733 	    *total = 2;
25734 	    break;
25735 	  case 0:
25736 	  case -1:
25737 	    /* Start with (MEM (SYMBOL_REF)), since that's where
25738 	       it'll probably end up.  Add a penalty for size.  */
25739 	    *total = (COSTS_N_INSNS (1)
25740 		      + (flag_pic != 0 && !TARGET_64BIT)
25741 		      + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
25742 	    break;
25743 	  }
25744       return true;
25745 
25746     case ZERO_EXTEND:
25747       /* The zero extensions is often completely free on x86_64, so make
25748 	 it as cheap as possible.  */
25749       if (TARGET_64BIT && mode == DImode
25750 	  && GET_MODE (XEXP (x, 0)) == SImode)
25751 	*total = 1;
25752       else if (TARGET_ZERO_EXTEND_WITH_AND)
25753 	*total = cost->add;
25754       else
25755 	*total = cost->movzx;
25756       return false;
25757 
25758     case SIGN_EXTEND:
25759       *total = cost->movsx;
25760       return false;
25761 
25762     case ASHIFT:
25763       if (CONST_INT_P (XEXP (x, 1))
25764 	  && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
25765 	{
25766 	  HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
25767 	  if (value == 1)
25768 	    {
25769 	      *total = cost->add;
25770 	      return false;
25771 	    }
25772 	  if ((value == 2 || value == 3)
25773 	      && cost->lea <= cost->shift_const)
25774 	    {
25775 	      *total = cost->lea;
25776 	      return false;
25777 	    }
25778 	}
25779       /* FALLTHRU */
25780 
25781     case ROTATE:
25782     case ASHIFTRT:
25783     case LSHIFTRT:
25784     case ROTATERT:
25785       if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
25786 	{
25787 	  if (CONST_INT_P (XEXP (x, 1)))
25788 	    {
25789 	      if (INTVAL (XEXP (x, 1)) > 32)
25790 		*total = cost->shift_const + COSTS_N_INSNS (2);
25791 	      else
25792 		*total = cost->shift_const * 2;
25793 	    }
25794 	  else
25795 	    {
25796 	      if (GET_CODE (XEXP (x, 1)) == AND)
25797 		*total = cost->shift_var * 2;
25798 	      else
25799 		*total = cost->shift_var * 6 + COSTS_N_INSNS (2);
25800 	    }
25801 	}
25802       else
25803 	{
25804 	  if (CONST_INT_P (XEXP (x, 1)))
25805 	    *total = cost->shift_const;
25806 	  else
25807 	    *total = cost->shift_var;
25808 	}
25809       return false;
25810 
25811     case MULT:
25812       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
25813 	{
25814 	  /* ??? SSE scalar cost should be used here.  */
25815 	  *total = cost->fmul;
25816 	  return false;
25817 	}
25818       else if (X87_FLOAT_MODE_P (mode))
25819 	{
25820 	  *total = cost->fmul;
25821 	  return false;
25822 	}
25823       else if (FLOAT_MODE_P (mode))
25824 	{
25825 	  /* ??? SSE vector cost should be used here.  */
25826 	  *total = cost->fmul;
25827 	  return false;
25828 	}
25829       else
25830 	{
25831 	  rtx op0 = XEXP (x, 0);
25832 	  rtx op1 = XEXP (x, 1);
25833 	  int nbits;
25834 	  if (CONST_INT_P (XEXP (x, 1)))
25835 	    {
25836 	      unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
25837 	      for (nbits = 0; value != 0; value &= value - 1)
25838 	        nbits++;
25839 	    }
25840 	  else
25841 	    /* This is arbitrary.  */
25842 	    nbits = 7;
25843 
25844 	  /* Compute costs correctly for widening multiplication.  */
25845 	  if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
25846 	      && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
25847 	         == GET_MODE_SIZE (mode))
25848 	    {
25849 	      int is_mulwiden = 0;
25850 	      enum machine_mode inner_mode = GET_MODE (op0);
25851 
25852 	      if (GET_CODE (op0) == GET_CODE (op1))
25853 		is_mulwiden = 1, op1 = XEXP (op1, 0);
25854 	      else if (CONST_INT_P (op1))
25855 		{
25856 		  if (GET_CODE (op0) == SIGN_EXTEND)
25857 		    is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
25858 			          == INTVAL (op1);
25859 		  else
25860 		    is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
25861 	        }
25862 
25863 	      if (is_mulwiden)
25864 	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
25865 	    }
25866 
25867   	  *total = (cost->mult_init[MODE_INDEX (mode)]
25868 		    + nbits * cost->mult_bit
25869 	            + rtx_cost (op0, outer_code, speed) + rtx_cost (op1, outer_code, speed));
25870 
25871           return true;
25872 	}
25873 
25874     case DIV:
25875     case UDIV:
25876     case MOD:
25877     case UMOD:
25878       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
25879 	/* ??? SSE cost should be used here.  */
25880 	*total = cost->fdiv;
25881       else if (X87_FLOAT_MODE_P (mode))
25882 	*total = cost->fdiv;
25883       else if (FLOAT_MODE_P (mode))
25884 	/* ??? SSE vector cost should be used here.  */
25885 	*total = cost->fdiv;
25886       else
25887 	*total = cost->divide[MODE_INDEX (mode)];
25888       return false;
25889 
25890     case PLUS:
25891       if (GET_MODE_CLASS (mode) == MODE_INT
25892 	       && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
25893 	{
25894 	  if (GET_CODE (XEXP (x, 0)) == PLUS
25895 	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
25896 	      && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
25897 	      && CONSTANT_P (XEXP (x, 1)))
25898 	    {
25899 	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
25900 	      if (val == 2 || val == 4 || val == 8)
25901 		{
25902 		  *total = cost->lea;
25903 		  *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code, speed);
25904 		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
25905 				      outer_code, speed);
25906 		  *total += rtx_cost (XEXP (x, 1), outer_code, speed);
25907 		  return true;
25908 		}
25909 	    }
25910 	  else if (GET_CODE (XEXP (x, 0)) == MULT
25911 		   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
25912 	    {
25913 	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
25914 	      if (val == 2 || val == 4 || val == 8)
25915 		{
25916 		  *total = cost->lea;
25917 		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed);
25918 		  *total += rtx_cost (XEXP (x, 1), outer_code, speed);
25919 		  return true;
25920 		}
25921 	    }
25922 	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
25923 	    {
25924 	      *total = cost->lea;
25925 	      *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed);
25926 	      *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code, speed);
25927 	      *total += rtx_cost (XEXP (x, 1), outer_code, speed);
25928 	      return true;
25929 	    }
25930 	}
25931       /* FALLTHRU */
25932 
25933     case MINUS:
25934       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
25935 	{
25936 	  /* ??? SSE cost should be used here.  */
25937 	  *total = cost->fadd;
25938 	  return false;
25939 	}
25940       else if (X87_FLOAT_MODE_P (mode))
25941 	{
25942 	  *total = cost->fadd;
25943 	  return false;
25944 	}
25945       else if (FLOAT_MODE_P (mode))
25946 	{
25947 	  /* ??? SSE vector cost should be used here.  */
25948 	  *total = cost->fadd;
25949 	  return false;
25950 	}
25951       /* FALLTHRU */
25952 
25953     case AND:
25954     case IOR:
25955     case XOR:
25956       if (!TARGET_64BIT && mode == DImode)
25957 	{
25958 	  *total = (cost->add * 2
25959 		    + (rtx_cost (XEXP (x, 0), outer_code, speed)
25960 		       << (GET_MODE (XEXP (x, 0)) != DImode))
25961 		    + (rtx_cost (XEXP (x, 1), outer_code, speed)
25962 	               << (GET_MODE (XEXP (x, 1)) != DImode)));
25963 	  return true;
25964 	}
25965       /* FALLTHRU */
25966 
25967     case NEG:
25968       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
25969 	{
25970 	  /* ??? SSE cost should be used here.  */
25971 	  *total = cost->fchs;
25972 	  return false;
25973 	}
25974       else if (X87_FLOAT_MODE_P (mode))
25975 	{
25976 	  *total = cost->fchs;
25977 	  return false;
25978 	}
25979       else if (FLOAT_MODE_P (mode))
25980 	{
25981 	  /* ??? SSE vector cost should be used here.  */
25982 	  *total = cost->fchs;
25983 	  return false;
25984 	}
25985       /* FALLTHRU */
25986 
25987     case NOT:
25988       if (!TARGET_64BIT && mode == DImode)
25989 	*total = cost->add * 2;
25990       else
25991 	*total = cost->add;
25992       return false;
25993 
25994     case COMPARE:
25995       if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
25996 	  && XEXP (XEXP (x, 0), 1) == const1_rtx
25997 	  && CONST_INT_P (XEXP (XEXP (x, 0), 2))
25998 	  && XEXP (x, 1) == const0_rtx)
25999 	{
26000 	  /* This kind of construct is implemented using test[bwl].
26001 	     Treat it as if we had an AND.  */
26002 	  *total = (cost->add
26003 		    + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed)
26004 		    + rtx_cost (const1_rtx, outer_code, speed));
26005 	  return true;
26006 	}
26007       return false;
26008 
26009     case FLOAT_EXTEND:
26010       if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
26011 	*total = 0;
26012       return false;
26013 
26014     case ABS:
26015       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
26016 	/* ??? SSE cost should be used here.  */
26017 	*total = cost->fabs;
26018       else if (X87_FLOAT_MODE_P (mode))
26019 	*total = cost->fabs;
26020       else if (FLOAT_MODE_P (mode))
26021 	/* ??? SSE vector cost should be used here.  */
26022 	*total = cost->fabs;
26023       return false;
26024 
26025     case SQRT:
26026       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
26027 	/* ??? SSE cost should be used here.  */
26028 	*total = cost->fsqrt;
26029       else if (X87_FLOAT_MODE_P (mode))
26030 	*total = cost->fsqrt;
26031       else if (FLOAT_MODE_P (mode))
26032 	/* ??? SSE vector cost should be used here.  */
26033 	*total = cost->fsqrt;
26034       return false;
26035 
26036     case UNSPEC:
26037       if (XINT (x, 1) == UNSPEC_TP)
26038 	*total = 0;
26039       return false;
26040 
26041     case VEC_SELECT:
26042     case VEC_CONCAT:
26043     case VEC_MERGE:
26044     case VEC_DUPLICATE:
26045       /* ??? Assume all of these vector manipulation patterns are
26046 	 recognizable.  In which case they all pretty much have the
26047 	 same cost.  */
26048      *total = COSTS_N_INSNS (1);
26049      return true;
26050 
26051     default:
26052       return false;
26053     }
26054 }
26055 
26056 #if TARGET_MACHO
26057 
26058 static int current_machopic_label_num;
26059 
26060 /* Given a symbol name and its associated stub, write out the
26061    definition of the stub.  */
26062 
26063 void
26064 machopic_output_stub (FILE *file, const char *symb, const char *stub)
26065 {
26066   unsigned int length;
26067   char *binder_name, *symbol_name, lazy_ptr_name[32];
26068   int label = ++current_machopic_label_num;
26069 
26070   /* For 64-bit we shouldn't get here.  */
26071   gcc_assert (!TARGET_64BIT);
26072 
26073   /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
26074   symb = (*targetm.strip_name_encoding) (symb);
26075 
26076   length = strlen (stub);
26077   binder_name = XALLOCAVEC (char, length + 32);
26078   GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
26079 
26080   length = strlen (symb);
26081   symbol_name = XALLOCAVEC (char, length + 32);
26082   GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
26083 
26084   sprintf (lazy_ptr_name, "L%d$lz", label);
26085 
26086   if (MACHOPIC_PURE)
26087     switch_to_section (darwin_sections[machopic_picsymbol_stub_section]);
26088   else
26089     switch_to_section (darwin_sections[machopic_symbol_stub_section]);
26090 
26091   fprintf (file, "%s:\n", stub);
26092   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
26093 
26094   if (MACHOPIC_PURE)
26095     {
26096       fprintf (file, "\tcall\tLPC$%d\nLPC$%d:\tpopl\t%%eax\n", label, label);
26097       fprintf (file, "\tmovl\t%s-LPC$%d(%%eax),%%edx\n", lazy_ptr_name, label);
26098       fprintf (file, "\tjmp\t*%%edx\n");
26099     }
26100   else
26101     fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
26102 
26103   fprintf (file, "%s:\n", binder_name);
26104 
26105   if (MACHOPIC_PURE)
26106     {
26107       fprintf (file, "\tlea\t%s-LPC$%d(%%eax),%%eax\n", lazy_ptr_name, label);
26108       fputs ("\tpushl\t%eax\n", file);
26109     }
26110   else
26111     fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
26112 
26113   fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
26114 
26115   switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]);
26116   fprintf (file, "%s:\n", lazy_ptr_name);
26117   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
26118   fprintf (file, ASM_LONG "%s\n", binder_name);
26119 }
26120 #endif /* TARGET_MACHO */
26121 
26122 /* Order the registers for register allocator.  */
26123 
26124 void
26125 x86_order_regs_for_local_alloc (void)
26126 {
26127    int pos = 0;
26128    int i;
26129 
26130    /* First allocate the local general purpose registers.  */
26131    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
26132      if (GENERAL_REGNO_P (i) && call_used_regs[i])
26133 	reg_alloc_order [pos++] = i;
26134 
26135    /* Global general purpose registers.  */
26136    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
26137      if (GENERAL_REGNO_P (i) && !call_used_regs[i])
26138 	reg_alloc_order [pos++] = i;
26139 
26140    /* x87 registers come first in case we are doing FP math
26141       using them.  */
26142    if (!TARGET_SSE_MATH)
26143      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
26144        reg_alloc_order [pos++] = i;
26145 
26146    /* SSE registers.  */
26147    for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
26148      reg_alloc_order [pos++] = i;
26149    for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
26150      reg_alloc_order [pos++] = i;
26151 
26152    /* x87 registers.  */
26153    if (TARGET_SSE_MATH)
26154      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
26155        reg_alloc_order [pos++] = i;
26156 
26157    for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
26158      reg_alloc_order [pos++] = i;
26159 
26160    /* Initialize the rest of array as we do not allocate some registers
26161       at all.  */
26162    while (pos < FIRST_PSEUDO_REGISTER)
26163      reg_alloc_order [pos++] = 0;
26164 }
26165 
26166 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
26167    struct attribute_spec.handler.  */
26168 static tree
26169 ix86_handle_abi_attribute (tree *node, tree name,
26170 			      tree args ATTRIBUTE_UNUSED,
26171 			      int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
26172 {
26173   if (TREE_CODE (*node) != FUNCTION_TYPE
26174       && TREE_CODE (*node) != METHOD_TYPE
26175       && TREE_CODE (*node) != FIELD_DECL
26176       && TREE_CODE (*node) != TYPE_DECL)
26177     {
26178       warning (OPT_Wattributes, "%qE attribute only applies to functions",
26179 	       name);
26180       *no_add_attrs = true;
26181       return NULL_TREE;
26182     }
26183   if (!TARGET_64BIT)
26184     {
26185       warning (OPT_Wattributes, "%qE attribute only available for 64-bit",
26186 	       name);
26187       *no_add_attrs = true;
26188       return NULL_TREE;
26189     }
26190 
26191   /* Can combine regparm with all attributes but fastcall.  */
26192   if (is_attribute_p ("ms_abi", name))
26193     {
26194       if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
26195         {
26196 	  error ("ms_abi and sysv_abi attributes are not compatible");
26197 	}
26198 
26199       return NULL_TREE;
26200     }
26201   else if (is_attribute_p ("sysv_abi", name))
26202     {
26203       if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
26204         {
26205 	  error ("ms_abi and sysv_abi attributes are not compatible");
26206 	}
26207 
26208       return NULL_TREE;
26209     }
26210 
26211   return NULL_TREE;
26212 }
26213 
26214 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
26215    struct attribute_spec.handler.  */
26216 static tree
26217 ix86_handle_struct_attribute (tree *node, tree name,
26218 			      tree args ATTRIBUTE_UNUSED,
26219 			      int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
26220 {
26221   tree *type = NULL;
26222   if (DECL_P (*node))
26223     {
26224       if (TREE_CODE (*node) == TYPE_DECL)
26225 	type = &TREE_TYPE (*node);
26226     }
26227   else
26228     type = node;
26229 
26230   if (!(type && (TREE_CODE (*type) == RECORD_TYPE
26231 		 || TREE_CODE (*type) == UNION_TYPE)))
26232     {
26233       warning (OPT_Wattributes, "%qE attribute ignored",
26234 	       name);
26235       *no_add_attrs = true;
26236     }
26237 
26238   else if ((is_attribute_p ("ms_struct", name)
26239 	    && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
26240 	   || ((is_attribute_p ("gcc_struct", name)
26241 		&& lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
26242     {
26243       warning (OPT_Wattributes, "%qE incompatible attribute ignored",
26244                name);
26245       *no_add_attrs = true;
26246     }
26247 
26248   return NULL_TREE;
26249 }
26250 
26251 static tree
26252 ix86_handle_fndecl_attribute (tree *node, tree name,
26253                               tree args ATTRIBUTE_UNUSED,
26254                               int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
26255 {
26256   if (TREE_CODE (*node) != FUNCTION_DECL)
26257     {
26258       warning (OPT_Wattributes, "%qE attribute only applies to functions",
26259                name);
26260       *no_add_attrs = true;
26261       return NULL_TREE;
26262     }
26263 
26264   if (TARGET_64BIT)
26265     {
26266       warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
26267                name);
26268       return NULL_TREE;
26269     }
26270 
26271 #ifndef HAVE_AS_IX86_SWAP
26272   sorry ("ms_hook_prologue attribute needs assembler swap suffix support");
26273 #endif
26274 
26275     return NULL_TREE;
26276 }
26277 
26278 static bool
26279 ix86_ms_bitfield_layout_p (const_tree record_type)
26280 {
26281   return (TARGET_MS_BITFIELD_LAYOUT &&
26282 	  !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
26283     || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type));
26284 }
26285 
26286 /* Returns an expression indicating where the this parameter is
26287    located on entry to the FUNCTION.  */
26288 
26289 static rtx
26290 x86_this_parameter (tree function)
26291 {
26292   tree type = TREE_TYPE (function);
26293   bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
26294   int nregs;
26295 
26296   if (TARGET_64BIT)
26297     {
26298       const int *parm_regs;
26299 
26300       if (ix86_function_type_abi (type) == MS_ABI)
26301         parm_regs = x86_64_ms_abi_int_parameter_registers;
26302       else
26303         parm_regs = x86_64_int_parameter_registers;
26304       return gen_rtx_REG (DImode, parm_regs[aggr]);
26305     }
26306 
26307   nregs = ix86_function_regparm (type, function);
26308 
26309   if (nregs > 0 && !stdarg_p (type))
26310     {
26311       int regno;
26312 
26313       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
26314 	regno = aggr ? DX_REG : CX_REG;
26315       else
26316         {
26317 	  regno = AX_REG;
26318 	  if (aggr)
26319 	    {
26320 	      regno = DX_REG;
26321 	      if (nregs == 1)
26322 		return gen_rtx_MEM (SImode,
26323 				    plus_constant (stack_pointer_rtx, 4));
26324 	    }
26325 	}
26326       return gen_rtx_REG (SImode, regno);
26327     }
26328 
26329   return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
26330 }
26331 
26332 /* Determine whether x86_output_mi_thunk can succeed.  */
26333 
26334 static bool
26335 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
26336 			 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
26337 			 HOST_WIDE_INT vcall_offset, const_tree function)
26338 {
26339   /* 64-bit can handle anything.  */
26340   if (TARGET_64BIT)
26341     return true;
26342 
26343   /* For 32-bit, everything's fine if we have one free register.  */
26344   if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
26345     return true;
26346 
26347   /* Need a free register for vcall_offset.  */
26348   if (vcall_offset)
26349     return false;
26350 
26351   /* Need a free register for GOT references.  */
26352   if (flag_pic && !(*targetm.binds_local_p) (function))
26353     return false;
26354 
26355   /* Otherwise ok.  */
26356   return true;
26357 }
26358 
26359 /* Output the assembler code for a thunk function.  THUNK_DECL is the
26360    declaration for the thunk function itself, FUNCTION is the decl for
26361    the target function.  DELTA is an immediate constant offset to be
26362    added to THIS.  If VCALL_OFFSET is nonzero, the word at
26363    *(*this + vcall_offset) should be added to THIS.  */
26364 
26365 static void
26366 x86_output_mi_thunk (FILE *file,
26367 		     tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
26368 		     HOST_WIDE_INT vcall_offset, tree function)
26369 {
26370   rtx xops[3];
26371   rtx this_param = x86_this_parameter (function);
26372   rtx this_reg, tmp;
26373 
26374   /* Make sure unwind info is emitted for the thunk if needed.  */
26375   final_start_function (emit_barrier (), file, 1);
26376 
26377   /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
26378      pull it in now and let DELTA benefit.  */
26379   if (REG_P (this_param))
26380     this_reg = this_param;
26381   else if (vcall_offset)
26382     {
26383       /* Put the this parameter into %eax.  */
26384       xops[0] = this_param;
26385       xops[1] = this_reg = gen_rtx_REG (Pmode, AX_REG);
26386       output_asm_insn ("mov%z1\t{%0, %1|%1, %0}", xops);
26387     }
26388   else
26389     this_reg = NULL_RTX;
26390 
26391   /* Adjust the this parameter by a fixed constant.  */
26392   if (delta)
26393     {
26394       /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
26395          Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
26396       bool sub = delta < 0 || delta == 128;
26397       xops[0] = GEN_INT (sub ? -delta : delta);
26398       xops[1] = this_reg ? this_reg : this_param;
26399       if (TARGET_64BIT)
26400 	{
26401 	  if (!x86_64_general_operand (xops[0], DImode))
26402 	    {
26403 	      tmp = gen_rtx_REG (DImode, R10_REG);
26404 	      xops[1] = tmp;
26405 	      output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", xops);
26406 	      xops[0] = tmp;
26407 	      xops[1] = this_param;
26408 	    }
26409 	  if (sub)
26410 	    output_asm_insn ("sub{q}\t{%0, %1|%1, %0}", xops);
26411 	  else
26412 	    output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
26413 	}
26414       else if (sub)
26415 	output_asm_insn ("sub{l}\t{%0, %1|%1, %0}", xops);
26416       else
26417 	output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
26418     }
26419 
26420   /* Adjust the this parameter by a value stored in the vtable.  */
26421   if (vcall_offset)
26422     {
26423       if (TARGET_64BIT)
26424 	tmp = gen_rtx_REG (DImode, R10_REG);
26425       else
26426 	{
26427 	  int tmp_regno = CX_REG;
26428 	  if (lookup_attribute ("fastcall",
26429 				TYPE_ATTRIBUTES (TREE_TYPE (function))))
26430 	    tmp_regno = AX_REG;
26431 	  tmp = gen_rtx_REG (SImode, tmp_regno);
26432 	}
26433 
26434       xops[0] = gen_rtx_MEM (Pmode, this_reg);
26435       xops[1] = tmp;
26436       output_asm_insn ("mov%z1\t{%0, %1|%1, %0}", xops);
26437 
26438       /* Adjust the this parameter.  */
26439       xops[0] = gen_rtx_MEM (Pmode, plus_constant (tmp, vcall_offset));
26440       if (TARGET_64BIT && !memory_operand (xops[0], Pmode))
26441 	{
26442 	  rtx tmp2 = gen_rtx_REG (DImode, R11_REG);
26443 	  xops[0] = GEN_INT (vcall_offset);
26444 	  xops[1] = tmp2;
26445 	  output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops);
26446 	  xops[0] = gen_rtx_MEM (Pmode, gen_rtx_PLUS (Pmode, tmp, tmp2));
26447 	}
26448       xops[1] = this_reg;
26449       output_asm_insn ("add%z1\t{%0, %1|%1, %0}", xops);
26450     }
26451 
26452   /* If necessary, drop THIS back to its stack slot.  */
26453   if (this_reg && this_reg != this_param)
26454     {
26455       xops[0] = this_reg;
26456       xops[1] = this_param;
26457       output_asm_insn ("mov%z1\t{%0, %1|%1, %0}", xops);
26458     }
26459 
26460   xops[0] = XEXP (DECL_RTL (function), 0);
26461   if (TARGET_64BIT)
26462     {
26463       if (!flag_pic || (*targetm.binds_local_p) (function))
26464 	output_asm_insn ("jmp\t%P0", xops);
26465       /* All thunks should be in the same object as their target,
26466 	 and thus binds_local_p should be true.  */
26467       else if (TARGET_64BIT && cfun->machine->call_abi == MS_ABI)
26468 	gcc_unreachable ();
26469       else
26470 	{
26471 	  tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, xops[0]), UNSPEC_GOTPCREL);
26472 	  tmp = gen_rtx_CONST (Pmode, tmp);
26473 	  tmp = gen_rtx_MEM (QImode, tmp);
26474 	  xops[0] = tmp;
26475 	  output_asm_insn ("jmp\t%A0", xops);
26476 	}
26477     }
26478   else
26479     {
26480       if (!flag_pic || (*targetm.binds_local_p) (function))
26481 	output_asm_insn ("jmp\t%P0", xops);
26482       else
26483 #if TARGET_MACHO
26484 	if (TARGET_MACHO)
26485 	  {
26486 	    rtx sym_ref = XEXP (DECL_RTL (function), 0);
26487 	    tmp = (gen_rtx_SYMBOL_REF
26488 		   (Pmode,
26489 		    machopic_indirection_name (sym_ref, /*stub_p=*/true)));
26490 	    tmp = gen_rtx_MEM (QImode, tmp);
26491 	    xops[0] = tmp;
26492 	    output_asm_insn ("jmp\t%0", xops);
26493 	  }
26494 	else
26495 #endif /* TARGET_MACHO */
26496 	{
26497 	  tmp = gen_rtx_REG (SImode, CX_REG);
26498 	  output_set_got (tmp, NULL_RTX);
26499 
26500 	  xops[1] = tmp;
26501 	  output_asm_insn ("mov{l}\t{%0@GOT(%1), %1|%1, %0@GOT[%1]}", xops);
26502 	  output_asm_insn ("jmp\t{*}%1", xops);
26503 	}
26504     }
26505   final_end_function ();
26506 }
26507 
26508 static void
26509 x86_file_start (void)
26510 {
26511   default_file_start ();
26512 #if TARGET_MACHO
26513   darwin_file_start ();
26514 #endif
26515   if (X86_FILE_START_VERSION_DIRECTIVE)
26516     fputs ("\t.version\t\"01.01\"\n", asm_out_file);
26517   if (X86_FILE_START_FLTUSED)
26518     fputs ("\t.global\t__fltused\n", asm_out_file);
26519   if (ix86_asm_dialect == ASM_INTEL)
26520     fputs ("\t.intel_syntax noprefix\n", asm_out_file);
26521 }
26522 
26523 int
26524 x86_field_alignment (tree field, int computed)
26525 {
26526   enum machine_mode mode;
26527   tree type = TREE_TYPE (field);
26528 
26529   if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
26530     return computed;
26531   mode = TYPE_MODE (strip_array_types (type));
26532   if (mode == DFmode || mode == DCmode
26533       || GET_MODE_CLASS (mode) == MODE_INT
26534       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
26535     return MIN (32, computed);
26536   return computed;
26537 }
26538 
26539 /* Output assembler code to FILE to increment profiler label # LABELNO
26540    for profiling a function entry.  */
26541 void
26542 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
26543 {
26544   if (TARGET_64BIT)
26545     {
26546 #ifndef NO_PROFILE_COUNTERS
26547       fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
26548 #endif
26549 
26550       if (DEFAULT_ABI == SYSV_ABI && flag_pic)
26551 	fputs ("\tcall\t*" MCOUNT_NAME "@GOTPCREL(%rip)\n", file);
26552       else
26553 	fputs ("\tcall\t" MCOUNT_NAME "\n", file);
26554     }
26555   else if (flag_pic)
26556     {
26557 #ifndef NO_PROFILE_COUNTERS
26558       fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
26559 	       LPREFIX, labelno);
26560 #endif
26561       fputs ("\tcall\t*" MCOUNT_NAME "@GOT(%ebx)\n", file);
26562     }
26563   else
26564     {
26565 #ifndef NO_PROFILE_COUNTERS
26566       fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
26567 	       LPREFIX, labelno);
26568 #endif
26569       fputs ("\tcall\t" MCOUNT_NAME "\n", file);
26570     }
26571 }
26572 
26573 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
26574 /* We don't have exact information about the insn sizes, but we may assume
26575    quite safely that we are informed about all 1 byte insns and memory
26576    address sizes.  This is enough to eliminate unnecessary padding in
26577    99% of cases.  */
26578 
26579 static int
26580 min_insn_size (rtx insn)
26581 {
26582   int l = 0, len;
26583 
26584   if (!INSN_P (insn) || !active_insn_p (insn))
26585     return 0;
26586 
26587   /* Discard alignments we've emit and jump instructions.  */
26588   if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
26589       && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
26590     return 0;
26591   if (JUMP_TABLE_DATA_P (insn))
26592     return 0;
26593 
26594   /* Important case - calls are always 5 bytes.
26595      It is common to have many calls in the row.  */
26596   if (CALL_P (insn)
26597       && symbolic_reference_mentioned_p (PATTERN (insn))
26598       && !SIBLING_CALL_P (insn))
26599     return 5;
26600   len = get_attr_length (insn);
26601   if (len <= 1)
26602     return 1;
26603 
26604   /* For normal instructions we rely on get_attr_length being exact,
26605      with a few exceptions.  */
26606   if (!JUMP_P (insn))
26607     {
26608       enum attr_type type = get_attr_type (insn);
26609 
26610       switch (type)
26611 	{
26612 	case TYPE_MULTI:
26613 	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
26614 	      || asm_noperands (PATTERN (insn)) >= 0)
26615 	    return 0;
26616 	  break;
26617 	case TYPE_OTHER:
26618 	case TYPE_FCMP:
26619 	  break;
26620 	default:
26621 	  /* Otherwise trust get_attr_length.  */
26622 	  return len;
26623 	}
26624 
26625       l = get_attr_length_address (insn);
26626       if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
26627 	l = 4;
26628     }
26629   if (l)
26630     return 1+l;
26631   else
26632     return 2;
26633 }
26634 
26635 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
26636    window.  */
26637 
26638 static void
26639 ix86_avoid_jump_mispredicts (void)
26640 {
26641   rtx insn, start = get_insns ();
26642   int nbytes = 0, njumps = 0;
26643   int isjump = 0;
26644 
26645   /* Look for all minimal intervals of instructions containing 4 jumps.
26646      The intervals are bounded by START and INSN.  NBYTES is the total
26647      size of instructions in the interval including INSN and not including
26648      START.  When the NBYTES is smaller than 16 bytes, it is possible
26649      that the end of START and INSN ends up in the same 16byte page.
26650 
26651      The smallest offset in the page INSN can start is the case where START
26652      ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
26653      We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
26654      */
26655   for (insn = start; insn; insn = NEXT_INSN (insn))
26656     {
26657       int min_size;
26658 
26659       if (LABEL_P (insn))
26660 	{
26661 	  int align = label_to_alignment (insn);
26662 	  int max_skip = label_to_max_skip (insn);
26663 
26664 	  if (max_skip > 15)
26665 	    max_skip = 15;
26666 	  /* If align > 3, only up to 16 - max_skip - 1 bytes can be
26667 	     already in the current 16 byte page, because otherwise
26668 	     ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
26669 	     bytes to reach 16 byte boundary.  */
26670 	  if (align <= 0
26671 	      || (align <= 3 && max_skip != (1 << align) - 1))
26672 	    max_skip = 0;
26673 	  if (dump_file)
26674 	    fprintf (dump_file, "Label %i with max_skip %i\n",
26675 		     INSN_UID (insn), max_skip);
26676 	  if (max_skip)
26677 	    {
26678 	      while (nbytes + max_skip >= 16)
26679 		{
26680 		  start = NEXT_INSN (start);
26681 		  if ((JUMP_P (start)
26682 		       && GET_CODE (PATTERN (start)) != ADDR_VEC
26683 		       && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
26684 		      || CALL_P (start))
26685 		    njumps--, isjump = 1;
26686 		  else
26687 		    isjump = 0;
26688 		  nbytes -= min_insn_size (start);
26689 		}
26690 	    }
26691 	  continue;
26692 	}
26693 
26694       min_size = min_insn_size (insn);
26695       nbytes += min_size;
26696       if (dump_file)
26697 	fprintf (dump_file, "Insn %i estimated to %i bytes\n",
26698 		 INSN_UID (insn), min_size);
26699       if ((JUMP_P (insn)
26700 	   && GET_CODE (PATTERN (insn)) != ADDR_VEC
26701 	   && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
26702 	  || CALL_P (insn))
26703 	njumps++;
26704       else
26705 	continue;
26706 
26707       while (njumps > 3)
26708 	{
26709 	  start = NEXT_INSN (start);
26710 	  if ((JUMP_P (start)
26711 	       && GET_CODE (PATTERN (start)) != ADDR_VEC
26712 	       && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
26713 	      || CALL_P (start))
26714 	    njumps--, isjump = 1;
26715 	  else
26716 	    isjump = 0;
26717 	  nbytes -= min_insn_size (start);
26718 	}
26719       gcc_assert (njumps >= 0);
26720       if (dump_file)
26721         fprintf (dump_file, "Interval %i to %i has %i bytes\n",
26722 		 INSN_UID (start), INSN_UID (insn), nbytes);
26723 
26724       if (njumps == 3 && isjump && nbytes < 16)
26725 	{
26726 	  int padsize = 15 - nbytes + min_insn_size (insn);
26727 
26728 	  if (dump_file)
26729 	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
26730 		     INSN_UID (insn), padsize);
26731           emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
26732 	}
26733     }
26734 }
26735 #endif
26736 
26737 /* AMD Athlon works faster
26738    when RET is not destination of conditional jump or directly preceded
26739    by other jump instruction.  We avoid the penalty by inserting NOP just
26740    before the RET instructions in such cases.  */
26741 static void
26742 ix86_pad_returns (void)
26743 {
26744   edge e;
26745   edge_iterator ei;
26746 
26747   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
26748     {
26749       basic_block bb = e->src;
26750       rtx ret = BB_END (bb);
26751       rtx prev;
26752       bool replace = false;
26753 
26754       if (!JUMP_P (ret) || GET_CODE (PATTERN (ret)) != RETURN
26755 	  || optimize_bb_for_size_p (bb))
26756 	continue;
26757       for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
26758 	if (active_insn_p (prev) || LABEL_P (prev))
26759 	  break;
26760       if (prev && LABEL_P (prev))
26761 	{
26762 	  edge e;
26763 	  edge_iterator ei;
26764 
26765 	  FOR_EACH_EDGE (e, ei, bb->preds)
26766 	    if (EDGE_FREQUENCY (e) && e->src->index >= 0
26767 		&& !(e->flags & EDGE_FALLTHRU))
26768 	      replace = true;
26769 	}
26770       if (!replace)
26771 	{
26772 	  prev = prev_active_insn (ret);
26773 	  if (prev
26774 	      && ((JUMP_P (prev) && any_condjump_p (prev))
26775 		  || CALL_P (prev)))
26776 	    replace = true;
26777 	  /* Empty functions get branch mispredict even when the jump destination
26778 	     is not visible to us.  */
26779 	  if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED)
26780 	    replace = true;
26781 	}
26782       if (replace)
26783 	{
26784 	  emit_jump_insn_before (gen_return_internal_long (), ret);
26785 	  delete_insn (ret);
26786 	}
26787     }
26788 }
26789 
26790 /* Implement machine specific optimizations.  We implement padding of returns
26791    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
26792 static void
26793 ix86_reorg (void)
26794 {
26795   if (optimize && optimize_function_for_speed_p (cfun))
26796     {
26797       if (TARGET_PAD_RETURNS)
26798 	ix86_pad_returns ();
26799 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
26800       if (TARGET_FOUR_JUMP_LIMIT)
26801 	ix86_avoid_jump_mispredicts ();
26802 #endif
26803     }
26804 }
26805 
26806 /* Return nonzero when QImode register that must be represented via REX prefix
26807    is used.  */
26808 bool
26809 x86_extended_QIreg_mentioned_p (rtx insn)
26810 {
26811   int i;
26812   extract_insn_cached (insn);
26813   for (i = 0; i < recog_data.n_operands; i++)
26814     if (REG_P (recog_data.operand[i])
26815 	&& REGNO (recog_data.operand[i]) > BX_REG)
26816        return true;
26817   return false;
26818 }
26819 
26820 /* Return nonzero when P points to register encoded via REX prefix.
26821    Called via for_each_rtx.  */
26822 static int
26823 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
26824 {
26825    unsigned int regno;
26826    if (!REG_P (*p))
26827      return 0;
26828    regno = REGNO (*p);
26829    return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
26830 }
26831 
26832 /* Return true when INSN mentions register that must be encoded using REX
26833    prefix.  */
26834 bool
26835 x86_extended_reg_mentioned_p (rtx insn)
26836 {
26837   return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
26838 		       extended_reg_mentioned_1, NULL);
26839 }
26840 
26841 /* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
26842    optabs would emit if we didn't have TFmode patterns.  */
26843 
26844 void
26845 x86_emit_floatuns (rtx operands[2])
26846 {
26847   rtx neglab, donelab, i0, i1, f0, in, out;
26848   enum machine_mode mode, inmode;
26849 
26850   inmode = GET_MODE (operands[1]);
26851   gcc_assert (inmode == SImode || inmode == DImode);
26852 
26853   out = operands[0];
26854   in = force_reg (inmode, operands[1]);
26855   mode = GET_MODE (out);
26856   neglab = gen_label_rtx ();
26857   donelab = gen_label_rtx ();
26858   f0 = gen_reg_rtx (mode);
26859 
26860   emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
26861 
26862   expand_float (out, in, 0);
26863 
26864   emit_jump_insn (gen_jump (donelab));
26865   emit_barrier ();
26866 
26867   emit_label (neglab);
26868 
26869   i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
26870 			    1, OPTAB_DIRECT);
26871   i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
26872 			    1, OPTAB_DIRECT);
26873   i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
26874 
26875   expand_float (f0, i0, 0);
26876 
26877   emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
26878 
26879   emit_label (donelab);
26880 }
26881 
26882 /* AVX does not support 32-byte integer vector operations,
26883    thus the longest vector we are faced with is V16QImode.  */
26884 #define MAX_VECT_LEN	16
26885 
26886 struct expand_vec_perm_d
26887 {
26888   rtx target, op0, op1;
26889   unsigned char perm[MAX_VECT_LEN];
26890   enum machine_mode vmode;
26891   unsigned char nelt;
26892   bool testing_p;
26893 };
26894 
26895 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
26896 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
26897 
26898 /* Get a vector mode of the same size as the original but with elements
26899    twice as wide.  This is only guaranteed to apply to integral vectors.  */
26900 
26901 static inline enum machine_mode
26902 get_mode_wider_vector (enum machine_mode o)
26903 {
26904   /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
26905   enum machine_mode n = GET_MODE_WIDER_MODE (o);
26906   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
26907   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
26908   return n;
26909 }
26910 
26911 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
26912    with all elements equal to VAR.  Return true if successful.  */
26913 
26914 static bool
26915 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
26916 				   rtx target, rtx val)
26917 {
26918   bool ok;
26919 
26920   switch (mode)
26921     {
26922     case V2SImode:
26923     case V2SFmode:
26924       if (!mmx_ok)
26925 	return false;
26926       /* FALLTHRU */
26927 
26928     case V4DFmode:
26929     case V4DImode:
26930     case V8SFmode:
26931     case V8SImode:
26932     case V2DFmode:
26933     case V2DImode:
26934     case V4SFmode:
26935     case V4SImode:
26936       {
26937 	rtx insn, dup;
26938 
26939 	/* First attempt to recognize VAL as-is.  */
26940 	dup = gen_rtx_VEC_DUPLICATE (mode, val);
26941 	insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
26942 	if (recog_memoized (insn) < 0)
26943 	  {
26944 	    rtx seq;
26945 	    /* If that fails, force VAL into a register.  */
26946 
26947 	    start_sequence ();
26948 	    XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
26949 	    seq = get_insns ();
26950 	    end_sequence ();
26951 	    if (seq)
26952 	      emit_insn_before (seq, insn);
26953 
26954 	    ok = recog_memoized (insn) >= 0;
26955 	    gcc_assert (ok);
26956 	  }
26957       }
26958       return true;
26959 
26960     case V4HImode:
26961       if (!mmx_ok)
26962 	return false;
26963       if (TARGET_SSE || TARGET_3DNOW_A)
26964 	{
26965 	  rtx x;
26966 
26967 	  val = gen_lowpart (SImode, val);
26968 	  x = gen_rtx_TRUNCATE (HImode, val);
26969 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
26970 	  emit_insn (gen_rtx_SET (VOIDmode, target, x));
26971 	  return true;
26972 	}
26973       goto widen;
26974 
26975     case V8QImode:
26976       if (!mmx_ok)
26977 	return false;
26978       goto widen;
26979 
26980     case V8HImode:
26981       if (TARGET_SSE2)
26982 	{
26983 	  struct expand_vec_perm_d dperm;
26984 	  rtx tmp1, tmp2;
26985 
26986 	permute:
26987 	  memset (&dperm, 0, sizeof (dperm));
26988 	  dperm.target = target;
26989 	  dperm.vmode = mode;
26990 	  dperm.nelt = GET_MODE_NUNITS (mode);
26991 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
26992 
26993 	  /* Extend to SImode using a paradoxical SUBREG.  */
26994 	  tmp1 = gen_reg_rtx (SImode);
26995 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
26996 
26997 	  /* Insert the SImode value as low element of a V4SImode vector. */
26998 	  tmp2 = gen_lowpart (V4SImode, dperm.op0);
26999 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
27000 
27001 	  ok = (expand_vec_perm_1 (&dperm)
27002 		|| expand_vec_perm_broadcast_1 (&dperm));
27003 	  gcc_assert (ok);
27004 	  return ok;
27005 	}
27006       goto widen;
27007 
27008     case V16QImode:
27009       if (TARGET_SSE2)
27010 	goto permute;
27011       goto widen;
27012 
27013     widen:
27014       /* Replicate the value once into the next wider mode and recurse.  */
27015       {
27016 	enum machine_mode smode, wsmode, wvmode;
27017 	rtx x;
27018 
27019 	smode = GET_MODE_INNER (mode);
27020 	wvmode = get_mode_wider_vector (mode);
27021 	wsmode = GET_MODE_INNER (wvmode);
27022 
27023 	val = convert_modes (wsmode, smode, val, true);
27024 	x = expand_simple_binop (wsmode, ASHIFT, val,
27025 				 GEN_INT (GET_MODE_BITSIZE (smode)),
27026 				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
27027 	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
27028 
27029 	x = gen_lowpart (wvmode, target);
27030 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
27031 	gcc_assert (ok);
27032 	return ok;
27033       }
27034 
27035     case V16HImode:
27036     case V32QImode:
27037       {
27038 	enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
27039 	rtx x = gen_reg_rtx (hvmode);
27040 
27041 	ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
27042 	gcc_assert (ok);
27043 
27044 	x = gen_rtx_VEC_CONCAT (mode, x, x);
27045 	emit_insn (gen_rtx_SET (VOIDmode, target, x));
27046       }
27047       return true;
27048 
27049     default:
27050       return false;
27051     }
27052 }
27053 
27054 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
27055    whose ONE_VAR element is VAR, and other elements are zero.  Return true
27056    if successful.  */
27057 
27058 static bool
27059 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
27060 				     rtx target, rtx var, int one_var)
27061 {
27062   enum machine_mode vsimode;
27063   rtx new_target;
27064   rtx x, tmp;
27065   bool use_vector_set = false;
27066 
27067   switch (mode)
27068     {
27069     case V2DImode:
27070       /* For SSE4.1, we normally use vector set.  But if the second
27071 	 element is zero and inter-unit moves are OK, we use movq
27072 	 instead.  */
27073       use_vector_set = (TARGET_64BIT
27074 			&& TARGET_SSE4_1
27075 			&& !(TARGET_INTER_UNIT_MOVES
27076 			     && one_var == 0));
27077       break;
27078     case V16QImode:
27079     case V4SImode:
27080     case V4SFmode:
27081       use_vector_set = TARGET_SSE4_1;
27082       break;
27083     case V8HImode:
27084       use_vector_set = TARGET_SSE2;
27085       break;
27086     case V4HImode:
27087       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
27088       break;
27089     case V32QImode:
27090     case V16HImode:
27091     case V8SImode:
27092     case V8SFmode:
27093     case V4DFmode:
27094       use_vector_set = TARGET_AVX;
27095       break;
27096     case V4DImode:
27097       /* Use ix86_expand_vector_set in 64bit mode only.  */
27098       use_vector_set = TARGET_AVX && TARGET_64BIT;
27099       break;
27100     default:
27101       break;
27102     }
27103 
27104   if (use_vector_set)
27105     {
27106       emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
27107       var = force_reg (GET_MODE_INNER (mode), var);
27108       ix86_expand_vector_set (mmx_ok, target, var, one_var);
27109       return true;
27110     }
27111 
27112   switch (mode)
27113     {
27114     case V2SFmode:
27115     case V2SImode:
27116       if (!mmx_ok)
27117 	return false;
27118       /* FALLTHRU */
27119 
27120     case V2DFmode:
27121     case V2DImode:
27122       if (one_var != 0)
27123 	return false;
27124       var = force_reg (GET_MODE_INNER (mode), var);
27125       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
27126       emit_insn (gen_rtx_SET (VOIDmode, target, x));
27127       return true;
27128 
27129     case V4SFmode:
27130     case V4SImode:
27131       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
27132 	new_target = gen_reg_rtx (mode);
27133       else
27134 	new_target = target;
27135       var = force_reg (GET_MODE_INNER (mode), var);
27136       x = gen_rtx_VEC_DUPLICATE (mode, var);
27137       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
27138       emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
27139       if (one_var != 0)
27140 	{
27141 	  /* We need to shuffle the value to the correct position, so
27142 	     create a new pseudo to store the intermediate result.  */
27143 
27144 	  /* With SSE2, we can use the integer shuffle insns.  */
27145 	  if (mode != V4SFmode && TARGET_SSE2)
27146 	    {
27147 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
27148 					    const1_rtx,
27149 					    GEN_INT (one_var == 1 ? 0 : 1),
27150 					    GEN_INT (one_var == 2 ? 0 : 1),
27151 					    GEN_INT (one_var == 3 ? 0 : 1)));
27152 	      if (target != new_target)
27153 		emit_move_insn (target, new_target);
27154 	      return true;
27155 	    }
27156 
27157 	  /* Otherwise convert the intermediate result to V4SFmode and
27158 	     use the SSE1 shuffle instructions.  */
27159 	  if (mode != V4SFmode)
27160 	    {
27161 	      tmp = gen_reg_rtx (V4SFmode);
27162 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
27163 	    }
27164 	  else
27165 	    tmp = new_target;
27166 
27167 	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
27168 				       const1_rtx,
27169 				       GEN_INT (one_var == 1 ? 0 : 1),
27170 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
27171 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
27172 
27173 	  if (mode != V4SFmode)
27174 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
27175 	  else if (tmp != target)
27176 	    emit_move_insn (target, tmp);
27177 	}
27178       else if (target != new_target)
27179 	emit_move_insn (target, new_target);
27180       return true;
27181 
27182     case V8HImode:
27183     case V16QImode:
27184       vsimode = V4SImode;
27185       goto widen;
27186     case V4HImode:
27187     case V8QImode:
27188       if (!mmx_ok)
27189 	return false;
27190       vsimode = V2SImode;
27191       goto widen;
27192     widen:
27193       if (one_var != 0)
27194 	return false;
27195 
27196       /* Zero extend the variable element to SImode and recurse.  */
27197       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
27198 
27199       x = gen_reg_rtx (vsimode);
27200       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
27201 						var, one_var))
27202 	gcc_unreachable ();
27203 
27204       emit_move_insn (target, gen_lowpart (mode, x));
27205       return true;
27206 
27207     default:
27208       return false;
27209     }
27210 }
27211 
27212 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
27213    consisting of the values in VALS.  It is known that all elements
27214    except ONE_VAR are constants.  Return true if successful.  */
27215 
27216 static bool
27217 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
27218 				 rtx target, rtx vals, int one_var)
27219 {
27220   rtx var = XVECEXP (vals, 0, one_var);
27221   enum machine_mode wmode;
27222   rtx const_vec, x;
27223 
27224   const_vec = copy_rtx (vals);
27225   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
27226   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
27227 
27228   switch (mode)
27229     {
27230     case V2DFmode:
27231     case V2DImode:
27232     case V2SFmode:
27233     case V2SImode:
27234       /* For the two element vectors, it's just as easy to use
27235 	 the general case.  */
27236       return false;
27237 
27238     case V4DImode:
27239       /* Use ix86_expand_vector_set in 64bit mode only.  */
27240       if (!TARGET_64BIT)
27241 	return false;
27242     case V4DFmode:
27243     case V8SFmode:
27244     case V8SImode:
27245     case V16HImode:
27246     case V32QImode:
27247     case V4SFmode:
27248     case V4SImode:
27249     case V8HImode:
27250     case V4HImode:
27251       break;
27252 
27253     case V16QImode:
27254       if (TARGET_SSE4_1)
27255 	break;
27256       wmode = V8HImode;
27257       goto widen;
27258     case V8QImode:
27259       wmode = V4HImode;
27260       goto widen;
27261     widen:
27262       /* There's no way to set one QImode entry easily.  Combine
27263 	 the variable value with its adjacent constant value, and
27264 	 promote to an HImode set.  */
27265       x = XVECEXP (vals, 0, one_var ^ 1);
27266       if (one_var & 1)
27267 	{
27268 	  var = convert_modes (HImode, QImode, var, true);
27269 	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
27270 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
27271 	  x = GEN_INT (INTVAL (x) & 0xff);
27272 	}
27273       else
27274 	{
27275 	  var = convert_modes (HImode, QImode, var, true);
27276 	  x = gen_int_mode (INTVAL (x) << 8, HImode);
27277 	}
27278       if (x != const0_rtx)
27279 	var = expand_simple_binop (HImode, IOR, var, x, var,
27280 				   1, OPTAB_LIB_WIDEN);
27281 
27282       x = gen_reg_rtx (wmode);
27283       emit_move_insn (x, gen_lowpart (wmode, const_vec));
27284       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
27285 
27286       emit_move_insn (target, gen_lowpart (mode, x));
27287       return true;
27288 
27289     default:
27290       return false;
27291     }
27292 
27293   emit_move_insn (target, const_vec);
27294   ix86_expand_vector_set (mmx_ok, target, var, one_var);
27295   return true;
27296 }
27297 
27298 /* A subroutine of ix86_expand_vector_init_general.  Use vector
27299    concatenate to handle the most general case: all values variable,
27300    and none identical.  */
27301 
27302 static void
27303 ix86_expand_vector_init_concat (enum machine_mode mode,
27304 				rtx target, rtx *ops, int n)
27305 {
27306   enum machine_mode cmode, hmode = VOIDmode;
27307   rtx first[8], second[4];
27308   rtvec v;
27309   int i, j;
27310 
27311   switch (n)
27312     {
27313     case 2:
27314       switch (mode)
27315 	{
27316 	case V8SImode:
27317 	  cmode = V4SImode;
27318 	  break;
27319 	case V8SFmode:
27320 	  cmode = V4SFmode;
27321 	  break;
27322 	case V4DImode:
27323 	  cmode = V2DImode;
27324 	  break;
27325 	case V4DFmode:
27326 	  cmode = V2DFmode;
27327 	  break;
27328 	case V4SImode:
27329 	  cmode = V2SImode;
27330 	  break;
27331 	case V4SFmode:
27332 	  cmode = V2SFmode;
27333 	  break;
27334 	case V2DImode:
27335 	  cmode = DImode;
27336 	  break;
27337 	case V2SImode:
27338 	  cmode = SImode;
27339 	  break;
27340 	case V2DFmode:
27341 	  cmode = DFmode;
27342 	  break;
27343 	case V2SFmode:
27344 	  cmode = SFmode;
27345 	  break;
27346 	default:
27347 	  gcc_unreachable ();
27348 	}
27349 
27350       if (!register_operand (ops[1], cmode))
27351 	ops[1] = force_reg (cmode, ops[1]);
27352       if (!register_operand (ops[0], cmode))
27353 	ops[0] = force_reg (cmode, ops[0]);
27354       emit_insn (gen_rtx_SET (VOIDmode, target,
27355 			      gen_rtx_VEC_CONCAT (mode, ops[0],
27356 						  ops[1])));
27357       break;
27358 
27359     case 4:
27360       switch (mode)
27361 	{
27362 	case V4DImode:
27363 	  cmode = V2DImode;
27364 	  break;
27365 	case V4DFmode:
27366 	  cmode = V2DFmode;
27367 	  break;
27368 	case V4SImode:
27369 	  cmode = V2SImode;
27370 	  break;
27371 	case V4SFmode:
27372 	  cmode = V2SFmode;
27373 	  break;
27374 	default:
27375 	  gcc_unreachable ();
27376 	}
27377       goto half;
27378 
27379     case 8:
27380       switch (mode)
27381 	{
27382 	case V8SImode:
27383 	  cmode = V2SImode;
27384 	  hmode = V4SImode;
27385 	  break;
27386 	case V8SFmode:
27387 	  cmode = V2SFmode;
27388 	  hmode = V4SFmode;
27389 	  break;
27390 	default:
27391 	  gcc_unreachable ();
27392 	}
27393       goto half;
27394 
27395 half:
27396       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
27397       i = n - 1;
27398       j = (n >> 1) - 1;
27399       for (; i > 0; i -= 2, j--)
27400 	{
27401 	  first[j] = gen_reg_rtx (cmode);
27402 	  v = gen_rtvec (2, ops[i - 1], ops[i]);
27403 	  ix86_expand_vector_init (false, first[j],
27404 				   gen_rtx_PARALLEL (cmode, v));
27405 	}
27406 
27407       n >>= 1;
27408       if (n > 2)
27409 	{
27410 	  gcc_assert (hmode != VOIDmode);
27411 	  for (i = j = 0; i < n; i += 2, j++)
27412 	    {
27413 	      second[j] = gen_reg_rtx (hmode);
27414 	      ix86_expand_vector_init_concat (hmode, second [j],
27415 					      &first [i], 2);
27416 	    }
27417 	  n >>= 1;
27418 	  ix86_expand_vector_init_concat (mode, target, second, n);
27419 	}
27420       else
27421 	ix86_expand_vector_init_concat (mode, target, first, n);
27422       break;
27423 
27424     default:
27425       gcc_unreachable ();
27426     }
27427 }
27428 
27429 /* A subroutine of ix86_expand_vector_init_general.  Use vector
27430    interleave to handle the most general case: all values variable,
27431    and none identical.  */
27432 
27433 static void
27434 ix86_expand_vector_init_interleave (enum machine_mode mode,
27435 				    rtx target, rtx *ops, int n)
27436 {
27437   enum machine_mode first_imode, second_imode, third_imode, inner_mode;
27438   int i, j;
27439   rtx op0, op1;
27440   rtx (*gen_load_even) (rtx, rtx, rtx);
27441   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
27442   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
27443 
27444   switch (mode)
27445     {
27446     case V8HImode:
27447       gen_load_even = gen_vec_setv8hi;
27448       gen_interleave_first_low = gen_vec_interleave_lowv4si;
27449       gen_interleave_second_low = gen_vec_interleave_lowv2di;
27450       inner_mode = HImode;
27451       first_imode = V4SImode;
27452       second_imode = V2DImode;
27453       third_imode = VOIDmode;
27454       break;
27455     case V16QImode:
27456       gen_load_even = gen_vec_setv16qi;
27457       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
27458       gen_interleave_second_low = gen_vec_interleave_lowv4si;
27459       inner_mode = QImode;
27460       first_imode = V8HImode;
27461       second_imode = V4SImode;
27462       third_imode = V2DImode;
27463       break;
27464     default:
27465       gcc_unreachable ();
27466     }
27467 
27468   for (i = 0; i < n; i++)
27469     {
27470       /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
27471       op0 = gen_reg_rtx (SImode);
27472       emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
27473 
27474       /* Insert the SImode value as low element of V4SImode vector. */
27475       op1 = gen_reg_rtx (V4SImode);
27476       op0 = gen_rtx_VEC_MERGE (V4SImode,
27477 			       gen_rtx_VEC_DUPLICATE (V4SImode,
27478 						      op0),
27479 			       CONST0_RTX (V4SImode),
27480 			       const1_rtx);
27481       emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
27482 
27483       /* Cast the V4SImode vector back to a vector in orignal mode.  */
27484       op0 = gen_reg_rtx (mode);
27485       emit_move_insn (op0, gen_lowpart (mode, op1));
27486 
27487       /* Load even elements into the second positon.  */
27488       emit_insn ((*gen_load_even) (op0,
27489 				   force_reg (inner_mode,
27490 					      ops [i + i + 1]),
27491 				   const1_rtx));
27492 
27493       /* Cast vector to FIRST_IMODE vector.  */
27494       ops[i] = gen_reg_rtx (first_imode);
27495       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
27496     }
27497 
27498   /* Interleave low FIRST_IMODE vectors.  */
27499   for (i = j = 0; i < n; i += 2, j++)
27500     {
27501       op0 = gen_reg_rtx (first_imode);
27502       emit_insn ((*gen_interleave_first_low) (op0, ops[i], ops[i + 1]));
27503 
27504       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
27505       ops[j] = gen_reg_rtx (second_imode);
27506       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
27507     }
27508 
27509   /* Interleave low SECOND_IMODE vectors.  */
27510   switch (second_imode)
27511     {
27512     case V4SImode:
27513       for (i = j = 0; i < n / 2; i += 2, j++)
27514 	{
27515 	  op0 = gen_reg_rtx (second_imode);
27516 	  emit_insn ((*gen_interleave_second_low) (op0, ops[i],
27517 						   ops[i + 1]));
27518 
27519 	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
27520 	     vector.  */
27521 	  ops[j] = gen_reg_rtx (third_imode);
27522 	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
27523 	}
27524       second_imode = V2DImode;
27525       gen_interleave_second_low = gen_vec_interleave_lowv2di;
27526       /* FALLTHRU */
27527 
27528     case V2DImode:
27529       op0 = gen_reg_rtx (second_imode);
27530       emit_insn ((*gen_interleave_second_low) (op0, ops[0],
27531 					       ops[1]));
27532 
27533       /* Cast the SECOND_IMODE vector back to a vector on original
27534 	 mode.  */
27535       emit_insn (gen_rtx_SET (VOIDmode, target,
27536 			      gen_lowpart (mode, op0)));
27537       break;
27538 
27539     default:
27540       gcc_unreachable ();
27541     }
27542 }
27543 
27544 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
27545    all values variable, and none identical.  */
27546 
27547 static void
27548 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
27549 				 rtx target, rtx vals)
27550 {
27551   rtx ops[32], op0, op1;
27552   enum machine_mode half_mode = VOIDmode;
27553   int n, i;
27554 
27555   switch (mode)
27556     {
27557     case V2SFmode:
27558     case V2SImode:
27559       if (!mmx_ok && !TARGET_SSE)
27560 	break;
27561       /* FALLTHRU */
27562 
27563     case V8SFmode:
27564     case V8SImode:
27565     case V4DFmode:
27566     case V4DImode:
27567     case V4SFmode:
27568     case V4SImode:
27569     case V2DFmode:
27570     case V2DImode:
27571       n = GET_MODE_NUNITS (mode);
27572       for (i = 0; i < n; i++)
27573 	ops[i] = XVECEXP (vals, 0, i);
27574       ix86_expand_vector_init_concat (mode, target, ops, n);
27575       return;
27576 
27577     case V32QImode:
27578       half_mode = V16QImode;
27579       goto half;
27580 
27581     case V16HImode:
27582       half_mode = V8HImode;
27583       goto half;
27584 
27585 half:
27586       n = GET_MODE_NUNITS (mode);
27587       for (i = 0; i < n; i++)
27588 	ops[i] = XVECEXP (vals, 0, i);
27589       op0 = gen_reg_rtx (half_mode);
27590       op1 = gen_reg_rtx (half_mode);
27591       ix86_expand_vector_init_interleave (half_mode, op0, ops,
27592 					  n >> 2);
27593       ix86_expand_vector_init_interleave (half_mode, op1,
27594 					  &ops [n >> 1], n >> 2);
27595       emit_insn (gen_rtx_SET (VOIDmode, target,
27596 			      gen_rtx_VEC_CONCAT (mode, op0, op1)));
27597       return;
27598 
27599     case V16QImode:
27600       if (!TARGET_SSE4_1)
27601 	break;
27602       /* FALLTHRU */
27603 
27604     case V8HImode:
27605       if (!TARGET_SSE2)
27606 	break;
27607 
27608       /* Don't use ix86_expand_vector_init_interleave if we can't
27609 	 move from GPR to SSE register directly.  */
27610       if (!TARGET_INTER_UNIT_MOVES)
27611 	break;
27612 
27613       n = GET_MODE_NUNITS (mode);
27614       for (i = 0; i < n; i++)
27615 	ops[i] = XVECEXP (vals, 0, i);
27616       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
27617       return;
27618 
27619     case V4HImode:
27620     case V8QImode:
27621       break;
27622 
27623     default:
27624       gcc_unreachable ();
27625     }
27626 
27627     {
27628       int i, j, n_elts, n_words, n_elt_per_word;
27629       enum machine_mode inner_mode;
27630       rtx words[4], shift;
27631 
27632       inner_mode = GET_MODE_INNER (mode);
27633       n_elts = GET_MODE_NUNITS (mode);
27634       n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
27635       n_elt_per_word = n_elts / n_words;
27636       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
27637 
27638       for (i = 0; i < n_words; ++i)
27639 	{
27640 	  rtx word = NULL_RTX;
27641 
27642 	  for (j = 0; j < n_elt_per_word; ++j)
27643 	    {
27644 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
27645 	      elt = convert_modes (word_mode, inner_mode, elt, true);
27646 
27647 	      if (j == 0)
27648 		word = elt;
27649 	      else
27650 		{
27651 		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
27652 					      word, 1, OPTAB_LIB_WIDEN);
27653 		  word = expand_simple_binop (word_mode, IOR, word, elt,
27654 					      word, 1, OPTAB_LIB_WIDEN);
27655 		}
27656 	    }
27657 
27658 	  words[i] = word;
27659 	}
27660 
27661       if (n_words == 1)
27662 	emit_move_insn (target, gen_lowpart (mode, words[0]));
27663       else if (n_words == 2)
27664 	{
27665 	  rtx tmp = gen_reg_rtx (mode);
27666 	  emit_clobber (tmp);
27667 	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
27668 	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
27669 	  emit_move_insn (target, tmp);
27670 	}
27671       else if (n_words == 4)
27672 	{
27673 	  rtx tmp = gen_reg_rtx (V4SImode);
27674 	  gcc_assert (word_mode == SImode);
27675 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
27676 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
27677 	  emit_move_insn (target, gen_lowpart (mode, tmp));
27678 	}
27679       else
27680 	gcc_unreachable ();
27681     }
27682 }
27683 
27684 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
27685    instructions unless MMX_OK is true.  */
27686 
27687 void
27688 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
27689 {
27690   enum machine_mode mode = GET_MODE (target);
27691   enum machine_mode inner_mode = GET_MODE_INNER (mode);
27692   int n_elts = GET_MODE_NUNITS (mode);
27693   int n_var = 0, one_var = -1;
27694   bool all_same = true, all_const_zero = true;
27695   int i;
27696   rtx x;
27697 
27698   for (i = 0; i < n_elts; ++i)
27699     {
27700       x = XVECEXP (vals, 0, i);
27701       if (!(CONST_INT_P (x)
27702 	    || GET_CODE (x) == CONST_DOUBLE
27703 	    || GET_CODE (x) == CONST_FIXED))
27704 	n_var++, one_var = i;
27705       else if (x != CONST0_RTX (inner_mode))
27706 	all_const_zero = false;
27707       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
27708 	all_same = false;
27709     }
27710 
27711   /* Constants are best loaded from the constant pool.  */
27712   if (n_var == 0)
27713     {
27714       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
27715       return;
27716     }
27717 
27718   /* If all values are identical, broadcast the value.  */
27719   if (all_same
27720       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
27721 					    XVECEXP (vals, 0, 0)))
27722     return;
27723 
27724   /* Values where only one field is non-constant are best loaded from
27725      the pool and overwritten via move later.  */
27726   if (n_var == 1)
27727     {
27728       if (all_const_zero
27729 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
27730 						  XVECEXP (vals, 0, one_var),
27731 						  one_var))
27732 	return;
27733 
27734       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
27735 	return;
27736     }
27737 
27738   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
27739 }
27740 
27741 void
27742 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
27743 {
27744   enum machine_mode mode = GET_MODE (target);
27745   enum machine_mode inner_mode = GET_MODE_INNER (mode);
27746   enum machine_mode half_mode;
27747   bool use_vec_merge = false;
27748   rtx tmp;
27749   static rtx (*gen_extract[6][2]) (rtx, rtx)
27750     = {
27751 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
27752 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
27753 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
27754 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
27755 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
27756 	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
27757       };
27758   static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
27759     = {
27760 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
27761 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
27762 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
27763 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
27764 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
27765 	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
27766       };
27767   int i, j, n;
27768 
27769   switch (mode)
27770     {
27771     case V2SFmode:
27772     case V2SImode:
27773       if (mmx_ok)
27774 	{
27775 	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
27776 	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
27777 	  if (elt == 0)
27778 	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
27779 	  else
27780 	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
27781 	  emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
27782 	  return;
27783 	}
27784       break;
27785 
27786     case V2DImode:
27787       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
27788       if (use_vec_merge)
27789 	break;
27790 
27791       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
27792       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
27793       if (elt == 0)
27794 	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
27795       else
27796 	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
27797       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
27798       return;
27799 
27800     case V2DFmode:
27801       {
27802 	rtx op0, op1;
27803 
27804 	/* For the two element vectors, we implement a VEC_CONCAT with
27805 	   the extraction of the other element.  */
27806 
27807 	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
27808 	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
27809 
27810 	if (elt == 0)
27811 	  op0 = val, op1 = tmp;
27812 	else
27813 	  op0 = tmp, op1 = val;
27814 
27815 	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
27816 	emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
27817       }
27818       return;
27819 
27820     case V4SFmode:
27821       use_vec_merge = TARGET_SSE4_1;
27822       if (use_vec_merge)
27823 	break;
27824 
27825       switch (elt)
27826 	{
27827 	case 0:
27828 	  use_vec_merge = true;
27829 	  break;
27830 
27831 	case 1:
27832 	  /* tmp = target = A B C D */
27833 	  tmp = copy_to_reg (target);
27834 	  /* target = A A B B */
27835 	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
27836 	  /* target = X A B B */
27837 	  ix86_expand_vector_set (false, target, val, 0);
27838 	  /* target = A X C D  */
27839 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
27840 					  const1_rtx, const0_rtx,
27841 					  GEN_INT (2+4), GEN_INT (3+4)));
27842 	  return;
27843 
27844 	case 2:
27845 	  /* tmp = target = A B C D */
27846 	  tmp = copy_to_reg (target);
27847 	  /* tmp = X B C D */
27848 	  ix86_expand_vector_set (false, tmp, val, 0);
27849 	  /* target = A B X D */
27850 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
27851 					  const0_rtx, const1_rtx,
27852 					  GEN_INT (0+4), GEN_INT (3+4)));
27853 	  return;
27854 
27855 	case 3:
27856 	  /* tmp = target = A B C D */
27857 	  tmp = copy_to_reg (target);
27858 	  /* tmp = X B C D */
27859 	  ix86_expand_vector_set (false, tmp, val, 0);
27860 	  /* target = A B X D */
27861 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
27862 					  const0_rtx, const1_rtx,
27863 					  GEN_INT (2+4), GEN_INT (0+4)));
27864 	  return;
27865 
27866 	default:
27867 	  gcc_unreachable ();
27868 	}
27869       break;
27870 
27871     case V4SImode:
27872       use_vec_merge = TARGET_SSE4_1;
27873       if (use_vec_merge)
27874 	break;
27875 
27876       /* Element 0 handled by vec_merge below.  */
27877       if (elt == 0)
27878 	{
27879 	  use_vec_merge = true;
27880 	  break;
27881 	}
27882 
27883       if (TARGET_SSE2)
27884 	{
27885 	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
27886 	     store into element 0, then shuffle them back.  */
27887 
27888 	  rtx order[4];
27889 
27890 	  order[0] = GEN_INT (elt);
27891 	  order[1] = const1_rtx;
27892 	  order[2] = const2_rtx;
27893 	  order[3] = GEN_INT (3);
27894 	  order[elt] = const0_rtx;
27895 
27896 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
27897 					order[1], order[2], order[3]));
27898 
27899 	  ix86_expand_vector_set (false, target, val, 0);
27900 
27901 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
27902 					order[1], order[2], order[3]));
27903 	}
27904       else
27905 	{
27906 	  /* For SSE1, we have to reuse the V4SF code.  */
27907 	  ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
27908 				  gen_lowpart (SFmode, val), elt);
27909 	}
27910       return;
27911 
27912     case V8HImode:
27913       use_vec_merge = TARGET_SSE2;
27914       break;
27915     case V4HImode:
27916       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
27917       break;
27918 
27919     case V16QImode:
27920       use_vec_merge = TARGET_SSE4_1;
27921       break;
27922 
27923     case V8QImode:
27924       break;
27925 
27926     case V32QImode:
27927       half_mode = V16QImode;
27928       j = 0;
27929       n = 16;
27930       goto half;
27931 
27932     case V16HImode:
27933       half_mode = V8HImode;
27934       j = 1;
27935       n = 8;
27936       goto half;
27937 
27938     case V8SImode:
27939       half_mode = V4SImode;
27940       j = 2;
27941       n = 4;
27942       goto half;
27943 
27944     case V4DImode:
27945       half_mode = V2DImode;
27946       j = 3;
27947       n = 2;
27948       goto half;
27949 
27950     case V8SFmode:
27951       half_mode = V4SFmode;
27952       j = 4;
27953       n = 4;
27954       goto half;
27955 
27956     case V4DFmode:
27957       half_mode = V2DFmode;
27958       j = 5;
27959       n = 2;
27960       goto half;
27961 
27962 half:
27963       /* Compute offset.  */
27964       i = elt / n;
27965       elt %= n;
27966 
27967       gcc_assert (i <= 1);
27968 
27969       /* Extract the half.  */
27970       tmp = gen_reg_rtx (half_mode);
27971       emit_insn ((*gen_extract[j][i]) (tmp, target));
27972 
27973       /* Put val in tmp at elt.  */
27974       ix86_expand_vector_set (false, tmp, val, elt);
27975 
27976       /* Put it back.  */
27977       emit_insn ((*gen_insert[j][i]) (target, target, tmp));
27978       return;
27979 
27980     default:
27981       break;
27982     }
27983 
27984   if (use_vec_merge)
27985     {
27986       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
27987       tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
27988       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
27989     }
27990   else
27991     {
27992       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
27993 
27994       emit_move_insn (mem, target);
27995 
27996       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
27997       emit_move_insn (tmp, val);
27998 
27999       emit_move_insn (target, mem);
28000     }
28001 }
28002 
28003 void
28004 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
28005 {
28006   enum machine_mode mode = GET_MODE (vec);
28007   enum machine_mode inner_mode = GET_MODE_INNER (mode);
28008   bool use_vec_extr = false;
28009   rtx tmp;
28010 
28011   switch (mode)
28012     {
28013     case V2SImode:
28014     case V2SFmode:
28015       if (!mmx_ok)
28016 	break;
28017       /* FALLTHRU */
28018 
28019     case V2DFmode:
28020     case V2DImode:
28021       use_vec_extr = true;
28022       break;
28023 
28024     case V4SFmode:
28025       use_vec_extr = TARGET_SSE4_1;
28026       if (use_vec_extr)
28027 	break;
28028 
28029       switch (elt)
28030 	{
28031 	case 0:
28032 	  tmp = vec;
28033 	  break;
28034 
28035 	case 1:
28036 	case 3:
28037 	  tmp = gen_reg_rtx (mode);
28038 	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
28039 				       GEN_INT (elt), GEN_INT (elt),
28040 				       GEN_INT (elt+4), GEN_INT (elt+4)));
28041 	  break;
28042 
28043 	case 2:
28044 	  tmp = gen_reg_rtx (mode);
28045 	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
28046 	  break;
28047 
28048 	default:
28049 	  gcc_unreachable ();
28050 	}
28051       vec = tmp;
28052       use_vec_extr = true;
28053       elt = 0;
28054       break;
28055 
28056     case V4SImode:
28057       use_vec_extr = TARGET_SSE4_1;
28058       if (use_vec_extr)
28059 	break;
28060 
28061       if (TARGET_SSE2)
28062 	{
28063 	  switch (elt)
28064 	    {
28065 	    case 0:
28066 	      tmp = vec;
28067 	      break;
28068 
28069 	    case 1:
28070 	    case 3:
28071 	      tmp = gen_reg_rtx (mode);
28072 	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
28073 					    GEN_INT (elt), GEN_INT (elt),
28074 					    GEN_INT (elt), GEN_INT (elt)));
28075 	      break;
28076 
28077 	    case 2:
28078 	      tmp = gen_reg_rtx (mode);
28079 	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
28080 	      break;
28081 
28082 	    default:
28083 	      gcc_unreachable ();
28084 	    }
28085 	  vec = tmp;
28086 	  use_vec_extr = true;
28087 	  elt = 0;
28088 	}
28089       else
28090 	{
28091 	  /* For SSE1, we have to reuse the V4SF code.  */
28092 	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
28093 				      gen_lowpart (V4SFmode, vec), elt);
28094 	  return;
28095 	}
28096       break;
28097 
28098     case V8HImode:
28099       use_vec_extr = TARGET_SSE2;
28100       break;
28101     case V4HImode:
28102       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
28103       break;
28104 
28105     case V16QImode:
28106       use_vec_extr = TARGET_SSE4_1;
28107       break;
28108 
28109     case V8QImode:
28110       /* ??? Could extract the appropriate HImode element and shift.  */
28111     default:
28112       break;
28113     }
28114 
28115   if (use_vec_extr)
28116     {
28117       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
28118       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
28119 
28120       /* Let the rtl optimizers know about the zero extension performed.  */
28121       if (inner_mode == QImode || inner_mode == HImode)
28122 	{
28123 	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
28124 	  target = gen_lowpart (SImode, target);
28125 	}
28126 
28127       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
28128     }
28129   else
28130     {
28131       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
28132 
28133       emit_move_insn (mem, vec);
28134 
28135       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
28136       emit_move_insn (target, tmp);
28137     }
28138 }
28139 
28140 /* Expand a vector reduction on V4SFmode for SSE1.  FN is the binary
28141    pattern to reduce; DEST is the destination; IN is the input vector.  */
28142 
28143 void
28144 ix86_expand_reduc_v4sf (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
28145 {
28146   rtx tmp1, tmp2, tmp3;
28147 
28148   tmp1 = gen_reg_rtx (V4SFmode);
28149   tmp2 = gen_reg_rtx (V4SFmode);
28150   tmp3 = gen_reg_rtx (V4SFmode);
28151 
28152   emit_insn (gen_sse_movhlps (tmp1, in, in));
28153   emit_insn (fn (tmp2, tmp1, in));
28154 
28155   emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2,
28156 				  const1_rtx, const1_rtx,
28157 				  GEN_INT (1+4), GEN_INT (1+4)));
28158   emit_insn (fn (dest, tmp2, tmp3));
28159 }
28160 
28161 /* Target hook for scalar_mode_supported_p.  */
28162 static bool
28163 ix86_scalar_mode_supported_p (enum machine_mode mode)
28164 {
28165   if (DECIMAL_FLOAT_MODE_P (mode))
28166     return default_decimal_float_supported_p ();
28167   else if (mode == TFmode)
28168     return true;
28169   else
28170     return default_scalar_mode_supported_p (mode);
28171 }
28172 
28173 /* Implements target hook vector_mode_supported_p.  */
28174 static bool
28175 ix86_vector_mode_supported_p (enum machine_mode mode)
28176 {
28177   if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
28178     return true;
28179   if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
28180     return true;
28181   if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
28182     return true;
28183   if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
28184     return true;
28185   if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
28186     return true;
28187   return false;
28188 }
28189 
28190 /* Target hook for c_mode_for_suffix.  */
28191 static enum machine_mode
28192 ix86_c_mode_for_suffix (char suffix)
28193 {
28194   if (suffix == 'q')
28195     return TFmode;
28196   if (suffix == 'w')
28197     return XFmode;
28198 
28199   return VOIDmode;
28200 }
28201 
28202 /* Worker function for TARGET_MD_ASM_CLOBBERS.
28203 
28204    We do this in the new i386 backend to maintain source compatibility
28205    with the old cc0-based compiler.  */
28206 
28207 static tree
28208 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
28209 		      tree inputs ATTRIBUTE_UNUSED,
28210 		      tree clobbers)
28211 {
28212   clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
28213 			clobbers);
28214   clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
28215 			clobbers);
28216   return clobbers;
28217 }
28218 
28219 /* Implements target vector targetm.asm.encode_section_info.  This
28220    is not used by netware.  */
28221 
28222 static void ATTRIBUTE_UNUSED
28223 ix86_encode_section_info (tree decl, rtx rtl, int first)
28224 {
28225   default_encode_section_info (decl, rtl, first);
28226 
28227   if (TREE_CODE (decl) == VAR_DECL
28228       && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
28229       && ix86_in_large_data_p (decl))
28230     SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
28231 }
28232 
28233 /* Worker function for REVERSE_CONDITION.  */
28234 
28235 enum rtx_code
28236 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
28237 {
28238   return (mode != CCFPmode && mode != CCFPUmode
28239 	  ? reverse_condition (code)
28240 	  : reverse_condition_maybe_unordered (code));
28241 }
28242 
28243 /* Output code to perform an x87 FP register move, from OPERANDS[1]
28244    to OPERANDS[0].  */
28245 
28246 const char *
28247 output_387_reg_move (rtx insn, rtx *operands)
28248 {
28249   if (REG_P (operands[0]))
28250     {
28251       if (REG_P (operands[1])
28252 	  && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
28253 	{
28254 	  if (REGNO (operands[0]) == FIRST_STACK_REG)
28255 	    return output_387_ffreep (operands, 0);
28256 	  return "fstp\t%y0";
28257 	}
28258       if (STACK_TOP_P (operands[0]))
28259 	return "fld%Z1\t%y1";
28260       return "fst\t%y0";
28261     }
28262   else if (MEM_P (operands[0]))
28263     {
28264       gcc_assert (REG_P (operands[1]));
28265       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
28266 	return "fstp%Z0\t%y0";
28267       else
28268 	{
28269 	  /* There is no non-popping store to memory for XFmode.
28270 	     So if we need one, follow the store with a load.  */
28271 	  if (GET_MODE (operands[0]) == XFmode)
28272 	    return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
28273 	  else
28274 	    return "fst%Z0\t%y0";
28275 	}
28276     }
28277   else
28278     gcc_unreachable();
28279 }
28280 
28281 /* Output code to perform a conditional jump to LABEL, if C2 flag in
28282    FP status register is set.  */
28283 
28284 void
28285 ix86_emit_fp_unordered_jump (rtx label)
28286 {
28287   rtx reg = gen_reg_rtx (HImode);
28288   rtx temp;
28289 
28290   emit_insn (gen_x86_fnstsw_1 (reg));
28291 
28292   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
28293     {
28294       emit_insn (gen_x86_sahf_1 (reg));
28295 
28296       temp = gen_rtx_REG (CCmode, FLAGS_REG);
28297       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
28298     }
28299   else
28300     {
28301       emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
28302 
28303       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28304       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
28305     }
28306 
28307   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
28308 			      gen_rtx_LABEL_REF (VOIDmode, label),
28309 			      pc_rtx);
28310   temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
28311 
28312   emit_jump_insn (temp);
28313   predict_jump (REG_BR_PROB_BASE * 10 / 100);
28314 }
28315 
28316 /* Output code to perform a log1p XFmode calculation.  */
28317 
28318 void ix86_emit_i387_log1p (rtx op0, rtx op1)
28319 {
28320   rtx label1 = gen_label_rtx ();
28321   rtx label2 = gen_label_rtx ();
28322 
28323   rtx tmp = gen_reg_rtx (XFmode);
28324   rtx tmp2 = gen_reg_rtx (XFmode);
28325   rtx test;
28326 
28327   emit_insn (gen_absxf2 (tmp, op1));
28328   test = gen_rtx_GE (VOIDmode, tmp,
28329     CONST_DOUBLE_FROM_REAL_VALUE (
28330        REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
28331        XFmode));
28332   emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
28333 
28334   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
28335   emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
28336   emit_jump (label2);
28337 
28338   emit_label (label1);
28339   emit_move_insn (tmp, CONST1_RTX (XFmode));
28340   emit_insn (gen_addxf3 (tmp, op1, tmp));
28341   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
28342   emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
28343 
28344   emit_label (label2);
28345 }
28346 
28347 /* Output code to perform a Newton-Rhapson approximation of a single precision
28348    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
28349 
28350 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
28351 {
28352   rtx x0, x1, e0, e1, two;
28353 
28354   x0 = gen_reg_rtx (mode);
28355   e0 = gen_reg_rtx (mode);
28356   e1 = gen_reg_rtx (mode);
28357   x1 = gen_reg_rtx (mode);
28358 
28359   two = CONST_DOUBLE_FROM_REAL_VALUE (dconst2, SFmode);
28360 
28361   if (VECTOR_MODE_P (mode))
28362     two = ix86_build_const_vector (SFmode, true, two);
28363 
28364   two = force_reg (mode, two);
28365 
28366   /* a / b = a * rcp(b) * (2.0 - b * rcp(b)) */
28367 
28368   /* x0 = rcp(b) estimate */
28369   emit_insn (gen_rtx_SET (VOIDmode, x0,
28370 			  gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
28371 					  UNSPEC_RCP)));
28372   /* e0 = x0 * a */
28373   emit_insn (gen_rtx_SET (VOIDmode, e0,
28374 			  gen_rtx_MULT (mode, x0, a)));
28375   /* e1 = x0 * b */
28376   emit_insn (gen_rtx_SET (VOIDmode, e1,
28377 			  gen_rtx_MULT (mode, x0, b)));
28378   /* x1 = 2. - e1 */
28379   emit_insn (gen_rtx_SET (VOIDmode, x1,
28380 			  gen_rtx_MINUS (mode, two, e1)));
28381   /* res = e0 * x1 */
28382   emit_insn (gen_rtx_SET (VOIDmode, res,
28383 			  gen_rtx_MULT (mode, e0, x1)));
28384 }
28385 
28386 /* Output code to perform a Newton-Rhapson approximation of a
28387    single precision floating point [reciprocal] square root.  */
28388 
28389 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
28390 			 bool recip)
28391 {
28392   rtx x0, e0, e1, e2, e3, mthree, mhalf;
28393   REAL_VALUE_TYPE r;
28394 
28395   x0 = gen_reg_rtx (mode);
28396   e0 = gen_reg_rtx (mode);
28397   e1 = gen_reg_rtx (mode);
28398   e2 = gen_reg_rtx (mode);
28399   e3 = gen_reg_rtx (mode);
28400 
28401   real_from_integer (&r, VOIDmode, -3, -1, 0);
28402   mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
28403 
28404   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
28405   mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
28406 
28407   if (VECTOR_MODE_P (mode))
28408     {
28409       mthree = ix86_build_const_vector (SFmode, true, mthree);
28410       mhalf = ix86_build_const_vector (SFmode, true, mhalf);
28411     }
28412 
28413   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
28414      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
28415 
28416   /* x0 = rsqrt(a) estimate */
28417   emit_insn (gen_rtx_SET (VOIDmode, x0,
28418 			  gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
28419 					  UNSPEC_RSQRT)));
28420 
28421   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
28422   if (!recip)
28423     {
28424       rtx zero, mask;
28425 
28426       zero = gen_reg_rtx (mode);
28427       mask = gen_reg_rtx (mode);
28428 
28429       zero = force_reg (mode, CONST0_RTX(mode));
28430       emit_insn (gen_rtx_SET (VOIDmode, mask,
28431 			      gen_rtx_NE (mode, zero, a)));
28432 
28433       emit_insn (gen_rtx_SET (VOIDmode, x0,
28434 			      gen_rtx_AND (mode, x0, mask)));
28435     }
28436 
28437   /* e0 = x0 * a */
28438   emit_insn (gen_rtx_SET (VOIDmode, e0,
28439 			  gen_rtx_MULT (mode, x0, a)));
28440   /* e1 = e0 * x0 */
28441   emit_insn (gen_rtx_SET (VOIDmode, e1,
28442 			  gen_rtx_MULT (mode, e0, x0)));
28443 
28444   /* e2 = e1 - 3. */
28445   mthree = force_reg (mode, mthree);
28446   emit_insn (gen_rtx_SET (VOIDmode, e2,
28447 			  gen_rtx_PLUS (mode, e1, mthree)));
28448 
28449   mhalf = force_reg (mode, mhalf);
28450   if (recip)
28451     /* e3 = -.5 * x0 */
28452     emit_insn (gen_rtx_SET (VOIDmode, e3,
28453 			    gen_rtx_MULT (mode, x0, mhalf)));
28454   else
28455     /* e3 = -.5 * e0 */
28456     emit_insn (gen_rtx_SET (VOIDmode, e3,
28457 			    gen_rtx_MULT (mode, e0, mhalf)));
28458   /* ret = e2 * e3 */
28459   emit_insn (gen_rtx_SET (VOIDmode, res,
28460 			  gen_rtx_MULT (mode, e2, e3)));
28461 }
28462 
28463 /* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
28464 
28465 static void ATTRIBUTE_UNUSED
28466 i386_solaris_elf_named_section (const char *name, unsigned int flags,
28467 				tree decl)
28468 {
28469   /* With Binutils 2.15, the "@unwind" marker must be specified on
28470      every occurrence of the ".eh_frame" section, not just the first
28471      one.  */
28472   if (TARGET_64BIT
28473       && strcmp (name, ".eh_frame") == 0)
28474     {
28475       fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
28476 	       flags & SECTION_WRITE ? "aw" : "a");
28477       return;
28478     }
28479   default_elf_asm_named_section (name, flags, decl);
28480 }
28481 
28482 /* Return the mangling of TYPE if it is an extended fundamental type.  */
28483 
28484 static const char *
28485 ix86_mangle_type (const_tree type)
28486 {
28487   type = TYPE_MAIN_VARIANT (type);
28488 
28489   if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
28490       && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
28491     return NULL;
28492 
28493   switch (TYPE_MODE (type))
28494     {
28495     case TFmode:
28496       /* __float128 is "g".  */
28497       return "g";
28498     case XFmode:
28499       /* "long double" or __float80 is "e".  */
28500       return "e";
28501     default:
28502       return NULL;
28503     }
28504 }
28505 
28506 /* For 32-bit code we can save PIC register setup by using
28507    __stack_chk_fail_local hidden function instead of calling
28508    __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
28509    register, so it is better to call __stack_chk_fail directly.  */
28510 
28511 static tree
28512 ix86_stack_protect_fail (void)
28513 {
28514   return TARGET_64BIT
28515 	 ? default_external_stack_protect_fail ()
28516 	 : default_hidden_stack_protect_fail ();
28517 }
28518 
28519 /* Select a format to encode pointers in exception handling data.  CODE
28520    is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
28521    true if the symbol may be affected by dynamic relocations.
28522 
28523    ??? All x86 object file formats are capable of representing this.
28524    After all, the relocation needed is the same as for the call insn.
28525    Whether or not a particular assembler allows us to enter such, I
28526    guess we'll have to see.  */
28527 int
28528 asm_preferred_eh_data_format (int code, int global)
28529 {
28530   if (flag_pic)
28531     {
28532       int type = DW_EH_PE_sdata8;
28533       if (!TARGET_64BIT
28534 	  || ix86_cmodel == CM_SMALL_PIC
28535 	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
28536 	type = DW_EH_PE_sdata4;
28537       return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
28538     }
28539   if (ix86_cmodel == CM_SMALL
28540       || (ix86_cmodel == CM_MEDIUM && code))
28541     return DW_EH_PE_udata4;
28542   return DW_EH_PE_absptr;
28543 }
28544 
28545 /* Expand copysign from SIGN to the positive value ABS_VALUE
28546    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
28547    the sign-bit.  */
28548 static void
28549 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
28550 {
28551   enum machine_mode mode = GET_MODE (sign);
28552   rtx sgn = gen_reg_rtx (mode);
28553   if (mask == NULL_RTX)
28554     {
28555       mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), false);
28556       if (!VECTOR_MODE_P (mode))
28557 	{
28558 	  /* We need to generate a scalar mode mask in this case.  */
28559 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
28560 	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
28561 	  mask = gen_reg_rtx (mode);
28562 	  emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
28563 	}
28564     }
28565   else
28566     mask = gen_rtx_NOT (mode, mask);
28567   emit_insn (gen_rtx_SET (VOIDmode, sgn,
28568 			  gen_rtx_AND (mode, mask, sign)));
28569   emit_insn (gen_rtx_SET (VOIDmode, result,
28570 			  gen_rtx_IOR (mode, abs_value, sgn)));
28571 }
28572 
28573 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
28574    mask for masking out the sign-bit is stored in *SMASK, if that is
28575    non-null.  */
28576 static rtx
28577 ix86_expand_sse_fabs (rtx op0, rtx *smask)
28578 {
28579   enum machine_mode mode = GET_MODE (op0);
28580   rtx xa, mask;
28581 
28582   xa = gen_reg_rtx (mode);
28583   mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), true);
28584   if (!VECTOR_MODE_P (mode))
28585     {
28586       /* We need to generate a scalar mode mask in this case.  */
28587       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
28588       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
28589       mask = gen_reg_rtx (mode);
28590       emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
28591     }
28592   emit_insn (gen_rtx_SET (VOIDmode, xa,
28593 			  gen_rtx_AND (mode, op0, mask)));
28594 
28595   if (smask)
28596     *smask = mask;
28597 
28598   return xa;
28599 }
28600 
28601 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
28602    swapping the operands if SWAP_OPERANDS is true.  The expanded
28603    code is a forward jump to a newly created label in case the
28604    comparison is true.  The generated label rtx is returned.  */
28605 static rtx
28606 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
28607                                   bool swap_operands)
28608 {
28609   rtx label, tmp;
28610 
28611   if (swap_operands)
28612     {
28613       tmp = op0;
28614       op0 = op1;
28615       op1 = tmp;
28616     }
28617 
28618   label = gen_label_rtx ();
28619   tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
28620   emit_insn (gen_rtx_SET (VOIDmode, tmp,
28621 			  gen_rtx_COMPARE (CCFPUmode, op0, op1)));
28622   tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
28623   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28624 			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
28625   tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
28626   JUMP_LABEL (tmp) = label;
28627 
28628   return label;
28629 }
28630 
28631 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
28632    using comparison code CODE.  Operands are swapped for the comparison if
28633    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
28634 static rtx
28635 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
28636 			      bool swap_operands)
28637 {
28638   enum machine_mode mode = GET_MODE (op0);
28639   rtx mask = gen_reg_rtx (mode);
28640 
28641   if (swap_operands)
28642     {
28643       rtx tmp = op0;
28644       op0 = op1;
28645       op1 = tmp;
28646     }
28647 
28648   if (mode == DFmode)
28649     emit_insn (gen_sse2_maskcmpdf3 (mask, op0, op1,
28650 				    gen_rtx_fmt_ee (code, mode, op0, op1)));
28651   else
28652     emit_insn (gen_sse_maskcmpsf3 (mask, op0, op1,
28653 				   gen_rtx_fmt_ee (code, mode, op0, op1)));
28654 
28655   return mask;
28656 }
28657 
28658 /* Generate and return a rtx of mode MODE for 2**n where n is the number
28659    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
28660 static rtx
28661 ix86_gen_TWO52 (enum machine_mode mode)
28662 {
28663   REAL_VALUE_TYPE TWO52r;
28664   rtx TWO52;
28665 
28666   real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
28667   TWO52 = const_double_from_real_value (TWO52r, mode);
28668   TWO52 = force_reg (mode, TWO52);
28669 
28670   return TWO52;
28671 }
28672 
28673 /* Expand SSE sequence for computing lround from OP1 storing
28674    into OP0.  */
28675 void
28676 ix86_expand_lround (rtx op0, rtx op1)
28677 {
28678   /* C code for the stuff we're doing below:
28679        tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
28680        return (long)tmp;
28681    */
28682   enum machine_mode mode = GET_MODE (op1);
28683   const struct real_format *fmt;
28684   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
28685   rtx adj;
28686 
28687   /* load nextafter (0.5, 0.0) */
28688   fmt = REAL_MODE_FORMAT (mode);
28689   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
28690   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
28691 
28692   /* adj = copysign (0.5, op1) */
28693   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
28694   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
28695 
28696   /* adj = op1 + adj */
28697   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
28698 
28699   /* op0 = (imode)adj */
28700   expand_fix (op0, adj, 0);
28701 }
28702 
28703 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
28704    into OPERAND0.  */
28705 void
28706 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
28707 {
28708   /* C code for the stuff we're doing below (for do_floor):
28709 	xi = (long)op1;
28710         xi -= (double)xi > op1 ? 1 : 0;
28711         return xi;
28712    */
28713   enum machine_mode fmode = GET_MODE (op1);
28714   enum machine_mode imode = GET_MODE (op0);
28715   rtx ireg, freg, label, tmp;
28716 
28717   /* reg = (long)op1 */
28718   ireg = gen_reg_rtx (imode);
28719   expand_fix (ireg, op1, 0);
28720 
28721   /* freg = (double)reg */
28722   freg = gen_reg_rtx (fmode);
28723   expand_float (freg, ireg, 0);
28724 
28725   /* ireg = (freg > op1) ? ireg - 1 : ireg */
28726   label = ix86_expand_sse_compare_and_jump (UNLE,
28727 					    freg, op1, !do_floor);
28728   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
28729 			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
28730   emit_move_insn (ireg, tmp);
28731 
28732   emit_label (label);
28733   LABEL_NUSES (label) = 1;
28734 
28735   emit_move_insn (op0, ireg);
28736 }
28737 
28738 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
28739    result in OPERAND0.  */
28740 void
28741 ix86_expand_rint (rtx operand0, rtx operand1)
28742 {
28743   /* C code for the stuff we're doing below:
28744 	xa = fabs (operand1);
28745         if (!isless (xa, 2**52))
28746 	  return operand1;
28747         xa = xa + 2**52 - 2**52;
28748         return copysign (xa, operand1);
28749    */
28750   enum machine_mode mode = GET_MODE (operand0);
28751   rtx res, xa, label, TWO52, mask;
28752 
28753   res = gen_reg_rtx (mode);
28754   emit_move_insn (res, operand1);
28755 
28756   /* xa = abs (operand1) */
28757   xa = ix86_expand_sse_fabs (res, &mask);
28758 
28759   /* if (!isless (xa, TWO52)) goto label; */
28760   TWO52 = ix86_gen_TWO52 (mode);
28761   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
28762 
28763   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
28764   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
28765 
28766   ix86_sse_copysign_to_positive (res, xa, res, mask);
28767 
28768   emit_label (label);
28769   LABEL_NUSES (label) = 1;
28770 
28771   emit_move_insn (operand0, res);
28772 }
28773 
28774 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
28775    into OPERAND0.  */
28776 void
28777 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
28778 {
28779   /* C code for the stuff we expand below.
28780         double xa = fabs (x), x2;
28781         if (!isless (xa, TWO52))
28782           return x;
28783         xa = xa + TWO52 - TWO52;
28784         x2 = copysign (xa, x);
28785      Compensate.  Floor:
28786         if (x2 > x)
28787           x2 -= 1;
28788      Compensate.  Ceil:
28789         if (x2 < x)
28790           x2 -= -1;
28791         return x2;
28792    */
28793   enum machine_mode mode = GET_MODE (operand0);
28794   rtx xa, TWO52, tmp, label, one, res, mask;
28795 
28796   TWO52 = ix86_gen_TWO52 (mode);
28797 
28798   /* Temporary for holding the result, initialized to the input
28799      operand to ease control flow.  */
28800   res = gen_reg_rtx (mode);
28801   emit_move_insn (res, operand1);
28802 
28803   /* xa = abs (operand1) */
28804   xa = ix86_expand_sse_fabs (res, &mask);
28805 
28806   /* if (!isless (xa, TWO52)) goto label; */
28807   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
28808 
28809   /* xa = xa + TWO52 - TWO52; */
28810   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
28811   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
28812 
28813   /* xa = copysign (xa, operand1) */
28814   ix86_sse_copysign_to_positive (xa, xa, res, mask);
28815 
28816   /* generate 1.0 or -1.0 */
28817   one = force_reg (mode,
28818 	           const_double_from_real_value (do_floor
28819 						 ? dconst1 : dconstm1, mode));
28820 
28821   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
28822   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
28823   emit_insn (gen_rtx_SET (VOIDmode, tmp,
28824                           gen_rtx_AND (mode, one, tmp)));
28825   /* We always need to subtract here to preserve signed zero.  */
28826   tmp = expand_simple_binop (mode, MINUS,
28827 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
28828   emit_move_insn (res, tmp);
28829 
28830   emit_label (label);
28831   LABEL_NUSES (label) = 1;
28832 
28833   emit_move_insn (operand0, res);
28834 }
28835 
28836 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
28837    into OPERAND0.  */
28838 void
28839 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
28840 {
28841   /* C code for the stuff we expand below.
28842 	double xa = fabs (x), x2;
28843         if (!isless (xa, TWO52))
28844           return x;
28845 	x2 = (double)(long)x;
28846      Compensate.  Floor:
28847 	if (x2 > x)
28848 	  x2 -= 1;
28849      Compensate.  Ceil:
28850 	if (x2 < x)
28851 	  x2 += 1;
28852 	if (HONOR_SIGNED_ZEROS (mode))
28853 	  return copysign (x2, x);
28854 	return x2;
28855    */
28856   enum machine_mode mode = GET_MODE (operand0);
28857   rtx xa, xi, TWO52, tmp, label, one, res, mask;
28858 
28859   TWO52 = ix86_gen_TWO52 (mode);
28860 
28861   /* Temporary for holding the result, initialized to the input
28862      operand to ease control flow.  */
28863   res = gen_reg_rtx (mode);
28864   emit_move_insn (res, operand1);
28865 
28866   /* xa = abs (operand1) */
28867   xa = ix86_expand_sse_fabs (res, &mask);
28868 
28869   /* if (!isless (xa, TWO52)) goto label; */
28870   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
28871 
28872   /* xa = (double)(long)x */
28873   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
28874   expand_fix (xi, res, 0);
28875   expand_float (xa, xi, 0);
28876 
28877   /* generate 1.0 */
28878   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
28879 
28880   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
28881   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
28882   emit_insn (gen_rtx_SET (VOIDmode, tmp,
28883                           gen_rtx_AND (mode, one, tmp)));
28884   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
28885 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
28886   emit_move_insn (res, tmp);
28887 
28888   if (HONOR_SIGNED_ZEROS (mode))
28889     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
28890 
28891   emit_label (label);
28892   LABEL_NUSES (label) = 1;
28893 
28894   emit_move_insn (operand0, res);
28895 }
28896 
28897 /* Expand SSE sequence for computing round from OPERAND1 storing
28898    into OPERAND0.  Sequence that works without relying on DImode truncation
28899    via cvttsd2siq that is only available on 64bit targets.  */
28900 void
28901 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
28902 {
28903   /* C code for the stuff we expand below.
28904         double xa = fabs (x), xa2, x2;
28905         if (!isless (xa, TWO52))
28906           return x;
28907      Using the absolute value and copying back sign makes
28908      -0.0 -> -0.0 correct.
28909         xa2 = xa + TWO52 - TWO52;
28910      Compensate.
28911 	dxa = xa2 - xa;
28912         if (dxa <= -0.5)
28913           xa2 += 1;
28914         else if (dxa > 0.5)
28915           xa2 -= 1;
28916         x2 = copysign (xa2, x);
28917         return x2;
28918    */
28919   enum machine_mode mode = GET_MODE (operand0);
28920   rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
28921 
28922   TWO52 = ix86_gen_TWO52 (mode);
28923 
28924   /* Temporary for holding the result, initialized to the input
28925      operand to ease control flow.  */
28926   res = gen_reg_rtx (mode);
28927   emit_move_insn (res, operand1);
28928 
28929   /* xa = abs (operand1) */
28930   xa = ix86_expand_sse_fabs (res, &mask);
28931 
28932   /* if (!isless (xa, TWO52)) goto label; */
28933   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
28934 
28935   /* xa2 = xa + TWO52 - TWO52; */
28936   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
28937   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
28938 
28939   /* dxa = xa2 - xa; */
28940   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
28941 
28942   /* generate 0.5, 1.0 and -0.5 */
28943   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
28944   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
28945   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
28946 			       0, OPTAB_DIRECT);
28947 
28948   /* Compensate.  */
28949   tmp = gen_reg_rtx (mode);
28950   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
28951   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
28952   emit_insn (gen_rtx_SET (VOIDmode, tmp,
28953                           gen_rtx_AND (mode, one, tmp)));
28954   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
28955   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
28956   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
28957   emit_insn (gen_rtx_SET (VOIDmode, tmp,
28958                           gen_rtx_AND (mode, one, tmp)));
28959   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
28960 
28961   /* res = copysign (xa2, operand1) */
28962   ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
28963 
28964   emit_label (label);
28965   LABEL_NUSES (label) = 1;
28966 
28967   emit_move_insn (operand0, res);
28968 }
28969 
28970 /* Expand SSE sequence for computing trunc from OPERAND1 storing
28971    into OPERAND0.  */
28972 void
28973 ix86_expand_trunc (rtx operand0, rtx operand1)
28974 {
28975   /* C code for SSE variant we expand below.
28976         double xa = fabs (x), x2;
28977         if (!isless (xa, TWO52))
28978           return x;
28979         x2 = (double)(long)x;
28980 	if (HONOR_SIGNED_ZEROS (mode))
28981 	  return copysign (x2, x);
28982 	return x2;
28983    */
28984   enum machine_mode mode = GET_MODE (operand0);
28985   rtx xa, xi, TWO52, label, res, mask;
28986 
28987   TWO52 = ix86_gen_TWO52 (mode);
28988 
28989   /* Temporary for holding the result, initialized to the input
28990      operand to ease control flow.  */
28991   res = gen_reg_rtx (mode);
28992   emit_move_insn (res, operand1);
28993 
28994   /* xa = abs (operand1) */
28995   xa = ix86_expand_sse_fabs (res, &mask);
28996 
28997   /* if (!isless (xa, TWO52)) goto label; */
28998   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
28999 
29000   /* x = (double)(long)x */
29001   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
29002   expand_fix (xi, res, 0);
29003   expand_float (res, xi, 0);
29004 
29005   if (HONOR_SIGNED_ZEROS (mode))
29006     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
29007 
29008   emit_label (label);
29009   LABEL_NUSES (label) = 1;
29010 
29011   emit_move_insn (operand0, res);
29012 }
29013 
29014 /* Expand SSE sequence for computing trunc from OPERAND1 storing
29015    into OPERAND0.  */
29016 void
29017 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
29018 {
29019   enum machine_mode mode = GET_MODE (operand0);
29020   rtx xa, mask, TWO52, label, one, res, smask, tmp;
29021 
29022   /* C code for SSE variant we expand below.
29023         double xa = fabs (x), x2;
29024         if (!isless (xa, TWO52))
29025           return x;
29026         xa2 = xa + TWO52 - TWO52;
29027      Compensate:
29028         if (xa2 > xa)
29029           xa2 -= 1.0;
29030         x2 = copysign (xa2, x);
29031         return x2;
29032    */
29033 
29034   TWO52 = ix86_gen_TWO52 (mode);
29035 
29036   /* Temporary for holding the result, initialized to the input
29037      operand to ease control flow.  */
29038   res = gen_reg_rtx (mode);
29039   emit_move_insn (res, operand1);
29040 
29041   /* xa = abs (operand1) */
29042   xa = ix86_expand_sse_fabs (res, &smask);
29043 
29044   /* if (!isless (xa, TWO52)) goto label; */
29045   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
29046 
29047   /* res = xa + TWO52 - TWO52; */
29048   tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
29049   tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
29050   emit_move_insn (res, tmp);
29051 
29052   /* generate 1.0 */
29053   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
29054 
29055   /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
29056   mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
29057   emit_insn (gen_rtx_SET (VOIDmode, mask,
29058                           gen_rtx_AND (mode, mask, one)));
29059   tmp = expand_simple_binop (mode, MINUS,
29060 			     res, mask, NULL_RTX, 0, OPTAB_DIRECT);
29061   emit_move_insn (res, tmp);
29062 
29063   /* res = copysign (res, operand1) */
29064   ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
29065 
29066   emit_label (label);
29067   LABEL_NUSES (label) = 1;
29068 
29069   emit_move_insn (operand0, res);
29070 }
29071 
29072 /* Expand SSE sequence for computing round from OPERAND1 storing
29073    into OPERAND0.  */
29074 void
29075 ix86_expand_round (rtx operand0, rtx operand1)
29076 {
29077   /* C code for the stuff we're doing below:
29078         double xa = fabs (x);
29079         if (!isless (xa, TWO52))
29080           return x;
29081         xa = (double)(long)(xa + nextafter (0.5, 0.0));
29082         return copysign (xa, x);
29083    */
29084   enum machine_mode mode = GET_MODE (operand0);
29085   rtx res, TWO52, xa, label, xi, half, mask;
29086   const struct real_format *fmt;
29087   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
29088 
29089   /* Temporary for holding the result, initialized to the input
29090      operand to ease control flow.  */
29091   res = gen_reg_rtx (mode);
29092   emit_move_insn (res, operand1);
29093 
29094   TWO52 = ix86_gen_TWO52 (mode);
29095   xa = ix86_expand_sse_fabs (res, &mask);
29096   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
29097 
29098   /* load nextafter (0.5, 0.0) */
29099   fmt = REAL_MODE_FORMAT (mode);
29100   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
29101   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
29102 
29103   /* xa = xa + 0.5 */
29104   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
29105   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
29106 
29107   /* xa = (double)(int64_t)xa */
29108   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
29109   expand_fix (xi, xa, 0);
29110   expand_float (xa, xi, 0);
29111 
29112   /* res = copysign (xa, operand1) */
29113   ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
29114 
29115   emit_label (label);
29116   LABEL_NUSES (label) = 1;
29117 
29118   emit_move_insn (operand0, res);
29119 }
29120 
29121 
29122 /* Table of valid machine attributes.  */
29123 static const struct attribute_spec ix86_attribute_table[] =
29124 {
29125   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */
29126   /* Stdcall attribute says callee is responsible for popping arguments
29127      if they are not variable.  */
29128   { "stdcall",   0, 0, false, true,  true,  ix86_handle_cconv_attribute },
29129   /* Fastcall attribute says callee is responsible for popping arguments
29130      if they are not variable.  */
29131   { "fastcall",  0, 0, false, true,  true,  ix86_handle_cconv_attribute },
29132   /* Cdecl attribute says the callee is a normal C declaration */
29133   { "cdecl",     0, 0, false, true,  true,  ix86_handle_cconv_attribute },
29134   /* Regparm attribute specifies how many integer arguments are to be
29135      passed in registers.  */
29136   { "regparm",   1, 1, false, true,  true,  ix86_handle_cconv_attribute },
29137   /* Sseregparm attribute says we are using x86_64 calling conventions
29138      for FP arguments.  */
29139   { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute },
29140   /* force_align_arg_pointer says this function realigns the stack at entry.  */
29141   { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
29142     false, true,  true, ix86_handle_cconv_attribute },
29143 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
29144   { "dllimport", 0, 0, false, false, false, handle_dll_attribute },
29145   { "dllexport", 0, 0, false, false, false, handle_dll_attribute },
29146   { "shared",    0, 0, true,  false, false, ix86_handle_shared_attribute },
29147 #endif
29148   { "ms_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute },
29149   { "gcc_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute },
29150 #ifdef SUBTARGET_ATTRIBUTE_TABLE
29151   SUBTARGET_ATTRIBUTE_TABLE,
29152 #endif
29153   /* ms_abi and sysv_abi calling convention function attributes.  */
29154   { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute },
29155   { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute },
29156   { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute },
29157   /* End element.  */
29158   { NULL,        0, 0, false, false, false, NULL }
29159 };
29160 
29161 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
29162 static int
29163 ix86_builtin_vectorization_cost (bool runtime_test)
29164 {
29165   /* If the branch of the runtime test is taken - i.e. - the vectorized
29166      version is skipped - this incurs a misprediction cost (because the
29167      vectorized version is expected to be the fall-through).  So we subtract
29168      the latency of a mispredicted branch from the costs that are incured
29169      when the vectorized version is executed.
29170 
29171      TODO: The values in individual target tables have to be tuned or new
29172      fields may be needed. For eg. on K8, the default branch path is the
29173      not-taken path. If the taken path is predicted correctly, the minimum
29174      penalty of going down the taken-path is 1 cycle. If the taken-path is
29175      not predicted correctly, then the minimum penalty is 10 cycles.  */
29176 
29177   if (runtime_test)
29178     {
29179       return (-(ix86_cost->cond_taken_branch_cost));
29180     }
29181   else
29182     return 0;
29183 }
29184 
29185 /* Implement targetm.vectorize.builtin_vec_perm.  */
29186 
29187 static tree
29188 ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type)
29189 {
29190   tree itype = TREE_TYPE (vec_type);
29191   bool u = TYPE_UNSIGNED (itype);
29192   enum machine_mode vmode = TYPE_MODE (vec_type);
29193   enum ix86_builtins fcode = fcode; /* Silence bogus warning.  */
29194   bool ok = TARGET_SSE2;
29195 
29196   switch (vmode)
29197     {
29198     case V4DFmode:
29199       ok = TARGET_AVX;
29200       fcode = IX86_BUILTIN_VEC_PERM_V4DF;
29201       goto get_di;
29202     case V2DFmode:
29203       fcode = IX86_BUILTIN_VEC_PERM_V2DF;
29204     get_di:
29205       itype = ix86_get_builtin_type (IX86_BT_DI);
29206       break;
29207 
29208     case V8SFmode:
29209       ok = TARGET_AVX;
29210       fcode = IX86_BUILTIN_VEC_PERM_V8SF;
29211       goto get_si;
29212     case V4SFmode:
29213       ok = TARGET_SSE;
29214       fcode = IX86_BUILTIN_VEC_PERM_V4SF;
29215     get_si:
29216       itype = ix86_get_builtin_type (IX86_BT_SI);
29217       break;
29218 
29219     case V2DImode:
29220       fcode = u ? IX86_BUILTIN_VEC_PERM_V2DI_U : IX86_BUILTIN_VEC_PERM_V2DI;
29221       break;
29222     case V4SImode:
29223       fcode = u ? IX86_BUILTIN_VEC_PERM_V4SI_U : IX86_BUILTIN_VEC_PERM_V4SI;
29224       break;
29225     case V8HImode:
29226       fcode = u ? IX86_BUILTIN_VEC_PERM_V8HI_U : IX86_BUILTIN_VEC_PERM_V8HI;
29227       break;
29228     case V16QImode:
29229       fcode = u ? IX86_BUILTIN_VEC_PERM_V16QI_U : IX86_BUILTIN_VEC_PERM_V16QI;
29230       break;
29231     default:
29232       ok = false;
29233       break;
29234     }
29235 
29236   if (!ok)
29237     return NULL_TREE;
29238 
29239   *mask_type = itype;
29240   return ix86_builtins[(int) fcode];
29241 }
29242 
29243 /* Return a vector mode with twice as many elements as VMODE.  */
29244 /* ??? Consider moving this to a table generated by genmodes.c.  */
29245 
29246 static enum machine_mode
29247 doublesize_vector_mode (enum machine_mode vmode)
29248 {
29249   switch (vmode)
29250     {
29251     case V2SFmode:	return V4SFmode;
29252     case V1DImode:	return V2DImode;
29253     case V2SImode:	return V4SImode;
29254     case V4HImode:	return V8HImode;
29255     case V8QImode:	return V16QImode;
29256 
29257     case V2DFmode:	return V4DFmode;
29258     case V4SFmode:	return V8SFmode;
29259     case V2DImode:	return V4DImode;
29260     case V4SImode:	return V8SImode;
29261     case V8HImode:	return V16HImode;
29262     case V16QImode:	return V32QImode;
29263 
29264     case V4DFmode:	return V8DFmode;
29265     case V8SFmode:	return V16SFmode;
29266     case V4DImode:	return V8DImode;
29267     case V8SImode:	return V16SImode;
29268     case V16HImode:	return V32HImode;
29269     case V32QImode:	return V64QImode;
29270 
29271     default:
29272       gcc_unreachable ();
29273     }
29274 }
29275 
29276 /* Construct (set target (vec_select op0 (parallel perm))) and
29277    return true if that's a valid instruction in the active ISA.  */
29278 
29279 static bool
29280 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
29281 {
29282   rtx rperm[MAX_VECT_LEN], x;
29283   unsigned i;
29284 
29285   for (i = 0; i < nelt; ++i)
29286     rperm[i] = GEN_INT (perm[i]);
29287 
29288   x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
29289   x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
29290   x = gen_rtx_SET (VOIDmode, target, x);
29291 
29292   x = emit_insn (x);
29293   if (recog_memoized (x) < 0)
29294     {
29295       remove_insn (x);
29296       return false;
29297     }
29298   return true;
29299 }
29300 
29301 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
29302 
29303 static bool
29304 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
29305 			const unsigned char *perm, unsigned nelt)
29306 {
29307   enum machine_mode v2mode;
29308   rtx x;
29309 
29310   v2mode = doublesize_vector_mode (GET_MODE (op0));
29311   x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
29312   return expand_vselect (target, x, perm, nelt);
29313 }
29314 
29315 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
29316    in terms of blendp[sd] / pblendw / pblendvb.  */
29317 
29318 static bool
29319 expand_vec_perm_blend (struct expand_vec_perm_d *d)
29320 {
29321   enum machine_mode vmode = d->vmode;
29322   unsigned i, mask, nelt = d->nelt;
29323   rtx target, op0, op1, x;
29324 
29325   if (!TARGET_SSE4_1 || d->op0 == d->op1)
29326     return false;
29327   if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode))
29328     return false;
29329 
29330   /* This is a blend, not a permute.  Elements must stay in their
29331      respective lanes.  */
29332   for (i = 0; i < nelt; ++i)
29333     {
29334       unsigned e = d->perm[i];
29335       if (!(e == i || e == i + nelt))
29336 	return false;
29337     }
29338 
29339   if (d->testing_p)
29340     return true;
29341 
29342   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
29343      decision should be extracted elsewhere, so that we only try that
29344      sequence once all budget==3 options have been tried.  */
29345 
29346   /* For bytes, see if bytes move in pairs so we can use pblendw with
29347      an immediate argument, rather than pblendvb with a vector argument.  */
29348   if (vmode == V16QImode)
29349     {
29350       bool pblendw_ok = true;
29351       for (i = 0; i < 16 && pblendw_ok; i += 2)
29352 	pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]);
29353 
29354       if (!pblendw_ok)
29355 	{
29356 	  rtx rperm[16], vperm;
29357 
29358 	  for (i = 0; i < nelt; ++i)
29359 	    rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
29360 
29361 	  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
29362 	  vperm = force_reg (V16QImode, vperm);
29363 
29364 	  emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm));
29365 	  return true;
29366 	}
29367     }
29368 
29369   target = d->target;
29370   op0 = d->op0;
29371   op1 = d->op1;
29372   mask = 0;
29373 
29374   switch (vmode)
29375     {
29376     case V4DFmode:
29377     case V8SFmode:
29378     case V2DFmode:
29379     case V4SFmode:
29380     case V8HImode:
29381       for (i = 0; i < nelt; ++i)
29382 	mask |= (d->perm[i] >= nelt) << i;
29383       break;
29384 
29385     case V2DImode:
29386       for (i = 0; i < 2; ++i)
29387 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
29388       goto do_subreg;
29389 
29390     case V4SImode:
29391       for (i = 0; i < 4; ++i)
29392 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
29393       goto do_subreg;
29394 
29395     case V16QImode:
29396       for (i = 0; i < 8; ++i)
29397 	mask |= (d->perm[i * 2] >= 16) << i;
29398 
29399     do_subreg:
29400       vmode = V8HImode;
29401       target = gen_lowpart (vmode, target);
29402       op0 = gen_lowpart (vmode, op0);
29403       op1 = gen_lowpart (vmode, op1);
29404       break;
29405 
29406     default:
29407       gcc_unreachable ();
29408     }
29409 
29410   /* This matches five different patterns with the different modes.  */
29411   x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
29412   x = gen_rtx_SET (VOIDmode, target, x);
29413   emit_insn (x);
29414 
29415   return true;
29416 }
29417 
29418 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
29419    in terms of the variable form of vpermilps.
29420 
29421    Note that we will have already failed the immediate input vpermilps,
29422    which requires that the high and low part shuffle be identical; the
29423    variable form doesn't require that.  */
29424 
29425 static bool
29426 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
29427 {
29428   rtx rperm[8], vperm;
29429   unsigned i;
29430 
29431   if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
29432     return false;
29433 
29434   /* We can only permute within the 128-bit lane.  */
29435   for (i = 0; i < 8; ++i)
29436     {
29437       unsigned e = d->perm[i];
29438       if (i < 4 ? e >= 4 : e < 4)
29439 	return false;
29440     }
29441 
29442   if (d->testing_p)
29443     return true;
29444 
29445   for (i = 0; i < 8; ++i)
29446     {
29447       unsigned e = d->perm[i];
29448 
29449       /* Within each 128-bit lane, the elements of op0 are numbered
29450 	 from 0 and the elements of op1 are numbered from 4.  */
29451       if (e >= 8 + 4)
29452 	e -= 8;
29453       else if (e >= 4)
29454 	e -= 4;
29455 
29456       rperm[i] = GEN_INT (e);
29457     }
29458 
29459   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
29460   vperm = force_reg (V8SImode, vperm);
29461   emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
29462 
29463   return true;
29464 }
29465 
29466 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
29467    in terms of pshufb or vpperm.  */
29468 
29469 static bool
29470 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
29471 {
29472   unsigned i, nelt, eltsz;
29473   rtx rperm[16], vperm, target, op0, op1;
29474 
29475   if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP))
29476     return false;
29477   if (GET_MODE_SIZE (d->vmode) != 16)
29478     return false;
29479 
29480   if (d->testing_p)
29481     return true;
29482 
29483   nelt = d->nelt;
29484   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
29485 
29486   for (i = 0; i < nelt; ++i)
29487     {
29488       unsigned j, e = d->perm[i];
29489       for (j = 0; j < eltsz; ++j)
29490 	rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
29491     }
29492 
29493   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
29494   vperm = force_reg (V16QImode, vperm);
29495 
29496   target = gen_lowpart (V16QImode, d->target);
29497   op0 = gen_lowpart (V16QImode, d->op0);
29498   if (d->op0 == d->op1)
29499     emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
29500   else
29501     {
29502       op1 = gen_lowpart (V16QImode, d->op1);
29503       emit_insn (gen_xop_pperm (target, op0, op1, vperm));
29504     }
29505 
29506   return true;
29507 }
29508 
29509 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
29510    in a single instruction.  */
29511 
29512 static bool
29513 expand_vec_perm_1 (struct expand_vec_perm_d *d)
29514 {
29515   unsigned i, nelt = d->nelt;
29516   unsigned char perm2[MAX_VECT_LEN];
29517 
29518   /* Check plain VEC_SELECT first, because AVX has instructions that could
29519      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
29520      input where SEL+CONCAT may not.  */
29521   if (d->op0 == d->op1)
29522     {
29523       int mask = nelt - 1;
29524 
29525       for (i = 0; i < nelt; i++)
29526 	perm2[i] = d->perm[i] & mask;
29527 
29528       if (expand_vselect (d->target, d->op0, perm2, nelt))
29529 	return true;
29530 
29531       /* There are plenty of patterns in sse.md that are written for
29532 	 SEL+CONCAT and are not replicated for a single op.  Perhaps
29533 	 that should be changed, to avoid the nastiness here.  */
29534 
29535       /* Recognize interleave style patterns, which means incrementing
29536 	 every other permutation operand.  */
29537       for (i = 0; i < nelt; i += 2)
29538 	{
29539 	  perm2[i] = d->perm[i] & mask;
29540 	  perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
29541 	}
29542       if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
29543 	return true;
29544 
29545       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
29546       if (nelt >= 4)
29547 	{
29548 	  for (i = 0; i < nelt; i += 4)
29549 	    {
29550 	      perm2[i + 0] = d->perm[i + 0] & mask;
29551 	      perm2[i + 1] = d->perm[i + 1] & mask;
29552 	      perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
29553 	      perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
29554 	    }
29555 
29556 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
29557 	    return true;
29558 	}
29559     }
29560 
29561   /* Finally, try the fully general two operand permute.  */
29562   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
29563     return true;
29564 
29565   /* Recognize interleave style patterns with reversed operands.  */
29566   if (d->op0 != d->op1)
29567     {
29568       for (i = 0; i < nelt; ++i)
29569 	{
29570 	  unsigned e = d->perm[i];
29571 	  if (e >= nelt)
29572 	    e -= nelt;
29573 	  else
29574 	    e += nelt;
29575 	  perm2[i] = e;
29576 	}
29577 
29578       if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
29579 	return true;
29580     }
29581 
29582   /* Try the SSE4.1 blend variable merge instructions.  */
29583   if (expand_vec_perm_blend (d))
29584     return true;
29585 
29586   /* Try one of the AVX vpermil variable permutations.  */
29587   if (expand_vec_perm_vpermil (d))
29588     return true;
29589 
29590   /* Try the SSSE3 pshufb or XOP vpperm variable permutation.  */
29591   if (expand_vec_perm_pshufb (d))
29592     return true;
29593 
29594   return false;
29595 }
29596 
29597 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
29598    in terms of a pair of pshuflw + pshufhw instructions.  */
29599 
29600 static bool
29601 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
29602 {
29603   unsigned char perm2[MAX_VECT_LEN];
29604   unsigned i;
29605   bool ok;
29606 
29607   if (d->vmode != V8HImode || d->op0 != d->op1)
29608     return false;
29609 
29610   /* The two permutations only operate in 64-bit lanes.  */
29611   for (i = 0; i < 4; ++i)
29612     if (d->perm[i] >= 4)
29613       return false;
29614   for (i = 4; i < 8; ++i)
29615     if (d->perm[i] < 4)
29616       return false;
29617 
29618   if (d->testing_p)
29619     return true;
29620 
29621   /* Emit the pshuflw.  */
29622   memcpy (perm2, d->perm, 4);
29623   for (i = 4; i < 8; ++i)
29624     perm2[i] = i;
29625   ok = expand_vselect (d->target, d->op0, perm2, 8);
29626   gcc_assert (ok);
29627 
29628   /* Emit the pshufhw.  */
29629   memcpy (perm2 + 4, d->perm + 4, 4);
29630   for (i = 0; i < 4; ++i)
29631     perm2[i] = i;
29632   ok = expand_vselect (d->target, d->target, perm2, 8);
29633   gcc_assert (ok);
29634 
29635   return true;
29636 }
29637 
29638 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
29639    the permutation using the SSSE3 palignr instruction.  This succeeds
29640    when all of the elements in PERM fit within one vector and we merely
29641    need to shift them down so that a single vector permutation has a
29642    chance to succeed.  */
29643 
29644 static bool
29645 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
29646 {
29647   unsigned i, nelt = d->nelt;
29648   unsigned min, max;
29649   bool in_order, ok;
29650   rtx shift;
29651 
29652   /* Even with AVX, palignr only operates on 128-bit vectors.  */
29653   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
29654     return false;
29655 
29656   min = nelt, max = 0;
29657   for (i = 0; i < nelt; ++i)
29658     {
29659       unsigned e = d->perm[i];
29660       if (e < min)
29661 	min = e;
29662       if (e > max)
29663 	max = e;
29664     }
29665   if (min == 0 || max - min >= nelt)
29666     return false;
29667 
29668   /* Given that we have SSSE3, we know we'll be able to implement the
29669      single operand permutation after the palignr with pshufb.  */
29670   if (d->testing_p)
29671     return true;
29672 
29673   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
29674   emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
29675 				  gen_lowpart (TImode, d->op1),
29676 				  gen_lowpart (TImode, d->op0), shift));
29677 
29678   d->op0 = d->op1 = d->target;
29679 
29680   in_order = true;
29681   for (i = 0; i < nelt; ++i)
29682     {
29683       unsigned e = d->perm[i] - min;
29684       if (e != i)
29685 	in_order = false;
29686       d->perm[i] = e;
29687     }
29688 
29689   /* Test for the degenerate case where the alignment by itself
29690      produces the desired permutation.  */
29691   if (in_order)
29692     return true;
29693 
29694   ok = expand_vec_perm_1 (d);
29695   gcc_assert (ok);
29696 
29697   return ok;
29698 }
29699 
29700 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
29701    a two vector permutation into a single vector permutation by using
29702    an interleave operation to merge the vectors.  */
29703 
29704 static bool
29705 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
29706 {
29707   struct expand_vec_perm_d dremap, dfinal;
29708   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
29709   unsigned contents, h1, h2, h3, h4;
29710   unsigned char remap[2 * MAX_VECT_LEN];
29711   rtx seq;
29712   bool ok;
29713 
29714   if (d->op0 == d->op1)
29715     return false;
29716 
29717   /* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit
29718      lanes.  We can use similar techniques with the vperm2f128 instruction,
29719      but it requires slightly different logic.  */
29720   if (GET_MODE_SIZE (d->vmode) != 16)
29721     return false;
29722 
29723   /* Examine from whence the elements come.  */
29724   contents = 0;
29725   for (i = 0; i < nelt; ++i)
29726     contents |= 1u << d->perm[i];
29727 
29728   /* Split the two input vectors into 4 halves.  */
29729   h1 = (1u << nelt2) - 1;
29730   h2 = h1 << nelt2;
29731   h3 = h2 << nelt2;
29732   h4 = h3 << nelt2;
29733 
29734   memset (remap, 0xff, sizeof (remap));
29735   dremap = *d;
29736 
29737   /* If the elements from the low halves use interleave low, and similarly
29738      for interleave high.  If the elements are from mis-matched halves, we
29739      can use shufps for V4SF/V4SI or do a DImode shuffle.  */
29740   if ((contents & (h1 | h3)) == contents)
29741     {
29742       for (i = 0; i < nelt2; ++i)
29743 	{
29744 	  remap[i] = i * 2;
29745 	  remap[i + nelt] = i * 2 + 1;
29746 	  dremap.perm[i * 2] = i;
29747 	  dremap.perm[i * 2 + 1] = i + nelt;
29748 	}
29749     }
29750   else if ((contents & (h2 | h4)) == contents)
29751     {
29752       for (i = 0; i < nelt2; ++i)
29753 	{
29754 	  remap[i + nelt2] = i * 2;
29755 	  remap[i + nelt + nelt2] = i * 2 + 1;
29756 	  dremap.perm[i * 2] = i + nelt2;
29757 	  dremap.perm[i * 2 + 1] = i + nelt + nelt2;
29758 	}
29759     }
29760   else if ((contents & (h1 | h4)) == contents)
29761     {
29762       for (i = 0; i < nelt2; ++i)
29763 	{
29764 	  remap[i] = i;
29765 	  remap[i + nelt + nelt2] = i + nelt2;
29766 	  dremap.perm[i] = i;
29767 	  dremap.perm[i + nelt2] = i + nelt + nelt2;
29768 	}
29769       if (nelt != 4)
29770 	{
29771 	  dremap.vmode = V2DImode;
29772 	  dremap.nelt = 2;
29773 	  dremap.perm[0] = 0;
29774 	  dremap.perm[1] = 3;
29775 	}
29776     }
29777   else if ((contents & (h2 | h3)) == contents)
29778     {
29779       for (i = 0; i < nelt2; ++i)
29780 	{
29781 	  remap[i + nelt2] = i;
29782 	  remap[i + nelt] = i + nelt2;
29783 	  dremap.perm[i] = i + nelt2;
29784 	  dremap.perm[i + nelt2] = i + nelt;
29785 	}
29786       if (nelt != 4)
29787 	{
29788 	  dremap.vmode = V2DImode;
29789 	  dremap.nelt = 2;
29790 	  dremap.perm[0] = 1;
29791 	  dremap.perm[1] = 2;
29792 	}
29793     }
29794   else
29795     return false;
29796 
29797   /* Use the remapping array set up above to move the elements from their
29798      swizzled locations into their final destinations.  */
29799   dfinal = *d;
29800   for (i = 0; i < nelt; ++i)
29801     {
29802       unsigned e = remap[d->perm[i]];
29803       gcc_assert (e < nelt);
29804       dfinal.perm[i] = e;
29805     }
29806   dfinal.op0 = gen_reg_rtx (dfinal.vmode);
29807   dfinal.op1 = dfinal.op0;
29808   dremap.target = dfinal.op0;
29809 
29810   /* Test if the final remap can be done with a single insn.  For V4SFmode or
29811      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
29812   start_sequence ();
29813   ok = expand_vec_perm_1 (&dfinal);
29814   seq = get_insns ();
29815   end_sequence ();
29816 
29817   if (!ok)
29818     return false;
29819 
29820   if (dremap.vmode != dfinal.vmode)
29821     {
29822       dremap.target = gen_lowpart (dremap.vmode, dremap.target);
29823       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
29824       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
29825     }
29826 
29827   ok = expand_vec_perm_1 (&dremap);
29828   gcc_assert (ok);
29829 
29830   emit_insn (seq);
29831   return true;
29832 }
29833 
29834 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
29835    permutation with two pshufb insns and an ior.  We should have already
29836    failed all two instruction sequences.  */
29837 
29838 static bool
29839 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
29840 {
29841   rtx rperm[2][16], vperm, l, h, op, m128;
29842   unsigned int i, nelt, eltsz;
29843 
29844   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
29845     return false;
29846   gcc_assert (d->op0 != d->op1);
29847 
29848   nelt = d->nelt;
29849   eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
29850 
29851   /* Generate two permutation masks.  If the required element is within
29852      the given vector it is shuffled into the proper lane.  If the required
29853      element is in the other vector, force a zero into the lane by setting
29854      bit 7 in the permutation mask.  */
29855   m128 = GEN_INT (-128);
29856   for (i = 0; i < nelt; ++i)
29857     {
29858       unsigned j, e = d->perm[i];
29859       unsigned which = (e >= nelt);
29860       if (e >= nelt)
29861 	e -= nelt;
29862 
29863       for (j = 0; j < eltsz; ++j)
29864 	{
29865 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
29866 	  rperm[1-which][i*eltsz + j] = m128;
29867 	}
29868     }
29869 
29870   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
29871   vperm = force_reg (V16QImode, vperm);
29872 
29873   l = gen_reg_rtx (V16QImode);
29874   op = gen_lowpart (V16QImode, d->op0);
29875   emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
29876 
29877   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
29878   vperm = force_reg (V16QImode, vperm);
29879 
29880   h = gen_reg_rtx (V16QImode);
29881   op = gen_lowpart (V16QImode, d->op1);
29882   emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
29883 
29884   op = gen_lowpart (V16QImode, d->target);
29885   emit_insn (gen_iorv16qi3 (op, l, h));
29886 
29887   return true;
29888 }
29889 
29890 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
29891    and extract-odd permutations.  */
29892 
29893 static bool
29894 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
29895 {
29896   rtx t1, t2, t3, t4;
29897 
29898   switch (d->vmode)
29899     {
29900     case V4DFmode:
29901       t1 = gen_reg_rtx (V4DFmode);
29902       t2 = gen_reg_rtx (V4DFmode);
29903 
29904       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
29905       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
29906       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
29907 
29908       /* Now an unpck[lh]pd will produce the result required.  */
29909       if (odd)
29910 	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
29911       else
29912 	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
29913       emit_insn (t3);
29914       break;
29915 
29916     case V8SFmode:
29917       {
29918 	static const unsigned char perm1[8] = { 0, 2, 1, 3, 5, 6, 5, 7 };
29919 	static const unsigned char perme[8] = { 0, 1,  8,  9, 4, 5, 12, 13 };
29920 	static const unsigned char permo[8] = { 2, 3, 10, 11, 6, 7, 14, 15 };
29921 
29922 	t1 = gen_reg_rtx (V8SFmode);
29923 	t2 = gen_reg_rtx (V8SFmode);
29924 	t3 = gen_reg_rtx (V8SFmode);
29925 	t4 = gen_reg_rtx (V8SFmode);
29926 
29927 	/* Shuffle within the 128-bit lanes to produce:
29928 	   { 0 2 1 3 4 6 5 7 } and { 8 a 9 b c e d f }.  */
29929 	expand_vselect (t1, d->op0, perm1, 8);
29930 	expand_vselect (t2, d->op1, perm1, 8);
29931 
29932 	/* Shuffle the lanes around to produce:
29933 	   { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
29934 	emit_insn (gen_avx_vperm2f128v8sf3 (t3, t1, t2, GEN_INT (0x20)));
29935 	emit_insn (gen_avx_vperm2f128v8sf3 (t4, t1, t2, GEN_INT (0x31)));
29936 
29937 	/* Now a vpermil2p will produce the result required.  */
29938 	/* ??? The vpermil2p requires a vector constant.  Another option
29939 	   is a unpck[lh]ps to merge the two vectors to produce
29940 	   { 0 4 2 6 8 c a e } or { 1 5 3 7 9 d b f }.  Then use another
29941 	   vpermilps to get the elements into the final order.  */
29942 	d->op0 = t3;
29943 	d->op1 = t4;
29944 	memcpy (d->perm, odd ? permo: perme, 8);
29945 	expand_vec_perm_vpermil (d);
29946       }
29947       break;
29948 
29949     case V2DFmode:
29950     case V4SFmode:
29951     case V2DImode:
29952     case V4SImode:
29953       /* These are always directly implementable by expand_vec_perm_1.  */
29954       gcc_unreachable ();
29955 
29956     case V8HImode:
29957       if (TARGET_SSSE3)
29958 	return expand_vec_perm_pshufb2 (d);
29959       else
29960 	{
29961 	  /* We need 2*log2(N)-1 operations to achieve odd/even
29962 	     with interleave. */
29963 	  t1 = gen_reg_rtx (V8HImode);
29964 	  t2 = gen_reg_rtx (V8HImode);
29965 	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
29966 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
29967 	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
29968 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
29969 	  if (odd)
29970 	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
29971 	  else
29972 	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
29973 	  emit_insn (t3);
29974 	}
29975       break;
29976 
29977     case V16QImode:
29978       if (TARGET_SSSE3)
29979 	return expand_vec_perm_pshufb2 (d);
29980       else
29981 	{
29982 	  t1 = gen_reg_rtx (V16QImode);
29983 	  t2 = gen_reg_rtx (V16QImode);
29984 	  t3 = gen_reg_rtx (V16QImode);
29985 	  emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
29986 	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
29987 	  emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
29988 	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
29989 	  emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
29990 	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
29991 	  if (odd)
29992 	    t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
29993 	  else
29994 	    t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
29995 	  emit_insn (t3);
29996 	}
29997       break;
29998 
29999     default:
30000       gcc_unreachable ();
30001     }
30002 
30003   return true;
30004 }
30005 
30006 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
30007    extract-even and extract-odd permutations.  */
30008 
30009 static bool
30010 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
30011 {
30012   unsigned i, odd, nelt = d->nelt;
30013 
30014   odd = d->perm[0];
30015   if (odd != 0 && odd != 1)
30016     return false;
30017 
30018   for (i = 1; i < nelt; ++i)
30019     if (d->perm[i] != 2 * i + odd)
30020       return false;
30021 
30022   return expand_vec_perm_even_odd_1 (d, odd);
30023 }
30024 
30025 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
30026    permutations.  We assume that expand_vec_perm_1 has already failed.  */
30027 
30028 static bool
30029 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
30030 {
30031   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
30032   enum machine_mode vmode = d->vmode;
30033   unsigned char perm2[4];
30034   rtx op0 = d->op0;
30035   bool ok;
30036 
30037   switch (vmode)
30038     {
30039     case V4DFmode:
30040     case V8SFmode:
30041       /* These are special-cased in sse.md so that we can optionally
30042 	 use the vbroadcast instruction.  They expand to two insns
30043 	 if the input happens to be in a register.  */
30044       gcc_unreachable ();
30045 
30046     case V2DFmode:
30047     case V2DImode:
30048     case V4SFmode:
30049     case V4SImode:
30050       /* These are always implementable using standard shuffle patterns.  */
30051       gcc_unreachable ();
30052 
30053     case V8HImode:
30054     case V16QImode:
30055       /* These can be implemented via interleave.  We save one insn by
30056 	 stopping once we have promoted to V4SImode and then use pshufd.  */
30057       do
30058 	{
30059 	  optab otab = vec_interleave_low_optab;
30060 
30061 	  if (elt >= nelt2)
30062 	    {
30063 	      otab = vec_interleave_high_optab;
30064 	      elt -= nelt2;
30065 	    }
30066 	  nelt2 /= 2;
30067 
30068 	  op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
30069 	  vmode = get_mode_wider_vector (vmode);
30070 	  op0 = gen_lowpart (vmode, op0);
30071 	}
30072       while (vmode != V4SImode);
30073 
30074       memset (perm2, elt, 4);
30075       ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
30076       gcc_assert (ok);
30077       return true;
30078 
30079     default:
30080       gcc_unreachable ();
30081     }
30082 }
30083 
30084 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
30085    broadcast permutations.  */
30086 
30087 static bool
30088 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
30089 {
30090   unsigned i, elt, nelt = d->nelt;
30091 
30092   if (d->op0 != d->op1)
30093     return false;
30094 
30095   elt = d->perm[0];
30096   for (i = 1; i < nelt; ++i)
30097     if (d->perm[i] != elt)
30098       return false;
30099 
30100   return expand_vec_perm_broadcast_1 (d);
30101 }
30102 
30103 /* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
30104    With all of the interface bits taken care of, perform the expansion
30105    in D and return true on success.  */
30106 
30107 static bool
30108 ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
30109 {
30110   /* Try a single instruction expansion.  */
30111   if (expand_vec_perm_1 (d))
30112     return true;
30113 
30114   /* Try sequences of two instructions.  */
30115 
30116   if (expand_vec_perm_pshuflw_pshufhw (d))
30117     return true;
30118 
30119   if (expand_vec_perm_palignr (d))
30120     return true;
30121 
30122   if (expand_vec_perm_interleave2 (d))
30123     return true;
30124 
30125   if (expand_vec_perm_broadcast (d))
30126     return true;
30127 
30128   /* Try sequences of three instructions.  */
30129 
30130   if (expand_vec_perm_pshufb2 (d))
30131     return true;
30132 
30133   /* ??? Look for narrow permutations whose element orderings would
30134      allow the promotion to a wider mode.  */
30135 
30136   /* ??? Look for sequences of interleave or a wider permute that place
30137      the data into the correct lanes for a half-vector shuffle like
30138      pshuf[lh]w or vpermilps.  */
30139 
30140   /* ??? Look for sequences of interleave that produce the desired results.
30141      The combinatorics of punpck[lh] get pretty ugly... */
30142 
30143   if (expand_vec_perm_even_odd (d))
30144     return true;
30145 
30146   return false;
30147 }
30148 
30149 /* Extract the values from the vector CST into the permutation array in D.
30150    Return 0 on error, 1 if all values from the permutation come from the
30151    first vector, 2 if all values from the second vector, and 3 otherwise.  */
30152 
30153 static int
30154 extract_vec_perm_cst (struct expand_vec_perm_d *d, tree cst)
30155 {
30156   tree list = TREE_VECTOR_CST_ELTS (cst);
30157   unsigned i, nelt = d->nelt;
30158   int ret = 0;
30159 
30160   for (i = 0; i < nelt; ++i, list = TREE_CHAIN (list))
30161     {
30162       unsigned HOST_WIDE_INT e;
30163 
30164       if (!host_integerp (TREE_VALUE (list), 1))
30165 	return 0;
30166       e = tree_low_cst (TREE_VALUE (list), 1);
30167       if (e >= 2 * nelt)
30168 	return 0;
30169 
30170       ret |= (e < nelt ? 1 : 2);
30171       d->perm[i] = e;
30172     }
30173   gcc_assert (list == NULL);
30174 
30175   /* For all elements from second vector, fold the elements to first.  */
30176   if (ret == 2)
30177     for (i = 0; i < nelt; ++i)
30178       d->perm[i] -= nelt;
30179 
30180   return ret;
30181 }
30182 
30183 static rtx
30184 ix86_expand_vec_perm_builtin (tree exp)
30185 {
30186   struct expand_vec_perm_d d;
30187   tree arg0, arg1, arg2;
30188 
30189   arg0 = CALL_EXPR_ARG (exp, 0);
30190   arg1 = CALL_EXPR_ARG (exp, 1);
30191   arg2 = CALL_EXPR_ARG (exp, 2);
30192 
30193   d.vmode = TYPE_MODE (TREE_TYPE (arg0));
30194   d.nelt = GET_MODE_NUNITS (d.vmode);
30195   d.testing_p = false;
30196   gcc_assert (VECTOR_MODE_P (d.vmode));
30197 
30198   if (TREE_CODE (arg2) != VECTOR_CST)
30199     {
30200       error_at (EXPR_LOCATION (exp),
30201 		"vector permutation requires vector constant");
30202       goto exit_error;
30203     }
30204 
30205   switch (extract_vec_perm_cst (&d, arg2))
30206     {
30207     default:
30208       gcc_unreachable();
30209 
30210     case 0:
30211       error_at (EXPR_LOCATION (exp), "invalid vector permutation constant");
30212       goto exit_error;
30213 
30214     case 3:
30215       if (!operand_equal_p (arg0, arg1, 0))
30216 	{
30217 	  d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
30218 	  d.op0 = force_reg (d.vmode, d.op0);
30219 	  d.op1 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
30220 	  d.op1 = force_reg (d.vmode, d.op1);
30221 	  break;
30222 	}
30223 
30224       /* The elements of PERM do not suggest that only the first operand
30225 	 is used, but both operands are identical.  Allow easier matching
30226 	 of the permutation by folding the permutation into the single
30227 	 input vector.  */
30228       {
30229 	unsigned i, nelt = d.nelt;
30230 	for (i = 0; i < nelt; ++i)
30231 	  if (d.perm[i] >= nelt)
30232 	    d.perm[i] -= nelt;
30233       }
30234       /* FALLTHRU */
30235 
30236     case 1:
30237       d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
30238       d.op0 = force_reg (d.vmode, d.op0);
30239       d.op1 = d.op0;
30240       break;
30241 
30242     case 2:
30243       d.op0 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
30244       d.op0 = force_reg (d.vmode, d.op0);
30245       d.op1 = d.op0;
30246       break;
30247     }
30248 
30249   d.target = gen_reg_rtx (d.vmode);
30250   if (ix86_expand_vec_perm_builtin_1 (&d))
30251     return d.target;
30252 
30253   /* For compiler generated permutations, we should never got here, because
30254      the compiler should also be checking the ok hook.  But since this is a
30255      builtin the user has access too, so don't abort.  */
30256   switch (d.nelt)
30257     {
30258     case 2:
30259       sorry ("vector permutation (%d %d)", d.perm[0], d.perm[1]);
30260       break;
30261     case 4:
30262       sorry ("vector permutation (%d %d %d %d)",
30263 	     d.perm[0], d.perm[1], d.perm[2], d.perm[3]);
30264       break;
30265     case 8:
30266       sorry ("vector permutation (%d %d %d %d %d %d %d %d)",
30267 	     d.perm[0], d.perm[1], d.perm[2], d.perm[3],
30268 	     d.perm[4], d.perm[5], d.perm[6], d.perm[7]);
30269       break;
30270     case 16:
30271       sorry ("vector permutation "
30272 	     "(%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d)",
30273 	     d.perm[0], d.perm[1], d.perm[2], d.perm[3],
30274 	     d.perm[4], d.perm[5], d.perm[6], d.perm[7],
30275 	     d.perm[8], d.perm[9], d.perm[10], d.perm[11],
30276 	     d.perm[12], d.perm[13], d.perm[14], d.perm[15]);
30277       break;
30278     default:
30279       gcc_unreachable ();
30280     }
30281  exit_error:
30282   return CONST0_RTX (d.vmode);
30283 }
30284 
30285 /* Implement targetm.vectorize.builtin_vec_perm_ok.  */
30286 
30287 static bool
30288 ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask)
30289 {
30290   struct expand_vec_perm_d d;
30291   int vec_mask;
30292   bool ret, one_vec;
30293 
30294   d.vmode = TYPE_MODE (vec_type);
30295   d.nelt = GET_MODE_NUNITS (d.vmode);
30296   d.testing_p = true;
30297 
30298   /* Given sufficient ISA support we can just return true here
30299      for selected vector modes.  */
30300   if (GET_MODE_SIZE (d.vmode) == 16)
30301     {
30302       /* All implementable with a single vpperm insn.  */
30303       if (TARGET_XOP)
30304 	return true;
30305       /* All implementable with 2 pshufb + 1 ior.  */
30306       if (TARGET_SSSE3)
30307 	return true;
30308       /* All implementable with shufpd or unpck[lh]pd.  */
30309       if (d.nelt == 2)
30310 	return true;
30311     }
30312 
30313   vec_mask = extract_vec_perm_cst (&d, mask);
30314 
30315   /* This hook is cannot be called in response to something that the
30316      user does (unlike the builtin expander) so we shouldn't ever see
30317      an error generated from the extract.  */
30318   gcc_assert (vec_mask > 0 && vec_mask <= 3);
30319   one_vec = (vec_mask != 3);
30320 
30321   /* Implementable with shufps or pshufd.  */
30322   if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
30323     return true;
30324 
30325   /* Otherwise we have to go through the motions and see if we can
30326      figure out how to generate the requested permutation.  */
30327   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
30328   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
30329   if (!one_vec)
30330     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
30331 
30332   start_sequence ();
30333   ret = ix86_expand_vec_perm_builtin_1 (&d);
30334   end_sequence ();
30335 
30336   return ret;
30337 }
30338 
30339 void
30340 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
30341 {
30342   struct expand_vec_perm_d d;
30343   unsigned i, nelt;
30344 
30345   d.target = targ;
30346   d.op0 = op0;
30347   d.op1 = op1;
30348   d.vmode = GET_MODE (targ);
30349   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
30350   d.testing_p = false;
30351 
30352   for (i = 0; i < nelt; ++i)
30353     d.perm[i] = i * 2 + odd;
30354 
30355   /* We'll either be able to implement the permutation directly...  */
30356   if (expand_vec_perm_1 (&d))
30357     return;
30358 
30359   /* ... or we use the special-case patterns.  */
30360   expand_vec_perm_even_odd_1 (&d, odd);
30361 }
30362 
30363 /* This function returns the calling abi specific va_list type node.
30364    It returns  the FNDECL specific va_list type.  */
30365 
30366 tree
30367 ix86_fn_abi_va_list (tree fndecl)
30368 {
30369   if (!TARGET_64BIT)
30370     return va_list_type_node;
30371   gcc_assert (fndecl != NULL_TREE);
30372 
30373   if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
30374     return ms_va_list_type_node;
30375   else
30376     return sysv_va_list_type_node;
30377 }
30378 
30379 /* Returns the canonical va_list type specified by TYPE. If there
30380    is no valid TYPE provided, it return NULL_TREE.  */
30381 
30382 tree
30383 ix86_canonical_va_list_type (tree type)
30384 {
30385   tree wtype, htype;
30386 
30387   /* Resolve references and pointers to va_list type.  */
30388   if (INDIRECT_REF_P (type))
30389     type = TREE_TYPE (type);
30390   else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
30391     type = TREE_TYPE (type);
30392 
30393   if (TARGET_64BIT)
30394     {
30395       wtype = va_list_type_node;
30396 	  gcc_assert (wtype != NULL_TREE);
30397       htype = type;
30398       if (TREE_CODE (wtype) == ARRAY_TYPE)
30399 	{
30400 	  /* If va_list is an array type, the argument may have decayed
30401 	     to a pointer type, e.g. by being passed to another function.
30402 	     In that case, unwrap both types so that we can compare the
30403 	     underlying records.  */
30404 	  if (TREE_CODE (htype) == ARRAY_TYPE
30405 	      || POINTER_TYPE_P (htype))
30406 	    {
30407 	      wtype = TREE_TYPE (wtype);
30408 	      htype = TREE_TYPE (htype);
30409 	    }
30410 	}
30411       if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
30412 	return va_list_type_node;
30413       wtype = sysv_va_list_type_node;
30414 	  gcc_assert (wtype != NULL_TREE);
30415       htype = type;
30416       if (TREE_CODE (wtype) == ARRAY_TYPE)
30417 	{
30418 	  /* If va_list is an array type, the argument may have decayed
30419 	     to a pointer type, e.g. by being passed to another function.
30420 	     In that case, unwrap both types so that we can compare the
30421 	     underlying records.  */
30422 	  if (TREE_CODE (htype) == ARRAY_TYPE
30423 	      || POINTER_TYPE_P (htype))
30424 	    {
30425 	      wtype = TREE_TYPE (wtype);
30426 	      htype = TREE_TYPE (htype);
30427 	    }
30428 	}
30429       if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
30430 	return sysv_va_list_type_node;
30431       wtype = ms_va_list_type_node;
30432 	  gcc_assert (wtype != NULL_TREE);
30433       htype = type;
30434       if (TREE_CODE (wtype) == ARRAY_TYPE)
30435 	{
30436 	  /* If va_list is an array type, the argument may have decayed
30437 	     to a pointer type, e.g. by being passed to another function.
30438 	     In that case, unwrap both types so that we can compare the
30439 	     underlying records.  */
30440 	  if (TREE_CODE (htype) == ARRAY_TYPE
30441 	      || POINTER_TYPE_P (htype))
30442 	    {
30443 	      wtype = TREE_TYPE (wtype);
30444 	      htype = TREE_TYPE (htype);
30445 	    }
30446 	}
30447       if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
30448 	return ms_va_list_type_node;
30449       return NULL_TREE;
30450     }
30451   return std_canonical_va_list_type (type);
30452 }
30453 
30454 /* Iterate through the target-specific builtin types for va_list.
30455     IDX denotes the iterator, *PTREE is set to the result type of
30456     the va_list builtin, and *PNAME to its internal type.
30457     Returns zero if there is no element for this index, otherwise
30458     IDX should be increased upon the next call.
30459     Note, do not iterate a base builtin's name like __builtin_va_list.
30460     Used from c_common_nodes_and_builtins.  */
30461 
30462 int
30463 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
30464 {
30465   if (!TARGET_64BIT)
30466     return 0;
30467   switch (idx) {
30468   case 0:
30469     *ptree = ms_va_list_type_node;
30470     *pname = "__builtin_ms_va_list";
30471     break;
30472   case 1:
30473     *ptree = sysv_va_list_type_node;
30474     *pname = "__builtin_sysv_va_list";
30475     break;
30476   default:
30477     return 0;
30478   }
30479   return 1;
30480 }
30481 
30482 /* Initialize the GCC target structure.  */
30483 #undef TARGET_RETURN_IN_MEMORY
30484 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
30485 
30486 #undef TARGET_LEGITIMIZE_ADDRESS
30487 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
30488 
30489 #undef TARGET_ATTRIBUTE_TABLE
30490 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
30491 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
30492 #  undef TARGET_MERGE_DECL_ATTRIBUTES
30493 #  define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
30494 #endif
30495 
30496 #undef TARGET_COMP_TYPE_ATTRIBUTES
30497 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
30498 
30499 #undef TARGET_INIT_BUILTINS
30500 #define TARGET_INIT_BUILTINS ix86_init_builtins
30501 #undef TARGET_BUILTIN_DECL
30502 #define TARGET_BUILTIN_DECL ix86_builtin_decl
30503 #undef TARGET_EXPAND_BUILTIN
30504 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
30505 
30506 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
30507 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
30508   ix86_builtin_vectorized_function
30509 
30510 #undef TARGET_VECTORIZE_BUILTIN_CONVERSION
30511 #define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion
30512 
30513 #undef TARGET_BUILTIN_RECIPROCAL
30514 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
30515 
30516 #undef TARGET_ASM_FUNCTION_EPILOGUE
30517 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
30518 
30519 #undef TARGET_ENCODE_SECTION_INFO
30520 #ifndef SUBTARGET_ENCODE_SECTION_INFO
30521 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
30522 #else
30523 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
30524 #endif
30525 
30526 #undef TARGET_ASM_OPEN_PAREN
30527 #define TARGET_ASM_OPEN_PAREN ""
30528 #undef TARGET_ASM_CLOSE_PAREN
30529 #define TARGET_ASM_CLOSE_PAREN ""
30530 
30531 #undef TARGET_ASM_BYTE_OP
30532 #define TARGET_ASM_BYTE_OP ASM_BYTE
30533 
30534 #undef TARGET_ASM_ALIGNED_HI_OP
30535 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
30536 #undef TARGET_ASM_ALIGNED_SI_OP
30537 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
30538 #ifdef ASM_QUAD
30539 #undef TARGET_ASM_ALIGNED_DI_OP
30540 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
30541 #endif
30542 
30543 #undef TARGET_ASM_UNALIGNED_HI_OP
30544 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
30545 #undef TARGET_ASM_UNALIGNED_SI_OP
30546 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
30547 #undef TARGET_ASM_UNALIGNED_DI_OP
30548 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
30549 
30550 #undef TARGET_SCHED_ADJUST_COST
30551 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
30552 #undef TARGET_SCHED_ISSUE_RATE
30553 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
30554 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
30555 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
30556   ia32_multipass_dfa_lookahead
30557 
30558 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30559 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
30560 
30561 #ifdef HAVE_AS_TLS
30562 #undef TARGET_HAVE_TLS
30563 #define TARGET_HAVE_TLS true
30564 #endif
30565 #undef TARGET_CANNOT_FORCE_CONST_MEM
30566 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
30567 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
30568 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
30569 
30570 #undef TARGET_DELEGITIMIZE_ADDRESS
30571 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
30572 
30573 #undef TARGET_MS_BITFIELD_LAYOUT_P
30574 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
30575 
30576 #if TARGET_MACHO
30577 #undef TARGET_BINDS_LOCAL_P
30578 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
30579 #endif
30580 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
30581 #undef TARGET_BINDS_LOCAL_P
30582 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
30583 #endif
30584 
30585 #undef TARGET_ASM_OUTPUT_MI_THUNK
30586 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
30587 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30588 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
30589 
30590 #undef TARGET_ASM_FILE_START
30591 #define TARGET_ASM_FILE_START x86_file_start
30592 
30593 #undef TARGET_DEFAULT_TARGET_FLAGS
30594 #define TARGET_DEFAULT_TARGET_FLAGS	\
30595   (TARGET_DEFAULT			\
30596    | TARGET_SUBTARGET_DEFAULT		\
30597    | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT \
30598    | MASK_FUSED_MADD)
30599 
30600 #undef TARGET_HANDLE_OPTION
30601 #define TARGET_HANDLE_OPTION ix86_handle_option
30602 
30603 #undef TARGET_RTX_COSTS
30604 #define TARGET_RTX_COSTS ix86_rtx_costs
30605 #undef TARGET_ADDRESS_COST
30606 #define TARGET_ADDRESS_COST ix86_address_cost
30607 
30608 #undef TARGET_FIXED_CONDITION_CODE_REGS
30609 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
30610 #undef TARGET_CC_MODES_COMPATIBLE
30611 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
30612 
30613 #undef TARGET_MACHINE_DEPENDENT_REORG
30614 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
30615 
30616 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
30617 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
30618 
30619 #undef TARGET_BUILD_BUILTIN_VA_LIST
30620 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
30621 
30622 #undef TARGET_FN_ABI_VA_LIST
30623 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
30624 
30625 #undef TARGET_CANONICAL_VA_LIST_TYPE
30626 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
30627 
30628 #undef TARGET_EXPAND_BUILTIN_VA_START
30629 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
30630 
30631 #undef TARGET_MD_ASM_CLOBBERS
30632 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
30633 
30634 #undef TARGET_PROMOTE_PROTOTYPES
30635 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
30636 #undef TARGET_STRUCT_VALUE_RTX
30637 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
30638 #undef TARGET_SETUP_INCOMING_VARARGS
30639 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
30640 #undef TARGET_MUST_PASS_IN_STACK
30641 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
30642 #undef TARGET_PASS_BY_REFERENCE
30643 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
30644 #undef TARGET_INTERNAL_ARG_POINTER
30645 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
30646 #undef TARGET_UPDATE_STACK_BOUNDARY
30647 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
30648 #undef TARGET_GET_DRAP_RTX
30649 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
30650 #undef TARGET_STRICT_ARGUMENT_NAMING
30651 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
30652 #undef TARGET_STATIC_CHAIN
30653 #define TARGET_STATIC_CHAIN ix86_static_chain
30654 #undef TARGET_TRAMPOLINE_INIT
30655 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
30656 
30657 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30658 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
30659 
30660 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30661 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
30662 
30663 #undef TARGET_VECTOR_MODE_SUPPORTED_P
30664 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
30665 
30666 #undef TARGET_C_MODE_FOR_SUFFIX
30667 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
30668 
30669 #ifdef HAVE_AS_TLS
30670 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
30671 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
30672 #endif
30673 
30674 #ifdef SUBTARGET_INSERT_ATTRIBUTES
30675 #undef TARGET_INSERT_ATTRIBUTES
30676 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
30677 #endif
30678 
30679 #undef TARGET_MANGLE_TYPE
30680 #define TARGET_MANGLE_TYPE ix86_mangle_type
30681 
30682 #undef TARGET_STACK_PROTECT_FAIL
30683 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
30684 
30685 #undef TARGET_FUNCTION_VALUE
30686 #define TARGET_FUNCTION_VALUE ix86_function_value
30687 
30688 #undef TARGET_SECONDARY_RELOAD
30689 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
30690 
30691 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
30692 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
30693   ix86_builtin_vectorization_cost
30694 #undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
30695 #define TARGET_VECTORIZE_BUILTIN_VEC_PERM \
30696   ix86_vectorize_builtin_vec_perm
30697 #undef TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK
30698 #define TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK \
30699   ix86_vectorize_builtin_vec_perm_ok
30700 
30701 #undef TARGET_SET_CURRENT_FUNCTION
30702 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
30703 
30704 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30705 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
30706 
30707 #undef TARGET_OPTION_SAVE
30708 #define TARGET_OPTION_SAVE ix86_function_specific_save
30709 
30710 #undef TARGET_OPTION_RESTORE
30711 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
30712 
30713 #undef TARGET_OPTION_PRINT
30714 #define TARGET_OPTION_PRINT ix86_function_specific_print
30715 
30716 #undef TARGET_CAN_INLINE_P
30717 #define TARGET_CAN_INLINE_P ix86_can_inline_p
30718 
30719 #undef TARGET_EXPAND_TO_RTL_HOOK
30720 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
30721 
30722 #undef TARGET_LEGITIMATE_ADDRESS_P
30723 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
30724 
30725 #undef TARGET_IRA_COVER_CLASSES
30726 #define TARGET_IRA_COVER_CLASSES i386_ira_cover_classes
30727 
30728 #undef TARGET_FRAME_POINTER_REQUIRED
30729 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
30730 
30731 #undef TARGET_CAN_ELIMINATE
30732 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
30733 
30734 #undef TARGET_ASM_CODE_END
30735 #define TARGET_ASM_CODE_END ix86_code_end
30736 
30737 #if TARGET_MACHO
30738 #undef TARGET_INIT_LIBFUNCS
30739 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
30740 #endif
30741 
30742 struct gcc_target targetm = TARGET_INITIALIZER;
30743 
30744 #include "gt-i386.h"
30745