xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/spu/spu.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /* Copyright (C) 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
2 
3    This file is free software; you can redistribute it and/or modify it under
4    the terms of the GNU General Public License as published by the Free
5    Software Foundation; either version 3 of the License, or (at your option)
6    any later version.
7 
8    This file is distributed in the hope that it will be useful, but WITHOUT
9    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11    for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with GCC; see the file COPYING3.  If not see
15    <http://www.gnu.org/licenses/>.  */
16 
17 #include "config.h"
18 #include "system.h"
19 #include "coretypes.h"
20 #include "tm.h"
21 #include "rtl.h"
22 #include "regs.h"
23 #include "hard-reg-set.h"
24 #include "real.h"
25 #include "insn-config.h"
26 #include "conditions.h"
27 #include "insn-attr.h"
28 #include "flags.h"
29 #include "recog.h"
30 #include "obstack.h"
31 #include "tree.h"
32 #include "expr.h"
33 #include "optabs.h"
34 #include "except.h"
35 #include "function.h"
36 #include "output.h"
37 #include "basic-block.h"
38 #include "integrate.h"
39 #include "toplev.h"
40 #include "ggc.h"
41 #include "hashtab.h"
42 #include "tm_p.h"
43 #include "target.h"
44 #include "target-def.h"
45 #include "langhooks.h"
46 #include "reload.h"
47 #include "cfglayout.h"
48 #include "sched-int.h"
49 #include "params.h"
50 #include "assert.h"
51 #include "machmode.h"
52 #include "gimple.h"
53 #include "tm-constrs.h"
54 #include "ddg.h"
55 #include "sbitmap.h"
56 #include "timevar.h"
57 #include "df.h"
58 
59 /* Builtin types, data and prototypes. */
60 
61 enum spu_builtin_type_index
62 {
63   SPU_BTI_END_OF_PARAMS,
64 
65   /* We create new type nodes for these. */
66   SPU_BTI_V16QI,
67   SPU_BTI_V8HI,
68   SPU_BTI_V4SI,
69   SPU_BTI_V2DI,
70   SPU_BTI_V4SF,
71   SPU_BTI_V2DF,
72   SPU_BTI_UV16QI,
73   SPU_BTI_UV8HI,
74   SPU_BTI_UV4SI,
75   SPU_BTI_UV2DI,
76 
77   /* A 16-byte type. (Implemented with V16QI_type_node) */
78   SPU_BTI_QUADWORD,
79 
80   /* These all correspond to intSI_type_node */
81   SPU_BTI_7,
82   SPU_BTI_S7,
83   SPU_BTI_U7,
84   SPU_BTI_S10,
85   SPU_BTI_S10_4,
86   SPU_BTI_U14,
87   SPU_BTI_16,
88   SPU_BTI_S16,
89   SPU_BTI_S16_2,
90   SPU_BTI_U16,
91   SPU_BTI_U16_2,
92   SPU_BTI_U18,
93 
94   /* These correspond to the standard types */
95   SPU_BTI_INTQI,
96   SPU_BTI_INTHI,
97   SPU_BTI_INTSI,
98   SPU_BTI_INTDI,
99 
100   SPU_BTI_UINTQI,
101   SPU_BTI_UINTHI,
102   SPU_BTI_UINTSI,
103   SPU_BTI_UINTDI,
104 
105   SPU_BTI_FLOAT,
106   SPU_BTI_DOUBLE,
107 
108   SPU_BTI_VOID,
109   SPU_BTI_PTR,
110 
111   SPU_BTI_MAX
112 };
113 
114 #define V16QI_type_node               (spu_builtin_types[SPU_BTI_V16QI])
115 #define V8HI_type_node                (spu_builtin_types[SPU_BTI_V8HI])
116 #define V4SI_type_node                (spu_builtin_types[SPU_BTI_V4SI])
117 #define V2DI_type_node                (spu_builtin_types[SPU_BTI_V2DI])
118 #define V4SF_type_node                (spu_builtin_types[SPU_BTI_V4SF])
119 #define V2DF_type_node                (spu_builtin_types[SPU_BTI_V2DF])
120 #define unsigned_V16QI_type_node      (spu_builtin_types[SPU_BTI_UV16QI])
121 #define unsigned_V8HI_type_node       (spu_builtin_types[SPU_BTI_UV8HI])
122 #define unsigned_V4SI_type_node       (spu_builtin_types[SPU_BTI_UV4SI])
123 #define unsigned_V2DI_type_node       (spu_builtin_types[SPU_BTI_UV2DI])
124 
125 static GTY(()) tree spu_builtin_types[SPU_BTI_MAX];
126 
127 struct spu_builtin_range
128 {
129   int low, high;
130 };
131 
132 static struct spu_builtin_range spu_builtin_range[] = {
133   {-0x40ll, 0x7fll},		/* SPU_BTI_7     */
134   {-0x40ll, 0x3fll},		/* SPU_BTI_S7    */
135   {0ll, 0x7fll},		/* SPU_BTI_U7    */
136   {-0x200ll, 0x1ffll},		/* SPU_BTI_S10   */
137   {-0x2000ll, 0x1fffll},	/* SPU_BTI_S10_4 */
138   {0ll, 0x3fffll},		/* SPU_BTI_U14   */
139   {-0x8000ll, 0xffffll},	/* SPU_BTI_16    */
140   {-0x8000ll, 0x7fffll},	/* SPU_BTI_S16   */
141   {-0x20000ll, 0x1ffffll},	/* SPU_BTI_S16_2 */
142   {0ll, 0xffffll},		/* SPU_BTI_U16   */
143   {0ll, 0x3ffffll},		/* SPU_BTI_U16_2 */
144   {0ll, 0x3ffffll},		/* SPU_BTI_U18   */
145 };
146 
147 
148 /*  Target specific attribute specifications.  */
149 char regs_ever_allocated[FIRST_PSEUDO_REGISTER];
150 
151 /*  Prototypes and external defs.  */
152 static void spu_init_builtins (void);
153 static tree spu_builtin_decl (unsigned, bool);
154 static unsigned char spu_scalar_mode_supported_p (enum machine_mode mode);
155 static unsigned char spu_vector_mode_supported_p (enum machine_mode mode);
156 static bool spu_legitimate_address_p (enum machine_mode, rtx, bool);
157 static bool spu_addr_space_legitimate_address_p (enum machine_mode, rtx,
158 						 bool, addr_space_t);
159 static rtx adjust_operand (rtx op, HOST_WIDE_INT * start);
160 static rtx get_pic_reg (void);
161 static int need_to_save_reg (int regno, int saving);
162 static rtx frame_emit_store (int regno, rtx addr, HOST_WIDE_INT offset);
163 static rtx frame_emit_load (int regno, rtx addr, HOST_WIDE_INT offset);
164 static rtx frame_emit_add_imm (rtx dst, rtx src, HOST_WIDE_INT imm,
165 			       rtx scratch);
166 static void emit_nop_for_insn (rtx insn);
167 static bool insn_clobbers_hbr (rtx insn);
168 static void spu_emit_branch_hint (rtx before, rtx branch, rtx target,
169 				  int distance, sbitmap blocks);
170 static rtx spu_emit_vector_compare (enum rtx_code rcode, rtx op0, rtx op1,
171 	                            enum machine_mode dmode);
172 static rtx get_branch_target (rtx branch);
173 static void spu_machine_dependent_reorg (void);
174 static int spu_sched_issue_rate (void);
175 static int spu_sched_variable_issue (FILE * dump, int verbose, rtx insn,
176 				     int can_issue_more);
177 static int get_pipe (rtx insn);
178 static int spu_sched_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost);
179 static void spu_sched_init_global (FILE *, int, int);
180 static void spu_sched_init (FILE *, int, int);
181 static int spu_sched_reorder (FILE *, int, rtx *, int *, int);
182 static tree spu_handle_fndecl_attribute (tree * node, tree name, tree args,
183 					 int flags,
184 					 unsigned char *no_add_attrs);
185 static tree spu_handle_vector_attribute (tree * node, tree name, tree args,
186 					 int flags,
187 					 unsigned char *no_add_attrs);
188 static int spu_naked_function_p (tree func);
189 static unsigned char spu_pass_by_reference (CUMULATIVE_ARGS *cum, enum machine_mode mode,
190 					    const_tree type, unsigned char named);
191 static tree spu_build_builtin_va_list (void);
192 static void spu_va_start (tree, rtx);
193 static tree spu_gimplify_va_arg_expr (tree valist, tree type,
194 				      gimple_seq * pre_p, gimple_seq * post_p);
195 static int store_with_one_insn_p (rtx mem);
196 static int mem_is_padded_component_ref (rtx x);
197 static int reg_aligned_for_addr (rtx x);
198 static bool spu_assemble_integer (rtx x, unsigned int size, int aligned_p);
199 static void spu_asm_globalize_label (FILE * file, const char *name);
200 static unsigned char spu_rtx_costs (rtx x, int code, int outer_code,
201 				    int *total, bool speed);
202 static unsigned char spu_function_ok_for_sibcall (tree decl, tree exp);
203 static void spu_init_libfuncs (void);
204 static bool spu_return_in_memory (const_tree type, const_tree fntype);
205 static void fix_range (const char *);
206 static void spu_encode_section_info (tree, rtx, int);
207 static rtx spu_legitimize_address (rtx, rtx, enum machine_mode);
208 static rtx spu_addr_space_legitimize_address (rtx, rtx, enum machine_mode,
209 					      addr_space_t);
210 static tree spu_builtin_mul_widen_even (tree);
211 static tree spu_builtin_mul_widen_odd (tree);
212 static tree spu_builtin_mask_for_load (void);
213 static int spu_builtin_vectorization_cost (bool);
214 static bool spu_vector_alignment_reachable (const_tree, bool);
215 static tree spu_builtin_vec_perm (tree, tree *);
216 static enum machine_mode spu_addr_space_pointer_mode (addr_space_t);
217 static enum machine_mode spu_addr_space_address_mode (addr_space_t);
218 static bool spu_addr_space_subset_p (addr_space_t, addr_space_t);
219 static rtx spu_addr_space_convert (rtx, tree, tree);
220 static int spu_sms_res_mii (struct ddg *g);
221 static void asm_file_start (void);
222 static unsigned int spu_section_type_flags (tree, const char *, int);
223 static section *spu_select_section (tree, int, unsigned HOST_WIDE_INT);
224 static void spu_unique_section (tree, int);
225 static rtx spu_expand_load (rtx, rtx, rtx, int);
226 static void spu_trampoline_init (rtx, tree, rtx);
227 
228 extern const char *reg_names[];
229 
230 /* Which instruction set architecture to use.  */
231 int spu_arch;
232 /* Which cpu are we tuning for.  */
233 int spu_tune;
234 
235 /* The hardware requires 8 insns between a hint and the branch it
236    effects.  This variable describes how many rtl instructions the
237    compiler needs to see before inserting a hint, and then the compiler
238    will insert enough nops to make it at least 8 insns.  The default is
239    for the compiler to allow up to 2 nops be emitted.  The nops are
240    inserted in pairs, so we round down. */
241 int spu_hint_dist = (8*4) - (2*4);
242 
243 /* Determines whether we run variable tracking in machine dependent
244    reorganization.  */
245 static int spu_flag_var_tracking;
246 
247 enum spu_immediate {
248   SPU_NONE,
249   SPU_IL,
250   SPU_ILA,
251   SPU_ILH,
252   SPU_ILHU,
253   SPU_ORI,
254   SPU_ORHI,
255   SPU_ORBI,
256   SPU_IOHL
257 };
258 enum immediate_class
259 {
260   IC_POOL,			/* constant pool */
261   IC_IL1,			/* one il* instruction */
262   IC_IL2,			/* both ilhu and iohl instructions */
263   IC_IL1s,			/* one il* instruction */
264   IC_IL2s,			/* both ilhu and iohl instructions */
265   IC_FSMBI,			/* the fsmbi instruction */
266   IC_CPAT,			/* one of the c*d instructions */
267   IC_FSMBI2			/* fsmbi plus 1 other instruction */
268 };
269 
270 static enum spu_immediate which_immediate_load (HOST_WIDE_INT val);
271 static enum spu_immediate which_logical_immediate (HOST_WIDE_INT val);
272 static int cpat_info(unsigned char *arr, int size, int *prun, int *pstart);
273 static enum immediate_class classify_immediate (rtx op,
274 						enum machine_mode mode);
275 
276 static enum machine_mode spu_unwind_word_mode (void);
277 
278 static enum machine_mode
279 spu_libgcc_cmp_return_mode (void);
280 
281 static enum machine_mode
282 spu_libgcc_shift_count_mode (void);
283 
284 /* Pointer mode for __ea references.  */
285 #define EAmode (spu_ea_model != 32 ? DImode : SImode)
286 
287 
288 /*  Table of machine attributes.  */
289 static const struct attribute_spec spu_attribute_table[] =
290 {
291   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */
292   { "naked",          0, 0, true,  false, false, spu_handle_fndecl_attribute },
293   { "spu_vector",     0, 0, false, true,  false, spu_handle_vector_attribute },
294   { NULL,             0, 0, false, false, false, NULL }
295 };
296 
297 /*  TARGET overrides.  */
298 
299 #undef TARGET_ADDR_SPACE_POINTER_MODE
300 #define TARGET_ADDR_SPACE_POINTER_MODE spu_addr_space_pointer_mode
301 
302 #undef TARGET_ADDR_SPACE_ADDRESS_MODE
303 #define TARGET_ADDR_SPACE_ADDRESS_MODE spu_addr_space_address_mode
304 
305 #undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
306 #define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
307   spu_addr_space_legitimate_address_p
308 
309 #undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
310 #define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS spu_addr_space_legitimize_address
311 
312 #undef TARGET_ADDR_SPACE_SUBSET_P
313 #define TARGET_ADDR_SPACE_SUBSET_P spu_addr_space_subset_p
314 
315 #undef TARGET_ADDR_SPACE_CONVERT
316 #define TARGET_ADDR_SPACE_CONVERT spu_addr_space_convert
317 
318 #undef TARGET_INIT_BUILTINS
319 #define TARGET_INIT_BUILTINS spu_init_builtins
320 #undef TARGET_BUILTIN_DECL
321 #define TARGET_BUILTIN_DECL spu_builtin_decl
322 
323 #undef TARGET_EXPAND_BUILTIN
324 #define TARGET_EXPAND_BUILTIN spu_expand_builtin
325 
326 #undef TARGET_UNWIND_WORD_MODE
327 #define TARGET_UNWIND_WORD_MODE spu_unwind_word_mode
328 
329 #undef TARGET_LEGITIMIZE_ADDRESS
330 #define TARGET_LEGITIMIZE_ADDRESS spu_legitimize_address
331 
332 /* The current assembler doesn't like .4byte foo@ppu, so use the normal .long
333    and .quad for the debugger.  When it is known that the assembler is fixed,
334    these can be removed.  */
335 #undef TARGET_ASM_UNALIGNED_SI_OP
336 #define TARGET_ASM_UNALIGNED_SI_OP	"\t.long\t"
337 
338 #undef TARGET_ASM_ALIGNED_DI_OP
339 #define TARGET_ASM_ALIGNED_DI_OP	"\t.quad\t"
340 
341 /* The .8byte directive doesn't seem to work well for a 32 bit
342    architecture. */
343 #undef TARGET_ASM_UNALIGNED_DI_OP
344 #define TARGET_ASM_UNALIGNED_DI_OP NULL
345 
346 #undef TARGET_RTX_COSTS
347 #define TARGET_RTX_COSTS spu_rtx_costs
348 
349 #undef TARGET_ADDRESS_COST
350 #define TARGET_ADDRESS_COST hook_int_rtx_bool_0
351 
352 #undef TARGET_SCHED_ISSUE_RATE
353 #define TARGET_SCHED_ISSUE_RATE spu_sched_issue_rate
354 
355 #undef TARGET_SCHED_INIT_GLOBAL
356 #define TARGET_SCHED_INIT_GLOBAL spu_sched_init_global
357 
358 #undef TARGET_SCHED_INIT
359 #define TARGET_SCHED_INIT spu_sched_init
360 
361 #undef TARGET_SCHED_VARIABLE_ISSUE
362 #define TARGET_SCHED_VARIABLE_ISSUE spu_sched_variable_issue
363 
364 #undef TARGET_SCHED_REORDER
365 #define TARGET_SCHED_REORDER spu_sched_reorder
366 
367 #undef TARGET_SCHED_REORDER2
368 #define TARGET_SCHED_REORDER2 spu_sched_reorder
369 
370 #undef TARGET_SCHED_ADJUST_COST
371 #define TARGET_SCHED_ADJUST_COST spu_sched_adjust_cost
372 
373 #undef  TARGET_ATTRIBUTE_TABLE
374 #define TARGET_ATTRIBUTE_TABLE spu_attribute_table
375 
376 #undef TARGET_ASM_INTEGER
377 #define TARGET_ASM_INTEGER spu_assemble_integer
378 
379 #undef TARGET_SCALAR_MODE_SUPPORTED_P
380 #define TARGET_SCALAR_MODE_SUPPORTED_P	spu_scalar_mode_supported_p
381 
382 #undef TARGET_VECTOR_MODE_SUPPORTED_P
383 #define TARGET_VECTOR_MODE_SUPPORTED_P	spu_vector_mode_supported_p
384 
385 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
386 #define TARGET_FUNCTION_OK_FOR_SIBCALL spu_function_ok_for_sibcall
387 
388 #undef TARGET_ASM_GLOBALIZE_LABEL
389 #define TARGET_ASM_GLOBALIZE_LABEL spu_asm_globalize_label
390 
391 #undef TARGET_PASS_BY_REFERENCE
392 #define TARGET_PASS_BY_REFERENCE spu_pass_by_reference
393 
394 #undef TARGET_MUST_PASS_IN_STACK
395 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
396 
397 #undef TARGET_BUILD_BUILTIN_VA_LIST
398 #define TARGET_BUILD_BUILTIN_VA_LIST spu_build_builtin_va_list
399 
400 #undef TARGET_EXPAND_BUILTIN_VA_START
401 #define TARGET_EXPAND_BUILTIN_VA_START spu_va_start
402 
403 #undef TARGET_SETUP_INCOMING_VARARGS
404 #define TARGET_SETUP_INCOMING_VARARGS spu_setup_incoming_varargs
405 
406 #undef TARGET_MACHINE_DEPENDENT_REORG
407 #define TARGET_MACHINE_DEPENDENT_REORG spu_machine_dependent_reorg
408 
409 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
410 #define TARGET_GIMPLIFY_VA_ARG_EXPR spu_gimplify_va_arg_expr
411 
412 #undef TARGET_DEFAULT_TARGET_FLAGS
413 #define TARGET_DEFAULT_TARGET_FLAGS (TARGET_DEFAULT)
414 
415 #undef TARGET_INIT_LIBFUNCS
416 #define TARGET_INIT_LIBFUNCS spu_init_libfuncs
417 
418 #undef TARGET_RETURN_IN_MEMORY
419 #define TARGET_RETURN_IN_MEMORY spu_return_in_memory
420 
421 #undef  TARGET_ENCODE_SECTION_INFO
422 #define TARGET_ENCODE_SECTION_INFO spu_encode_section_info
423 
424 #undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN
425 #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN spu_builtin_mul_widen_even
426 
427 #undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD
428 #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD spu_builtin_mul_widen_odd
429 
430 #undef TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD
431 #define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD spu_builtin_mask_for_load
432 
433 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
434 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST spu_builtin_vectorization_cost
435 
436 #undef TARGET_VECTOR_ALIGNMENT_REACHABLE
437 #define TARGET_VECTOR_ALIGNMENT_REACHABLE spu_vector_alignment_reachable
438 
439 #undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
440 #define TARGET_VECTORIZE_BUILTIN_VEC_PERM spu_builtin_vec_perm
441 
442 #undef TARGET_LIBGCC_CMP_RETURN_MODE
443 #define TARGET_LIBGCC_CMP_RETURN_MODE spu_libgcc_cmp_return_mode
444 
445 #undef TARGET_LIBGCC_SHIFT_COUNT_MODE
446 #define TARGET_LIBGCC_SHIFT_COUNT_MODE spu_libgcc_shift_count_mode
447 
448 #undef TARGET_SCHED_SMS_RES_MII
449 #define TARGET_SCHED_SMS_RES_MII spu_sms_res_mii
450 
451 #undef TARGET_ASM_FILE_START
452 #define TARGET_ASM_FILE_START asm_file_start
453 
454 #undef TARGET_SECTION_TYPE_FLAGS
455 #define TARGET_SECTION_TYPE_FLAGS spu_section_type_flags
456 
457 #undef TARGET_ASM_SELECT_SECTION
458 #define TARGET_ASM_SELECT_SECTION  spu_select_section
459 
460 #undef TARGET_ASM_UNIQUE_SECTION
461 #define TARGET_ASM_UNIQUE_SECTION  spu_unique_section
462 
463 #undef TARGET_LEGITIMATE_ADDRESS_P
464 #define TARGET_LEGITIMATE_ADDRESS_P spu_legitimate_address_p
465 
466 #undef TARGET_TRAMPOLINE_INIT
467 #define TARGET_TRAMPOLINE_INIT spu_trampoline_init
468 
469 struct gcc_target targetm = TARGET_INITIALIZER;
470 
471 void
472 spu_optimization_options (int level ATTRIBUTE_UNUSED, int size ATTRIBUTE_UNUSED)
473 {
474   /* Override some of the default param values.  With so many registers
475      larger values are better for these params.  */
476   MAX_PENDING_LIST_LENGTH = 128;
477 
478   /* With so many registers this is better on by default. */
479   flag_rename_registers = 1;
480 }
481 
482 /* Sometimes certain combinations of command options do not make sense
483    on a particular target machine.  You can define a macro
484    OVERRIDE_OPTIONS to take account of this. This macro, if defined, is
485    executed once just after all the command options have been parsed.  */
486 void
487 spu_override_options (void)
488 {
489   /* Small loops will be unpeeled at -O3.  For SPU it is more important
490      to keep code small by default.  */
491   if (!flag_unroll_loops && !flag_peel_loops
492       && !PARAM_SET_P (PARAM_MAX_COMPLETELY_PEEL_TIMES))
493     PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES) = 1;
494 
495   flag_omit_frame_pointer = 1;
496 
497   /* Functions must be 8 byte aligned so we correctly handle dual issue */
498   if (align_functions < 8)
499     align_functions = 8;
500 
501   spu_hint_dist = 8*4 - spu_max_nops*4;
502   if (spu_hint_dist < 0)
503     spu_hint_dist = 0;
504 
505   if (spu_fixed_range_string)
506     fix_range (spu_fixed_range_string);
507 
508   /* Determine processor architectural level.  */
509   if (spu_arch_string)
510     {
511       if (strcmp (&spu_arch_string[0], "cell") == 0)
512         spu_arch = PROCESSOR_CELL;
513       else if (strcmp (&spu_arch_string[0], "celledp") == 0)
514         spu_arch = PROCESSOR_CELLEDP;
515       else
516         error ("Unknown architecture '%s'", &spu_arch_string[0]);
517     }
518 
519   /* Determine processor to tune for.  */
520   if (spu_tune_string)
521     {
522       if (strcmp (&spu_tune_string[0], "cell") == 0)
523         spu_tune = PROCESSOR_CELL;
524       else if (strcmp (&spu_tune_string[0], "celledp") == 0)
525         spu_tune = PROCESSOR_CELLEDP;
526       else
527         error ("Unknown architecture '%s'", &spu_tune_string[0]);
528     }
529 
530   /* Change defaults according to the processor architecture.  */
531   if (spu_arch == PROCESSOR_CELLEDP)
532     {
533       /* If no command line option has been otherwise specified, change
534 	 the default to -mno-safe-hints on celledp -- only the original
535 	 Cell/B.E. processors require this workaround.  */
536       if (!(target_flags_explicit & MASK_SAFE_HINTS))
537 	target_flags &= ~MASK_SAFE_HINTS;
538     }
539 
540   REAL_MODE_FORMAT (SFmode) = &spu_single_format;
541 }
542 
543 /* Handle an attribute requiring a FUNCTION_DECL; arguments as in
544    struct attribute_spec.handler.  */
545 
546 /* True if MODE is valid for the target.  By "valid", we mean able to
547    be manipulated in non-trivial ways.  In particular, this means all
548    the arithmetic is supported.  */
549 static bool
550 spu_scalar_mode_supported_p (enum machine_mode mode)
551 {
552   switch (mode)
553     {
554     case QImode:
555     case HImode:
556     case SImode:
557     case SFmode:
558     case DImode:
559     case TImode:
560     case DFmode:
561       return true;
562 
563     default:
564       return false;
565     }
566 }
567 
568 /* Similarly for vector modes.  "Supported" here is less strict.  At
569    least some operations are supported; need to check optabs or builtins
570    for further details.  */
571 static bool
572 spu_vector_mode_supported_p (enum machine_mode mode)
573 {
574   switch (mode)
575     {
576     case V16QImode:
577     case V8HImode:
578     case V4SImode:
579     case V2DImode:
580     case V4SFmode:
581     case V2DFmode:
582       return true;
583 
584     default:
585       return false;
586     }
587 }
588 
589 /* GCC assumes that in a paradoxical SUBREG the inner mode occupies the
590    least significant bytes of the outer mode.  This function returns
591    TRUE for the SUBREG's where this is correct.  */
592 int
593 valid_subreg (rtx op)
594 {
595   enum machine_mode om = GET_MODE (op);
596   enum machine_mode im = GET_MODE (SUBREG_REG (op));
597   return om != VOIDmode && im != VOIDmode
598     && (GET_MODE_SIZE (im) == GET_MODE_SIZE (om)
599 	|| (GET_MODE_SIZE (im) <= 4 && GET_MODE_SIZE (om) <= 4)
600 	|| (GET_MODE_SIZE (im) >= 16 && GET_MODE_SIZE (om) >= 16));
601 }
602 
603 /* When insv and ext[sz]v ar passed a TI SUBREG, we want to strip it off
604    and adjust the start offset.  */
605 static rtx
606 adjust_operand (rtx op, HOST_WIDE_INT * start)
607 {
608   enum machine_mode mode;
609   int op_size;
610   /* Strip any paradoxical SUBREG.  */
611   if (GET_CODE (op) == SUBREG
612       && (GET_MODE_BITSIZE (GET_MODE (op))
613 	  > GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (op)))))
614     {
615       if (start)
616 	*start -=
617 	  GET_MODE_BITSIZE (GET_MODE (op)) -
618 	  GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (op)));
619       op = SUBREG_REG (op);
620     }
621   /* If it is smaller than SI, assure a SUBREG */
622   op_size = GET_MODE_BITSIZE (GET_MODE (op));
623   if (op_size < 32)
624     {
625       if (start)
626 	*start += 32 - op_size;
627       op_size = 32;
628     }
629   /* If it is not a MODE_INT (and/or it is smaller than SI) add a SUBREG. */
630   mode = mode_for_size (op_size, MODE_INT, 0);
631   if (mode != GET_MODE (op))
632     op = gen_rtx_SUBREG (mode, op, 0);
633   return op;
634 }
635 
636 void
637 spu_expand_extv (rtx ops[], int unsignedp)
638 {
639   rtx dst = ops[0], src = ops[1];
640   HOST_WIDE_INT width = INTVAL (ops[2]);
641   HOST_WIDE_INT start = INTVAL (ops[3]);
642   HOST_WIDE_INT align_mask;
643   rtx s0, s1, mask, r0;
644 
645   gcc_assert (REG_P (dst) && GET_MODE (dst) == TImode);
646 
647   if (MEM_P (src))
648     {
649       /* First, determine if we need 1 TImode load or 2.  We need only 1
650          if the bits being extracted do not cross the alignment boundary
651          as determined by the MEM and its address. */
652 
653       align_mask = -MEM_ALIGN (src);
654       if ((start & align_mask) == ((start + width - 1) & align_mask))
655 	{
656 	  /* Alignment is sufficient for 1 load. */
657 	  s0 = gen_reg_rtx (TImode);
658 	  r0 = spu_expand_load (s0, 0, src, start / 8);
659 	  start &= 7;
660 	  if (r0)
661 	    emit_insn (gen_rotqby_ti (s0, s0, r0));
662 	}
663       else
664 	{
665 	  /* Need 2 loads. */
666 	  s0 = gen_reg_rtx (TImode);
667 	  s1 = gen_reg_rtx (TImode);
668 	  r0 = spu_expand_load (s0, s1, src, start / 8);
669 	  start &= 7;
670 
671 	  gcc_assert (start + width <= 128);
672 	  if (r0)
673 	    {
674 	      rtx r1 = gen_reg_rtx (SImode);
675 	      mask = gen_reg_rtx (TImode);
676 	      emit_move_insn (mask, GEN_INT (-1));
677 	      emit_insn (gen_rotqby_ti (s0, s0, r0));
678 	      emit_insn (gen_rotqby_ti (s1, s1, r0));
679 	      if (GET_CODE (r0) == CONST_INT)
680 		r1 = GEN_INT (INTVAL (r0) & 15);
681 	      else
682 		emit_insn (gen_andsi3 (r1, r0, GEN_INT (15)));
683 	      emit_insn (gen_shlqby_ti (mask, mask, r1));
684 	      emit_insn (gen_selb (s0, s1, s0, mask));
685 	    }
686 	}
687 
688     }
689   else if (GET_CODE (src) == SUBREG)
690     {
691       rtx r = SUBREG_REG (src);
692       gcc_assert (REG_P (r) && SCALAR_INT_MODE_P (GET_MODE (r)));
693       s0 = gen_reg_rtx (TImode);
694       if (GET_MODE_SIZE (GET_MODE (r)) < GET_MODE_SIZE (TImode))
695 	emit_insn (gen_rtx_SET (VOIDmode, s0, gen_rtx_ZERO_EXTEND (TImode, r)));
696       else
697 	emit_move_insn (s0, src);
698     }
699   else
700     {
701       gcc_assert (REG_P (src) && GET_MODE (src) == TImode);
702       s0 = gen_reg_rtx (TImode);
703       emit_move_insn (s0, src);
704     }
705 
706   /* Now s0 is TImode and contains the bits to extract at start. */
707 
708   if (start)
709     emit_insn (gen_rotlti3 (s0, s0, GEN_INT (start)));
710 
711   if (128 - width)
712     {
713       tree c = build_int_cst (NULL_TREE, 128 - width);
714       s0 = expand_shift (RSHIFT_EXPR, TImode, s0, c, s0, unsignedp);
715     }
716 
717   emit_move_insn (dst, s0);
718 }
719 
720 void
721 spu_expand_insv (rtx ops[])
722 {
723   HOST_WIDE_INT width = INTVAL (ops[1]);
724   HOST_WIDE_INT start = INTVAL (ops[2]);
725   HOST_WIDE_INT maskbits;
726   enum machine_mode dst_mode, src_mode;
727   rtx dst = ops[0], src = ops[3];
728   int dst_size, src_size;
729   rtx mask;
730   rtx shift_reg;
731   int shift;
732 
733 
734   if (GET_CODE (ops[0]) == MEM)
735     dst = gen_reg_rtx (TImode);
736   else
737     dst = adjust_operand (dst, &start);
738   dst_mode = GET_MODE (dst);
739   dst_size = GET_MODE_BITSIZE (GET_MODE (dst));
740 
741   if (CONSTANT_P (src))
742     {
743       enum machine_mode m =
744 	(width <= 32 ? SImode : width <= 64 ? DImode : TImode);
745       src = force_reg (m, convert_to_mode (m, src, 0));
746     }
747   src = adjust_operand (src, 0);
748   src_mode = GET_MODE (src);
749   src_size = GET_MODE_BITSIZE (GET_MODE (src));
750 
751   mask = gen_reg_rtx (dst_mode);
752   shift_reg = gen_reg_rtx (dst_mode);
753   shift = dst_size - start - width;
754 
755   /* It's not safe to use subreg here because the compiler assumes
756      that the SUBREG_REG is right justified in the SUBREG. */
757   convert_move (shift_reg, src, 1);
758 
759   if (shift > 0)
760     {
761       switch (dst_mode)
762 	{
763 	case SImode:
764 	  emit_insn (gen_ashlsi3 (shift_reg, shift_reg, GEN_INT (shift)));
765 	  break;
766 	case DImode:
767 	  emit_insn (gen_ashldi3 (shift_reg, shift_reg, GEN_INT (shift)));
768 	  break;
769 	case TImode:
770 	  emit_insn (gen_ashlti3 (shift_reg, shift_reg, GEN_INT (shift)));
771 	  break;
772 	default:
773 	  abort ();
774 	}
775     }
776   else if (shift < 0)
777     abort ();
778 
779   switch (dst_size)
780     {
781     case 32:
782       maskbits = (-1ll << (32 - width - start));
783       if (start)
784 	maskbits += (1ll << (32 - start));
785       emit_move_insn (mask, GEN_INT (maskbits));
786       break;
787     case 64:
788       maskbits = (-1ll << (64 - width - start));
789       if (start)
790 	maskbits += (1ll << (64 - start));
791       emit_move_insn (mask, GEN_INT (maskbits));
792       break;
793     case 128:
794       {
795 	unsigned char arr[16];
796 	int i = start / 8;
797 	memset (arr, 0, sizeof (arr));
798 	arr[i] = 0xff >> (start & 7);
799 	for (i++; i <= (start + width - 1) / 8; i++)
800 	  arr[i] = 0xff;
801 	arr[i - 1] &= 0xff << (7 - ((start + width - 1) & 7));
802 	emit_move_insn (mask, array_to_constant (TImode, arr));
803       }
804       break;
805     default:
806       abort ();
807     }
808   if (GET_CODE (ops[0]) == MEM)
809     {
810       rtx low = gen_reg_rtx (SImode);
811       rtx rotl = gen_reg_rtx (SImode);
812       rtx mask0 = gen_reg_rtx (TImode);
813       rtx addr;
814       rtx addr0;
815       rtx addr1;
816       rtx mem;
817 
818       addr = force_reg (Pmode, XEXP (ops[0], 0));
819       addr0 = gen_rtx_AND (Pmode, addr, GEN_INT (-16));
820       emit_insn (gen_andsi3 (low, addr, GEN_INT (15)));
821       emit_insn (gen_negsi2 (rotl, low));
822       emit_insn (gen_rotqby_ti (shift_reg, shift_reg, rotl));
823       emit_insn (gen_rotqmby_ti (mask0, mask, rotl));
824       mem = change_address (ops[0], TImode, addr0);
825       set_mem_alias_set (mem, 0);
826       emit_move_insn (dst, mem);
827       emit_insn (gen_selb (dst, dst, shift_reg, mask0));
828       if (start + width > MEM_ALIGN (ops[0]))
829 	{
830 	  rtx shl = gen_reg_rtx (SImode);
831 	  rtx mask1 = gen_reg_rtx (TImode);
832 	  rtx dst1 = gen_reg_rtx (TImode);
833 	  rtx mem1;
834 	  addr1 = plus_constant (addr, 16);
835 	  addr1 = gen_rtx_AND (Pmode, addr1, GEN_INT (-16));
836 	  emit_insn (gen_subsi3 (shl, GEN_INT (16), low));
837 	  emit_insn (gen_shlqby_ti (mask1, mask, shl));
838 	  mem1 = change_address (ops[0], TImode, addr1);
839 	  set_mem_alias_set (mem1, 0);
840 	  emit_move_insn (dst1, mem1);
841 	  emit_insn (gen_selb (dst1, dst1, shift_reg, mask1));
842 	  emit_move_insn (mem1, dst1);
843 	}
844       emit_move_insn (mem, dst);
845     }
846   else
847     emit_insn (gen_selb (dst, copy_rtx (dst), shift_reg, mask));
848 }
849 
850 
851 int
852 spu_expand_block_move (rtx ops[])
853 {
854   HOST_WIDE_INT bytes, align, offset;
855   rtx src, dst, sreg, dreg, target;
856   int i;
857   if (GET_CODE (ops[2]) != CONST_INT
858       || GET_CODE (ops[3]) != CONST_INT
859       || INTVAL (ops[2]) > (HOST_WIDE_INT) (MOVE_RATIO (optimize_insn_for_speed_p ()) * 8))
860     return 0;
861 
862   bytes = INTVAL (ops[2]);
863   align = INTVAL (ops[3]);
864 
865   if (bytes <= 0)
866     return 1;
867 
868   dst = ops[0];
869   src = ops[1];
870 
871   if (align == 16)
872     {
873       for (offset = 0; offset + 16 <= bytes; offset += 16)
874 	{
875 	  dst = adjust_address (ops[0], V16QImode, offset);
876 	  src = adjust_address (ops[1], V16QImode, offset);
877 	  emit_move_insn (dst, src);
878 	}
879       if (offset < bytes)
880 	{
881 	  rtx mask;
882 	  unsigned char arr[16] = { 0 };
883 	  for (i = 0; i < bytes - offset; i++)
884 	    arr[i] = 0xff;
885 	  dst = adjust_address (ops[0], V16QImode, offset);
886 	  src = adjust_address (ops[1], V16QImode, offset);
887 	  mask = gen_reg_rtx (V16QImode);
888 	  sreg = gen_reg_rtx (V16QImode);
889 	  dreg = gen_reg_rtx (V16QImode);
890 	  target = gen_reg_rtx (V16QImode);
891 	  emit_move_insn (mask, array_to_constant (V16QImode, arr));
892 	  emit_move_insn (dreg, dst);
893 	  emit_move_insn (sreg, src);
894 	  emit_insn (gen_selb (target, dreg, sreg, mask));
895 	  emit_move_insn (dst, target);
896 	}
897       return 1;
898     }
899   return 0;
900 }
901 
902 enum spu_comp_code
903 { SPU_EQ, SPU_GT, SPU_GTU };
904 
905 int spu_comp_icode[12][3] = {
906  {CODE_FOR_ceq_qi, CODE_FOR_cgt_qi, CODE_FOR_clgt_qi},
907  {CODE_FOR_ceq_hi, CODE_FOR_cgt_hi, CODE_FOR_clgt_hi},
908  {CODE_FOR_ceq_si, CODE_FOR_cgt_si, CODE_FOR_clgt_si},
909  {CODE_FOR_ceq_di, CODE_FOR_cgt_di, CODE_FOR_clgt_di},
910  {CODE_FOR_ceq_ti, CODE_FOR_cgt_ti, CODE_FOR_clgt_ti},
911  {CODE_FOR_ceq_sf, CODE_FOR_cgt_sf, 0},
912  {CODE_FOR_ceq_df, CODE_FOR_cgt_df, 0},
913  {CODE_FOR_ceq_v16qi, CODE_FOR_cgt_v16qi, CODE_FOR_clgt_v16qi},
914  {CODE_FOR_ceq_v8hi,  CODE_FOR_cgt_v8hi,  CODE_FOR_clgt_v8hi},
915  {CODE_FOR_ceq_v4si,  CODE_FOR_cgt_v4si,  CODE_FOR_clgt_v4si},
916  {CODE_FOR_ceq_v4sf,  CODE_FOR_cgt_v4sf, 0},
917  {CODE_FOR_ceq_v2df,  CODE_FOR_cgt_v2df, 0},
918 };
919 
920 /* Generate a compare for CODE.  Return a brand-new rtx that represents
921    the result of the compare.   GCC can figure this out too if we don't
922    provide all variations of compares, but GCC always wants to use
923    WORD_MODE, we can generate better code in most cases if we do it
924    ourselves.  */
925 void
926 spu_emit_branch_or_set (int is_set, rtx cmp, rtx operands[])
927 {
928   int reverse_compare = 0;
929   int reverse_test = 0;
930   rtx compare_result, eq_result;
931   rtx comp_rtx, eq_rtx;
932   enum machine_mode comp_mode;
933   enum machine_mode op_mode;
934   enum spu_comp_code scode, eq_code;
935   enum insn_code ior_code;
936   enum rtx_code code = GET_CODE (cmp);
937   rtx op0 = XEXP (cmp, 0);
938   rtx op1 = XEXP (cmp, 1);
939   int index;
940   int eq_test = 0;
941 
942   /* When op1 is a CONST_INT change (X >= C) to (X > C-1),
943      and so on, to keep the constant in operand 1. */
944   if (GET_CODE (op1) == CONST_INT)
945     {
946       HOST_WIDE_INT val = INTVAL (op1) - 1;
947       if (trunc_int_for_mode (val, GET_MODE (op0)) == val)
948 	switch (code)
949 	  {
950 	  case GE:
951 	    op1 = GEN_INT (val);
952 	    code = GT;
953 	    break;
954 	  case LT:
955 	    op1 = GEN_INT (val);
956 	    code = LE;
957 	    break;
958 	  case GEU:
959 	    op1 = GEN_INT (val);
960 	    code = GTU;
961 	    break;
962 	  case LTU:
963 	    op1 = GEN_INT (val);
964 	    code = LEU;
965 	    break;
966 	  default:
967 	    break;
968 	  }
969     }
970 
971   comp_mode = SImode;
972   op_mode = GET_MODE (op0);
973 
974   switch (code)
975     {
976     case GE:
977       scode = SPU_GT;
978       if (HONOR_NANS (op_mode))
979 	{
980 	  reverse_compare = 0;
981 	  reverse_test = 0;
982 	  eq_test = 1;
983 	  eq_code = SPU_EQ;
984 	}
985       else
986 	{
987 	  reverse_compare = 1;
988 	  reverse_test = 1;
989 	}
990       break;
991     case LE:
992       scode = SPU_GT;
993       if (HONOR_NANS (op_mode))
994 	{
995 	  reverse_compare = 1;
996 	  reverse_test = 0;
997 	  eq_test = 1;
998 	  eq_code = SPU_EQ;
999 	}
1000       else
1001 	{
1002 	  reverse_compare = 0;
1003 	  reverse_test = 1;
1004 	}
1005       break;
1006     case LT:
1007       reverse_compare = 1;
1008       reverse_test = 0;
1009       scode = SPU_GT;
1010       break;
1011     case GEU:
1012       reverse_compare = 1;
1013       reverse_test = 1;
1014       scode = SPU_GTU;
1015       break;
1016     case LEU:
1017       reverse_compare = 0;
1018       reverse_test = 1;
1019       scode = SPU_GTU;
1020       break;
1021     case LTU:
1022       reverse_compare = 1;
1023       reverse_test = 0;
1024       scode = SPU_GTU;
1025       break;
1026     case NE:
1027       reverse_compare = 0;
1028       reverse_test = 1;
1029       scode = SPU_EQ;
1030       break;
1031 
1032     case EQ:
1033       scode = SPU_EQ;
1034       break;
1035     case GT:
1036       scode = SPU_GT;
1037       break;
1038     case GTU:
1039       scode = SPU_GTU;
1040       break;
1041     default:
1042       scode = SPU_EQ;
1043       break;
1044     }
1045 
1046   switch (op_mode)
1047     {
1048     case QImode:
1049       index = 0;
1050       comp_mode = QImode;
1051       break;
1052     case HImode:
1053       index = 1;
1054       comp_mode = HImode;
1055       break;
1056     case SImode:
1057       index = 2;
1058       break;
1059     case DImode:
1060       index = 3;
1061       break;
1062     case TImode:
1063       index = 4;
1064       break;
1065     case SFmode:
1066       index = 5;
1067       break;
1068     case DFmode:
1069       index = 6;
1070       break;
1071     case V16QImode:
1072       index = 7;
1073       comp_mode = op_mode;
1074       break;
1075     case V8HImode:
1076       index = 8;
1077       comp_mode = op_mode;
1078       break;
1079     case V4SImode:
1080       index = 9;
1081       comp_mode = op_mode;
1082       break;
1083     case V4SFmode:
1084       index = 10;
1085       comp_mode = V4SImode;
1086       break;
1087     case V2DFmode:
1088       index = 11;
1089       comp_mode = V2DImode;
1090       break;
1091     case V2DImode:
1092     default:
1093       abort ();
1094     }
1095 
1096   if (GET_MODE (op1) == DFmode
1097       && (scode != SPU_GT && scode != SPU_EQ))
1098     abort ();
1099 
1100   if (is_set == 0 && op1 == const0_rtx
1101       && (GET_MODE (op0) == SImode
1102 	  || GET_MODE (op0) == HImode) && scode == SPU_EQ)
1103     {
1104       /* Don't need to set a register with the result when we are
1105          comparing against zero and branching. */
1106       reverse_test = !reverse_test;
1107       compare_result = op0;
1108     }
1109   else
1110     {
1111       compare_result = gen_reg_rtx (comp_mode);
1112 
1113       if (reverse_compare)
1114 	{
1115 	  rtx t = op1;
1116 	  op1 = op0;
1117 	  op0 = t;
1118 	}
1119 
1120       if (spu_comp_icode[index][scode] == 0)
1121 	abort ();
1122 
1123       if (!(*insn_data[spu_comp_icode[index][scode]].operand[1].predicate)
1124 	  (op0, op_mode))
1125 	op0 = force_reg (op_mode, op0);
1126       if (!(*insn_data[spu_comp_icode[index][scode]].operand[2].predicate)
1127 	  (op1, op_mode))
1128 	op1 = force_reg (op_mode, op1);
1129       comp_rtx = GEN_FCN (spu_comp_icode[index][scode]) (compare_result,
1130 							 op0, op1);
1131       if (comp_rtx == 0)
1132 	abort ();
1133       emit_insn (comp_rtx);
1134 
1135       if (eq_test)
1136         {
1137           eq_result = gen_reg_rtx (comp_mode);
1138           eq_rtx = GEN_FCN (spu_comp_icode[index][eq_code]) (eq_result,
1139 							     op0, op1);
1140           if (eq_rtx == 0)
1141 	    abort ();
1142           emit_insn (eq_rtx);
1143           ior_code = ior_optab->handlers[(int)comp_mode].insn_code;
1144           gcc_assert (ior_code != CODE_FOR_nothing);
1145           emit_insn (GEN_FCN (ior_code)
1146 		     (compare_result, compare_result, eq_result));
1147         }
1148     }
1149 
1150   if (is_set == 0)
1151     {
1152       rtx bcomp;
1153       rtx loc_ref;
1154 
1155       /* We don't have branch on QI compare insns, so we convert the
1156          QI compare result to a HI result. */
1157       if (comp_mode == QImode)
1158 	{
1159 	  rtx old_res = compare_result;
1160 	  compare_result = gen_reg_rtx (HImode);
1161 	  comp_mode = HImode;
1162 	  emit_insn (gen_extendqihi2 (compare_result, old_res));
1163 	}
1164 
1165       if (reverse_test)
1166 	bcomp = gen_rtx_EQ (comp_mode, compare_result, const0_rtx);
1167       else
1168 	bcomp = gen_rtx_NE (comp_mode, compare_result, const0_rtx);
1169 
1170       loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[3]);
1171       emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,
1172 				   gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
1173 							 loc_ref, pc_rtx)));
1174     }
1175   else if (is_set == 2)
1176     {
1177       rtx target = operands[0];
1178       int compare_size = GET_MODE_BITSIZE (comp_mode);
1179       int target_size = GET_MODE_BITSIZE (GET_MODE (target));
1180       enum machine_mode mode = mode_for_size (target_size, MODE_INT, 0);
1181       rtx select_mask;
1182       rtx op_t = operands[2];
1183       rtx op_f = operands[3];
1184 
1185       /* The result of the comparison can be SI, HI or QI mode.  Create a
1186          mask based on that result. */
1187       if (target_size > compare_size)
1188 	{
1189 	  select_mask = gen_reg_rtx (mode);
1190 	  emit_insn (gen_extend_compare (select_mask, compare_result));
1191 	}
1192       else if (target_size < compare_size)
1193 	select_mask =
1194 	  gen_rtx_SUBREG (mode, compare_result,
1195 			  (compare_size - target_size) / BITS_PER_UNIT);
1196       else if (comp_mode != mode)
1197 	select_mask = gen_rtx_SUBREG (mode, compare_result, 0);
1198       else
1199 	select_mask = compare_result;
1200 
1201       if (GET_MODE (target) != GET_MODE (op_t)
1202 	  || GET_MODE (target) != GET_MODE (op_f))
1203 	abort ();
1204 
1205       if (reverse_test)
1206 	emit_insn (gen_selb (target, op_t, op_f, select_mask));
1207       else
1208 	emit_insn (gen_selb (target, op_f, op_t, select_mask));
1209     }
1210   else
1211     {
1212       rtx target = operands[0];
1213       if (reverse_test)
1214 	emit_insn (gen_rtx_SET (VOIDmode, compare_result,
1215 				gen_rtx_NOT (comp_mode, compare_result)));
1216       if (GET_MODE (target) == SImode && GET_MODE (compare_result) == HImode)
1217 	emit_insn (gen_extendhisi2 (target, compare_result));
1218       else if (GET_MODE (target) == SImode
1219 	       && GET_MODE (compare_result) == QImode)
1220 	emit_insn (gen_extend_compare (target, compare_result));
1221       else
1222 	emit_move_insn (target, compare_result);
1223     }
1224 }
1225 
1226 HOST_WIDE_INT
1227 const_double_to_hwint (rtx x)
1228 {
1229   HOST_WIDE_INT val;
1230   REAL_VALUE_TYPE rv;
1231   if (GET_MODE (x) == SFmode)
1232     {
1233       REAL_VALUE_FROM_CONST_DOUBLE (rv, x);
1234       REAL_VALUE_TO_TARGET_SINGLE (rv, val);
1235     }
1236   else if (GET_MODE (x) == DFmode)
1237     {
1238       long l[2];
1239       REAL_VALUE_FROM_CONST_DOUBLE (rv, x);
1240       REAL_VALUE_TO_TARGET_DOUBLE (rv, l);
1241       val = l[0];
1242       val = (val << 32) | (l[1] & 0xffffffff);
1243     }
1244   else
1245     abort ();
1246   return val;
1247 }
1248 
1249 rtx
1250 hwint_to_const_double (enum machine_mode mode, HOST_WIDE_INT v)
1251 {
1252   long tv[2];
1253   REAL_VALUE_TYPE rv;
1254   gcc_assert (mode == SFmode || mode == DFmode);
1255 
1256   if (mode == SFmode)
1257     tv[0] = (v << 32) >> 32;
1258   else if (mode == DFmode)
1259     {
1260       tv[1] = (v << 32) >> 32;
1261       tv[0] = v >> 32;
1262     }
1263   real_from_target (&rv, tv, mode);
1264   return CONST_DOUBLE_FROM_REAL_VALUE (rv, mode);
1265 }
1266 
1267 void
1268 print_operand_address (FILE * file, register rtx addr)
1269 {
1270   rtx reg;
1271   rtx offset;
1272 
1273   if (GET_CODE (addr) == AND
1274       && GET_CODE (XEXP (addr, 1)) == CONST_INT
1275       && INTVAL (XEXP (addr, 1)) == -16)
1276     addr = XEXP (addr, 0);
1277 
1278   switch (GET_CODE (addr))
1279     {
1280     case REG:
1281       fprintf (file, "0(%s)", reg_names[REGNO (addr)]);
1282       break;
1283 
1284     case PLUS:
1285       reg = XEXP (addr, 0);
1286       offset = XEXP (addr, 1);
1287       if (GET_CODE (offset) == REG)
1288 	{
1289 	  fprintf (file, "%s,%s", reg_names[REGNO (reg)],
1290 		   reg_names[REGNO (offset)]);
1291 	}
1292       else if (GET_CODE (offset) == CONST_INT)
1293 	{
1294 	  fprintf (file, HOST_WIDE_INT_PRINT_DEC "(%s)",
1295 		   INTVAL (offset), reg_names[REGNO (reg)]);
1296 	}
1297       else
1298 	abort ();
1299       break;
1300 
1301     case CONST:
1302     case LABEL_REF:
1303     case SYMBOL_REF:
1304     case CONST_INT:
1305       output_addr_const (file, addr);
1306       break;
1307 
1308     default:
1309       debug_rtx (addr);
1310       abort ();
1311     }
1312 }
1313 
1314 void
1315 print_operand (FILE * file, rtx x, int code)
1316 {
1317   enum machine_mode mode = GET_MODE (x);
1318   HOST_WIDE_INT val;
1319   unsigned char arr[16];
1320   int xcode = GET_CODE (x);
1321   int i, info;
1322   if (GET_MODE (x) == VOIDmode)
1323     switch (code)
1324       {
1325       case 'L':			/* 128 bits, signed */
1326       case 'm':			/* 128 bits, signed */
1327       case 'T':			/* 128 bits, signed */
1328       case 't':			/* 128 bits, signed */
1329 	mode = TImode;
1330 	break;
1331       case 'K':			/* 64 bits, signed */
1332       case 'k':			/* 64 bits, signed */
1333       case 'D':			/* 64 bits, signed */
1334       case 'd':			/* 64 bits, signed */
1335 	mode = DImode;
1336 	break;
1337       case 'J':			/* 32 bits, signed */
1338       case 'j':			/* 32 bits, signed */
1339       case 's':			/* 32 bits, signed */
1340       case 'S':			/* 32 bits, signed */
1341 	mode = SImode;
1342 	break;
1343       }
1344   switch (code)
1345     {
1346 
1347     case 'j':			/* 32 bits, signed */
1348     case 'k':			/* 64 bits, signed */
1349     case 'm':			/* 128 bits, signed */
1350       if (xcode == CONST_INT
1351 	  || xcode == CONST_DOUBLE || xcode == CONST_VECTOR)
1352 	{
1353 	  gcc_assert (logical_immediate_p (x, mode));
1354 	  constant_to_array (mode, x, arr);
1355 	  val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
1356 	  val = trunc_int_for_mode (val, SImode);
1357 	  switch (which_logical_immediate (val))
1358 	  {
1359 	  case SPU_ORI:
1360 	    break;
1361 	  case SPU_ORHI:
1362 	    fprintf (file, "h");
1363 	    break;
1364 	  case SPU_ORBI:
1365 	    fprintf (file, "b");
1366 	    break;
1367 	  default:
1368 	    gcc_unreachable();
1369 	  }
1370 	}
1371       else
1372 	gcc_unreachable();
1373       return;
1374 
1375     case 'J':			/* 32 bits, signed */
1376     case 'K':			/* 64 bits, signed */
1377     case 'L':			/* 128 bits, signed */
1378       if (xcode == CONST_INT
1379 	  || xcode == CONST_DOUBLE || xcode == CONST_VECTOR)
1380 	{
1381 	  gcc_assert (logical_immediate_p (x, mode)
1382 		      || iohl_immediate_p (x, mode));
1383 	  constant_to_array (mode, x, arr);
1384 	  val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
1385 	  val = trunc_int_for_mode (val, SImode);
1386 	  switch (which_logical_immediate (val))
1387 	  {
1388 	  case SPU_ORI:
1389 	  case SPU_IOHL:
1390 	    break;
1391 	  case SPU_ORHI:
1392 	    val = trunc_int_for_mode (val, HImode);
1393 	    break;
1394 	  case SPU_ORBI:
1395 	    val = trunc_int_for_mode (val, QImode);
1396 	    break;
1397 	  default:
1398 	    gcc_unreachable();
1399 	  }
1400 	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, val);
1401 	}
1402       else
1403 	gcc_unreachable();
1404       return;
1405 
1406     case 't':			/* 128 bits, signed */
1407     case 'd':			/* 64 bits, signed */
1408     case 's':			/* 32 bits, signed */
1409       if (CONSTANT_P (x))
1410 	{
1411 	  enum immediate_class c = classify_immediate (x, mode);
1412 	  switch (c)
1413 	    {
1414 	    case IC_IL1:
1415 	      constant_to_array (mode, x, arr);
1416 	      val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
1417 	      val = trunc_int_for_mode (val, SImode);
1418 	      switch (which_immediate_load (val))
1419 		{
1420 		case SPU_IL:
1421 		  break;
1422 		case SPU_ILA:
1423 		  fprintf (file, "a");
1424 		  break;
1425 		case SPU_ILH:
1426 		  fprintf (file, "h");
1427 		  break;
1428 		case SPU_ILHU:
1429 		  fprintf (file, "hu");
1430 		  break;
1431 		default:
1432 		  gcc_unreachable ();
1433 		}
1434 	      break;
1435 	    case IC_CPAT:
1436 	      constant_to_array (mode, x, arr);
1437 	      cpat_info (arr, GET_MODE_SIZE (mode), &info, 0);
1438 	      if (info == 1)
1439 		fprintf (file, "b");
1440 	      else if (info == 2)
1441 		fprintf (file, "h");
1442 	      else if (info == 4)
1443 		fprintf (file, "w");
1444 	      else if (info == 8)
1445 		fprintf (file, "d");
1446 	      break;
1447 	    case IC_IL1s:
1448 	      if (xcode == CONST_VECTOR)
1449 		{
1450 		  x = CONST_VECTOR_ELT (x, 0);
1451 		  xcode = GET_CODE (x);
1452 		}
1453 	      if (xcode == SYMBOL_REF || xcode == LABEL_REF || xcode == CONST)
1454 		fprintf (file, "a");
1455 	      else if (xcode == HIGH)
1456 		fprintf (file, "hu");
1457 	      break;
1458 	    case IC_FSMBI:
1459 	    case IC_FSMBI2:
1460 	    case IC_IL2:
1461 	    case IC_IL2s:
1462 	    case IC_POOL:
1463 	      abort ();
1464 	    }
1465 	}
1466       else
1467 	gcc_unreachable ();
1468       return;
1469 
1470     case 'T':			/* 128 bits, signed */
1471     case 'D':			/* 64 bits, signed */
1472     case 'S':			/* 32 bits, signed */
1473       if (CONSTANT_P (x))
1474 	{
1475 	  enum immediate_class c = classify_immediate (x, mode);
1476 	  switch (c)
1477 	    {
1478 	    case IC_IL1:
1479 	      constant_to_array (mode, x, arr);
1480 	      val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
1481 	      val = trunc_int_for_mode (val, SImode);
1482 	      switch (which_immediate_load (val))
1483 		{
1484 		case SPU_IL:
1485 		case SPU_ILA:
1486 		  break;
1487 		case SPU_ILH:
1488 		case SPU_ILHU:
1489 		  val = trunc_int_for_mode (((arr[0] << 8) | arr[1]), HImode);
1490 		  break;
1491 		default:
1492 		  gcc_unreachable ();
1493 		}
1494 	      fprintf (file, HOST_WIDE_INT_PRINT_DEC, val);
1495 	      break;
1496 	    case IC_FSMBI:
1497 	      constant_to_array (mode, x, arr);
1498 	      val = 0;
1499 	      for (i = 0; i < 16; i++)
1500 		{
1501 		  val <<= 1;
1502 		  val |= arr[i] & 1;
1503 		}
1504 	      print_operand (file, GEN_INT (val), 0);
1505 	      break;
1506 	    case IC_CPAT:
1507 	      constant_to_array (mode, x, arr);
1508 	      cpat_info (arr, GET_MODE_SIZE (mode), 0, &info);
1509 	      fprintf (file, HOST_WIDE_INT_PRINT_DEC, (HOST_WIDE_INT)info);
1510 	      break;
1511 	    case IC_IL1s:
1512 	      if (xcode == HIGH)
1513 		x = XEXP (x, 0);
1514 	      if (GET_CODE (x) == CONST_VECTOR)
1515 		x = CONST_VECTOR_ELT (x, 0);
1516 	      output_addr_const (file, x);
1517 	      if (xcode == HIGH)
1518 		fprintf (file, "@h");
1519 	      break;
1520 	    case IC_IL2:
1521 	    case IC_IL2s:
1522 	    case IC_FSMBI2:
1523 	    case IC_POOL:
1524 	      abort ();
1525 	    }
1526 	}
1527       else
1528 	gcc_unreachable ();
1529       return;
1530 
1531     case 'C':
1532       if (xcode == CONST_INT)
1533 	{
1534 	  /* Only 4 least significant bits are relevant for generate
1535 	     control word instructions. */
1536 	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x) & 15);
1537 	  return;
1538 	}
1539       break;
1540 
1541     case 'M':			/* print code for c*d */
1542       if (GET_CODE (x) == CONST_INT)
1543 	switch (INTVAL (x))
1544 	  {
1545 	  case 1:
1546 	    fprintf (file, "b");
1547 	    break;
1548 	  case 2:
1549 	    fprintf (file, "h");
1550 	    break;
1551 	  case 4:
1552 	    fprintf (file, "w");
1553 	    break;
1554 	  case 8:
1555 	    fprintf (file, "d");
1556 	    break;
1557 	  default:
1558 	    gcc_unreachable();
1559 	  }
1560       else
1561 	gcc_unreachable();
1562       return;
1563 
1564     case 'N':			/* Negate the operand */
1565       if (xcode == CONST_INT)
1566 	fprintf (file, HOST_WIDE_INT_PRINT_DEC, -INTVAL (x));
1567       else if (xcode == CONST_VECTOR)
1568 	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1569 		 -INTVAL (CONST_VECTOR_ELT (x, 0)));
1570       return;
1571 
1572     case 'I':			/* enable/disable interrupts */
1573       if (xcode == CONST_INT)
1574 	fprintf (file, "%s",  INTVAL (x) == 0 ? "d" : "e");
1575       return;
1576 
1577     case 'b':			/* branch modifiers */
1578       if (xcode == REG)
1579 	fprintf (file, "%s", GET_MODE (x) == HImode ? "h" : "");
1580       else if (COMPARISON_P (x))
1581 	fprintf (file, "%s", xcode == NE ? "n" : "");
1582       return;
1583 
1584     case 'i':			/* indirect call */
1585       if (xcode == MEM)
1586 	{
1587 	  if (GET_CODE (XEXP (x, 0)) == REG)
1588 	    /* Used in indirect function calls. */
1589 	    fprintf (file, "%s", reg_names[REGNO (XEXP (x, 0))]);
1590 	  else
1591 	    output_address (XEXP (x, 0));
1592 	}
1593       return;
1594 
1595     case 'p':			/* load/store */
1596       if (xcode == MEM)
1597 	{
1598 	  x = XEXP (x, 0);
1599 	  xcode = GET_CODE (x);
1600 	}
1601       if (xcode == AND)
1602 	{
1603 	  x = XEXP (x, 0);
1604 	  xcode = GET_CODE (x);
1605 	}
1606       if (xcode == REG)
1607 	fprintf (file, "d");
1608       else if (xcode == CONST_INT)
1609 	fprintf (file, "a");
1610       else if (xcode == CONST || xcode == SYMBOL_REF || xcode == LABEL_REF)
1611 	fprintf (file, "r");
1612       else if (xcode == PLUS || xcode == LO_SUM)
1613 	{
1614 	  if (GET_CODE (XEXP (x, 1)) == REG)
1615 	    fprintf (file, "x");
1616 	  else
1617 	    fprintf (file, "d");
1618 	}
1619       return;
1620 
1621     case 'e':
1622       val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1623       val &= 0x7;
1624       output_addr_const (file, GEN_INT (val));
1625       return;
1626 
1627     case 'f':
1628       val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1629       val &= 0x1f;
1630       output_addr_const (file, GEN_INT (val));
1631       return;
1632 
1633     case 'g':
1634       val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1635       val &= 0x3f;
1636       output_addr_const (file, GEN_INT (val));
1637       return;
1638 
1639     case 'h':
1640       val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1641       val = (val >> 3) & 0x1f;
1642       output_addr_const (file, GEN_INT (val));
1643       return;
1644 
1645     case 'E':
1646       val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1647       val = -val;
1648       val &= 0x7;
1649       output_addr_const (file, GEN_INT (val));
1650       return;
1651 
1652     case 'F':
1653       val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1654       val = -val;
1655       val &= 0x1f;
1656       output_addr_const (file, GEN_INT (val));
1657       return;
1658 
1659     case 'G':
1660       val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1661       val = -val;
1662       val &= 0x3f;
1663       output_addr_const (file, GEN_INT (val));
1664       return;
1665 
1666     case 'H':
1667       val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1668       val = -(val & -8ll);
1669       val = (val >> 3) & 0x1f;
1670       output_addr_const (file, GEN_INT (val));
1671       return;
1672 
1673     case 'v':
1674     case 'w':
1675       constant_to_array (mode, x, arr);
1676       val = (((arr[0] << 1) + (arr[1] >> 7)) & 0xff) - 127;
1677       output_addr_const (file, GEN_INT (code == 'w' ? -val : val));
1678       return;
1679 
1680     case 0:
1681       if (xcode == REG)
1682 	fprintf (file, "%s", reg_names[REGNO (x)]);
1683       else if (xcode == MEM)
1684 	output_address (XEXP (x, 0));
1685       else if (xcode == CONST_VECTOR)
1686 	print_operand (file, CONST_VECTOR_ELT (x, 0), 0);
1687       else
1688 	output_addr_const (file, x);
1689       return;
1690 
1691       /* unused letters
1692 	              o qr  u   yz
1693 	AB            OPQR  UVWXYZ */
1694     default:
1695       output_operand_lossage ("invalid %%xn code");
1696     }
1697   gcc_unreachable ();
1698 }
1699 
1700 extern char call_used_regs[];
1701 
1702 /* For PIC mode we've reserved PIC_OFFSET_TABLE_REGNUM, which is a
1703    caller saved register.  For leaf functions it is more efficient to
1704    use a volatile register because we won't need to save and restore the
1705    pic register.  This routine is only valid after register allocation
1706    is completed, so we can pick an unused register.  */
1707 static rtx
1708 get_pic_reg (void)
1709 {
1710   rtx pic_reg = pic_offset_table_rtx;
1711   if (!reload_completed && !reload_in_progress)
1712     abort ();
1713   if (current_function_is_leaf && !df_regs_ever_live_p (LAST_ARG_REGNUM))
1714     pic_reg = gen_rtx_REG (SImode, LAST_ARG_REGNUM);
1715   return pic_reg;
1716 }
1717 
1718 /* Split constant addresses to handle cases that are too large.
1719    Add in the pic register when in PIC mode.
1720    Split immediates that require more than 1 instruction. */
1721 int
1722 spu_split_immediate (rtx * ops)
1723 {
1724   enum machine_mode mode = GET_MODE (ops[0]);
1725   enum immediate_class c = classify_immediate (ops[1], mode);
1726 
1727   switch (c)
1728     {
1729     case IC_IL2:
1730       {
1731 	unsigned char arrhi[16];
1732 	unsigned char arrlo[16];
1733 	rtx to, temp, hi, lo;
1734 	int i;
1735 	enum machine_mode imode = mode;
1736 	/* We need to do reals as ints because the constant used in the
1737 	   IOR might not be a legitimate real constant. */
1738 	imode = int_mode_for_mode (mode);
1739 	constant_to_array (mode, ops[1], arrhi);
1740 	if (imode != mode)
1741 	  to = simplify_gen_subreg (imode, ops[0], mode, 0);
1742 	else
1743 	  to = ops[0];
1744 	temp = !can_create_pseudo_p () ? to : gen_reg_rtx (imode);
1745 	for (i = 0; i < 16; i += 4)
1746 	  {
1747 	    arrlo[i + 2] = arrhi[i + 2];
1748 	    arrlo[i + 3] = arrhi[i + 3];
1749 	    arrlo[i + 0] = arrlo[i + 1] = 0;
1750 	    arrhi[i + 2] = arrhi[i + 3] = 0;
1751 	  }
1752 	hi = array_to_constant (imode, arrhi);
1753 	lo = array_to_constant (imode, arrlo);
1754 	emit_move_insn (temp, hi);
1755 	emit_insn (gen_rtx_SET
1756 		   (VOIDmode, to, gen_rtx_IOR (imode, temp, lo)));
1757 	return 1;
1758       }
1759     case IC_FSMBI2:
1760       {
1761 	unsigned char arr_fsmbi[16];
1762 	unsigned char arr_andbi[16];
1763 	rtx to, reg_fsmbi, reg_and;
1764 	int i;
1765 	enum machine_mode imode = mode;
1766 	/* We need to do reals as ints because the constant used in the
1767 	 * AND might not be a legitimate real constant. */
1768 	imode = int_mode_for_mode (mode);
1769 	constant_to_array (mode, ops[1], arr_fsmbi);
1770 	if (imode != mode)
1771 	  to = simplify_gen_subreg(imode, ops[0], GET_MODE (ops[0]), 0);
1772 	else
1773 	  to = ops[0];
1774 	for (i = 0; i < 16; i++)
1775 	  if (arr_fsmbi[i] != 0)
1776 	    {
1777 	      arr_andbi[0] = arr_fsmbi[i];
1778 	      arr_fsmbi[i] = 0xff;
1779 	    }
1780 	for (i = 1; i < 16; i++)
1781 	  arr_andbi[i] = arr_andbi[0];
1782 	reg_fsmbi = array_to_constant (imode, arr_fsmbi);
1783 	reg_and = array_to_constant (imode, arr_andbi);
1784 	emit_move_insn (to, reg_fsmbi);
1785 	emit_insn (gen_rtx_SET
1786 		   (VOIDmode, to, gen_rtx_AND (imode, to, reg_and)));
1787 	return 1;
1788       }
1789     case IC_POOL:
1790       if (reload_in_progress || reload_completed)
1791 	{
1792 	  rtx mem = force_const_mem (mode, ops[1]);
1793 	  if (TARGET_LARGE_MEM)
1794 	    {
1795 	      rtx addr = gen_rtx_REG (Pmode, REGNO (ops[0]));
1796 	      emit_move_insn (addr, XEXP (mem, 0));
1797 	      mem = replace_equiv_address (mem, addr);
1798 	    }
1799 	  emit_move_insn (ops[0], mem);
1800 	  return 1;
1801 	}
1802       break;
1803     case IC_IL1s:
1804     case IC_IL2s:
1805       if (reload_completed && GET_CODE (ops[1]) != HIGH)
1806 	{
1807 	  if (c == IC_IL2s)
1808 	    {
1809 	      emit_move_insn (ops[0], gen_rtx_HIGH (mode, ops[1]));
1810 	      emit_move_insn (ops[0], gen_rtx_LO_SUM (mode, ops[0], ops[1]));
1811 	    }
1812 	  else if (flag_pic)
1813 	    emit_insn (gen_pic (ops[0], ops[1]));
1814 	  if (flag_pic)
1815 	    {
1816 	      rtx pic_reg = get_pic_reg ();
1817 	      emit_insn (gen_addsi3 (ops[0], ops[0], pic_reg));
1818 	      crtl->uses_pic_offset_table = 1;
1819 	    }
1820 	  return flag_pic || c == IC_IL2s;
1821 	}
1822       break;
1823     case IC_IL1:
1824     case IC_FSMBI:
1825     case IC_CPAT:
1826       break;
1827     }
1828   return 0;
1829 }
1830 
1831 /* SAVING is TRUE when we are generating the actual load and store
1832    instructions for REGNO.  When determining the size of the stack
1833    needed for saving register we must allocate enough space for the
1834    worst case, because we don't always have the information early enough
1835    to not allocate it.  But we can at least eliminate the actual loads
1836    and stores during the prologue/epilogue.  */
1837 static int
1838 need_to_save_reg (int regno, int saving)
1839 {
1840   if (df_regs_ever_live_p (regno) && !call_used_regs[regno])
1841     return 1;
1842   if (flag_pic
1843       && regno == PIC_OFFSET_TABLE_REGNUM
1844       && (!saving || crtl->uses_pic_offset_table)
1845       && (!saving
1846 	  || !current_function_is_leaf || df_regs_ever_live_p (LAST_ARG_REGNUM)))
1847     return 1;
1848   return 0;
1849 }
1850 
1851 /* This function is only correct starting with local register
1852    allocation */
1853 int
1854 spu_saved_regs_size (void)
1855 {
1856   int reg_save_size = 0;
1857   int regno;
1858 
1859   for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; --regno)
1860     if (need_to_save_reg (regno, 0))
1861       reg_save_size += 0x10;
1862   return reg_save_size;
1863 }
1864 
1865 static rtx
1866 frame_emit_store (int regno, rtx addr, HOST_WIDE_INT offset)
1867 {
1868   rtx reg = gen_rtx_REG (V4SImode, regno);
1869   rtx mem =
1870     gen_frame_mem (V4SImode, gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)));
1871   return emit_insn (gen_movv4si (mem, reg));
1872 }
1873 
1874 static rtx
1875 frame_emit_load (int regno, rtx addr, HOST_WIDE_INT offset)
1876 {
1877   rtx reg = gen_rtx_REG (V4SImode, regno);
1878   rtx mem =
1879     gen_frame_mem (V4SImode, gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)));
1880   return emit_insn (gen_movv4si (reg, mem));
1881 }
1882 
1883 /* This happens after reload, so we need to expand it.  */
1884 static rtx
1885 frame_emit_add_imm (rtx dst, rtx src, HOST_WIDE_INT imm, rtx scratch)
1886 {
1887   rtx insn;
1888   if (satisfies_constraint_K (GEN_INT (imm)))
1889     {
1890       insn = emit_insn (gen_addsi3 (dst, src, GEN_INT (imm)));
1891     }
1892   else
1893     {
1894       emit_insn (gen_movsi (scratch, gen_int_mode (imm, SImode)));
1895       insn = emit_insn (gen_addsi3 (dst, src, scratch));
1896       if (REGNO (src) == REGNO (scratch))
1897 	abort ();
1898     }
1899   return insn;
1900 }
1901 
1902 /* Return nonzero if this function is known to have a null epilogue.  */
1903 
1904 int
1905 direct_return (void)
1906 {
1907   if (reload_completed)
1908     {
1909       if (cfun->static_chain_decl == 0
1910 	  && (spu_saved_regs_size ()
1911 	      + get_frame_size ()
1912 	      + crtl->outgoing_args_size
1913 	      + crtl->args.pretend_args_size == 0)
1914 	  && current_function_is_leaf)
1915 	return 1;
1916     }
1917   return 0;
1918 }
1919 
1920 /*
1921    The stack frame looks like this:
1922          +-------------+
1923          |  incoming   |
1924          |    args     |
1925    AP -> +-------------+
1926          | $lr save    |
1927          +-------------+
1928  prev SP | back chain  |
1929          +-------------+
1930          |  var args   |
1931          |  reg save   | crtl->args.pretend_args_size bytes
1932          +-------------+
1933          |    ...      |
1934          | saved regs  | spu_saved_regs_size() bytes
1935    FP -> +-------------+
1936          |    ...      |
1937          |   vars      | get_frame_size()  bytes
1938   HFP -> +-------------+
1939          |    ...      |
1940          |  outgoing   |
1941          |    args     | crtl->outgoing_args_size bytes
1942          +-------------+
1943          | $lr of next |
1944          |   frame     |
1945          +-------------+
1946          | back chain  |
1947    SP -> +-------------+
1948 
1949 */
1950 void
1951 spu_expand_prologue (void)
1952 {
1953   HOST_WIDE_INT size = get_frame_size (), offset, regno;
1954   HOST_WIDE_INT total_size;
1955   HOST_WIDE_INT saved_regs_size;
1956   rtx sp_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
1957   rtx scratch_reg_0, scratch_reg_1;
1958   rtx insn, real;
1959 
1960   if (flag_pic && optimize == 0)
1961     crtl->uses_pic_offset_table = 1;
1962 
1963   if (spu_naked_function_p (current_function_decl))
1964     return;
1965 
1966   scratch_reg_0 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 1);
1967   scratch_reg_1 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 2);
1968 
1969   saved_regs_size = spu_saved_regs_size ();
1970   total_size = size + saved_regs_size
1971     + crtl->outgoing_args_size
1972     + crtl->args.pretend_args_size;
1973 
1974   if (!current_function_is_leaf
1975       || cfun->calls_alloca || total_size > 0)
1976     total_size += STACK_POINTER_OFFSET;
1977 
1978   /* Save this first because code after this might use the link
1979      register as a scratch register. */
1980   if (!current_function_is_leaf)
1981     {
1982       insn = frame_emit_store (LINK_REGISTER_REGNUM, sp_reg, 16);
1983       RTX_FRAME_RELATED_P (insn) = 1;
1984     }
1985 
1986   if (total_size > 0)
1987     {
1988       offset = -crtl->args.pretend_args_size;
1989       for (regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
1990 	if (need_to_save_reg (regno, 1))
1991 	  {
1992 	    offset -= 16;
1993 	    insn = frame_emit_store (regno, sp_reg, offset);
1994 	    RTX_FRAME_RELATED_P (insn) = 1;
1995 	  }
1996     }
1997 
1998   if (flag_pic && crtl->uses_pic_offset_table)
1999     {
2000       rtx pic_reg = get_pic_reg ();
2001       insn = emit_insn (gen_load_pic_offset (pic_reg, scratch_reg_0));
2002       insn = emit_insn (gen_subsi3 (pic_reg, pic_reg, scratch_reg_0));
2003     }
2004 
2005   if (total_size > 0)
2006     {
2007       if (flag_stack_check)
2008 	{
2009 	  /* We compare against total_size-1 because
2010 	     ($sp >= total_size) <=> ($sp > total_size-1) */
2011 	  rtx scratch_v4si = gen_rtx_REG (V4SImode, REGNO (scratch_reg_0));
2012 	  rtx sp_v4si = gen_rtx_REG (V4SImode, STACK_POINTER_REGNUM);
2013 	  rtx size_v4si = spu_const (V4SImode, total_size - 1);
2014 	  if (!satisfies_constraint_K (GEN_INT (total_size - 1)))
2015 	    {
2016 	      emit_move_insn (scratch_v4si, size_v4si);
2017 	      size_v4si = scratch_v4si;
2018 	    }
2019 	  emit_insn (gen_cgt_v4si (scratch_v4si, sp_v4si, size_v4si));
2020 	  emit_insn (gen_vec_extractv4si
2021 		     (scratch_reg_0, scratch_v4si, GEN_INT (1)));
2022 	  emit_insn (gen_spu_heq (scratch_reg_0, GEN_INT (0)));
2023 	}
2024 
2025       /* Adjust the stack pointer, and make sure scratch_reg_0 contains
2026          the value of the previous $sp because we save it as the back
2027          chain. */
2028       if (total_size <= 2000)
2029 	{
2030 	  /* In this case we save the back chain first. */
2031 	  insn = frame_emit_store (STACK_POINTER_REGNUM, sp_reg, -total_size);
2032 	  insn =
2033 	    frame_emit_add_imm (sp_reg, sp_reg, -total_size, scratch_reg_0);
2034 	}
2035       else
2036 	{
2037 	  insn = emit_move_insn (scratch_reg_0, sp_reg);
2038 	  insn =
2039 	    frame_emit_add_imm (sp_reg, sp_reg, -total_size, scratch_reg_1);
2040 	}
2041       RTX_FRAME_RELATED_P (insn) = 1;
2042       real = gen_addsi3 (sp_reg, sp_reg, GEN_INT (-total_size));
2043       add_reg_note (insn, REG_FRAME_RELATED_EXPR, real);
2044 
2045       if (total_size > 2000)
2046 	{
2047 	  /* Save the back chain ptr */
2048 	  insn = frame_emit_store (REGNO (scratch_reg_0), sp_reg, 0);
2049 	}
2050 
2051       if (frame_pointer_needed)
2052 	{
2053 	  rtx fp_reg = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2054 	  HOST_WIDE_INT fp_offset = STACK_POINTER_OFFSET
2055 	    + crtl->outgoing_args_size;
2056 	  /* Set the new frame_pointer */
2057 	  insn = frame_emit_add_imm (fp_reg, sp_reg, fp_offset, scratch_reg_0);
2058 	  RTX_FRAME_RELATED_P (insn) = 1;
2059 	  real = gen_addsi3 (fp_reg, sp_reg, GEN_INT (fp_offset));
2060 	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, real);
2061           REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = STACK_BOUNDARY;
2062 	}
2063     }
2064 
2065 }
2066 
2067 void
2068 spu_expand_epilogue (bool sibcall_p)
2069 {
2070   int size = get_frame_size (), offset, regno;
2071   HOST_WIDE_INT saved_regs_size, total_size;
2072   rtx sp_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2073   rtx jump, scratch_reg_0;
2074 
2075   if (spu_naked_function_p (current_function_decl))
2076     return;
2077 
2078   scratch_reg_0 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 1);
2079 
2080   saved_regs_size = spu_saved_regs_size ();
2081   total_size = size + saved_regs_size
2082     + crtl->outgoing_args_size
2083     + crtl->args.pretend_args_size;
2084 
2085   if (!current_function_is_leaf
2086       || cfun->calls_alloca || total_size > 0)
2087     total_size += STACK_POINTER_OFFSET;
2088 
2089   if (total_size > 0)
2090     {
2091       if (cfun->calls_alloca)
2092 	frame_emit_load (STACK_POINTER_REGNUM, sp_reg, 0);
2093       else
2094 	frame_emit_add_imm (sp_reg, sp_reg, total_size, scratch_reg_0);
2095 
2096 
2097       if (saved_regs_size > 0)
2098 	{
2099 	  offset = -crtl->args.pretend_args_size;
2100 	  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
2101 	    if (need_to_save_reg (regno, 1))
2102 	      {
2103 		offset -= 0x10;
2104 		frame_emit_load (regno, sp_reg, offset);
2105 	      }
2106 	}
2107     }
2108 
2109   if (!current_function_is_leaf)
2110     frame_emit_load (LINK_REGISTER_REGNUM, sp_reg, 16);
2111 
2112   if (!sibcall_p)
2113     {
2114       emit_use (gen_rtx_REG (SImode, LINK_REGISTER_REGNUM));
2115       jump = emit_jump_insn (gen__return ());
2116       emit_barrier_after (jump);
2117     }
2118 
2119 }
2120 
2121 rtx
2122 spu_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
2123 {
2124   if (count != 0)
2125     return 0;
2126   /* This is inefficient because it ends up copying to a save-register
2127      which then gets saved even though $lr has already been saved.  But
2128      it does generate better code for leaf functions and we don't need
2129      to use RETURN_ADDRESS_POINTER_REGNUM to get it working.  It's only
2130      used for __builtin_return_address anyway, so maybe we don't care if
2131      it's inefficient. */
2132   return get_hard_reg_initial_val (Pmode, LINK_REGISTER_REGNUM);
2133 }
2134 
2135 
2136 /* Given VAL, generate a constant appropriate for MODE.
2137    If MODE is a vector mode, every element will be VAL.
2138    For TImode, VAL will be zero extended to 128 bits. */
2139 rtx
2140 spu_const (enum machine_mode mode, HOST_WIDE_INT val)
2141 {
2142   rtx inner;
2143   rtvec v;
2144   int units, i;
2145 
2146   gcc_assert (GET_MODE_CLASS (mode) == MODE_INT
2147 	      || GET_MODE_CLASS (mode) == MODE_FLOAT
2148 	      || GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2149 	      || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
2150 
2151   if (GET_MODE_CLASS (mode) == MODE_INT)
2152     return immed_double_const (val, 0, mode);
2153 
2154   /* val is the bit representation of the float */
2155   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
2156     return hwint_to_const_double (mode, val);
2157 
2158   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2159     inner = immed_double_const (val, 0, GET_MODE_INNER (mode));
2160   else
2161     inner = hwint_to_const_double (GET_MODE_INNER (mode), val);
2162 
2163   units = GET_MODE_NUNITS (mode);
2164 
2165   v = rtvec_alloc (units);
2166 
2167   for (i = 0; i < units; ++i)
2168     RTVEC_ELT (v, i) = inner;
2169 
2170   return gen_rtx_CONST_VECTOR (mode, v);
2171 }
2172 
2173 /* Create a MODE vector constant from 4 ints. */
2174 rtx
2175 spu_const_from_ints(enum machine_mode mode, int a, int b, int c, int d)
2176 {
2177   unsigned char arr[16];
2178   arr[0] = (a >> 24) & 0xff;
2179   arr[1] = (a >> 16) & 0xff;
2180   arr[2] = (a >> 8) & 0xff;
2181   arr[3] = (a >> 0) & 0xff;
2182   arr[4] = (b >> 24) & 0xff;
2183   arr[5] = (b >> 16) & 0xff;
2184   arr[6] = (b >> 8) & 0xff;
2185   arr[7] = (b >> 0) & 0xff;
2186   arr[8] = (c >> 24) & 0xff;
2187   arr[9] = (c >> 16) & 0xff;
2188   arr[10] = (c >> 8) & 0xff;
2189   arr[11] = (c >> 0) & 0xff;
2190   arr[12] = (d >> 24) & 0xff;
2191   arr[13] = (d >> 16) & 0xff;
2192   arr[14] = (d >> 8) & 0xff;
2193   arr[15] = (d >> 0) & 0xff;
2194   return array_to_constant(mode, arr);
2195 }
2196 
2197 /* branch hint stuff */
2198 
2199 /* An array of these is used to propagate hints to predecessor blocks. */
2200 struct spu_bb_info
2201 {
2202   rtx prop_jump; /* propagated from another block */
2203   int bb_index;  /* the original block. */
2204 };
2205 static struct spu_bb_info *spu_bb_info;
2206 
2207 #define STOP_HINT_P(INSN) \
2208 		(GET_CODE(INSN) == CALL_INSN \
2209 		 || INSN_CODE(INSN) == CODE_FOR_divmodsi4 \
2210 		 || INSN_CODE(INSN) == CODE_FOR_udivmodsi4)
2211 
2212 /* 1 when RTX is a hinted branch or its target.  We keep track of
2213    what has been hinted so the safe-hint code can test it easily.  */
2214 #define HINTED_P(RTX)						\
2215   (RTL_FLAG_CHECK3("HINTED_P", (RTX), CODE_LABEL, JUMP_INSN, CALL_INSN)->unchanging)
2216 
2217 /* 1 when RTX is an insn that must be scheduled on an even boundary. */
2218 #define SCHED_ON_EVEN_P(RTX)						\
2219   (RTL_FLAG_CHECK2("SCHED_ON_EVEN_P", (RTX), JUMP_INSN, CALL_INSN)->in_struct)
2220 
2221 /* Emit a nop for INSN such that the two will dual issue.  This assumes
2222    INSN is 8-byte aligned.  When INSN is inline asm we emit an lnop.
2223    We check for TImode to handle a MULTI1 insn which has dual issued its
2224    first instruction.  get_pipe returns -1 for MULTI0, inline asm, or
2225    ADDR_VEC insns. */
2226 static void
2227 emit_nop_for_insn (rtx insn)
2228 {
2229   int p;
2230   rtx new_insn;
2231   p = get_pipe (insn);
2232   if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn))
2233     new_insn = emit_insn_after (gen_lnop (), insn);
2234   else if (p == 1 && GET_MODE (insn) == TImode)
2235     {
2236       new_insn = emit_insn_before (gen_nopn (GEN_INT (127)), insn);
2237       PUT_MODE (new_insn, TImode);
2238       PUT_MODE (insn, VOIDmode);
2239     }
2240   else
2241     new_insn = emit_insn_after (gen_lnop (), insn);
2242   recog_memoized (new_insn);
2243 }
2244 
2245 /* Insert nops in basic blocks to meet dual issue alignment
2246    requirements.  Also make sure hbrp and hint instructions are at least
2247    one cycle apart, possibly inserting a nop.  */
2248 static void
2249 pad_bb(void)
2250 {
2251   rtx insn, next_insn, prev_insn, hbr_insn = 0;
2252   int length;
2253   int addr;
2254 
2255   /* This sets up INSN_ADDRESSES. */
2256   shorten_branches (get_insns ());
2257 
2258   /* Keep track of length added by nops. */
2259   length = 0;
2260 
2261   prev_insn = 0;
2262   insn = get_insns ();
2263   if (!active_insn_p (insn))
2264     insn = next_active_insn (insn);
2265   for (; insn; insn = next_insn)
2266     {
2267       next_insn = next_active_insn (insn);
2268       if (INSN_CODE (insn) == CODE_FOR_iprefetch
2269 	  || INSN_CODE (insn) == CODE_FOR_hbr)
2270 	{
2271 	  if (hbr_insn)
2272 	    {
2273 	      int a0 = INSN_ADDRESSES (INSN_UID (hbr_insn));
2274 	      int a1 = INSN_ADDRESSES (INSN_UID (insn));
2275 	      if ((a1 - a0 == 8 && GET_MODE (insn) != TImode)
2276 		  || (a1 - a0 == 4))
2277 		{
2278 		  prev_insn = emit_insn_before (gen_lnop (), insn);
2279 		  PUT_MODE (prev_insn, GET_MODE (insn));
2280 		  PUT_MODE (insn, TImode);
2281 		  length += 4;
2282 		}
2283 	    }
2284 	  hbr_insn = insn;
2285 	}
2286       if (INSN_CODE (insn) == CODE_FOR_blockage)
2287 	{
2288 	  if (GET_MODE (insn) == TImode)
2289 	    PUT_MODE (next_insn, TImode);
2290 	  insn = next_insn;
2291 	  next_insn = next_active_insn (insn);
2292 	}
2293       addr = INSN_ADDRESSES (INSN_UID (insn));
2294       if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn))
2295 	{
2296 	  if (((addr + length) & 7) != 0)
2297 	    {
2298 	      emit_nop_for_insn (prev_insn);
2299 	      length += 4;
2300 	    }
2301 	}
2302       else if (GET_MODE (insn) == TImode
2303 	       && ((next_insn && GET_MODE (next_insn) != TImode)
2304 		   || get_attr_type (insn) == TYPE_MULTI0)
2305 	       && ((addr + length) & 7) != 0)
2306 	{
2307 	  /* prev_insn will always be set because the first insn is
2308 	     always 8-byte aligned. */
2309 	  emit_nop_for_insn (prev_insn);
2310 	  length += 4;
2311 	}
2312       prev_insn = insn;
2313     }
2314 }
2315 
2316 
2317 /* Routines for branch hints. */
2318 
2319 static void
2320 spu_emit_branch_hint (rtx before, rtx branch, rtx target,
2321 		      int distance, sbitmap blocks)
2322 {
2323   rtx branch_label = 0;
2324   rtx hint;
2325   rtx insn;
2326   rtx table;
2327 
2328   if (before == 0 || branch == 0 || target == 0)
2329     return;
2330 
2331   /* While scheduling we require hints to be no further than 600, so
2332      we need to enforce that here too */
2333   if (distance > 600)
2334     return;
2335 
2336   /* If we have a Basic block note, emit it after the basic block note.  */
2337   if (NOTE_INSN_BASIC_BLOCK_P (before))
2338     before = NEXT_INSN (before);
2339 
2340   branch_label = gen_label_rtx ();
2341   LABEL_NUSES (branch_label)++;
2342   LABEL_PRESERVE_P (branch_label) = 1;
2343   insn = emit_label_before (branch_label, branch);
2344   branch_label = gen_rtx_LABEL_REF (VOIDmode, branch_label);
2345   SET_BIT (blocks, BLOCK_FOR_INSN (branch)->index);
2346 
2347   hint = emit_insn_before (gen_hbr (branch_label, target), before);
2348   recog_memoized (hint);
2349   HINTED_P (branch) = 1;
2350 
2351   if (GET_CODE (target) == LABEL_REF)
2352     HINTED_P (XEXP (target, 0)) = 1;
2353   else if (tablejump_p (branch, 0, &table))
2354     {
2355       rtvec vec;
2356       int j;
2357       if (GET_CODE (PATTERN (table)) == ADDR_VEC)
2358 	vec = XVEC (PATTERN (table), 0);
2359       else
2360 	vec = XVEC (PATTERN (table), 1);
2361       for (j = GET_NUM_ELEM (vec) - 1; j >= 0; --j)
2362 	HINTED_P (XEXP (RTVEC_ELT (vec, j), 0)) = 1;
2363     }
2364 
2365   if (distance >= 588)
2366     {
2367       /* Make sure the hint isn't scheduled any earlier than this point,
2368          which could make it too far for the branch offest to fit */
2369       recog_memoized (emit_insn_before (gen_blockage (), hint));
2370     }
2371   else if (distance <= 8 * 4)
2372     {
2373       /* To guarantee at least 8 insns between the hint and branch we
2374          insert nops. */
2375       int d;
2376       for (d = distance; d < 8 * 4; d += 4)
2377 	{
2378 	  insn =
2379 	    emit_insn_after (gen_nopn_nv (gen_rtx_REG (SImode, 127)), hint);
2380 	  recog_memoized (insn);
2381 	}
2382 
2383       /* Make sure any nops inserted aren't scheduled before the hint. */
2384       recog_memoized (emit_insn_after (gen_blockage (), hint));
2385 
2386       /* Make sure any nops inserted aren't scheduled after the call. */
2387       if (CALL_P (branch) && distance < 8 * 4)
2388 	recog_memoized (emit_insn_before (gen_blockage (), branch));
2389     }
2390 }
2391 
2392 /* Returns 0 if we don't want a hint for this branch.  Otherwise return
2393    the rtx for the branch target. */
2394 static rtx
2395 get_branch_target (rtx branch)
2396 {
2397   if (GET_CODE (branch) == JUMP_INSN)
2398     {
2399       rtx set, src;
2400 
2401       /* Return statements */
2402       if (GET_CODE (PATTERN (branch)) == RETURN)
2403 	return gen_rtx_REG (SImode, LINK_REGISTER_REGNUM);
2404 
2405       /* jump table */
2406       if (GET_CODE (PATTERN (branch)) == ADDR_VEC
2407 	  || GET_CODE (PATTERN (branch)) == ADDR_DIFF_VEC)
2408 	return 0;
2409 
2410      /* ASM GOTOs. */
2411      if (extract_asm_operands (PATTERN (branch)) != NULL)
2412 	return NULL;
2413 
2414       set = single_set (branch);
2415       src = SET_SRC (set);
2416       if (GET_CODE (SET_DEST (set)) != PC)
2417 	abort ();
2418 
2419       if (GET_CODE (src) == IF_THEN_ELSE)
2420 	{
2421 	  rtx lab = 0;
2422 	  rtx note = find_reg_note (branch, REG_BR_PROB, 0);
2423 	  if (note)
2424 	    {
2425 	      /* If the more probable case is not a fall through, then
2426 	         try a branch hint.  */
2427 	      HOST_WIDE_INT prob = INTVAL (XEXP (note, 0));
2428 	      if (prob > (REG_BR_PROB_BASE * 6 / 10)
2429 		  && GET_CODE (XEXP (src, 1)) != PC)
2430 		lab = XEXP (src, 1);
2431 	      else if (prob < (REG_BR_PROB_BASE * 4 / 10)
2432 		       && GET_CODE (XEXP (src, 2)) != PC)
2433 		lab = XEXP (src, 2);
2434 	    }
2435 	  if (lab)
2436 	    {
2437 	      if (GET_CODE (lab) == RETURN)
2438 		return gen_rtx_REG (SImode, LINK_REGISTER_REGNUM);
2439 	      return lab;
2440 	    }
2441 	  return 0;
2442 	}
2443 
2444       return src;
2445     }
2446   else if (GET_CODE (branch) == CALL_INSN)
2447     {
2448       rtx call;
2449       /* All of our call patterns are in a PARALLEL and the CALL is
2450          the first pattern in the PARALLEL. */
2451       if (GET_CODE (PATTERN (branch)) != PARALLEL)
2452 	abort ();
2453       call = XVECEXP (PATTERN (branch), 0, 0);
2454       if (GET_CODE (call) == SET)
2455 	call = SET_SRC (call);
2456       if (GET_CODE (call) != CALL)
2457 	abort ();
2458       return XEXP (XEXP (call, 0), 0);
2459     }
2460   return 0;
2461 }
2462 
2463 /* The special $hbr register is used to prevent the insn scheduler from
2464    moving hbr insns across instructions which invalidate them.  It
2465    should only be used in a clobber, and this function searches for
2466    insns which clobber it.  */
2467 static bool
2468 insn_clobbers_hbr (rtx insn)
2469 {
2470   if (INSN_P (insn)
2471       && GET_CODE (PATTERN (insn)) == PARALLEL)
2472     {
2473       rtx parallel = PATTERN (insn);
2474       rtx clobber;
2475       int j;
2476       for (j = XVECLEN (parallel, 0) - 1; j >= 0; j--)
2477 	{
2478 	  clobber = XVECEXP (parallel, 0, j);
2479 	  if (GET_CODE (clobber) == CLOBBER
2480 	      && GET_CODE (XEXP (clobber, 0)) == REG
2481 	      && REGNO (XEXP (clobber, 0)) == HBR_REGNUM)
2482 	    return 1;
2483 	}
2484     }
2485   return 0;
2486 }
2487 
2488 /* Search up to 32 insns starting at FIRST:
2489    - at any kind of hinted branch, just return
2490    - at any unconditional branch in the first 15 insns, just return
2491    - at a call or indirect branch, after the first 15 insns, force it to
2492      an even address and return
2493    - at any unconditional branch, after the first 15 insns, force it to
2494      an even address.
2495    At then end of the search, insert an hbrp within 4 insns of FIRST,
2496    and an hbrp within 16 instructions of FIRST.
2497  */
2498 static void
2499 insert_hbrp_for_ilb_runout (rtx first)
2500 {
2501   rtx insn, before_4 = 0, before_16 = 0;
2502   int addr = 0, length, first_addr = -1;
2503   int hbrp_addr0 = 128 * 4, hbrp_addr1 = 128 * 4;
2504   int insert_lnop_after = 0;
2505   for (insn = first; insn; insn = NEXT_INSN (insn))
2506     if (INSN_P (insn))
2507       {
2508 	if (first_addr == -1)
2509 	  first_addr = INSN_ADDRESSES (INSN_UID (insn));
2510 	addr = INSN_ADDRESSES (INSN_UID (insn)) - first_addr;
2511 	length = get_attr_length (insn);
2512 
2513 	if (before_4 == 0 && addr + length >= 4 * 4)
2514 	  before_4 = insn;
2515 	/* We test for 14 instructions because the first hbrp will add
2516 	   up to 2 instructions. */
2517 	if (before_16 == 0 && addr + length >= 14 * 4)
2518 	  before_16 = insn;
2519 
2520 	if (INSN_CODE (insn) == CODE_FOR_hbr)
2521 	  {
2522 	    /* Make sure an hbrp is at least 2 cycles away from a hint.
2523 	       Insert an lnop after the hbrp when necessary. */
2524 	    if (before_4 == 0 && addr > 0)
2525 	      {
2526 		before_4 = insn;
2527 		insert_lnop_after |= 1;
2528 	      }
2529 	    else if (before_4 && addr <= 4 * 4)
2530 	      insert_lnop_after |= 1;
2531 	    if (before_16 == 0 && addr > 10 * 4)
2532 	      {
2533 		before_16 = insn;
2534 		insert_lnop_after |= 2;
2535 	      }
2536 	    else if (before_16 && addr <= 14 * 4)
2537 	      insert_lnop_after |= 2;
2538 	  }
2539 
2540 	if (INSN_CODE (insn) == CODE_FOR_iprefetch)
2541 	  {
2542 	    if (addr < hbrp_addr0)
2543 	      hbrp_addr0 = addr;
2544 	    else if (addr < hbrp_addr1)
2545 	      hbrp_addr1 = addr;
2546 	  }
2547 
2548 	if (CALL_P (insn) || JUMP_P (insn))
2549 	  {
2550 	    if (HINTED_P (insn))
2551 	      return;
2552 
2553 	    /* Any branch after the first 15 insns should be on an even
2554 	       address to avoid a special case branch.  There might be
2555 	       some nops and/or hbrps inserted, so we test after 10
2556 	       insns. */
2557 	    if (addr > 10 * 4)
2558 	      SCHED_ON_EVEN_P (insn) = 1;
2559 	  }
2560 
2561 	if (CALL_P (insn) || tablejump_p (insn, 0, 0))
2562 	  return;
2563 
2564 
2565 	if (addr + length >= 32 * 4)
2566 	  {
2567 	    gcc_assert (before_4 && before_16);
2568 	    if (hbrp_addr0 > 4 * 4)
2569 	      {
2570 		insn =
2571 		  emit_insn_before (gen_iprefetch (GEN_INT (1)), before_4);
2572 		recog_memoized (insn);
2573 		INSN_ADDRESSES_NEW (insn,
2574 				    INSN_ADDRESSES (INSN_UID (before_4)));
2575 		PUT_MODE (insn, GET_MODE (before_4));
2576 		PUT_MODE (before_4, TImode);
2577 		if (insert_lnop_after & 1)
2578 		  {
2579 		    insn = emit_insn_before (gen_lnop (), before_4);
2580 		    recog_memoized (insn);
2581 		    INSN_ADDRESSES_NEW (insn,
2582 					INSN_ADDRESSES (INSN_UID (before_4)));
2583 		    PUT_MODE (insn, TImode);
2584 		  }
2585 	      }
2586 	    if ((hbrp_addr0 <= 4 * 4 || hbrp_addr0 > 16 * 4)
2587 		&& hbrp_addr1 > 16 * 4)
2588 	      {
2589 		insn =
2590 		  emit_insn_before (gen_iprefetch (GEN_INT (2)), before_16);
2591 		recog_memoized (insn);
2592 		INSN_ADDRESSES_NEW (insn,
2593 				    INSN_ADDRESSES (INSN_UID (before_16)));
2594 		PUT_MODE (insn, GET_MODE (before_16));
2595 		PUT_MODE (before_16, TImode);
2596 		if (insert_lnop_after & 2)
2597 		  {
2598 		    insn = emit_insn_before (gen_lnop (), before_16);
2599 		    recog_memoized (insn);
2600 		    INSN_ADDRESSES_NEW (insn,
2601 					INSN_ADDRESSES (INSN_UID
2602 							(before_16)));
2603 		    PUT_MODE (insn, TImode);
2604 		  }
2605 	      }
2606 	    return;
2607 	  }
2608       }
2609     else if (BARRIER_P (insn))
2610       return;
2611 
2612 }
2613 
2614 /* The SPU might hang when it executes 48 inline instructions after a
2615    hinted branch jumps to its hinted target.  The beginning of a
2616    function and the return from a call might have been hinted, and must
2617    be handled as well.  To prevent a hang we insert 2 hbrps.  The first
2618    should be within 6 insns of the branch target.  The second should be
2619    within 22 insns of the branch target.  When determining if hbrps are
2620    necessary, we look for only 32 inline instructions, because up to to
2621    12 nops and 4 hbrps could be inserted.  Similarily, when inserting
2622    new hbrps, we insert them within 4 and 16 insns of the target.  */
2623 static void
2624 insert_hbrp (void)
2625 {
2626   rtx insn;
2627   if (TARGET_SAFE_HINTS)
2628     {
2629       shorten_branches (get_insns ());
2630       /* Insert hbrp at beginning of function */
2631       insn = next_active_insn (get_insns ());
2632       if (insn)
2633 	insert_hbrp_for_ilb_runout (insn);
2634       /* Insert hbrp after hinted targets. */
2635       for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
2636 	if ((LABEL_P (insn) && HINTED_P (insn)) || CALL_P (insn))
2637 	  insert_hbrp_for_ilb_runout (next_active_insn (insn));
2638     }
2639 }
2640 
2641 static int in_spu_reorg;
2642 
2643 /* Insert branch hints.  There are no branch optimizations after this
2644    pass, so it's safe to set our branch hints now. */
2645 static void
2646 spu_machine_dependent_reorg (void)
2647 {
2648   sbitmap blocks;
2649   basic_block bb;
2650   rtx branch, insn;
2651   rtx branch_target = 0;
2652   int branch_addr = 0, insn_addr, required_dist = 0;
2653   int i;
2654   unsigned int j;
2655 
2656   if (!TARGET_BRANCH_HINTS || optimize == 0)
2657     {
2658       /* We still do it for unoptimized code because an external
2659          function might have hinted a call or return. */
2660       insert_hbrp ();
2661       pad_bb ();
2662       return;
2663     }
2664 
2665   blocks = sbitmap_alloc (last_basic_block);
2666   sbitmap_zero (blocks);
2667 
2668   in_spu_reorg = 1;
2669   compute_bb_for_insn ();
2670 
2671   compact_blocks ();
2672 
2673   spu_bb_info =
2674     (struct spu_bb_info *) xcalloc (n_basic_blocks,
2675 				    sizeof (struct spu_bb_info));
2676 
2677   /* We need exact insn addresses and lengths.  */
2678   shorten_branches (get_insns ());
2679 
2680   for (i = n_basic_blocks - 1; i >= 0; i--)
2681     {
2682       bb = BASIC_BLOCK (i);
2683       branch = 0;
2684       if (spu_bb_info[i].prop_jump)
2685 	{
2686 	  branch = spu_bb_info[i].prop_jump;
2687 	  branch_target = get_branch_target (branch);
2688 	  branch_addr = INSN_ADDRESSES (INSN_UID (branch));
2689 	  required_dist = spu_hint_dist;
2690 	}
2691       /* Search from end of a block to beginning.   In this loop, find
2692          jumps which need a branch and emit them only when:
2693          - it's an indirect branch and we're at the insn which sets
2694          the register
2695          - we're at an insn that will invalidate the hint. e.g., a
2696          call, another hint insn, inline asm that clobbers $hbr, and
2697          some inlined operations (divmodsi4).  Don't consider jumps
2698          because they are only at the end of a block and are
2699          considered when we are deciding whether to propagate
2700          - we're getting too far away from the branch.  The hbr insns
2701          only have a signed 10 bit offset
2702          We go back as far as possible so the branch will be considered
2703          for propagation when we get to the beginning of the block.  */
2704       for (insn = BB_END (bb); insn; insn = PREV_INSN (insn))
2705 	{
2706 	  if (INSN_P (insn))
2707 	    {
2708 	      insn_addr = INSN_ADDRESSES (INSN_UID (insn));
2709 	      if (branch
2710 		  && ((GET_CODE (branch_target) == REG
2711 		       && set_of (branch_target, insn) != NULL_RTX)
2712 		      || insn_clobbers_hbr (insn)
2713 		      || branch_addr - insn_addr > 600))
2714 		{
2715 		  rtx next = NEXT_INSN (insn);
2716 		  int next_addr = INSN_ADDRESSES (INSN_UID (next));
2717 		  if (insn != BB_END (bb)
2718 		      && branch_addr - next_addr >= required_dist)
2719 		    {
2720 		      if (dump_file)
2721 			fprintf (dump_file,
2722 				 "hint for %i in block %i before %i\n",
2723 				 INSN_UID (branch), bb->index,
2724 				 INSN_UID (next));
2725 		      spu_emit_branch_hint (next, branch, branch_target,
2726 					    branch_addr - next_addr, blocks);
2727 		    }
2728 		  branch = 0;
2729 		}
2730 
2731 	      /* JUMP_P will only be true at the end of a block.  When
2732 	         branch is already set it means we've previously decided
2733 	         to propagate a hint for that branch into this block. */
2734 	      if (CALL_P (insn) || (JUMP_P (insn) && !branch))
2735 		{
2736 		  branch = 0;
2737 		  if ((branch_target = get_branch_target (insn)))
2738 		    {
2739 		      branch = insn;
2740 		      branch_addr = insn_addr;
2741 		      required_dist = spu_hint_dist;
2742 		    }
2743 		}
2744 	    }
2745 	  if (insn == BB_HEAD (bb))
2746 	    break;
2747 	}
2748 
2749       if (branch)
2750 	{
2751 	  /* If we haven't emitted a hint for this branch yet, it might
2752 	     be profitable to emit it in one of the predecessor blocks,
2753 	     especially for loops.  */
2754 	  rtx bbend;
2755 	  basic_block prev = 0, prop = 0, prev2 = 0;
2756 	  int loop_exit = 0, simple_loop = 0;
2757 	  int next_addr = INSN_ADDRESSES (INSN_UID (NEXT_INSN (insn)));
2758 
2759 	  for (j = 0; j < EDGE_COUNT (bb->preds); j++)
2760 	    if (EDGE_PRED (bb, j)->flags & EDGE_FALLTHRU)
2761 	      prev = EDGE_PRED (bb, j)->src;
2762 	    else
2763 	      prev2 = EDGE_PRED (bb, j)->src;
2764 
2765 	  for (j = 0; j < EDGE_COUNT (bb->succs); j++)
2766 	    if (EDGE_SUCC (bb, j)->flags & EDGE_LOOP_EXIT)
2767 	      loop_exit = 1;
2768 	    else if (EDGE_SUCC (bb, j)->dest == bb)
2769 	      simple_loop = 1;
2770 
2771 	  /* If this branch is a loop exit then propagate to previous
2772 	     fallthru block. This catches the cases when it is a simple
2773 	     loop or when there is an initial branch into the loop. */
2774 	  if (prev && (loop_exit || simple_loop)
2775 	      && prev->loop_depth <= bb->loop_depth)
2776 	    prop = prev;
2777 
2778 	  /* If there is only one adjacent predecessor.  Don't propagate
2779 	     outside this loop.  This loop_depth test isn't perfect, but
2780 	     I'm not sure the loop_father member is valid at this point.  */
2781 	  else if (prev && single_pred_p (bb)
2782 		   && prev->loop_depth == bb->loop_depth)
2783 	    prop = prev;
2784 
2785 	  /* If this is the JOIN block of a simple IF-THEN then
2786 	     propogate the hint to the HEADER block. */
2787 	  else if (prev && prev2
2788 		   && EDGE_COUNT (bb->preds) == 2
2789 		   && EDGE_COUNT (prev->preds) == 1
2790 		   && EDGE_PRED (prev, 0)->src == prev2
2791 		   && prev2->loop_depth == bb->loop_depth
2792 		   && GET_CODE (branch_target) != REG)
2793 	    prop = prev;
2794 
2795 	  /* Don't propagate when:
2796 	     - this is a simple loop and the hint would be too far
2797 	     - this is not a simple loop and there are 16 insns in
2798 	     this block already
2799 	     - the predecessor block ends in a branch that will be
2800 	     hinted
2801 	     - the predecessor block ends in an insn that invalidates
2802 	     the hint */
2803 	  if (prop
2804 	      && prop->index >= 0
2805 	      && (bbend = BB_END (prop))
2806 	      && branch_addr - INSN_ADDRESSES (INSN_UID (bbend)) <
2807 	      (simple_loop ? 600 : 16 * 4) && get_branch_target (bbend) == 0
2808 	      && (JUMP_P (bbend) || !insn_clobbers_hbr (bbend)))
2809 	    {
2810 	      if (dump_file)
2811 		fprintf (dump_file, "propagate from %i to %i (loop depth %i) "
2812 			 "for %i (loop_exit %i simple_loop %i dist %i)\n",
2813 			 bb->index, prop->index, bb->loop_depth,
2814 			 INSN_UID (branch), loop_exit, simple_loop,
2815 			 branch_addr - INSN_ADDRESSES (INSN_UID (bbend)));
2816 
2817 	      spu_bb_info[prop->index].prop_jump = branch;
2818 	      spu_bb_info[prop->index].bb_index = i;
2819 	    }
2820 	  else if (branch_addr - next_addr >= required_dist)
2821 	    {
2822 	      if (dump_file)
2823 		fprintf (dump_file, "hint for %i in block %i before %i\n",
2824 			 INSN_UID (branch), bb->index,
2825 			 INSN_UID (NEXT_INSN (insn)));
2826 	      spu_emit_branch_hint (NEXT_INSN (insn), branch, branch_target,
2827 				    branch_addr - next_addr, blocks);
2828 	    }
2829 	  branch = 0;
2830 	}
2831     }
2832   free (spu_bb_info);
2833 
2834   if (!sbitmap_empty_p (blocks))
2835     find_many_sub_basic_blocks (blocks);
2836 
2837   /* We have to schedule to make sure alignment is ok. */
2838   FOR_EACH_BB (bb) bb->flags &= ~BB_DISABLE_SCHEDULE;
2839 
2840   /* The hints need to be scheduled, so call it again. */
2841   schedule_insns ();
2842 
2843   insert_hbrp ();
2844 
2845   pad_bb ();
2846 
2847   for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
2848     if (NONJUMP_INSN_P (insn) && INSN_CODE (insn) == CODE_FOR_hbr)
2849       {
2850 	/* Adjust the LABEL_REF in a hint when we have inserted a nop
2851 	   between its branch label and the branch .  We don't move the
2852 	   label because GCC expects it at the beginning of the block. */
2853 	rtx unspec = SET_SRC (XVECEXP (PATTERN (insn), 0, 0));
2854 	rtx label_ref = XVECEXP (unspec, 0, 0);
2855 	rtx label = XEXP (label_ref, 0);
2856 	rtx branch;
2857 	int offset = 0;
2858 	for (branch = NEXT_INSN (label);
2859 	     !JUMP_P (branch) && !CALL_P (branch);
2860 	     branch = NEXT_INSN (branch))
2861 	  if (NONJUMP_INSN_P (branch))
2862 	    offset += get_attr_length (branch);
2863 	if (offset > 0)
2864 	  XVECEXP (unspec, 0, 0) = plus_constant (label_ref, offset);
2865       }
2866 
2867   if (spu_flag_var_tracking)
2868     {
2869       df_analyze ();
2870       timevar_push (TV_VAR_TRACKING);
2871       variable_tracking_main ();
2872       timevar_pop (TV_VAR_TRACKING);
2873       df_finish_pass (false);
2874     }
2875 
2876   free_bb_for_insn ();
2877 
2878   in_spu_reorg = 0;
2879 }
2880 
2881 
2882 /* Insn scheduling routines, primarily for dual issue. */
2883 static int
2884 spu_sched_issue_rate (void)
2885 {
2886   return 2;
2887 }
2888 
2889 static int
2890 uses_ls_unit(rtx insn)
2891 {
2892   rtx set = single_set (insn);
2893   if (set != 0
2894       && (GET_CODE (SET_DEST (set)) == MEM
2895 	  || GET_CODE (SET_SRC (set)) == MEM))
2896     return 1;
2897   return 0;
2898 }
2899 
2900 static int
2901 get_pipe (rtx insn)
2902 {
2903   enum attr_type t;
2904   /* Handle inline asm */
2905   if (INSN_CODE (insn) == -1)
2906     return -1;
2907   t = get_attr_type (insn);
2908   switch (t)
2909     {
2910     case TYPE_CONVERT:
2911       return -2;
2912     case TYPE_MULTI0:
2913       return -1;
2914 
2915     case TYPE_FX2:
2916     case TYPE_FX3:
2917     case TYPE_SPR:
2918     case TYPE_NOP:
2919     case TYPE_FXB:
2920     case TYPE_FPD:
2921     case TYPE_FP6:
2922     case TYPE_FP7:
2923       return 0;
2924 
2925     case TYPE_LNOP:
2926     case TYPE_SHUF:
2927     case TYPE_LOAD:
2928     case TYPE_STORE:
2929     case TYPE_BR:
2930     case TYPE_MULTI1:
2931     case TYPE_HBR:
2932     case TYPE_IPREFETCH:
2933       return 1;
2934     default:
2935       abort ();
2936     }
2937 }
2938 
2939 
2940 /* haifa-sched.c has a static variable that keeps track of the current
2941    cycle.  It is passed to spu_sched_reorder, and we record it here for
2942    use by spu_sched_variable_issue.  It won't be accurate if the
2943    scheduler updates it's clock_var between the two calls. */
2944 static int clock_var;
2945 
2946 /* This is used to keep track of insn alignment.  Set to 0 at the
2947    beginning of each block and increased by the "length" attr of each
2948    insn scheduled. */
2949 static int spu_sched_length;
2950 
2951 /* Record when we've issued pipe0 and pipe1 insns so we can reorder the
2952    ready list appropriately in spu_sched_reorder(). */
2953 static int pipe0_clock;
2954 static int pipe1_clock;
2955 
2956 static int prev_clock_var;
2957 
2958 static int prev_priority;
2959 
2960 /* The SPU needs to load the next ilb sometime during the execution of
2961    the previous ilb.  There is a potential conflict if every cycle has a
2962    load or store.  To avoid the conflict we make sure the load/store
2963    unit is free for at least one cycle during the execution of insns in
2964    the previous ilb. */
2965 static int spu_ls_first;
2966 static int prev_ls_clock;
2967 
2968 static void
2969 spu_sched_init_global (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
2970 		       int max_ready ATTRIBUTE_UNUSED)
2971 {
2972   spu_sched_length = 0;
2973 }
2974 
2975 static void
2976 spu_sched_init (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
2977 		int max_ready ATTRIBUTE_UNUSED)
2978 {
2979   if (align_labels > 4 || align_loops > 4 || align_jumps > 4)
2980     {
2981       /* When any block might be at least 8-byte aligned, assume they
2982          will all be at least 8-byte aligned to make sure dual issue
2983          works out correctly. */
2984       spu_sched_length = 0;
2985     }
2986   spu_ls_first = INT_MAX;
2987   clock_var = -1;
2988   prev_ls_clock = -1;
2989   pipe0_clock = -1;
2990   pipe1_clock = -1;
2991   prev_clock_var = -1;
2992   prev_priority = -1;
2993 }
2994 
2995 static int
2996 spu_sched_variable_issue (FILE *file ATTRIBUTE_UNUSED,
2997 			  int verbose ATTRIBUTE_UNUSED, rtx insn, int more)
2998 {
2999   int len;
3000   int p;
3001   if (GET_CODE (PATTERN (insn)) == USE
3002       || GET_CODE (PATTERN (insn)) == CLOBBER
3003       || (len = get_attr_length (insn)) == 0)
3004     return more;
3005 
3006   spu_sched_length += len;
3007 
3008   /* Reset on inline asm */
3009   if (INSN_CODE (insn) == -1)
3010     {
3011       spu_ls_first = INT_MAX;
3012       pipe0_clock = -1;
3013       pipe1_clock = -1;
3014       return 0;
3015     }
3016   p = get_pipe (insn);
3017   if (p == 0)
3018     pipe0_clock = clock_var;
3019   else
3020     pipe1_clock = clock_var;
3021 
3022   if (in_spu_reorg)
3023     {
3024       if (clock_var - prev_ls_clock > 1
3025 	  || INSN_CODE (insn) == CODE_FOR_iprefetch)
3026 	spu_ls_first = INT_MAX;
3027       if (uses_ls_unit (insn))
3028 	{
3029 	  if (spu_ls_first == INT_MAX)
3030 	    spu_ls_first = spu_sched_length;
3031 	  prev_ls_clock = clock_var;
3032 	}
3033 
3034       /* The scheduler hasn't inserted the nop, but we will later on.
3035          Include those nops in spu_sched_length. */
3036       if (prev_clock_var == clock_var && (spu_sched_length & 7))
3037 	spu_sched_length += 4;
3038       prev_clock_var = clock_var;
3039 
3040       /* more is -1 when called from spu_sched_reorder for new insns
3041          that don't have INSN_PRIORITY */
3042       if (more >= 0)
3043 	prev_priority = INSN_PRIORITY (insn);
3044     }
3045 
3046   /* Always try issueing more insns.  spu_sched_reorder will decide
3047      when the cycle should be advanced. */
3048   return 1;
3049 }
3050 
3051 /* This function is called for both TARGET_SCHED_REORDER and
3052    TARGET_SCHED_REORDER2.  */
3053 static int
3054 spu_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
3055 		   rtx *ready, int *nreadyp, int clock)
3056 {
3057   int i, nready = *nreadyp;
3058   int pipe_0, pipe_1, pipe_hbrp, pipe_ls, schedule_i;
3059   rtx insn;
3060 
3061   clock_var = clock;
3062 
3063   if (nready <= 0 || pipe1_clock >= clock)
3064     return 0;
3065 
3066   /* Find any rtl insns that don't generate assembly insns and schedule
3067      them first. */
3068   for (i = nready - 1; i >= 0; i--)
3069     {
3070       insn = ready[i];
3071       if (INSN_CODE (insn) == -1
3072 	  || INSN_CODE (insn) == CODE_FOR_blockage
3073 	  || (INSN_P (insn) && get_attr_length (insn) == 0))
3074 	{
3075 	  ready[i] = ready[nready - 1];
3076 	  ready[nready - 1] = insn;
3077 	  return 1;
3078 	}
3079     }
3080 
3081   pipe_0 = pipe_1 = pipe_hbrp = pipe_ls = schedule_i = -1;
3082   for (i = 0; i < nready; i++)
3083     if (INSN_CODE (ready[i]) != -1)
3084       {
3085 	insn = ready[i];
3086 	switch (get_attr_type (insn))
3087 	  {
3088 	  default:
3089 	  case TYPE_MULTI0:
3090 	  case TYPE_CONVERT:
3091 	  case TYPE_FX2:
3092 	  case TYPE_FX3:
3093 	  case TYPE_SPR:
3094 	  case TYPE_NOP:
3095 	  case TYPE_FXB:
3096 	  case TYPE_FPD:
3097 	  case TYPE_FP6:
3098 	  case TYPE_FP7:
3099 	    pipe_0 = i;
3100 	    break;
3101 	  case TYPE_LOAD:
3102 	  case TYPE_STORE:
3103 	    pipe_ls = i;
3104 	  case TYPE_LNOP:
3105 	  case TYPE_SHUF:
3106 	  case TYPE_BR:
3107 	  case TYPE_MULTI1:
3108 	  case TYPE_HBR:
3109 	    pipe_1 = i;
3110 	    break;
3111 	  case TYPE_IPREFETCH:
3112 	    pipe_hbrp = i;
3113 	    break;
3114 	  }
3115       }
3116 
3117   /* In the first scheduling phase, schedule loads and stores together
3118      to increase the chance they will get merged during postreload CSE. */
3119   if (!reload_completed && pipe_ls >= 0)
3120     {
3121       insn = ready[pipe_ls];
3122       ready[pipe_ls] = ready[nready - 1];
3123       ready[nready - 1] = insn;
3124       return 1;
3125     }
3126 
3127   /* If there is an hbrp ready, prefer it over other pipe 1 insns. */
3128   if (pipe_hbrp >= 0)
3129     pipe_1 = pipe_hbrp;
3130 
3131   /* When we have loads/stores in every cycle of the last 15 insns and
3132      we are about to schedule another load/store, emit an hbrp insn
3133      instead. */
3134   if (in_spu_reorg
3135       && spu_sched_length - spu_ls_first >= 4 * 15
3136       && !(pipe0_clock < clock && pipe_0 >= 0) && pipe_1 == pipe_ls)
3137     {
3138       insn = sched_emit_insn (gen_iprefetch (GEN_INT (3)));
3139       recog_memoized (insn);
3140       if (pipe0_clock < clock)
3141 	PUT_MODE (insn, TImode);
3142       spu_sched_variable_issue (file, verbose, insn, -1);
3143       return 0;
3144     }
3145 
3146   /* In general, we want to emit nops to increase dual issue, but dual
3147      issue isn't faster when one of the insns could be scheduled later
3148      without effecting the critical path.  We look at INSN_PRIORITY to
3149      make a good guess, but it isn't perfect so -mdual-nops=n can be
3150      used to effect it. */
3151   if (in_spu_reorg && spu_dual_nops < 10)
3152     {
3153       /* When we are at an even address and we are not issueing nops to
3154          improve scheduling then we need to advance the cycle.  */
3155       if ((spu_sched_length & 7) == 0 && prev_clock_var == clock
3156 	  && (spu_dual_nops == 0
3157 	      || (pipe_1 != -1
3158 		  && prev_priority >
3159 		  INSN_PRIORITY (ready[pipe_1]) + spu_dual_nops)))
3160 	return 0;
3161 
3162       /* When at an odd address, schedule the highest priority insn
3163          without considering pipeline. */
3164       if ((spu_sched_length & 7) == 4 && prev_clock_var != clock
3165 	  && (spu_dual_nops == 0
3166 	      || (prev_priority >
3167 		  INSN_PRIORITY (ready[nready - 1]) + spu_dual_nops)))
3168 	return 1;
3169     }
3170 
3171 
3172   /* We haven't issued a pipe0 insn yet this cycle, if there is a
3173      pipe0 insn in the ready list, schedule it. */
3174   if (pipe0_clock < clock && pipe_0 >= 0)
3175     schedule_i = pipe_0;
3176 
3177   /* Either we've scheduled a pipe0 insn already or there is no pipe0
3178      insn to schedule.  Put a pipe1 insn at the front of the ready list. */
3179   else
3180     schedule_i = pipe_1;
3181 
3182   if (schedule_i > -1)
3183     {
3184       insn = ready[schedule_i];
3185       ready[schedule_i] = ready[nready - 1];
3186       ready[nready - 1] = insn;
3187       return 1;
3188     }
3189   return 0;
3190 }
3191 
3192 /* INSN is dependent on DEP_INSN. */
3193 static int
3194 spu_sched_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
3195 {
3196   rtx set;
3197 
3198   /* The blockage pattern is used to prevent instructions from being
3199      moved across it and has no cost. */
3200   if (INSN_CODE (insn) == CODE_FOR_blockage
3201       || INSN_CODE (dep_insn) == CODE_FOR_blockage)
3202     return 0;
3203 
3204   if ((INSN_P (insn) && get_attr_length (insn) == 0)
3205       || (INSN_P (dep_insn) && get_attr_length (dep_insn) == 0))
3206     return 0;
3207 
3208   /* Make sure hbrps are spread out. */
3209   if (INSN_CODE (insn) == CODE_FOR_iprefetch
3210       && INSN_CODE (dep_insn) == CODE_FOR_iprefetch)
3211     return 8;
3212 
3213   /* Make sure hints and hbrps are 2 cycles apart. */
3214   if ((INSN_CODE (insn) == CODE_FOR_iprefetch
3215        || INSN_CODE (insn) == CODE_FOR_hbr)
3216        && (INSN_CODE (dep_insn) == CODE_FOR_iprefetch
3217 	   || INSN_CODE (dep_insn) == CODE_FOR_hbr))
3218     return 2;
3219 
3220   /* An hbrp has no real dependency on other insns. */
3221   if (INSN_CODE (insn) == CODE_FOR_iprefetch
3222       || INSN_CODE (dep_insn) == CODE_FOR_iprefetch)
3223     return 0;
3224 
3225   /* Assuming that it is unlikely an argument register will be used in
3226      the first cycle of the called function, we reduce the cost for
3227      slightly better scheduling of dep_insn.  When not hinted, the
3228      mispredicted branch would hide the cost as well.  */
3229   if (CALL_P (insn))
3230   {
3231     rtx target = get_branch_target (insn);
3232     if (GET_CODE (target) != REG || !set_of (target, insn))
3233       return cost - 2;
3234     return cost;
3235   }
3236 
3237   /* And when returning from a function, let's assume the return values
3238      are completed sooner too. */
3239   if (CALL_P (dep_insn))
3240     return cost - 2;
3241 
3242   /* Make sure an instruction that loads from the back chain is schedule
3243      away from the return instruction so a hint is more likely to get
3244      issued. */
3245   if (INSN_CODE (insn) == CODE_FOR__return
3246       && (set = single_set (dep_insn))
3247       && GET_CODE (SET_DEST (set)) == REG
3248       && REGNO (SET_DEST (set)) == LINK_REGISTER_REGNUM)
3249     return 20;
3250 
3251   /* The dfa scheduler sets cost to 0 for all anti-dependencies and the
3252      scheduler makes every insn in a block anti-dependent on the final
3253      jump_insn.  We adjust here so higher cost insns will get scheduled
3254      earlier. */
3255   if (JUMP_P (insn) && REG_NOTE_KIND (link) == REG_DEP_ANTI)
3256     return insn_cost (dep_insn) - 3;
3257 
3258   return cost;
3259 }
3260 
3261 /* Create a CONST_DOUBLE from a string.  */
3262 struct rtx_def *
3263 spu_float_const (const char *string, enum machine_mode mode)
3264 {
3265   REAL_VALUE_TYPE value;
3266   value = REAL_VALUE_ATOF (string, mode);
3267   return CONST_DOUBLE_FROM_REAL_VALUE (value, mode);
3268 }
3269 
3270 int
3271 spu_constant_address_p (rtx x)
3272 {
3273   return (GET_CODE (x) == LABEL_REF || GET_CODE (x) == SYMBOL_REF
3274 	  || GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST
3275 	  || GET_CODE (x) == HIGH);
3276 }
3277 
3278 static enum spu_immediate
3279 which_immediate_load (HOST_WIDE_INT val)
3280 {
3281   gcc_assert (val == trunc_int_for_mode (val, SImode));
3282 
3283   if (val >= -0x8000 && val <= 0x7fff)
3284     return SPU_IL;
3285   if (val >= 0 && val <= 0x3ffff)
3286     return SPU_ILA;
3287   if ((val & 0xffff) == ((val >> 16) & 0xffff))
3288     return SPU_ILH;
3289   if ((val & 0xffff) == 0)
3290     return SPU_ILHU;
3291 
3292   return SPU_NONE;
3293 }
3294 
3295 /* Return true when OP can be loaded by one of the il instructions, or
3296    when flow2 is not completed and OP can be loaded using ilhu and iohl. */
3297 int
3298 immediate_load_p (rtx op, enum machine_mode mode)
3299 {
3300   if (CONSTANT_P (op))
3301     {
3302       enum immediate_class c = classify_immediate (op, mode);
3303       return c == IC_IL1 || c == IC_IL1s
3304 	     || (!epilogue_completed && (c == IC_IL2 || c == IC_IL2s));
3305     }
3306   return 0;
3307 }
3308 
3309 /* Return true if the first SIZE bytes of arr is a constant that can be
3310    generated with cbd, chd, cwd or cdd.  When non-NULL, PRUN and PSTART
3311    represent the size and offset of the instruction to use. */
3312 static int
3313 cpat_info(unsigned char *arr, int size, int *prun, int *pstart)
3314 {
3315   int cpat, run, i, start;
3316   cpat = 1;
3317   run = 0;
3318   start = -1;
3319   for (i = 0; i < size && cpat; i++)
3320     if (arr[i] != i+16)
3321       {
3322 	if (!run)
3323 	  {
3324 	    start = i;
3325 	    if (arr[i] == 3)
3326 	      run = 1;
3327 	    else if (arr[i] == 2 && arr[i+1] == 3)
3328 	      run = 2;
3329 	    else if (arr[i] == 0)
3330 	      {
3331 		while (arr[i+run] == run && i+run < 16)
3332 		  run++;
3333 		if (run != 4 && run != 8)
3334 		  cpat = 0;
3335 	      }
3336 	    else
3337 	      cpat = 0;
3338 	    if ((i & (run-1)) != 0)
3339 	      cpat = 0;
3340 	    i += run;
3341 	  }
3342 	else
3343 	  cpat = 0;
3344       }
3345   if (cpat && (run || size < 16))
3346     {
3347       if (run == 0)
3348 	run = 1;
3349       if (prun)
3350 	*prun = run;
3351       if (pstart)
3352 	*pstart = start == -1 ? 16-run : start;
3353       return 1;
3354     }
3355   return 0;
3356 }
3357 
3358 /* OP is a CONSTANT_P.  Determine what instructions can be used to load
3359    it into a register.  MODE is only valid when OP is a CONST_INT. */
3360 static enum immediate_class
3361 classify_immediate (rtx op, enum machine_mode mode)
3362 {
3363   HOST_WIDE_INT val;
3364   unsigned char arr[16];
3365   int i, j, repeated, fsmbi, repeat;
3366 
3367   gcc_assert (CONSTANT_P (op));
3368 
3369   if (GET_MODE (op) != VOIDmode)
3370     mode = GET_MODE (op);
3371 
3372   /* A V4SI const_vector with all identical symbols is ok. */
3373   if (!flag_pic
3374       && mode == V4SImode
3375       && GET_CODE (op) == CONST_VECTOR
3376       && GET_CODE (CONST_VECTOR_ELT (op, 0)) != CONST_INT
3377       && GET_CODE (CONST_VECTOR_ELT (op, 0)) != CONST_DOUBLE
3378       && CONST_VECTOR_ELT (op, 0) == CONST_VECTOR_ELT (op, 1)
3379       && CONST_VECTOR_ELT (op, 1) == CONST_VECTOR_ELT (op, 2)
3380       && CONST_VECTOR_ELT (op, 2) == CONST_VECTOR_ELT (op, 3))
3381     op = CONST_VECTOR_ELT (op, 0);
3382 
3383   switch (GET_CODE (op))
3384     {
3385     case SYMBOL_REF:
3386     case LABEL_REF:
3387       return TARGET_LARGE_MEM ? IC_IL2s : IC_IL1s;
3388 
3389     case CONST:
3390       /* We can never know if the resulting address fits in 18 bits and can be
3391 	 loaded with ila.  For now, assume the address will not overflow if
3392 	 the displacement is "small" (fits 'K' constraint).  */
3393       if (!TARGET_LARGE_MEM && GET_CODE (XEXP (op, 0)) == PLUS)
3394 	{
3395 	  rtx sym = XEXP (XEXP (op, 0), 0);
3396 	  rtx cst = XEXP (XEXP (op, 0), 1);
3397 
3398 	  if (GET_CODE (sym) == SYMBOL_REF
3399 	      && GET_CODE (cst) == CONST_INT
3400 	      && satisfies_constraint_K (cst))
3401 	    return IC_IL1s;
3402 	}
3403       return IC_IL2s;
3404 
3405     case HIGH:
3406       return IC_IL1s;
3407 
3408     case CONST_VECTOR:
3409       for (i = 0; i < GET_MODE_NUNITS (mode); i++)
3410 	if (GET_CODE (CONST_VECTOR_ELT (op, i)) != CONST_INT
3411 	    && GET_CODE (CONST_VECTOR_ELT (op, i)) != CONST_DOUBLE)
3412 	  return IC_POOL;
3413       /* Fall through. */
3414 
3415     case CONST_INT:
3416     case CONST_DOUBLE:
3417       constant_to_array (mode, op, arr);
3418 
3419       /* Check that each 4-byte slot is identical. */
3420       repeated = 1;
3421       for (i = 4; i < 16; i += 4)
3422 	for (j = 0; j < 4; j++)
3423 	  if (arr[j] != arr[i + j])
3424 	    repeated = 0;
3425 
3426       if (repeated)
3427 	{
3428 	  val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
3429 	  val = trunc_int_for_mode (val, SImode);
3430 
3431 	  if (which_immediate_load (val) != SPU_NONE)
3432 	    return IC_IL1;
3433 	}
3434 
3435       /* Any mode of 2 bytes or smaller can be loaded with an il
3436          instruction. */
3437       gcc_assert (GET_MODE_SIZE (mode) > 2);
3438 
3439       fsmbi = 1;
3440       repeat = 0;
3441       for (i = 0; i < 16 && fsmbi; i++)
3442 	if (arr[i] != 0 && repeat == 0)
3443 	  repeat = arr[i];
3444 	else if (arr[i] != 0 && arr[i] != repeat)
3445 	  fsmbi = 0;
3446       if (fsmbi)
3447 	return repeat == 0xff ? IC_FSMBI : IC_FSMBI2;
3448 
3449       if (cpat_info (arr, GET_MODE_SIZE (mode), 0, 0))
3450 	return IC_CPAT;
3451 
3452       if (repeated)
3453 	return IC_IL2;
3454 
3455       return IC_POOL;
3456     default:
3457       break;
3458     }
3459   gcc_unreachable ();
3460 }
3461 
3462 static enum spu_immediate
3463 which_logical_immediate (HOST_WIDE_INT val)
3464 {
3465   gcc_assert (val == trunc_int_for_mode (val, SImode));
3466 
3467   if (val >= -0x200 && val <= 0x1ff)
3468     return SPU_ORI;
3469   if (val >= 0 && val <= 0xffff)
3470     return SPU_IOHL;
3471   if ((val & 0xffff) == ((val >> 16) & 0xffff))
3472     {
3473       val = trunc_int_for_mode (val, HImode);
3474       if (val >= -0x200 && val <= 0x1ff)
3475 	return SPU_ORHI;
3476       if ((val & 0xff) == ((val >> 8) & 0xff))
3477 	{
3478 	  val = trunc_int_for_mode (val, QImode);
3479 	  if (val >= -0x200 && val <= 0x1ff)
3480 	    return SPU_ORBI;
3481 	}
3482     }
3483   return SPU_NONE;
3484 }
3485 
3486 /* Return TRUE when X, a CONST_VECTOR, only contains CONST_INTs or
3487    CONST_DOUBLEs. */
3488 static int
3489 const_vector_immediate_p (rtx x)
3490 {
3491   int i;
3492   gcc_assert (GET_CODE (x) == CONST_VECTOR);
3493   for (i = 0; i < GET_MODE_NUNITS (GET_MODE (x)); i++)
3494     if (GET_CODE (CONST_VECTOR_ELT (x, i)) != CONST_INT
3495 	&& GET_CODE (CONST_VECTOR_ELT (x, i)) != CONST_DOUBLE)
3496       return 0;
3497   return 1;
3498 }
3499 
3500 int
3501 logical_immediate_p (rtx op, enum machine_mode mode)
3502 {
3503   HOST_WIDE_INT val;
3504   unsigned char arr[16];
3505   int i, j;
3506 
3507   gcc_assert (GET_CODE (op) == CONST_INT || GET_CODE (op) == CONST_DOUBLE
3508 	      || GET_CODE (op) == CONST_VECTOR);
3509 
3510   if (GET_CODE (op) == CONST_VECTOR
3511       && !const_vector_immediate_p (op))
3512     return 0;
3513 
3514   if (GET_MODE (op) != VOIDmode)
3515     mode = GET_MODE (op);
3516 
3517   constant_to_array (mode, op, arr);
3518 
3519   /* Check that bytes are repeated. */
3520   for (i = 4; i < 16; i += 4)
3521     for (j = 0; j < 4; j++)
3522       if (arr[j] != arr[i + j])
3523 	return 0;
3524 
3525   val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
3526   val = trunc_int_for_mode (val, SImode);
3527 
3528   i = which_logical_immediate (val);
3529   return i != SPU_NONE && i != SPU_IOHL;
3530 }
3531 
3532 int
3533 iohl_immediate_p (rtx op, enum machine_mode mode)
3534 {
3535   HOST_WIDE_INT val;
3536   unsigned char arr[16];
3537   int i, j;
3538 
3539   gcc_assert (GET_CODE (op) == CONST_INT || GET_CODE (op) == CONST_DOUBLE
3540 	      || GET_CODE (op) == CONST_VECTOR);
3541 
3542   if (GET_CODE (op) == CONST_VECTOR
3543       && !const_vector_immediate_p (op))
3544     return 0;
3545 
3546   if (GET_MODE (op) != VOIDmode)
3547     mode = GET_MODE (op);
3548 
3549   constant_to_array (mode, op, arr);
3550 
3551   /* Check that bytes are repeated. */
3552   for (i = 4; i < 16; i += 4)
3553     for (j = 0; j < 4; j++)
3554       if (arr[j] != arr[i + j])
3555 	return 0;
3556 
3557   val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
3558   val = trunc_int_for_mode (val, SImode);
3559 
3560   return val >= 0 && val <= 0xffff;
3561 }
3562 
3563 int
3564 arith_immediate_p (rtx op, enum machine_mode mode,
3565 		   HOST_WIDE_INT low, HOST_WIDE_INT high)
3566 {
3567   HOST_WIDE_INT val;
3568   unsigned char arr[16];
3569   int bytes, i, j;
3570 
3571   gcc_assert (GET_CODE (op) == CONST_INT || GET_CODE (op) == CONST_DOUBLE
3572 	      || GET_CODE (op) == CONST_VECTOR);
3573 
3574   if (GET_CODE (op) == CONST_VECTOR
3575       && !const_vector_immediate_p (op))
3576     return 0;
3577 
3578   if (GET_MODE (op) != VOIDmode)
3579     mode = GET_MODE (op);
3580 
3581   constant_to_array (mode, op, arr);
3582 
3583   if (VECTOR_MODE_P (mode))
3584     mode = GET_MODE_INNER (mode);
3585 
3586   bytes = GET_MODE_SIZE (mode);
3587   mode = mode_for_size (GET_MODE_BITSIZE (mode), MODE_INT, 0);
3588 
3589   /* Check that bytes are repeated. */
3590   for (i = bytes; i < 16; i += bytes)
3591     for (j = 0; j < bytes; j++)
3592       if (arr[j] != arr[i + j])
3593 	return 0;
3594 
3595   val = arr[0];
3596   for (j = 1; j < bytes; j++)
3597     val = (val << 8) | arr[j];
3598 
3599   val = trunc_int_for_mode (val, mode);
3600 
3601   return val >= low && val <= high;
3602 }
3603 
3604 /* TRUE when op is an immediate and an exact power of 2, and given that
3605    OP is 2^scale, scale >= LOW && scale <= HIGH.  When OP is a vector,
3606    all entries must be the same. */
3607 bool
3608 exp2_immediate_p (rtx op, enum machine_mode mode, int low, int high)
3609 {
3610   enum machine_mode int_mode;
3611   HOST_WIDE_INT val;
3612   unsigned char arr[16];
3613   int bytes, i, j;
3614 
3615   gcc_assert (GET_CODE (op) == CONST_INT || GET_CODE (op) == CONST_DOUBLE
3616 	      || GET_CODE (op) == CONST_VECTOR);
3617 
3618   if (GET_CODE (op) == CONST_VECTOR
3619       && !const_vector_immediate_p (op))
3620     return 0;
3621 
3622   if (GET_MODE (op) != VOIDmode)
3623     mode = GET_MODE (op);
3624 
3625   constant_to_array (mode, op, arr);
3626 
3627   if (VECTOR_MODE_P (mode))
3628     mode = GET_MODE_INNER (mode);
3629 
3630   bytes = GET_MODE_SIZE (mode);
3631   int_mode = mode_for_size (GET_MODE_BITSIZE (mode), MODE_INT, 0);
3632 
3633   /* Check that bytes are repeated. */
3634   for (i = bytes; i < 16; i += bytes)
3635     for (j = 0; j < bytes; j++)
3636       if (arr[j] != arr[i + j])
3637 	return 0;
3638 
3639   val = arr[0];
3640   for (j = 1; j < bytes; j++)
3641     val = (val << 8) | arr[j];
3642 
3643   val = trunc_int_for_mode (val, int_mode);
3644 
3645   /* Currently, we only handle SFmode */
3646   gcc_assert (mode == SFmode);
3647   if (mode == SFmode)
3648     {
3649       int exp = (val >> 23) - 127;
3650       return val > 0 && (val & 0x007fffff) == 0
3651 	     &&  exp >= low && exp <= high;
3652     }
3653   return FALSE;
3654 }
3655 
3656 /* Return true if X is a SYMBOL_REF to an __ea qualified variable.  */
3657 
3658 static int
3659 ea_symbol_ref (rtx *px, void *data ATTRIBUTE_UNUSED)
3660 {
3661   rtx x = *px;
3662   tree decl;
3663 
3664   if (GET_CODE (x) == CONST && GET_CODE (XEXP (x, 0)) == PLUS)
3665     {
3666       rtx plus = XEXP (x, 0);
3667       rtx op0 = XEXP (plus, 0);
3668       rtx op1 = XEXP (plus, 1);
3669       if (GET_CODE (op1) == CONST_INT)
3670 	x = op0;
3671     }
3672 
3673   return (GET_CODE (x) == SYMBOL_REF
3674  	  && (decl = SYMBOL_REF_DECL (x)) != 0
3675  	  && TREE_CODE (decl) == VAR_DECL
3676  	  && TYPE_ADDR_SPACE (TREE_TYPE (decl)));
3677 }
3678 
3679 /* We accept:
3680    - any 32-bit constant (SImode, SFmode)
3681    - any constant that can be generated with fsmbi (any mode)
3682    - a 64-bit constant where the high and low bits are identical
3683      (DImode, DFmode)
3684    - a 128-bit constant where the four 32-bit words match.  */
3685 int
3686 spu_legitimate_constant_p (rtx x)
3687 {
3688   if (GET_CODE (x) == HIGH)
3689     x = XEXP (x, 0);
3690 
3691   /* Reject any __ea qualified reference.  These can't appear in
3692      instructions but must be forced to the constant pool.  */
3693   if (for_each_rtx (&x, ea_symbol_ref, 0))
3694     return 0;
3695 
3696   /* V4SI with all identical symbols is valid. */
3697   if (!flag_pic
3698       && GET_MODE (x) == V4SImode
3699       && (GET_CODE (CONST_VECTOR_ELT (x, 0)) == SYMBOL_REF
3700 	  || GET_CODE (CONST_VECTOR_ELT (x, 0)) == LABEL_REF
3701 	  || GET_CODE (CONST_VECTOR_ELT (x, 0)) == CONST))
3702     return CONST_VECTOR_ELT (x, 0) == CONST_VECTOR_ELT (x, 1)
3703 	   && CONST_VECTOR_ELT (x, 1) == CONST_VECTOR_ELT (x, 2)
3704 	   && CONST_VECTOR_ELT (x, 2) == CONST_VECTOR_ELT (x, 3);
3705 
3706   if (GET_CODE (x) == CONST_VECTOR
3707       && !const_vector_immediate_p (x))
3708     return 0;
3709   return 1;
3710 }
3711 
3712 /* Valid address are:
3713    - symbol_ref, label_ref, const
3714    - reg
3715    - reg + const_int, where const_int is 16 byte aligned
3716    - reg + reg, alignment doesn't matter
3717   The alignment matters in the reg+const case because lqd and stqd
3718   ignore the 4 least significant bits of the const.  We only care about
3719   16 byte modes because the expand phase will change all smaller MEM
3720   references to TImode.  */
3721 static bool
3722 spu_legitimate_address_p (enum machine_mode mode,
3723 			  rtx x, bool reg_ok_strict)
3724 {
3725   int aligned = GET_MODE_SIZE (mode) >= 16;
3726   if (aligned
3727       && GET_CODE (x) == AND
3728       && GET_CODE (XEXP (x, 1)) == CONST_INT
3729       && INTVAL (XEXP (x, 1)) == (HOST_WIDE_INT) - 16)
3730     x = XEXP (x, 0);
3731   switch (GET_CODE (x))
3732     {
3733     case LABEL_REF:
3734       return !TARGET_LARGE_MEM;
3735 
3736     case SYMBOL_REF:
3737     case CONST:
3738       /* Keep __ea references until reload so that spu_expand_mov can see them
3739 	 in MEMs.  */
3740       if (ea_symbol_ref (&x, 0))
3741 	return !reload_in_progress && !reload_completed;
3742       return !TARGET_LARGE_MEM;
3743 
3744     case CONST_INT:
3745       return INTVAL (x) >= 0 && INTVAL (x) <= 0x3ffff;
3746 
3747     case SUBREG:
3748       x = XEXP (x, 0);
3749       if (REG_P (x))
3750 	return 0;
3751 
3752     case REG:
3753       return INT_REG_OK_FOR_BASE_P (x, reg_ok_strict);
3754 
3755     case PLUS:
3756     case LO_SUM:
3757       {
3758 	rtx op0 = XEXP (x, 0);
3759 	rtx op1 = XEXP (x, 1);
3760 	if (GET_CODE (op0) == SUBREG)
3761 	  op0 = XEXP (op0, 0);
3762 	if (GET_CODE (op1) == SUBREG)
3763 	  op1 = XEXP (op1, 0);
3764 	if (GET_CODE (op0) == REG
3765 	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
3766 	    && GET_CODE (op1) == CONST_INT
3767 	    && INTVAL (op1) >= -0x2000
3768 	    && INTVAL (op1) <= 0x1fff
3769 	    && (!aligned || (INTVAL (op1) & 15) == 0))
3770 	  return TRUE;
3771 	if (GET_CODE (op0) == REG
3772 	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
3773 	    && GET_CODE (op1) == REG
3774 	    && INT_REG_OK_FOR_INDEX_P (op1, reg_ok_strict))
3775 	  return TRUE;
3776       }
3777       break;
3778 
3779     default:
3780       break;
3781     }
3782   return FALSE;
3783 }
3784 
3785 /* Like spu_legitimate_address_p, except with named addresses.  */
3786 static bool
3787 spu_addr_space_legitimate_address_p (enum machine_mode mode, rtx x,
3788 				     bool reg_ok_strict, addr_space_t as)
3789 {
3790   if (as == ADDR_SPACE_EA)
3791     return (REG_P (x) && (GET_MODE (x) == EAmode));
3792 
3793   else if (as != ADDR_SPACE_GENERIC)
3794     gcc_unreachable ();
3795 
3796   return spu_legitimate_address_p (mode, x, reg_ok_strict);
3797 }
3798 
3799 /* When the address is reg + const_int, force the const_int into a
3800    register.  */
3801 rtx
3802 spu_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
3803 			enum machine_mode mode ATTRIBUTE_UNUSED)
3804 {
3805   rtx op0, op1;
3806   /* Make sure both operands are registers.  */
3807   if (GET_CODE (x) == PLUS)
3808     {
3809       op0 = XEXP (x, 0);
3810       op1 = XEXP (x, 1);
3811       if (ALIGNED_SYMBOL_REF_P (op0))
3812 	{
3813 	  op0 = force_reg (Pmode, op0);
3814 	  mark_reg_pointer (op0, 128);
3815 	}
3816       else if (GET_CODE (op0) != REG)
3817 	op0 = force_reg (Pmode, op0);
3818       if (ALIGNED_SYMBOL_REF_P (op1))
3819 	{
3820 	  op1 = force_reg (Pmode, op1);
3821 	  mark_reg_pointer (op1, 128);
3822 	}
3823       else if (GET_CODE (op1) != REG)
3824 	op1 = force_reg (Pmode, op1);
3825       x = gen_rtx_PLUS (Pmode, op0, op1);
3826     }
3827   return x;
3828 }
3829 
3830 /* Like spu_legitimate_address, except with named address support.  */
3831 static rtx
3832 spu_addr_space_legitimize_address (rtx x, rtx oldx, enum machine_mode mode,
3833 				   addr_space_t as)
3834 {
3835   if (as != ADDR_SPACE_GENERIC)
3836     return x;
3837 
3838   return spu_legitimize_address (x, oldx, mode);
3839 }
3840 
3841 /* Handle an attribute requiring a FUNCTION_DECL; arguments as in
3842    struct attribute_spec.handler.  */
3843 static tree
3844 spu_handle_fndecl_attribute (tree * node,
3845 			     tree name,
3846 			     tree args ATTRIBUTE_UNUSED,
3847 			     int flags ATTRIBUTE_UNUSED, bool * no_add_attrs)
3848 {
3849   if (TREE_CODE (*node) != FUNCTION_DECL)
3850     {
3851       warning (0, "%qE attribute only applies to functions",
3852 	       name);
3853       *no_add_attrs = true;
3854     }
3855 
3856   return NULL_TREE;
3857 }
3858 
3859 /* Handle the "vector" attribute.  */
3860 static tree
3861 spu_handle_vector_attribute (tree * node, tree name,
3862 			     tree args ATTRIBUTE_UNUSED,
3863 			     int flags ATTRIBUTE_UNUSED, bool * no_add_attrs)
3864 {
3865   tree type = *node, result = NULL_TREE;
3866   enum machine_mode mode;
3867   int unsigned_p;
3868 
3869   while (POINTER_TYPE_P (type)
3870 	 || TREE_CODE (type) == FUNCTION_TYPE
3871 	 || TREE_CODE (type) == METHOD_TYPE || TREE_CODE (type) == ARRAY_TYPE)
3872     type = TREE_TYPE (type);
3873 
3874   mode = TYPE_MODE (type);
3875 
3876   unsigned_p = TYPE_UNSIGNED (type);
3877   switch (mode)
3878     {
3879     case DImode:
3880       result = (unsigned_p ? unsigned_V2DI_type_node : V2DI_type_node);
3881       break;
3882     case SImode:
3883       result = (unsigned_p ? unsigned_V4SI_type_node : V4SI_type_node);
3884       break;
3885     case HImode:
3886       result = (unsigned_p ? unsigned_V8HI_type_node : V8HI_type_node);
3887       break;
3888     case QImode:
3889       result = (unsigned_p ? unsigned_V16QI_type_node : V16QI_type_node);
3890       break;
3891     case SFmode:
3892       result = V4SF_type_node;
3893       break;
3894     case DFmode:
3895       result = V2DF_type_node;
3896       break;
3897     default:
3898       break;
3899     }
3900 
3901   /* Propagate qualifiers attached to the element type
3902      onto the vector type.  */
3903   if (result && result != type && TYPE_QUALS (type))
3904     result = build_qualified_type (result, TYPE_QUALS (type));
3905 
3906   *no_add_attrs = true;		/* No need to hang on to the attribute.  */
3907 
3908   if (!result)
3909     warning (0, "%qE attribute ignored", name);
3910   else
3911     *node = lang_hooks.types.reconstruct_complex_type (*node, result);
3912 
3913   return NULL_TREE;
3914 }
3915 
3916 /* Return nonzero if FUNC is a naked function.  */
3917 static int
3918 spu_naked_function_p (tree func)
3919 {
3920   tree a;
3921 
3922   if (TREE_CODE (func) != FUNCTION_DECL)
3923     abort ();
3924 
3925   a = lookup_attribute ("naked", DECL_ATTRIBUTES (func));
3926   return a != NULL_TREE;
3927 }
3928 
3929 int
3930 spu_initial_elimination_offset (int from, int to)
3931 {
3932   int saved_regs_size = spu_saved_regs_size ();
3933   int sp_offset = 0;
3934   if (!current_function_is_leaf || crtl->outgoing_args_size
3935       || get_frame_size () || saved_regs_size)
3936     sp_offset = STACK_POINTER_OFFSET;
3937   if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
3938     return get_frame_size () + crtl->outgoing_args_size + sp_offset;
3939   else if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
3940     return get_frame_size ();
3941   else if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
3942     return sp_offset + crtl->outgoing_args_size
3943       + get_frame_size () + saved_regs_size + STACK_POINTER_OFFSET;
3944   else if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
3945     return get_frame_size () + saved_regs_size + sp_offset;
3946   else
3947     gcc_unreachable ();
3948 }
3949 
3950 rtx
3951 spu_function_value (const_tree type, const_tree func ATTRIBUTE_UNUSED)
3952 {
3953   enum machine_mode mode = TYPE_MODE (type);
3954   int byte_size = ((mode == BLKmode)
3955 		   ? int_size_in_bytes (type) : GET_MODE_SIZE (mode));
3956 
3957   /* Make sure small structs are left justified in a register. */
3958   if ((mode == BLKmode || (type && AGGREGATE_TYPE_P (type)))
3959       && byte_size <= UNITS_PER_WORD * MAX_REGISTER_RETURN && byte_size > 0)
3960     {
3961       enum machine_mode smode;
3962       rtvec v;
3963       int i;
3964       int nregs = (byte_size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3965       int n = byte_size / UNITS_PER_WORD;
3966       v = rtvec_alloc (nregs);
3967       for (i = 0; i < n; i++)
3968 	{
3969 	  RTVEC_ELT (v, i) = gen_rtx_EXPR_LIST (VOIDmode,
3970 						gen_rtx_REG (TImode,
3971 							     FIRST_RETURN_REGNUM
3972 							     + i),
3973 						GEN_INT (UNITS_PER_WORD * i));
3974 	  byte_size -= UNITS_PER_WORD;
3975 	}
3976 
3977       if (n < nregs)
3978 	{
3979 	  if (byte_size < 4)
3980 	    byte_size = 4;
3981 	  smode =
3982 	    smallest_mode_for_size (byte_size * BITS_PER_UNIT, MODE_INT);
3983 	  RTVEC_ELT (v, n) =
3984 	    gen_rtx_EXPR_LIST (VOIDmode,
3985 			       gen_rtx_REG (smode, FIRST_RETURN_REGNUM + n),
3986 			       GEN_INT (UNITS_PER_WORD * n));
3987 	}
3988       return gen_rtx_PARALLEL (mode, v);
3989     }
3990   return gen_rtx_REG (mode, FIRST_RETURN_REGNUM);
3991 }
3992 
3993 rtx
3994 spu_function_arg (CUMULATIVE_ARGS cum,
3995 		  enum machine_mode mode,
3996 		  tree type, int named ATTRIBUTE_UNUSED)
3997 {
3998   int byte_size;
3999 
4000   if (cum >= MAX_REGISTER_ARGS)
4001     return 0;
4002 
4003   byte_size = ((mode == BLKmode)
4004 	       ? int_size_in_bytes (type) : GET_MODE_SIZE (mode));
4005 
4006   /* The ABI does not allow parameters to be passed partially in
4007      reg and partially in stack. */
4008   if ((cum + (byte_size + 15) / 16) > MAX_REGISTER_ARGS)
4009     return 0;
4010 
4011   /* Make sure small structs are left justified in a register. */
4012   if ((mode == BLKmode || (type && AGGREGATE_TYPE_P (type)))
4013       && byte_size < UNITS_PER_WORD && byte_size > 0)
4014     {
4015       enum machine_mode smode;
4016       rtx gr_reg;
4017       if (byte_size < 4)
4018 	byte_size = 4;
4019       smode = smallest_mode_for_size (byte_size * BITS_PER_UNIT, MODE_INT);
4020       gr_reg = gen_rtx_EXPR_LIST (VOIDmode,
4021 				  gen_rtx_REG (smode, FIRST_ARG_REGNUM + cum),
4022 				  const0_rtx);
4023       return gen_rtx_PARALLEL (mode, gen_rtvec (1, gr_reg));
4024     }
4025   else
4026     return gen_rtx_REG (mode, FIRST_ARG_REGNUM + cum);
4027 }
4028 
4029 /* Variable sized types are passed by reference.  */
4030 static bool
4031 spu_pass_by_reference (CUMULATIVE_ARGS * cum ATTRIBUTE_UNUSED,
4032 		       enum machine_mode mode ATTRIBUTE_UNUSED,
4033 		       const_tree type, bool named ATTRIBUTE_UNUSED)
4034 {
4035   return type && TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST;
4036 }
4037 
4038 
4039 /* Var args. */
4040 
4041 /* Create and return the va_list datatype.
4042 
4043    On SPU, va_list is an array type equivalent to
4044 
4045       typedef struct __va_list_tag
4046         {
4047             void *__args __attribute__((__aligned(16)));
4048             void *__skip __attribute__((__aligned(16)));
4049 
4050         } va_list[1];
4051 
4052    where __args points to the arg that will be returned by the next
4053    va_arg(), and __skip points to the previous stack frame such that
4054    when __args == __skip we should advance __args by 32 bytes. */
4055 static tree
4056 spu_build_builtin_va_list (void)
4057 {
4058   tree f_args, f_skip, record, type_decl;
4059   bool owp;
4060 
4061   record = (*lang_hooks.types.make_type) (RECORD_TYPE);
4062 
4063   type_decl =
4064     build_decl (BUILTINS_LOCATION,
4065 		TYPE_DECL, get_identifier ("__va_list_tag"), record);
4066 
4067   f_args = build_decl (BUILTINS_LOCATION,
4068 		       FIELD_DECL, get_identifier ("__args"), ptr_type_node);
4069   f_skip = build_decl (BUILTINS_LOCATION,
4070 		       FIELD_DECL, get_identifier ("__skip"), ptr_type_node);
4071 
4072   DECL_FIELD_CONTEXT (f_args) = record;
4073   DECL_ALIGN (f_args) = 128;
4074   DECL_USER_ALIGN (f_args) = 1;
4075 
4076   DECL_FIELD_CONTEXT (f_skip) = record;
4077   DECL_ALIGN (f_skip) = 128;
4078   DECL_USER_ALIGN (f_skip) = 1;
4079 
4080   TREE_CHAIN (record) = type_decl;
4081   TYPE_NAME (record) = type_decl;
4082   TYPE_FIELDS (record) = f_args;
4083   TREE_CHAIN (f_args) = f_skip;
4084 
4085   /* We know this is being padded and we want it too.  It is an internal
4086      type so hide the warnings from the user. */
4087   owp = warn_padded;
4088   warn_padded = false;
4089 
4090   layout_type (record);
4091 
4092   warn_padded = owp;
4093 
4094   /* The correct type is an array type of one element.  */
4095   return build_array_type (record, build_index_type (size_zero_node));
4096 }
4097 
4098 /* Implement va_start by filling the va_list structure VALIST.
4099    NEXTARG points to the first anonymous stack argument.
4100 
4101    The following global variables are used to initialize
4102    the va_list structure:
4103 
4104      crtl->args.info;
4105        the CUMULATIVE_ARGS for this function
4106 
4107      crtl->args.arg_offset_rtx:
4108        holds the offset of the first anonymous stack argument
4109        (relative to the virtual arg pointer).  */
4110 
4111 static void
4112 spu_va_start (tree valist, rtx nextarg)
4113 {
4114   tree f_args, f_skip;
4115   tree args, skip, t;
4116 
4117   f_args = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4118   f_skip = TREE_CHAIN (f_args);
4119 
4120   valist = build_va_arg_indirect_ref (valist);
4121   args =
4122     build3 (COMPONENT_REF, TREE_TYPE (f_args), valist, f_args, NULL_TREE);
4123   skip =
4124     build3 (COMPONENT_REF, TREE_TYPE (f_skip), valist, f_skip, NULL_TREE);
4125 
4126   /* Find the __args area.  */
4127   t = make_tree (TREE_TYPE (args), nextarg);
4128   if (crtl->args.pretend_args_size > 0)
4129     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (args), t,
4130 		size_int (-STACK_POINTER_OFFSET));
4131   t = build2 (MODIFY_EXPR, TREE_TYPE (args), args, t);
4132   TREE_SIDE_EFFECTS (t) = 1;
4133   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4134 
4135   /* Find the __skip area.  */
4136   t = make_tree (TREE_TYPE (skip), virtual_incoming_args_rtx);
4137   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (skip), t,
4138 	      size_int (crtl->args.pretend_args_size
4139 			 - STACK_POINTER_OFFSET));
4140   t = build2 (MODIFY_EXPR, TREE_TYPE (skip), skip, t);
4141   TREE_SIDE_EFFECTS (t) = 1;
4142   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4143 }
4144 
4145 /* Gimplify va_arg by updating the va_list structure
4146    VALIST as required to retrieve an argument of type
4147    TYPE, and returning that argument.
4148 
4149    ret = va_arg(VALIST, TYPE);
4150 
4151    generates code equivalent to:
4152 
4153     paddedsize = (sizeof(TYPE) + 15) & -16;
4154     if (VALIST.__args + paddedsize > VALIST.__skip
4155 	&& VALIST.__args <= VALIST.__skip)
4156       addr = VALIST.__skip + 32;
4157     else
4158       addr = VALIST.__args;
4159     VALIST.__args = addr + paddedsize;
4160     ret = *(TYPE *)addr;
4161  */
4162 static tree
4163 spu_gimplify_va_arg_expr (tree valist, tree type, gimple_seq * pre_p,
4164 			  gimple_seq * post_p ATTRIBUTE_UNUSED)
4165 {
4166   tree f_args, f_skip;
4167   tree args, skip;
4168   HOST_WIDE_INT size, rsize;
4169   tree paddedsize, addr, tmp;
4170   bool pass_by_reference_p;
4171 
4172   f_args = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4173   f_skip = TREE_CHAIN (f_args);
4174 
4175   valist = build1 (INDIRECT_REF, TREE_TYPE (TREE_TYPE (valist)), valist);
4176   args =
4177     build3 (COMPONENT_REF, TREE_TYPE (f_args), valist, f_args, NULL_TREE);
4178   skip =
4179     build3 (COMPONENT_REF, TREE_TYPE (f_skip), valist, f_skip, NULL_TREE);
4180 
4181   addr = create_tmp_var (ptr_type_node, "va_arg");
4182 
4183   /* if an object is dynamically sized, a pointer to it is passed
4184      instead of the object itself. */
4185   pass_by_reference_p = spu_pass_by_reference (NULL, TYPE_MODE (type), type,
4186 					       false);
4187   if (pass_by_reference_p)
4188     type = build_pointer_type (type);
4189   size = int_size_in_bytes (type);
4190   rsize = ((size + UNITS_PER_WORD - 1) / UNITS_PER_WORD) * UNITS_PER_WORD;
4191 
4192   /* build conditional expression to calculate addr. The expression
4193      will be gimplified later. */
4194   paddedsize = size_int (rsize);
4195   tmp = build2 (POINTER_PLUS_EXPR, ptr_type_node, unshare_expr (args), paddedsize);
4196   tmp = build2 (TRUTH_AND_EXPR, boolean_type_node,
4197 		build2 (GT_EXPR, boolean_type_node, tmp, unshare_expr (skip)),
4198 		build2 (LE_EXPR, boolean_type_node, unshare_expr (args),
4199 		unshare_expr (skip)));
4200 
4201   tmp = build3 (COND_EXPR, ptr_type_node, tmp,
4202 		build2 (POINTER_PLUS_EXPR, ptr_type_node, unshare_expr (skip),
4203 			size_int (32)), unshare_expr (args));
4204 
4205   gimplify_assign (addr, tmp, pre_p);
4206 
4207   /* update VALIST.__args */
4208   tmp = build2 (POINTER_PLUS_EXPR, ptr_type_node, addr, paddedsize);
4209   gimplify_assign (unshare_expr (args), tmp, pre_p);
4210 
4211   addr = fold_convert (build_pointer_type_for_mode (type, ptr_mode, true),
4212 		       addr);
4213 
4214   if (pass_by_reference_p)
4215     addr = build_va_arg_indirect_ref (addr);
4216 
4217   return build_va_arg_indirect_ref (addr);
4218 }
4219 
4220 /* Save parameter registers starting with the register that corresponds
4221    to the first unnamed parameters.  If the first unnamed parameter is
4222    in the stack then save no registers.  Set pretend_args_size to the
4223    amount of space needed to save the registers. */
4224 void
4225 spu_setup_incoming_varargs (CUMULATIVE_ARGS * cum, enum machine_mode mode,
4226 			    tree type, int *pretend_size, int no_rtl)
4227 {
4228   if (!no_rtl)
4229     {
4230       rtx tmp;
4231       int regno;
4232       int offset;
4233       int ncum = *cum;
4234 
4235       /* cum currently points to the last named argument, we want to
4236          start at the next argument. */
4237       FUNCTION_ARG_ADVANCE (ncum, mode, type, 1);
4238 
4239       offset = -STACK_POINTER_OFFSET;
4240       for (regno = ncum; regno < MAX_REGISTER_ARGS; regno++)
4241 	{
4242 	  tmp = gen_frame_mem (V4SImode,
4243 			       plus_constant (virtual_incoming_args_rtx,
4244 					      offset));
4245 	  emit_move_insn (tmp,
4246 			  gen_rtx_REG (V4SImode, FIRST_ARG_REGNUM + regno));
4247 	  offset += 16;
4248 	}
4249       *pretend_size = offset + STACK_POINTER_OFFSET;
4250     }
4251 }
4252 
4253 void
4254 spu_conditional_register_usage (void)
4255 {
4256   if (flag_pic)
4257     {
4258       fixed_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
4259       call_used_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
4260     }
4261 }
4262 
4263 /* This is called any time we inspect the alignment of a register for
4264    addresses.  */
4265 static int
4266 reg_aligned_for_addr (rtx x)
4267 {
4268   int regno =
4269     REGNO (x) < FIRST_PSEUDO_REGISTER ? ORIGINAL_REGNO (x) : REGNO (x);
4270   return REGNO_POINTER_ALIGN (regno) >= 128;
4271 }
4272 
4273 /* Encode symbol attributes (local vs. global, tls model) of a SYMBOL_REF
4274    into its SYMBOL_REF_FLAGS.  */
4275 static void
4276 spu_encode_section_info (tree decl, rtx rtl, int first)
4277 {
4278   default_encode_section_info (decl, rtl, first);
4279 
4280   /* If a variable has a forced alignment to < 16 bytes, mark it with
4281      SYMBOL_FLAG_ALIGN1.  */
4282   if (TREE_CODE (decl) == VAR_DECL
4283       && DECL_USER_ALIGN (decl) && DECL_ALIGN (decl) < 128)
4284     SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_ALIGN1;
4285 }
4286 
4287 /* Return TRUE if we are certain the mem refers to a complete object
4288    which is both 16-byte aligned and padded to a 16-byte boundary.  This
4289    would make it safe to store with a single instruction.
4290    We guarantee the alignment and padding for static objects by aligning
4291    all of them to 16-bytes. (DATA_ALIGNMENT and CONSTANT_ALIGNMENT.)
4292    FIXME: We currently cannot guarantee this for objects on the stack
4293    because assign_parm_setup_stack calls assign_stack_local with the
4294    alignment of the parameter mode and in that case the alignment never
4295    gets adjusted by LOCAL_ALIGNMENT. */
4296 static int
4297 store_with_one_insn_p (rtx mem)
4298 {
4299   enum machine_mode mode = GET_MODE (mem);
4300   rtx addr = XEXP (mem, 0);
4301   if (mode == BLKmode)
4302     return 0;
4303   if (GET_MODE_SIZE (mode) >= 16)
4304     return 1;
4305   /* Only static objects. */
4306   if (GET_CODE (addr) == SYMBOL_REF)
4307     {
4308       /* We use the associated declaration to make sure the access is
4309          referring to the whole object.
4310          We check both MEM_EXPR and and SYMBOL_REF_DECL.  I'm not sure
4311          if it is necessary.  Will there be cases where one exists, and
4312          the other does not?  Will there be cases where both exist, but
4313          have different types?  */
4314       tree decl = MEM_EXPR (mem);
4315       if (decl
4316 	  && TREE_CODE (decl) == VAR_DECL
4317 	  && GET_MODE (mem) == TYPE_MODE (TREE_TYPE (decl)))
4318 	return 1;
4319       decl = SYMBOL_REF_DECL (addr);
4320       if (decl
4321 	  && TREE_CODE (decl) == VAR_DECL
4322 	  && GET_MODE (mem) == TYPE_MODE (TREE_TYPE (decl)))
4323 	return 1;
4324     }
4325   return 0;
4326 }
4327 
4328 /* Return 1 when the address is not valid for a simple load and store as
4329    required by the '_mov*' patterns.   We could make this less strict
4330    for loads, but we prefer mem's to look the same so they are more
4331    likely to be merged.  */
4332 static int
4333 address_needs_split (rtx mem)
4334 {
4335   if (GET_MODE_SIZE (GET_MODE (mem)) < 16
4336       && (GET_MODE_SIZE (GET_MODE (mem)) < 4
4337 	  || !(store_with_one_insn_p (mem)
4338 	       || mem_is_padded_component_ref (mem))))
4339     return 1;
4340 
4341   return 0;
4342 }
4343 
4344 static GTY(()) rtx cache_fetch;		  /* __cache_fetch function */
4345 static GTY(()) rtx cache_fetch_dirty;	  /* __cache_fetch_dirty function */
4346 static alias_set_type ea_alias_set = -1;  /* alias set for __ea memory */
4347 
4348 /* MEM is known to be an __ea qualified memory access.  Emit a call to
4349    fetch the ppu memory to local store, and return its address in local
4350    store.  */
4351 
4352 static void
4353 ea_load_store (rtx mem, bool is_store, rtx ea_addr, rtx data_addr)
4354 {
4355   if (is_store)
4356     {
4357       rtx ndirty = GEN_INT (GET_MODE_SIZE (GET_MODE (mem)));
4358       if (!cache_fetch_dirty)
4359 	cache_fetch_dirty = init_one_libfunc ("__cache_fetch_dirty");
4360       emit_library_call_value (cache_fetch_dirty, data_addr, LCT_NORMAL, Pmode,
4361 			       2, ea_addr, EAmode, ndirty, SImode);
4362     }
4363   else
4364     {
4365       if (!cache_fetch)
4366 	cache_fetch = init_one_libfunc ("__cache_fetch");
4367       emit_library_call_value (cache_fetch, data_addr, LCT_NORMAL, Pmode,
4368 			       1, ea_addr, EAmode);
4369     }
4370 }
4371 
4372 /* Like ea_load_store, but do the cache tag comparison and, for stores,
4373    dirty bit marking, inline.
4374 
4375    The cache control data structure is an array of
4376 
4377    struct __cache_tag_array
4378      {
4379         unsigned int tag_lo[4];
4380         unsigned int tag_hi[4];
4381         void *data_pointer[4];
4382         int reserved[4];
4383         vector unsigned short dirty_bits[4];
4384      }  */
4385 
4386 static void
4387 ea_load_store_inline (rtx mem, bool is_store, rtx ea_addr, rtx data_addr)
4388 {
4389   rtx ea_addr_si;
4390   HOST_WIDE_INT v;
4391   rtx tag_size_sym = gen_rtx_SYMBOL_REF (Pmode, "__cache_tag_array_size");
4392   rtx tag_arr_sym = gen_rtx_SYMBOL_REF (Pmode, "__cache_tag_array");
4393   rtx index_mask = gen_reg_rtx (SImode);
4394   rtx tag_arr = gen_reg_rtx (Pmode);
4395   rtx splat_mask = gen_reg_rtx (TImode);
4396   rtx splat = gen_reg_rtx (V4SImode);
4397   rtx splat_hi = NULL_RTX;
4398   rtx tag_index = gen_reg_rtx (Pmode);
4399   rtx block_off = gen_reg_rtx (SImode);
4400   rtx tag_addr = gen_reg_rtx (Pmode);
4401   rtx tag = gen_reg_rtx (V4SImode);
4402   rtx cache_tag = gen_reg_rtx (V4SImode);
4403   rtx cache_tag_hi = NULL_RTX;
4404   rtx cache_ptrs = gen_reg_rtx (TImode);
4405   rtx cache_ptrs_si = gen_reg_rtx (SImode);
4406   rtx tag_equal = gen_reg_rtx (V4SImode);
4407   rtx tag_equal_hi = NULL_RTX;
4408   rtx tag_eq_pack = gen_reg_rtx (V4SImode);
4409   rtx tag_eq_pack_si = gen_reg_rtx (SImode);
4410   rtx eq_index = gen_reg_rtx (SImode);
4411   rtx bcomp, hit_label, hit_ref, cont_label, insn;
4412 
4413   if (spu_ea_model != 32)
4414     {
4415       splat_hi = gen_reg_rtx (V4SImode);
4416       cache_tag_hi = gen_reg_rtx (V4SImode);
4417       tag_equal_hi = gen_reg_rtx (V4SImode);
4418     }
4419 
4420   emit_move_insn (index_mask, plus_constant (tag_size_sym, -128));
4421   emit_move_insn (tag_arr, tag_arr_sym);
4422   v = 0x0001020300010203LL;
4423   emit_move_insn (splat_mask, immed_double_const (v, v, TImode));
4424   ea_addr_si = ea_addr;
4425   if (spu_ea_model != 32)
4426     ea_addr_si = convert_to_mode (SImode, ea_addr, 1);
4427 
4428   /* tag_index = ea_addr & (tag_array_size - 128)  */
4429   emit_insn (gen_andsi3 (tag_index, ea_addr_si, index_mask));
4430 
4431   /* splat ea_addr to all 4 slots.  */
4432   emit_insn (gen_shufb (splat, ea_addr_si, ea_addr_si, splat_mask));
4433   /* Similarly for high 32 bits of ea_addr.  */
4434   if (spu_ea_model != 32)
4435     emit_insn (gen_shufb (splat_hi, ea_addr, ea_addr, splat_mask));
4436 
4437   /* block_off = ea_addr & 127  */
4438   emit_insn (gen_andsi3 (block_off, ea_addr_si, spu_const (SImode, 127)));
4439 
4440   /* tag_addr = tag_arr + tag_index  */
4441   emit_insn (gen_addsi3 (tag_addr, tag_arr, tag_index));
4442 
4443   /* Read cache tags.  */
4444   emit_move_insn (cache_tag, gen_rtx_MEM (V4SImode, tag_addr));
4445   if (spu_ea_model != 32)
4446     emit_move_insn (cache_tag_hi, gen_rtx_MEM (V4SImode,
4447 					       plus_constant (tag_addr, 16)));
4448 
4449   /* tag = ea_addr & -128  */
4450   emit_insn (gen_andv4si3 (tag, splat, spu_const (V4SImode, -128)));
4451 
4452   /* Read all four cache data pointers.  */
4453   emit_move_insn (cache_ptrs, gen_rtx_MEM (TImode,
4454 					   plus_constant (tag_addr, 32)));
4455 
4456   /* Compare tags.  */
4457   emit_insn (gen_ceq_v4si (tag_equal, tag, cache_tag));
4458   if (spu_ea_model != 32)
4459     {
4460       emit_insn (gen_ceq_v4si (tag_equal_hi, splat_hi, cache_tag_hi));
4461       emit_insn (gen_andv4si3 (tag_equal, tag_equal, tag_equal_hi));
4462     }
4463 
4464   /* At most one of the tags compare equal, so tag_equal has one
4465      32-bit slot set to all 1's, with the other slots all zero.
4466      gbb picks off low bit from each byte in the 128-bit registers,
4467      so tag_eq_pack is one of 0xf000, 0x0f00, 0x00f0, 0x000f, assuming
4468      we have a hit.  */
4469   emit_insn (gen_spu_gbb (tag_eq_pack, spu_gen_subreg (V16QImode, tag_equal)));
4470   emit_insn (gen_spu_convert (tag_eq_pack_si, tag_eq_pack));
4471 
4472   /* So counting leading zeros will set eq_index to 16, 20, 24 or 28.  */
4473   emit_insn (gen_clzsi2 (eq_index, tag_eq_pack_si));
4474 
4475   /* Allowing us to rotate the corresponding cache data pointer to slot0.
4476      (rotating eq_index mod 16 bytes).  */
4477   emit_insn (gen_rotqby_ti (cache_ptrs, cache_ptrs, eq_index));
4478   emit_insn (gen_spu_convert (cache_ptrs_si, cache_ptrs));
4479 
4480   /* Add block offset to form final data address.  */
4481   emit_insn (gen_addsi3 (data_addr, cache_ptrs_si, block_off));
4482 
4483   /* Check that we did hit.  */
4484   hit_label = gen_label_rtx ();
4485   hit_ref = gen_rtx_LABEL_REF (VOIDmode, hit_label);
4486   bcomp = gen_rtx_NE (SImode, tag_eq_pack_si, const0_rtx);
4487   insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,
4488 				      gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
4489 							    hit_ref, pc_rtx)));
4490   /* Say that this branch is very likely to happen.  */
4491   v = REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100 - 1;
4492   REG_NOTES (insn)
4493     = gen_rtx_EXPR_LIST (REG_BR_PROB, GEN_INT (v), REG_NOTES (insn));
4494 
4495   ea_load_store (mem, is_store, ea_addr, data_addr);
4496   cont_label = gen_label_rtx ();
4497   emit_jump_insn (gen_jump (cont_label));
4498   emit_barrier ();
4499 
4500   emit_label (hit_label);
4501 
4502   if (is_store)
4503     {
4504       HOST_WIDE_INT v_hi;
4505       rtx dirty_bits = gen_reg_rtx (TImode);
4506       rtx dirty_off = gen_reg_rtx (SImode);
4507       rtx dirty_128 = gen_reg_rtx (TImode);
4508       rtx neg_block_off = gen_reg_rtx (SImode);
4509 
4510       /* Set up mask with one dirty bit per byte of the mem we are
4511 	 writing, starting from top bit.  */
4512       v_hi = v = -1;
4513       v <<= (128 - GET_MODE_SIZE (GET_MODE (mem))) & 63;
4514       if ((128 - GET_MODE_SIZE (GET_MODE (mem))) >= 64)
4515 	{
4516 	  v_hi = v;
4517 	  v = 0;
4518 	}
4519       emit_move_insn (dirty_bits, immed_double_const (v, v_hi, TImode));
4520 
4521       /* Form index into cache dirty_bits.  eq_index is one of
4522 	 0x10, 0x14, 0x18 or 0x1c.  Multiplying by 4 gives us
4523 	 0x40, 0x50, 0x60 or 0x70 which just happens to be the
4524 	 offset to each of the four dirty_bits elements.  */
4525       emit_insn (gen_ashlsi3 (dirty_off, eq_index, spu_const (SImode, 2)));
4526 
4527       emit_insn (gen_spu_lqx (dirty_128, tag_addr, dirty_off));
4528 
4529       /* Rotate bit mask to proper bit.  */
4530       emit_insn (gen_negsi2 (neg_block_off, block_off));
4531       emit_insn (gen_rotqbybi_ti (dirty_bits, dirty_bits, neg_block_off));
4532       emit_insn (gen_rotqbi_ti (dirty_bits, dirty_bits, neg_block_off));
4533 
4534       /* Or in the new dirty bits.  */
4535       emit_insn (gen_iorti3 (dirty_128, dirty_bits, dirty_128));
4536 
4537       /* Store.  */
4538       emit_insn (gen_spu_stqx (dirty_128, tag_addr, dirty_off));
4539     }
4540 
4541   emit_label (cont_label);
4542 }
4543 
4544 static rtx
4545 expand_ea_mem (rtx mem, bool is_store)
4546 {
4547   rtx ea_addr;
4548   rtx data_addr = gen_reg_rtx (Pmode);
4549   rtx new_mem;
4550 
4551   ea_addr = force_reg (EAmode, XEXP (mem, 0));
4552   if (optimize_size || optimize == 0)
4553     ea_load_store (mem, is_store, ea_addr, data_addr);
4554   else
4555     ea_load_store_inline (mem, is_store, ea_addr, data_addr);
4556 
4557   if (ea_alias_set == -1)
4558     ea_alias_set = new_alias_set ();
4559 
4560   /* We generate a new MEM RTX to refer to the copy of the data
4561      in the cache.  We do not copy memory attributes (except the
4562      alignment) from the original MEM, as they may no longer apply
4563      to the cache copy.  */
4564   new_mem = gen_rtx_MEM (GET_MODE (mem), data_addr);
4565   set_mem_alias_set (new_mem, ea_alias_set);
4566   set_mem_align (new_mem, MIN (MEM_ALIGN (mem), 128 * 8));
4567 
4568   return new_mem;
4569 }
4570 
4571 int
4572 spu_expand_mov (rtx * ops, enum machine_mode mode)
4573 {
4574   if (GET_CODE (ops[0]) == SUBREG && !valid_subreg (ops[0]))
4575     abort ();
4576 
4577   if (GET_CODE (ops[1]) == SUBREG && !valid_subreg (ops[1]))
4578     {
4579       rtx from = SUBREG_REG (ops[1]);
4580       enum machine_mode imode = int_mode_for_mode (GET_MODE (from));
4581 
4582       gcc_assert (GET_MODE_CLASS (mode) == MODE_INT
4583 		  && GET_MODE_CLASS (imode) == MODE_INT
4584 		  && subreg_lowpart_p (ops[1]));
4585 
4586       if (GET_MODE_SIZE (imode) < 4)
4587 	imode = SImode;
4588       if (imode != GET_MODE (from))
4589 	from = gen_rtx_SUBREG (imode, from, 0);
4590 
4591       if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (imode))
4592 	{
4593 	  enum insn_code icode = convert_optab_handler (trunc_optab, mode, imode)->insn_code;
4594 	  emit_insn (GEN_FCN (icode) (ops[0], from));
4595 	}
4596       else
4597 	emit_insn (gen_extend_insn (ops[0], from, mode, imode, 1));
4598       return 1;
4599     }
4600 
4601   /* At least one of the operands needs to be a register. */
4602   if ((reload_in_progress | reload_completed) == 0
4603       && !register_operand (ops[0], mode) && !register_operand (ops[1], mode))
4604     {
4605       rtx temp = force_reg (mode, ops[1]);
4606       emit_move_insn (ops[0], temp);
4607       return 1;
4608     }
4609   if (reload_in_progress || reload_completed)
4610     {
4611       if (CONSTANT_P (ops[1]))
4612 	return spu_split_immediate (ops);
4613       return 0;
4614     }
4615 
4616   /* Catch the SImode immediates greater than 0x7fffffff, and sign
4617      extend them. */
4618   if (GET_CODE (ops[1]) == CONST_INT)
4619     {
4620       HOST_WIDE_INT val = trunc_int_for_mode (INTVAL (ops[1]), mode);
4621       if (val != INTVAL (ops[1]))
4622 	{
4623 	  emit_move_insn (ops[0], GEN_INT (val));
4624 	  return 1;
4625 	}
4626     }
4627   if (MEM_P (ops[0]))
4628     {
4629       if (MEM_ADDR_SPACE (ops[0]))
4630 	ops[0] = expand_ea_mem (ops[0], true);
4631       return spu_split_store (ops);
4632     }
4633   if (MEM_P (ops[1]))
4634     {
4635       if (MEM_ADDR_SPACE (ops[1]))
4636 	ops[1] = expand_ea_mem (ops[1], false);
4637       return spu_split_load (ops);
4638     }
4639 
4640   return 0;
4641 }
4642 
4643 static void
4644 spu_convert_move (rtx dst, rtx src)
4645 {
4646   enum machine_mode mode = GET_MODE (dst);
4647   enum machine_mode int_mode = mode_for_size (GET_MODE_BITSIZE (mode), MODE_INT, 0);
4648   rtx reg;
4649   gcc_assert (GET_MODE (src) == TImode);
4650   reg = int_mode != mode ? gen_reg_rtx (int_mode) : dst;
4651   emit_insn (gen_rtx_SET (VOIDmode, reg,
4652 	       gen_rtx_TRUNCATE (int_mode,
4653 		 gen_rtx_LSHIFTRT (TImode, src,
4654 		   GEN_INT (int_mode == DImode ? 64 : 96)))));
4655   if (int_mode != mode)
4656     {
4657       reg = simplify_gen_subreg (mode, reg, int_mode, 0);
4658       emit_move_insn (dst, reg);
4659     }
4660 }
4661 
4662 /* Load TImode values into DST0 and DST1 (when it is non-NULL) using
4663    the address from SRC and SRC+16.  Return a REG or CONST_INT that
4664    specifies how many bytes to rotate the loaded registers, plus any
4665    extra from EXTRA_ROTQBY.  The address and rotate amounts are
4666    normalized to improve merging of loads and rotate computations. */
4667 static rtx
4668 spu_expand_load (rtx dst0, rtx dst1, rtx src, int extra_rotby)
4669 {
4670   rtx addr = XEXP (src, 0);
4671   rtx p0, p1, rot, addr0, addr1;
4672   int rot_amt;
4673 
4674   rot = 0;
4675   rot_amt = 0;
4676 
4677   if (MEM_ALIGN (src) >= 128)
4678     /* Address is already aligned; simply perform a TImode load.  */ ;
4679   else if (GET_CODE (addr) == PLUS)
4680     {
4681       /* 8 cases:
4682          aligned reg   + aligned reg     => lqx
4683          aligned reg   + unaligned reg   => lqx, rotqby
4684          aligned reg   + aligned const   => lqd
4685          aligned reg   + unaligned const => lqd, rotqbyi
4686          unaligned reg + aligned reg     => lqx, rotqby
4687          unaligned reg + unaligned reg   => lqx, a, rotqby (1 scratch)
4688          unaligned reg + aligned const   => lqd, rotqby
4689          unaligned reg + unaligned const -> not allowed by legitimate address
4690        */
4691       p0 = XEXP (addr, 0);
4692       p1 = XEXP (addr, 1);
4693       if (!reg_aligned_for_addr (p0))
4694 	{
4695 	  if (REG_P (p1) && !reg_aligned_for_addr (p1))
4696 	    {
4697 	      rot = gen_reg_rtx (SImode);
4698 	      emit_insn (gen_addsi3 (rot, p0, p1));
4699 	    }
4700 	  else if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15))
4701 	    {
4702 	      if (INTVAL (p1) > 0
4703 		  && REG_POINTER (p0)
4704 		  && INTVAL (p1) * BITS_PER_UNIT
4705 		     < REGNO_POINTER_ALIGN (REGNO (p0)))
4706 		{
4707 		  rot = gen_reg_rtx (SImode);
4708 		  emit_insn (gen_addsi3 (rot, p0, p1));
4709 		  addr = p0;
4710 		}
4711 	      else
4712 		{
4713 		  rtx x = gen_reg_rtx (SImode);
4714 		  emit_move_insn (x, p1);
4715 		  if (!spu_arith_operand (p1, SImode))
4716 		    p1 = x;
4717 		  rot = gen_reg_rtx (SImode);
4718 		  emit_insn (gen_addsi3 (rot, p0, p1));
4719 		  addr = gen_rtx_PLUS (Pmode, p0, x);
4720 		}
4721 	    }
4722 	  else
4723 	    rot = p0;
4724 	}
4725       else
4726 	{
4727 	  if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15))
4728 	    {
4729 	      rot_amt = INTVAL (p1) & 15;
4730 	      if (INTVAL (p1) & -16)
4731 		{
4732 		  p1 = GEN_INT (INTVAL (p1) & -16);
4733 		  addr = gen_rtx_PLUS (SImode, p0, p1);
4734 		}
4735 	      else
4736 		addr = p0;
4737 	    }
4738 	  else if (REG_P (p1) && !reg_aligned_for_addr (p1))
4739 	    rot = p1;
4740 	}
4741     }
4742   else if (REG_P (addr))
4743     {
4744       if (!reg_aligned_for_addr (addr))
4745 	rot = addr;
4746     }
4747   else if (GET_CODE (addr) == CONST)
4748     {
4749       if (GET_CODE (XEXP (addr, 0)) == PLUS
4750 	  && ALIGNED_SYMBOL_REF_P (XEXP (XEXP (addr, 0), 0))
4751 	  && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)
4752 	{
4753 	  rot_amt = INTVAL (XEXP (XEXP (addr, 0), 1));
4754 	  if (rot_amt & -16)
4755 	    addr = gen_rtx_CONST (Pmode,
4756 				  gen_rtx_PLUS (Pmode,
4757 						XEXP (XEXP (addr, 0), 0),
4758 						GEN_INT (rot_amt & -16)));
4759 	  else
4760 	    addr = XEXP (XEXP (addr, 0), 0);
4761 	}
4762       else
4763 	{
4764 	  rot = gen_reg_rtx (Pmode);
4765 	  emit_move_insn (rot, addr);
4766 	}
4767     }
4768   else if (GET_CODE (addr) == CONST_INT)
4769     {
4770       rot_amt = INTVAL (addr);
4771       addr = GEN_INT (rot_amt & -16);
4772     }
4773   else if (!ALIGNED_SYMBOL_REF_P (addr))
4774     {
4775       rot = gen_reg_rtx (Pmode);
4776       emit_move_insn (rot, addr);
4777     }
4778 
4779   rot_amt += extra_rotby;
4780 
4781   rot_amt &= 15;
4782 
4783   if (rot && rot_amt)
4784     {
4785       rtx x = gen_reg_rtx (SImode);
4786       emit_insn (gen_addsi3 (x, rot, GEN_INT (rot_amt)));
4787       rot = x;
4788       rot_amt = 0;
4789     }
4790   if (!rot && rot_amt)
4791     rot = GEN_INT (rot_amt);
4792 
4793   addr0 = copy_rtx (addr);
4794   addr0 = gen_rtx_AND (SImode, copy_rtx (addr), GEN_INT (-16));
4795   emit_insn (gen__movti (dst0, change_address (src, TImode, addr0)));
4796 
4797   if (dst1)
4798     {
4799       addr1 = plus_constant (copy_rtx (addr), 16);
4800       addr1 = gen_rtx_AND (SImode, addr1, GEN_INT (-16));
4801       emit_insn (gen__movti (dst1, change_address (src, TImode, addr1)));
4802     }
4803 
4804   return rot;
4805 }
4806 
4807 int
4808 spu_split_load (rtx * ops)
4809 {
4810   enum machine_mode mode = GET_MODE (ops[0]);
4811   rtx addr, load, rot;
4812   int rot_amt;
4813 
4814   if (GET_MODE_SIZE (mode) >= 16)
4815     return 0;
4816 
4817   addr = XEXP (ops[1], 0);
4818   gcc_assert (GET_CODE (addr) != AND);
4819 
4820   if (!address_needs_split (ops[1]))
4821     {
4822       ops[1] = change_address (ops[1], TImode, addr);
4823       load = gen_reg_rtx (TImode);
4824       emit_insn (gen__movti (load, ops[1]));
4825       spu_convert_move (ops[0], load);
4826       return 1;
4827     }
4828 
4829   rot_amt = GET_MODE_SIZE (mode) < 4 ? GET_MODE_SIZE (mode) - 4 : 0;
4830 
4831   load = gen_reg_rtx (TImode);
4832   rot = spu_expand_load (load, 0, ops[1], rot_amt);
4833 
4834   if (rot)
4835     emit_insn (gen_rotqby_ti (load, load, rot));
4836 
4837   spu_convert_move (ops[0], load);
4838   return 1;
4839 }
4840 
4841 int
4842 spu_split_store (rtx * ops)
4843 {
4844   enum machine_mode mode = GET_MODE (ops[0]);
4845   rtx reg;
4846   rtx addr, p0, p1, p1_lo, smem;
4847   int aform;
4848   int scalar;
4849 
4850   if (GET_MODE_SIZE (mode) >= 16)
4851     return 0;
4852 
4853   addr = XEXP (ops[0], 0);
4854   gcc_assert (GET_CODE (addr) != AND);
4855 
4856   if (!address_needs_split (ops[0]))
4857     {
4858       reg = gen_reg_rtx (TImode);
4859       emit_insn (gen_spu_convert (reg, ops[1]));
4860       ops[0] = change_address (ops[0], TImode, addr);
4861       emit_move_insn (ops[0], reg);
4862       return 1;
4863     }
4864 
4865   if (GET_CODE (addr) == PLUS)
4866     {
4867       /* 8 cases:
4868          aligned reg   + aligned reg     => lqx, c?x, shuf, stqx
4869          aligned reg   + unaligned reg   => lqx, c?x, shuf, stqx
4870          aligned reg   + aligned const   => lqd, c?d, shuf, stqx
4871          aligned reg   + unaligned const => lqd, c?d, shuf, stqx
4872          unaligned reg + aligned reg     => lqx, c?x, shuf, stqx
4873          unaligned reg + unaligned reg   => lqx, c?x, shuf, stqx
4874          unaligned reg + aligned const   => lqd, c?d, shuf, stqx
4875          unaligned reg + unaligned const -> lqx, c?d, shuf, stqx
4876        */
4877       aform = 0;
4878       p0 = XEXP (addr, 0);
4879       p1 = p1_lo = XEXP (addr, 1);
4880       if (REG_P (p0) && GET_CODE (p1) == CONST_INT)
4881 	{
4882 	  p1_lo = GEN_INT (INTVAL (p1) & 15);
4883 	  if (reg_aligned_for_addr (p0))
4884 	    {
4885 	      p1 = GEN_INT (INTVAL (p1) & -16);
4886 	      if (p1 == const0_rtx)
4887 		addr = p0;
4888 	      else
4889 		addr = gen_rtx_PLUS (SImode, p0, p1);
4890 	    }
4891 	  else
4892 	    {
4893 	      rtx x = gen_reg_rtx (SImode);
4894 	      emit_move_insn (x, p1);
4895 	      addr = gen_rtx_PLUS (SImode, p0, x);
4896 	    }
4897 	}
4898     }
4899   else if (REG_P (addr))
4900     {
4901       aform = 0;
4902       p0 = addr;
4903       p1 = p1_lo = const0_rtx;
4904     }
4905   else
4906     {
4907       aform = 1;
4908       p0 = gen_rtx_REG (SImode, STACK_POINTER_REGNUM);
4909       p1 = 0;			/* aform doesn't use p1 */
4910       p1_lo = addr;
4911       if (ALIGNED_SYMBOL_REF_P (addr))
4912 	p1_lo = const0_rtx;
4913       else if (GET_CODE (addr) == CONST
4914 	       && GET_CODE (XEXP (addr, 0)) == PLUS
4915 	       && ALIGNED_SYMBOL_REF_P (XEXP (XEXP (addr, 0), 0))
4916 	       && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)
4917 	{
4918 	  HOST_WIDE_INT v = INTVAL (XEXP (XEXP (addr, 0), 1));
4919 	  if ((v & -16) != 0)
4920 	    addr = gen_rtx_CONST (Pmode,
4921 				  gen_rtx_PLUS (Pmode,
4922 						XEXP (XEXP (addr, 0), 0),
4923 						GEN_INT (v & -16)));
4924 	  else
4925 	    addr = XEXP (XEXP (addr, 0), 0);
4926 	  p1_lo = GEN_INT (v & 15);
4927 	}
4928       else if (GET_CODE (addr) == CONST_INT)
4929 	{
4930 	  p1_lo = GEN_INT (INTVAL (addr) & 15);
4931 	  addr = GEN_INT (INTVAL (addr) & -16);
4932 	}
4933       else
4934 	{
4935 	  p1_lo = gen_reg_rtx (SImode);
4936 	  emit_move_insn (p1_lo, addr);
4937 	}
4938     }
4939 
4940   reg = gen_reg_rtx (TImode);
4941 
4942   scalar = store_with_one_insn_p (ops[0]);
4943   if (!scalar)
4944     {
4945       /* We could copy the flags from the ops[0] MEM to mem here,
4946          We don't because we want this load to be optimized away if
4947          possible, and copying the flags will prevent that in certain
4948          cases, e.g. consider the volatile flag. */
4949 
4950       rtx pat = gen_reg_rtx (TImode);
4951       rtx lmem = change_address (ops[0], TImode, copy_rtx (addr));
4952       set_mem_alias_set (lmem, 0);
4953       emit_insn (gen_movti (reg, lmem));
4954 
4955       if (!p0 || reg_aligned_for_addr (p0))
4956 	p0 = stack_pointer_rtx;
4957       if (!p1_lo)
4958 	p1_lo = const0_rtx;
4959 
4960       emit_insn (gen_cpat (pat, p0, p1_lo, GEN_INT (GET_MODE_SIZE (mode))));
4961       emit_insn (gen_shufb (reg, ops[1], reg, pat));
4962     }
4963   else
4964     {
4965       if (GET_CODE (ops[1]) == REG)
4966 	emit_insn (gen_spu_convert (reg, ops[1]));
4967       else if (GET_CODE (ops[1]) == SUBREG)
4968 	emit_insn (gen_spu_convert (reg, SUBREG_REG (ops[1])));
4969       else
4970 	abort ();
4971     }
4972 
4973   if (GET_MODE_SIZE (mode) < 4 && scalar)
4974     emit_insn (gen_ashlti3
4975 	       (reg, reg, GEN_INT (32 - GET_MODE_BITSIZE (mode))));
4976 
4977   smem = change_address (ops[0], TImode, copy_rtx (addr));
4978   /* We can't use the previous alias set because the memory has changed
4979      size and can potentially overlap objects of other types.  */
4980   set_mem_alias_set (smem, 0);
4981 
4982   emit_insn (gen_movti (smem, reg));
4983   return 1;
4984 }
4985 
4986 /* Return TRUE if X is MEM which is a struct member reference
4987    and the member can safely be loaded and stored with a single
4988    instruction because it is padded. */
4989 static int
4990 mem_is_padded_component_ref (rtx x)
4991 {
4992   tree t = MEM_EXPR (x);
4993   tree r;
4994   if (!t || TREE_CODE (t) != COMPONENT_REF)
4995     return 0;
4996   t = TREE_OPERAND (t, 1);
4997   if (!t || TREE_CODE (t) != FIELD_DECL
4998       || DECL_ALIGN (t) < 128 || AGGREGATE_TYPE_P (TREE_TYPE (t)))
4999     return 0;
5000   /* Only do this for RECORD_TYPEs, not UNION_TYPEs. */
5001   r = DECL_FIELD_CONTEXT (t);
5002   if (!r || TREE_CODE (r) != RECORD_TYPE)
5003     return 0;
5004   /* Make sure they are the same mode */
5005   if (GET_MODE (x) != TYPE_MODE (TREE_TYPE (t)))
5006     return 0;
5007   /* If there are no following fields then the field alignment assures
5008      the structure is padded to the alignment which means this field is
5009      padded too.  */
5010   if (TREE_CHAIN (t) == 0)
5011     return 1;
5012   /* If the following field is also aligned then this field will be
5013      padded. */
5014   t = TREE_CHAIN (t);
5015   if (TREE_CODE (t) == FIELD_DECL && DECL_ALIGN (t) >= 128)
5016     return 1;
5017   return 0;
5018 }
5019 
5020 /* Parse the -mfixed-range= option string.  */
5021 static void
5022 fix_range (const char *const_str)
5023 {
5024   int i, first, last;
5025   char *str, *dash, *comma;
5026 
5027   /* str must be of the form REG1'-'REG2{,REG1'-'REG} where REG1 and
5028      REG2 are either register names or register numbers.  The effect
5029      of this option is to mark the registers in the range from REG1 to
5030      REG2 as ``fixed'' so they won't be used by the compiler.  */
5031 
5032   i = strlen (const_str);
5033   str = (char *) alloca (i + 1);
5034   memcpy (str, const_str, i + 1);
5035 
5036   while (1)
5037     {
5038       dash = strchr (str, '-');
5039       if (!dash)
5040 	{
5041 	  warning (0, "value of -mfixed-range must have form REG1-REG2");
5042 	  return;
5043 	}
5044       *dash = '\0';
5045       comma = strchr (dash + 1, ',');
5046       if (comma)
5047 	*comma = '\0';
5048 
5049       first = decode_reg_name (str);
5050       if (first < 0)
5051 	{
5052 	  warning (0, "unknown register name: %s", str);
5053 	  return;
5054 	}
5055 
5056       last = decode_reg_name (dash + 1);
5057       if (last < 0)
5058 	{
5059 	  warning (0, "unknown register name: %s", dash + 1);
5060 	  return;
5061 	}
5062 
5063       *dash = '-';
5064 
5065       if (first > last)
5066 	{
5067 	  warning (0, "%s-%s is an empty range", str, dash + 1);
5068 	  return;
5069 	}
5070 
5071       for (i = first; i <= last; ++i)
5072 	fixed_regs[i] = call_used_regs[i] = 1;
5073 
5074       if (!comma)
5075 	break;
5076 
5077       *comma = ',';
5078       str = comma + 1;
5079     }
5080 }
5081 
5082 /* Return TRUE if x is a CONST_INT, CONST_DOUBLE or CONST_VECTOR that
5083    can be generated using the fsmbi instruction. */
5084 int
5085 fsmbi_const_p (rtx x)
5086 {
5087   if (CONSTANT_P (x))
5088     {
5089       /* We can always choose TImode for CONST_INT because the high bits
5090          of an SImode will always be all 1s, i.e., valid for fsmbi. */
5091       enum immediate_class c = classify_immediate (x, TImode);
5092       return c == IC_FSMBI || (!epilogue_completed && c == IC_FSMBI2);
5093     }
5094   return 0;
5095 }
5096 
5097 /* Return TRUE if x is a CONST_INT, CONST_DOUBLE or CONST_VECTOR that
5098    can be generated using the cbd, chd, cwd or cdd instruction. */
5099 int
5100 cpat_const_p (rtx x, enum machine_mode mode)
5101 {
5102   if (CONSTANT_P (x))
5103     {
5104       enum immediate_class c = classify_immediate (x, mode);
5105       return c == IC_CPAT;
5106     }
5107   return 0;
5108 }
5109 
5110 rtx
5111 gen_cpat_const (rtx * ops)
5112 {
5113   unsigned char dst[16];
5114   int i, offset, shift, isize;
5115   if (GET_CODE (ops[3]) != CONST_INT
5116       || GET_CODE (ops[2]) != CONST_INT
5117       || (GET_CODE (ops[1]) != CONST_INT
5118 	  && GET_CODE (ops[1]) != REG))
5119     return 0;
5120   if (GET_CODE (ops[1]) == REG
5121       && (!REG_POINTER (ops[1])
5122 	  || REGNO_POINTER_ALIGN (ORIGINAL_REGNO (ops[1])) < 128))
5123     return 0;
5124 
5125   for (i = 0; i < 16; i++)
5126     dst[i] = i + 16;
5127   isize = INTVAL (ops[3]);
5128   if (isize == 1)
5129     shift = 3;
5130   else if (isize == 2)
5131     shift = 2;
5132   else
5133     shift = 0;
5134   offset = (INTVAL (ops[2]) +
5135 	    (GET_CODE (ops[1]) ==
5136 	     CONST_INT ? INTVAL (ops[1]) : 0)) & 15;
5137   for (i = 0; i < isize; i++)
5138     dst[offset + i] = i + shift;
5139   return array_to_constant (TImode, dst);
5140 }
5141 
5142 /* Convert a CONST_INT, CONST_DOUBLE, or CONST_VECTOR into a 16 byte
5143    array.  Use MODE for CONST_INT's.  When the constant's mode is smaller
5144    than 16 bytes, the value is repeated across the rest of the array. */
5145 void
5146 constant_to_array (enum machine_mode mode, rtx x, unsigned char arr[16])
5147 {
5148   HOST_WIDE_INT val;
5149   int i, j, first;
5150 
5151   memset (arr, 0, 16);
5152   mode = GET_MODE (x) != VOIDmode ? GET_MODE (x) : mode;
5153   if (GET_CODE (x) == CONST_INT
5154       || (GET_CODE (x) == CONST_DOUBLE
5155 	  && (mode == SFmode || mode == DFmode)))
5156     {
5157       gcc_assert (mode != VOIDmode && mode != BLKmode);
5158 
5159       if (GET_CODE (x) == CONST_DOUBLE)
5160 	val = const_double_to_hwint (x);
5161       else
5162 	val = INTVAL (x);
5163       first = GET_MODE_SIZE (mode) - 1;
5164       for (i = first; i >= 0; i--)
5165 	{
5166 	  arr[i] = val & 0xff;
5167 	  val >>= 8;
5168 	}
5169       /* Splat the constant across the whole array. */
5170       for (j = 0, i = first + 1; i < 16; i++)
5171 	{
5172 	  arr[i] = arr[j];
5173 	  j = (j == first) ? 0 : j + 1;
5174 	}
5175     }
5176   else if (GET_CODE (x) == CONST_DOUBLE)
5177     {
5178       val = CONST_DOUBLE_LOW (x);
5179       for (i = 15; i >= 8; i--)
5180 	{
5181 	  arr[i] = val & 0xff;
5182 	  val >>= 8;
5183 	}
5184       val = CONST_DOUBLE_HIGH (x);
5185       for (i = 7; i >= 0; i--)
5186 	{
5187 	  arr[i] = val & 0xff;
5188 	  val >>= 8;
5189 	}
5190     }
5191   else if (GET_CODE (x) == CONST_VECTOR)
5192     {
5193       int units;
5194       rtx elt;
5195       mode = GET_MODE_INNER (mode);
5196       units = CONST_VECTOR_NUNITS (x);
5197       for (i = 0; i < units; i++)
5198 	{
5199 	  elt = CONST_VECTOR_ELT (x, i);
5200 	  if (GET_CODE (elt) == CONST_INT || GET_CODE (elt) == CONST_DOUBLE)
5201 	    {
5202 	      if (GET_CODE (elt) == CONST_DOUBLE)
5203 		val = const_double_to_hwint (elt);
5204 	      else
5205 		val = INTVAL (elt);
5206 	      first = GET_MODE_SIZE (mode) - 1;
5207 	      if (first + i * GET_MODE_SIZE (mode) > 16)
5208 		abort ();
5209 	      for (j = first; j >= 0; j--)
5210 		{
5211 		  arr[j + i * GET_MODE_SIZE (mode)] = val & 0xff;
5212 		  val >>= 8;
5213 		}
5214 	    }
5215 	}
5216     }
5217   else
5218     gcc_unreachable();
5219 }
5220 
5221 /* Convert a 16 byte array to a constant of mode MODE.  When MODE is
5222    smaller than 16 bytes, use the bytes that would represent that value
5223    in a register, e.g., for QImode return the value of arr[3].  */
5224 rtx
5225 array_to_constant (enum machine_mode mode, const unsigned char arr[16])
5226 {
5227   enum machine_mode inner_mode;
5228   rtvec v;
5229   int units, size, i, j, k;
5230   HOST_WIDE_INT val;
5231 
5232   if (GET_MODE_CLASS (mode) == MODE_INT
5233       && GET_MODE_BITSIZE (mode) <= HOST_BITS_PER_WIDE_INT)
5234     {
5235       j = GET_MODE_SIZE (mode);
5236       i = j < 4 ? 4 - j : 0;
5237       for (val = 0; i < j; i++)
5238 	val = (val << 8) | arr[i];
5239       val = trunc_int_for_mode (val, mode);
5240       return GEN_INT (val);
5241     }
5242 
5243   if (mode == TImode)
5244     {
5245       HOST_WIDE_INT high;
5246       for (i = high = 0; i < 8; i++)
5247 	high = (high << 8) | arr[i];
5248       for (i = 8, val = 0; i < 16; i++)
5249 	val = (val << 8) | arr[i];
5250       return immed_double_const (val, high, TImode);
5251     }
5252   if (mode == SFmode)
5253     {
5254       val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
5255       val = trunc_int_for_mode (val, SImode);
5256       return hwint_to_const_double (SFmode, val);
5257     }
5258   if (mode == DFmode)
5259     {
5260       for (i = 0, val = 0; i < 8; i++)
5261 	val = (val << 8) | arr[i];
5262       return hwint_to_const_double (DFmode, val);
5263     }
5264 
5265   if (!VECTOR_MODE_P (mode))
5266     abort ();
5267 
5268   units = GET_MODE_NUNITS (mode);
5269   size = GET_MODE_UNIT_SIZE (mode);
5270   inner_mode = GET_MODE_INNER (mode);
5271   v = rtvec_alloc (units);
5272 
5273   for (k = i = 0; i < units; ++i)
5274     {
5275       val = 0;
5276       for (j = 0; j < size; j++, k++)
5277 	val = (val << 8) | arr[k];
5278 
5279       if (GET_MODE_CLASS (inner_mode) == MODE_FLOAT)
5280 	RTVEC_ELT (v, i) = hwint_to_const_double (inner_mode, val);
5281       else
5282 	RTVEC_ELT (v, i) = GEN_INT (trunc_int_for_mode (val, inner_mode));
5283     }
5284   if (k > 16)
5285     abort ();
5286 
5287   return gen_rtx_CONST_VECTOR (mode, v);
5288 }
5289 
5290 static void
5291 reloc_diagnostic (rtx x)
5292 {
5293   tree decl = 0;
5294   if (!flag_pic || !(TARGET_WARN_RELOC || TARGET_ERROR_RELOC))
5295     return;
5296 
5297   if (GET_CODE (x) == SYMBOL_REF)
5298     decl = SYMBOL_REF_DECL (x);
5299   else if (GET_CODE (x) == CONST
5300 	   && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5301     decl = SYMBOL_REF_DECL (XEXP (XEXP (x, 0), 0));
5302 
5303   /* SYMBOL_REF_DECL is not necessarily a DECL. */
5304   if (decl && !DECL_P (decl))
5305     decl = 0;
5306 
5307   /* The decl could be a string constant.  */
5308   if (decl && DECL_P (decl))
5309     {
5310       location_t loc;
5311       /* We use last_assemble_variable_decl to get line information.  It's
5312 	 not always going to be right and might not even be close, but will
5313 	 be right for the more common cases. */
5314       if (!last_assemble_variable_decl || in_section == ctors_section)
5315 	loc = DECL_SOURCE_LOCATION (decl);
5316       else
5317 	loc = DECL_SOURCE_LOCATION (last_assemble_variable_decl);
5318 
5319       if (TARGET_WARN_RELOC)
5320 	warning_at (loc, 0,
5321 		    "creating run-time relocation for %qD", decl);
5322       else
5323 	error_at (loc,
5324 		  "creating run-time relocation for %qD", decl);
5325     }
5326   else
5327     {
5328       if (TARGET_WARN_RELOC)
5329 	warning_at (input_location, 0, "creating run-time relocation");
5330       else
5331 	error_at (input_location, "creating run-time relocation");
5332     }
5333 }
5334 
5335 /* Hook into assemble_integer so we can generate an error for run-time
5336    relocations.  The SPU ABI disallows them. */
5337 static bool
5338 spu_assemble_integer (rtx x, unsigned int size, int aligned_p)
5339 {
5340   /* By default run-time relocations aren't supported, but we allow them
5341      in case users support it in their own run-time loader.  And we provide
5342      a warning for those users that don't.  */
5343   if ((GET_CODE (x) == SYMBOL_REF)
5344       || GET_CODE (x) == LABEL_REF || GET_CODE (x) == CONST)
5345     reloc_diagnostic (x);
5346 
5347   return default_assemble_integer (x, size, aligned_p);
5348 }
5349 
5350 static void
5351 spu_asm_globalize_label (FILE * file, const char *name)
5352 {
5353   fputs ("\t.global\t", file);
5354   assemble_name (file, name);
5355   fputs ("\n", file);
5356 }
5357 
5358 static bool
5359 spu_rtx_costs (rtx x, int code, int outer_code ATTRIBUTE_UNUSED, int *total,
5360 	       bool speed ATTRIBUTE_UNUSED)
5361 {
5362   enum machine_mode mode = GET_MODE (x);
5363   int cost = COSTS_N_INSNS (2);
5364 
5365   /* Folding to a CONST_VECTOR will use extra space but there might
5366      be only a small savings in cycles.  We'd like to use a CONST_VECTOR
5367      only if it allows us to fold away multiple insns.  Changing the cost
5368      of a CONST_VECTOR here (or in CONST_COSTS) doesn't help though
5369      because this cost will only be compared against a single insn.
5370      if (code == CONST_VECTOR)
5371        return (LEGITIMATE_CONSTANT_P(x)) ? cost : COSTS_N_INSNS(6);
5372    */
5373 
5374   /* Use defaults for float operations.  Not accurate but good enough. */
5375   if (mode == DFmode)
5376     {
5377       *total = COSTS_N_INSNS (13);
5378       return true;
5379     }
5380   if (mode == SFmode)
5381     {
5382       *total = COSTS_N_INSNS (6);
5383       return true;
5384     }
5385   switch (code)
5386     {
5387     case CONST_INT:
5388       if (satisfies_constraint_K (x))
5389 	*total = 0;
5390       else if (INTVAL (x) >= -0x80000000ll && INTVAL (x) <= 0xffffffffll)
5391 	*total = COSTS_N_INSNS (1);
5392       else
5393 	*total = COSTS_N_INSNS (3);
5394       return true;
5395 
5396     case CONST:
5397       *total = COSTS_N_INSNS (3);
5398       return true;
5399 
5400     case LABEL_REF:
5401     case SYMBOL_REF:
5402       *total = COSTS_N_INSNS (0);
5403       return true;
5404 
5405     case CONST_DOUBLE:
5406       *total = COSTS_N_INSNS (5);
5407       return true;
5408 
5409     case FLOAT_EXTEND:
5410     case FLOAT_TRUNCATE:
5411     case FLOAT:
5412     case UNSIGNED_FLOAT:
5413     case FIX:
5414     case UNSIGNED_FIX:
5415       *total = COSTS_N_INSNS (7);
5416       return true;
5417 
5418     case PLUS:
5419       if (mode == TImode)
5420 	{
5421 	  *total = COSTS_N_INSNS (9);
5422 	  return true;
5423 	}
5424       break;
5425 
5426     case MULT:
5427       cost =
5428 	GET_CODE (XEXP (x, 0)) ==
5429 	REG ? COSTS_N_INSNS (12) : COSTS_N_INSNS (7);
5430       if (mode == SImode && GET_CODE (XEXP (x, 0)) == REG)
5431 	{
5432 	  if (GET_CODE (XEXP (x, 1)) == CONST_INT)
5433 	    {
5434 	      HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
5435 	      cost = COSTS_N_INSNS (14);
5436 	      if ((val & 0xffff) == 0)
5437 		cost = COSTS_N_INSNS (9);
5438 	      else if (val > 0 && val < 0x10000)
5439 		cost = COSTS_N_INSNS (11);
5440 	    }
5441 	}
5442       *total = cost;
5443       return true;
5444     case DIV:
5445     case UDIV:
5446     case MOD:
5447     case UMOD:
5448       *total = COSTS_N_INSNS (20);
5449       return true;
5450     case ROTATE:
5451     case ROTATERT:
5452     case ASHIFT:
5453     case ASHIFTRT:
5454     case LSHIFTRT:
5455       *total = COSTS_N_INSNS (4);
5456       return true;
5457     case UNSPEC:
5458       if (XINT (x, 1) == UNSPEC_CONVERT)
5459 	*total = COSTS_N_INSNS (0);
5460       else
5461 	*total = COSTS_N_INSNS (4);
5462       return true;
5463     }
5464   /* Scale cost by mode size.  Except when initializing (cfun->decl == 0). */
5465   if (GET_MODE_CLASS (mode) == MODE_INT
5466       && GET_MODE_SIZE (mode) > GET_MODE_SIZE (SImode) && cfun && cfun->decl)
5467     cost = cost * (GET_MODE_SIZE (mode) / GET_MODE_SIZE (SImode))
5468       * (GET_MODE_SIZE (mode) / GET_MODE_SIZE (SImode));
5469   *total = cost;
5470   return true;
5471 }
5472 
5473 static enum machine_mode
5474 spu_unwind_word_mode (void)
5475 {
5476   return SImode;
5477 }
5478 
5479 /* Decide whether we can make a sibling call to a function.  DECL is the
5480    declaration of the function being targeted by the call and EXP is the
5481    CALL_EXPR representing the call.  */
5482 static bool
5483 spu_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
5484 {
5485   return decl && !TARGET_LARGE_MEM;
5486 }
5487 
5488 /* We need to correctly update the back chain pointer and the Available
5489    Stack Size (which is in the second slot of the sp register.) */
5490 void
5491 spu_allocate_stack (rtx op0, rtx op1)
5492 {
5493   HOST_WIDE_INT v;
5494   rtx chain = gen_reg_rtx (V4SImode);
5495   rtx stack_bot = gen_frame_mem (V4SImode, stack_pointer_rtx);
5496   rtx sp = gen_reg_rtx (V4SImode);
5497   rtx splatted = gen_reg_rtx (V4SImode);
5498   rtx pat = gen_reg_rtx (TImode);
5499 
5500   /* copy the back chain so we can save it back again. */
5501   emit_move_insn (chain, stack_bot);
5502 
5503   op1 = force_reg (SImode, op1);
5504 
5505   v = 0x1020300010203ll;
5506   emit_move_insn (pat, immed_double_const (v, v, TImode));
5507   emit_insn (gen_shufb (splatted, op1, op1, pat));
5508 
5509   emit_insn (gen_spu_convert (sp, stack_pointer_rtx));
5510   emit_insn (gen_subv4si3 (sp, sp, splatted));
5511 
5512   if (flag_stack_check)
5513     {
5514       rtx avail = gen_reg_rtx(SImode);
5515       rtx result = gen_reg_rtx(SImode);
5516       emit_insn (gen_vec_extractv4si (avail, sp, GEN_INT (1)));
5517       emit_insn (gen_cgt_si(result, avail, GEN_INT (-1)));
5518       emit_insn (gen_spu_heq (result, GEN_INT(0) ));
5519     }
5520 
5521   emit_insn (gen_spu_convert (stack_pointer_rtx, sp));
5522 
5523   emit_move_insn (stack_bot, chain);
5524 
5525   emit_move_insn (op0, virtual_stack_dynamic_rtx);
5526 }
5527 
5528 void
5529 spu_restore_stack_nonlocal (rtx op0 ATTRIBUTE_UNUSED, rtx op1)
5530 {
5531   static unsigned char arr[16] =
5532     { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 };
5533   rtx temp = gen_reg_rtx (SImode);
5534   rtx temp2 = gen_reg_rtx (SImode);
5535   rtx temp3 = gen_reg_rtx (V4SImode);
5536   rtx temp4 = gen_reg_rtx (V4SImode);
5537   rtx pat = gen_reg_rtx (TImode);
5538   rtx sp = gen_rtx_REG (V4SImode, STACK_POINTER_REGNUM);
5539 
5540   /* Restore the backchain from the first word, sp from the second.  */
5541   emit_move_insn (temp2, adjust_address_nv (op1, SImode, 0));
5542   emit_move_insn (temp, adjust_address_nv (op1, SImode, 4));
5543 
5544   emit_move_insn (pat, array_to_constant (TImode, arr));
5545 
5546   /* Compute Available Stack Size for sp */
5547   emit_insn (gen_subsi3 (temp, temp, stack_pointer_rtx));
5548   emit_insn (gen_shufb (temp3, temp, temp, pat));
5549 
5550   /* Compute Available Stack Size for back chain */
5551   emit_insn (gen_subsi3 (temp2, temp2, stack_pointer_rtx));
5552   emit_insn (gen_shufb (temp4, temp2, temp2, pat));
5553   emit_insn (gen_addv4si3 (temp4, sp, temp4));
5554 
5555   emit_insn (gen_addv4si3 (sp, sp, temp3));
5556   emit_move_insn (gen_frame_mem (V4SImode, stack_pointer_rtx), temp4);
5557 }
5558 
5559 static void
5560 spu_init_libfuncs (void)
5561 {
5562   set_optab_libfunc (smul_optab, DImode, "__muldi3");
5563   set_optab_libfunc (sdiv_optab, DImode, "__divdi3");
5564   set_optab_libfunc (smod_optab, DImode, "__moddi3");
5565   set_optab_libfunc (udiv_optab, DImode, "__udivdi3");
5566   set_optab_libfunc (umod_optab, DImode, "__umoddi3");
5567   set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
5568   set_optab_libfunc (ffs_optab, DImode, "__ffsdi2");
5569   set_optab_libfunc (clz_optab, DImode, "__clzdi2");
5570   set_optab_libfunc (ctz_optab, DImode, "__ctzdi2");
5571   set_optab_libfunc (popcount_optab, DImode, "__popcountdi2");
5572   set_optab_libfunc (parity_optab, DImode, "__paritydi2");
5573 
5574   set_conv_libfunc (ufloat_optab, DFmode, SImode, "__float_unssidf");
5575   set_conv_libfunc (ufloat_optab, DFmode, DImode, "__float_unsdidf");
5576 
5577   set_optab_libfunc (smul_optab, TImode, "__multi3");
5578   set_optab_libfunc (sdiv_optab, TImode, "__divti3");
5579   set_optab_libfunc (smod_optab, TImode, "__modti3");
5580   set_optab_libfunc (udiv_optab, TImode, "__udivti3");
5581   set_optab_libfunc (umod_optab, TImode, "__umodti3");
5582   set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
5583 }
5584 
5585 /* Make a subreg, stripping any existing subreg.  We could possibly just
5586    call simplify_subreg, but in this case we know what we want. */
5587 rtx
5588 spu_gen_subreg (enum machine_mode mode, rtx x)
5589 {
5590   if (GET_CODE (x) == SUBREG)
5591     x = SUBREG_REG (x);
5592   if (GET_MODE (x) == mode)
5593     return x;
5594   return gen_rtx_SUBREG (mode, x, 0);
5595 }
5596 
5597 static bool
5598 spu_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
5599 {
5600   return (TYPE_MODE (type) == BLKmode
5601 	  && ((type) == 0
5602 	      || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST
5603 	      || int_size_in_bytes (type) >
5604 	      (MAX_REGISTER_RETURN * UNITS_PER_WORD)));
5605 }
5606 
5607 /* Create the built-in types and functions */
5608 
5609 enum spu_function_code
5610 {
5611 #define DEF_BUILTIN(fcode, icode, name, type, params) fcode,
5612 #include "spu-builtins.def"
5613 #undef DEF_BUILTIN
5614    NUM_SPU_BUILTINS
5615 };
5616 
5617 extern GTY(()) struct spu_builtin_description spu_builtins[NUM_SPU_BUILTINS];
5618 
5619 struct spu_builtin_description spu_builtins[] = {
5620 #define DEF_BUILTIN(fcode, icode, name, type, params) \
5621   {fcode, icode, name, type, params, NULL_TREE},
5622 #include "spu-builtins.def"
5623 #undef DEF_BUILTIN
5624 };
5625 
5626 /* Returns the rs6000 builtin decl for CODE.  */
5627 
5628 static tree
5629 spu_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
5630 {
5631   if (code >= NUM_SPU_BUILTINS)
5632     return error_mark_node;
5633 
5634   return spu_builtins[code].fndecl;
5635 }
5636 
5637 
5638 static void
5639 spu_init_builtins (void)
5640 {
5641   struct spu_builtin_description *d;
5642   unsigned int i;
5643 
5644   V16QI_type_node = build_vector_type (intQI_type_node, 16);
5645   V8HI_type_node = build_vector_type (intHI_type_node, 8);
5646   V4SI_type_node = build_vector_type (intSI_type_node, 4);
5647   V2DI_type_node = build_vector_type (intDI_type_node, 2);
5648   V4SF_type_node = build_vector_type (float_type_node, 4);
5649   V2DF_type_node = build_vector_type (double_type_node, 2);
5650 
5651   unsigned_V16QI_type_node = build_vector_type (unsigned_intQI_type_node, 16);
5652   unsigned_V8HI_type_node = build_vector_type (unsigned_intHI_type_node, 8);
5653   unsigned_V4SI_type_node = build_vector_type (unsigned_intSI_type_node, 4);
5654   unsigned_V2DI_type_node = build_vector_type (unsigned_intDI_type_node, 2);
5655 
5656   spu_builtin_types[SPU_BTI_QUADWORD] = V16QI_type_node;
5657 
5658   spu_builtin_types[SPU_BTI_7] = global_trees[TI_INTSI_TYPE];
5659   spu_builtin_types[SPU_BTI_S7] = global_trees[TI_INTSI_TYPE];
5660   spu_builtin_types[SPU_BTI_U7] = global_trees[TI_INTSI_TYPE];
5661   spu_builtin_types[SPU_BTI_S10] = global_trees[TI_INTSI_TYPE];
5662   spu_builtin_types[SPU_BTI_S10_4] = global_trees[TI_INTSI_TYPE];
5663   spu_builtin_types[SPU_BTI_U14] = global_trees[TI_INTSI_TYPE];
5664   spu_builtin_types[SPU_BTI_16] = global_trees[TI_INTSI_TYPE];
5665   spu_builtin_types[SPU_BTI_S16] = global_trees[TI_INTSI_TYPE];
5666   spu_builtin_types[SPU_BTI_S16_2] = global_trees[TI_INTSI_TYPE];
5667   spu_builtin_types[SPU_BTI_U16] = global_trees[TI_INTSI_TYPE];
5668   spu_builtin_types[SPU_BTI_U16_2] = global_trees[TI_INTSI_TYPE];
5669   spu_builtin_types[SPU_BTI_U18] = global_trees[TI_INTSI_TYPE];
5670 
5671   spu_builtin_types[SPU_BTI_INTQI] = global_trees[TI_INTQI_TYPE];
5672   spu_builtin_types[SPU_BTI_INTHI] = global_trees[TI_INTHI_TYPE];
5673   spu_builtin_types[SPU_BTI_INTSI] = global_trees[TI_INTSI_TYPE];
5674   spu_builtin_types[SPU_BTI_INTDI] = global_trees[TI_INTDI_TYPE];
5675   spu_builtin_types[SPU_BTI_UINTQI] = global_trees[TI_UINTQI_TYPE];
5676   spu_builtin_types[SPU_BTI_UINTHI] = global_trees[TI_UINTHI_TYPE];
5677   spu_builtin_types[SPU_BTI_UINTSI] = global_trees[TI_UINTSI_TYPE];
5678   spu_builtin_types[SPU_BTI_UINTDI] = global_trees[TI_UINTDI_TYPE];
5679 
5680   spu_builtin_types[SPU_BTI_FLOAT] = global_trees[TI_FLOAT_TYPE];
5681   spu_builtin_types[SPU_BTI_DOUBLE] = global_trees[TI_DOUBLE_TYPE];
5682 
5683   spu_builtin_types[SPU_BTI_VOID] = global_trees[TI_VOID_TYPE];
5684 
5685   spu_builtin_types[SPU_BTI_PTR] =
5686     build_pointer_type (build_qualified_type
5687 			(void_type_node,
5688 			 TYPE_QUAL_CONST | TYPE_QUAL_VOLATILE));
5689 
5690   /* For each builtin we build a new prototype.  The tree code will make
5691      sure nodes are shared. */
5692   for (i = 0, d = spu_builtins; i < NUM_SPU_BUILTINS; i++, d++)
5693     {
5694       tree p;
5695       char name[64];		/* build_function will make a copy. */
5696       int parm;
5697 
5698       if (d->name == 0)
5699 	continue;
5700 
5701       /* Find last parm.  */
5702       for (parm = 1; d->parm[parm] != SPU_BTI_END_OF_PARAMS; parm++)
5703 	;
5704 
5705       p = void_list_node;
5706       while (parm > 1)
5707 	p = tree_cons (NULL_TREE, spu_builtin_types[d->parm[--parm]], p);
5708 
5709       p = build_function_type (spu_builtin_types[d->parm[0]], p);
5710 
5711       sprintf (name, "__builtin_%s", d->name);
5712       d->fndecl =
5713 	add_builtin_function (name, p, END_BUILTINS + i, BUILT_IN_MD,
5714 			      NULL, NULL_TREE);
5715       if (d->fcode == SPU_MASK_FOR_LOAD)
5716 	TREE_READONLY (d->fndecl) = 1;
5717 
5718       /* These builtins don't throw.  */
5719       TREE_NOTHROW (d->fndecl) = 1;
5720     }
5721 }
5722 
5723 void
5724 spu_restore_stack_block (rtx op0 ATTRIBUTE_UNUSED, rtx op1)
5725 {
5726   static unsigned char arr[16] =
5727     { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 };
5728 
5729   rtx temp = gen_reg_rtx (Pmode);
5730   rtx temp2 = gen_reg_rtx (V4SImode);
5731   rtx temp3 = gen_reg_rtx (V4SImode);
5732   rtx pat = gen_reg_rtx (TImode);
5733   rtx sp = gen_rtx_REG (V4SImode, STACK_POINTER_REGNUM);
5734 
5735   emit_move_insn (pat, array_to_constant (TImode, arr));
5736 
5737   /* Restore the sp.  */
5738   emit_move_insn (temp, op1);
5739   emit_move_insn (temp2, gen_frame_mem (V4SImode, stack_pointer_rtx));
5740 
5741   /* Compute available stack size for sp.  */
5742   emit_insn (gen_subsi3 (temp, temp, stack_pointer_rtx));
5743   emit_insn (gen_shufb (temp3, temp, temp, pat));
5744 
5745   emit_insn (gen_addv4si3 (sp, sp, temp3));
5746   emit_move_insn (gen_frame_mem (V4SImode, stack_pointer_rtx), temp2);
5747 }
5748 
5749 int
5750 spu_safe_dma (HOST_WIDE_INT channel)
5751 {
5752   return TARGET_SAFE_DMA && channel >= 21 && channel <= 27;
5753 }
5754 
5755 void
5756 spu_builtin_splats (rtx ops[])
5757 {
5758   enum machine_mode mode = GET_MODE (ops[0]);
5759   if (GET_CODE (ops[1]) == CONST_INT || GET_CODE (ops[1]) == CONST_DOUBLE)
5760     {
5761       unsigned char arr[16];
5762       constant_to_array (GET_MODE_INNER (mode), ops[1], arr);
5763       emit_move_insn (ops[0], array_to_constant (mode, arr));
5764     }
5765   else
5766     {
5767       rtx reg = gen_reg_rtx (TImode);
5768       rtx shuf;
5769       if (GET_CODE (ops[1]) != REG
5770 	  && GET_CODE (ops[1]) != SUBREG)
5771 	ops[1] = force_reg (GET_MODE_INNER (mode), ops[1]);
5772       switch (mode)
5773 	{
5774 	case V2DImode:
5775 	case V2DFmode:
5776 	  shuf =
5777 	    immed_double_const (0x0001020304050607ll, 0x1011121314151617ll,
5778 				TImode);
5779 	  break;
5780 	case V4SImode:
5781 	case V4SFmode:
5782 	  shuf =
5783 	    immed_double_const (0x0001020300010203ll, 0x0001020300010203ll,
5784 				TImode);
5785 	  break;
5786 	case V8HImode:
5787 	  shuf =
5788 	    immed_double_const (0x0203020302030203ll, 0x0203020302030203ll,
5789 				TImode);
5790 	  break;
5791 	case V16QImode:
5792 	  shuf =
5793 	    immed_double_const (0x0303030303030303ll, 0x0303030303030303ll,
5794 				TImode);
5795 	  break;
5796 	default:
5797 	  abort ();
5798 	}
5799       emit_move_insn (reg, shuf);
5800       emit_insn (gen_shufb (ops[0], ops[1], ops[1], reg));
5801     }
5802 }
5803 
5804 void
5805 spu_builtin_extract (rtx ops[])
5806 {
5807   enum machine_mode mode;
5808   rtx rot, from, tmp;
5809 
5810   mode = GET_MODE (ops[1]);
5811 
5812   if (GET_CODE (ops[2]) == CONST_INT)
5813     {
5814       switch (mode)
5815 	{
5816 	case V16QImode:
5817 	  emit_insn (gen_vec_extractv16qi (ops[0], ops[1], ops[2]));
5818 	  break;
5819 	case V8HImode:
5820 	  emit_insn (gen_vec_extractv8hi (ops[0], ops[1], ops[2]));
5821 	  break;
5822 	case V4SFmode:
5823 	  emit_insn (gen_vec_extractv4sf (ops[0], ops[1], ops[2]));
5824 	  break;
5825 	case V4SImode:
5826 	  emit_insn (gen_vec_extractv4si (ops[0], ops[1], ops[2]));
5827 	  break;
5828 	case V2DImode:
5829 	  emit_insn (gen_vec_extractv2di (ops[0], ops[1], ops[2]));
5830 	  break;
5831 	case V2DFmode:
5832 	  emit_insn (gen_vec_extractv2df (ops[0], ops[1], ops[2]));
5833 	  break;
5834 	default:
5835 	  abort ();
5836 	}
5837       return;
5838     }
5839 
5840   from = spu_gen_subreg (TImode, ops[1]);
5841   rot = gen_reg_rtx (TImode);
5842   tmp = gen_reg_rtx (SImode);
5843 
5844   switch (mode)
5845     {
5846     case V16QImode:
5847       emit_insn (gen_addsi3 (tmp, ops[2], GEN_INT (-3)));
5848       break;
5849     case V8HImode:
5850       emit_insn (gen_addsi3 (tmp, ops[2], ops[2]));
5851       emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (-2)));
5852       break;
5853     case V4SFmode:
5854     case V4SImode:
5855       emit_insn (gen_ashlsi3 (tmp, ops[2], GEN_INT (2)));
5856       break;
5857     case V2DImode:
5858     case V2DFmode:
5859       emit_insn (gen_ashlsi3 (tmp, ops[2], GEN_INT (3)));
5860       break;
5861     default:
5862       abort ();
5863     }
5864   emit_insn (gen_rotqby_ti (rot, from, tmp));
5865 
5866   emit_insn (gen_spu_convert (ops[0], rot));
5867 }
5868 
5869 void
5870 spu_builtin_insert (rtx ops[])
5871 {
5872   enum machine_mode mode = GET_MODE (ops[0]);
5873   enum machine_mode imode = GET_MODE_INNER (mode);
5874   rtx mask = gen_reg_rtx (TImode);
5875   rtx offset;
5876 
5877   if (GET_CODE (ops[3]) == CONST_INT)
5878     offset = GEN_INT (INTVAL (ops[3]) * GET_MODE_SIZE (imode));
5879   else
5880     {
5881       offset = gen_reg_rtx (SImode);
5882       emit_insn (gen_mulsi3
5883 		 (offset, ops[3], GEN_INT (GET_MODE_SIZE (imode))));
5884     }
5885   emit_insn (gen_cpat
5886 	     (mask, stack_pointer_rtx, offset,
5887 	      GEN_INT (GET_MODE_SIZE (imode))));
5888   emit_insn (gen_shufb (ops[0], ops[1], ops[2], mask));
5889 }
5890 
5891 void
5892 spu_builtin_promote (rtx ops[])
5893 {
5894   enum machine_mode mode, imode;
5895   rtx rot, from, offset;
5896   HOST_WIDE_INT pos;
5897 
5898   mode = GET_MODE (ops[0]);
5899   imode = GET_MODE_INNER (mode);
5900 
5901   from = gen_reg_rtx (TImode);
5902   rot = spu_gen_subreg (TImode, ops[0]);
5903 
5904   emit_insn (gen_spu_convert (from, ops[1]));
5905 
5906   if (GET_CODE (ops[2]) == CONST_INT)
5907     {
5908       pos = -GET_MODE_SIZE (imode) * INTVAL (ops[2]);
5909       if (GET_MODE_SIZE (imode) < 4)
5910 	pos += 4 - GET_MODE_SIZE (imode);
5911       offset = GEN_INT (pos & 15);
5912     }
5913   else
5914     {
5915       offset = gen_reg_rtx (SImode);
5916       switch (mode)
5917 	{
5918 	case V16QImode:
5919 	  emit_insn (gen_subsi3 (offset, GEN_INT (3), ops[2]));
5920 	  break;
5921 	case V8HImode:
5922 	  emit_insn (gen_subsi3 (offset, GEN_INT (1), ops[2]));
5923 	  emit_insn (gen_addsi3 (offset, offset, offset));
5924 	  break;
5925 	case V4SFmode:
5926 	case V4SImode:
5927 	  emit_insn (gen_subsi3 (offset, GEN_INT (0), ops[2]));
5928 	  emit_insn (gen_ashlsi3 (offset, offset, GEN_INT (2)));
5929 	  break;
5930 	case V2DImode:
5931 	case V2DFmode:
5932 	  emit_insn (gen_ashlsi3 (offset, ops[2], GEN_INT (3)));
5933 	  break;
5934 	default:
5935 	  abort ();
5936 	}
5937     }
5938   emit_insn (gen_rotqby_ti (rot, from, offset));
5939 }
5940 
5941 static void
5942 spu_trampoline_init (rtx m_tramp, tree fndecl, rtx cxt)
5943 {
5944   rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
5945   rtx shuf = gen_reg_rtx (V4SImode);
5946   rtx insn = gen_reg_rtx (V4SImode);
5947   rtx shufc;
5948   rtx insnc;
5949   rtx mem;
5950 
5951   fnaddr = force_reg (SImode, fnaddr);
5952   cxt = force_reg (SImode, cxt);
5953 
5954   if (TARGET_LARGE_MEM)
5955     {
5956       rtx rotl = gen_reg_rtx (V4SImode);
5957       rtx mask = gen_reg_rtx (V4SImode);
5958       rtx bi = gen_reg_rtx (SImode);
5959       static unsigned char const shufa[16] = {
5960 	2, 3, 0, 1, 18, 19, 16, 17,
5961 	0, 1, 2, 3, 16, 17, 18, 19
5962       };
5963       static unsigned char const insna[16] = {
5964 	0x41, 0, 0, 79,
5965 	0x41, 0, 0, STATIC_CHAIN_REGNUM,
5966 	0x60, 0x80, 0, 79,
5967 	0x60, 0x80, 0, STATIC_CHAIN_REGNUM
5968       };
5969 
5970       shufc = force_reg (TImode, array_to_constant (TImode, shufa));
5971       insnc = force_reg (V4SImode, array_to_constant (V4SImode, insna));
5972 
5973       emit_insn (gen_shufb (shuf, fnaddr, cxt, shufc));
5974       emit_insn (gen_vrotlv4si3 (rotl, shuf, spu_const (V4SImode, 7)));
5975       emit_insn (gen_movv4si (mask, spu_const (V4SImode, 0xffff << 7)));
5976       emit_insn (gen_selb (insn, insnc, rotl, mask));
5977 
5978       mem = adjust_address (m_tramp, V4SImode, 0);
5979       emit_move_insn (mem, insn);
5980 
5981       emit_move_insn (bi, GEN_INT (0x35000000 + (79 << 7)));
5982       mem = adjust_address (m_tramp, Pmode, 16);
5983       emit_move_insn (mem, bi);
5984     }
5985   else
5986     {
5987       rtx scxt = gen_reg_rtx (SImode);
5988       rtx sfnaddr = gen_reg_rtx (SImode);
5989       static unsigned char const insna[16] = {
5990 	0x42, 0, 0, STATIC_CHAIN_REGNUM,
5991 	0x30, 0, 0, 0,
5992 	0, 0, 0, 0,
5993 	0, 0, 0, 0
5994       };
5995 
5996       shufc = gen_reg_rtx (TImode);
5997       insnc = force_reg (V4SImode, array_to_constant (V4SImode, insna));
5998 
5999       /* By or'ing all of cxt with the ila opcode we are assuming cxt
6000 	 fits 18 bits and the last 4 are zeros.  This will be true if
6001 	 the stack pointer is initialized to 0x3fff0 at program start,
6002 	 otherwise the ila instruction will be garbage. */
6003 
6004       emit_insn (gen_ashlsi3 (scxt, cxt, GEN_INT (7)));
6005       emit_insn (gen_ashlsi3 (sfnaddr, fnaddr, GEN_INT (5)));
6006       emit_insn (gen_cpat
6007 		 (shufc, stack_pointer_rtx, GEN_INT (4), GEN_INT (4)));
6008       emit_insn (gen_shufb (shuf, sfnaddr, scxt, shufc));
6009       emit_insn (gen_iorv4si3 (insn, insnc, shuf));
6010 
6011       mem = adjust_address (m_tramp, V4SImode, 0);
6012       emit_move_insn (mem, insn);
6013     }
6014   emit_insn (gen_sync ());
6015 }
6016 
6017 void
6018 spu_expand_sign_extend (rtx ops[])
6019 {
6020   unsigned char arr[16];
6021   rtx pat = gen_reg_rtx (TImode);
6022   rtx sign, c;
6023   int i, last;
6024   last = GET_MODE (ops[0]) == DImode ? 7 : 15;
6025   if (GET_MODE (ops[1]) == QImode)
6026     {
6027       sign = gen_reg_rtx (HImode);
6028       emit_insn (gen_extendqihi2 (sign, ops[1]));
6029       for (i = 0; i < 16; i++)
6030 	arr[i] = 0x12;
6031       arr[last] = 0x13;
6032     }
6033   else
6034     {
6035       for (i = 0; i < 16; i++)
6036 	arr[i] = 0x10;
6037       switch (GET_MODE (ops[1]))
6038 	{
6039 	case HImode:
6040 	  sign = gen_reg_rtx (SImode);
6041 	  emit_insn (gen_extendhisi2 (sign, ops[1]));
6042 	  arr[last] = 0x03;
6043 	  arr[last - 1] = 0x02;
6044 	  break;
6045 	case SImode:
6046 	  sign = gen_reg_rtx (SImode);
6047 	  emit_insn (gen_ashrsi3 (sign, ops[1], GEN_INT (31)));
6048 	  for (i = 0; i < 4; i++)
6049 	    arr[last - i] = 3 - i;
6050 	  break;
6051 	case DImode:
6052 	  sign = gen_reg_rtx (SImode);
6053 	  c = gen_reg_rtx (SImode);
6054 	  emit_insn (gen_spu_convert (c, ops[1]));
6055 	  emit_insn (gen_ashrsi3 (sign, c, GEN_INT (31)));
6056 	  for (i = 0; i < 8; i++)
6057 	    arr[last - i] = 7 - i;
6058 	  break;
6059 	default:
6060 	  abort ();
6061 	}
6062     }
6063   emit_move_insn (pat, array_to_constant (TImode, arr));
6064   emit_insn (gen_shufb (ops[0], ops[1], sign, pat));
6065 }
6066 
6067 /* expand vector initialization. If there are any constant parts,
6068    load constant parts first. Then load any non-constant parts.  */
6069 void
6070 spu_expand_vector_init (rtx target, rtx vals)
6071 {
6072   enum machine_mode mode = GET_MODE (target);
6073   int n_elts = GET_MODE_NUNITS (mode);
6074   int n_var = 0;
6075   bool all_same = true;
6076   rtx first, x = NULL_RTX, first_constant = NULL_RTX;
6077   int i;
6078 
6079   first = XVECEXP (vals, 0, 0);
6080   for (i = 0; i < n_elts; ++i)
6081     {
6082       x = XVECEXP (vals, 0, i);
6083       if (!(CONST_INT_P (x)
6084 	    || GET_CODE (x) == CONST_DOUBLE
6085 	    || GET_CODE (x) == CONST_FIXED))
6086 	++n_var;
6087       else
6088 	{
6089 	  if (first_constant == NULL_RTX)
6090 	    first_constant = x;
6091 	}
6092       if (i > 0 && !rtx_equal_p (x, first))
6093 	all_same = false;
6094     }
6095 
6096   /* if all elements are the same, use splats to repeat elements */
6097   if (all_same)
6098     {
6099       if (!CONSTANT_P (first)
6100 	  && !register_operand (first, GET_MODE (x)))
6101 	first = force_reg (GET_MODE (first), first);
6102       emit_insn (gen_spu_splats (target, first));
6103       return;
6104     }
6105 
6106   /* load constant parts */
6107   if (n_var != n_elts)
6108     {
6109       if (n_var == 0)
6110 	{
6111 	  emit_move_insn (target,
6112 			  gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
6113 	}
6114       else
6115 	{
6116 	  rtx constant_parts_rtx = copy_rtx (vals);
6117 
6118 	  gcc_assert (first_constant != NULL_RTX);
6119 	  /* fill empty slots with the first constant, this increases
6120 	     our chance of using splats in the recursive call below. */
6121 	  for (i = 0; i < n_elts; ++i)
6122 	    {
6123 	      x = XVECEXP (constant_parts_rtx, 0, i);
6124 	      if (!(CONST_INT_P (x)
6125 		    || GET_CODE (x) == CONST_DOUBLE
6126 		    || GET_CODE (x) == CONST_FIXED))
6127 		XVECEXP (constant_parts_rtx, 0, i) = first_constant;
6128 	    }
6129 
6130 	  spu_expand_vector_init (target, constant_parts_rtx);
6131 	}
6132     }
6133 
6134   /* load variable parts */
6135   if (n_var != 0)
6136     {
6137       rtx insert_operands[4];
6138 
6139       insert_operands[0] = target;
6140       insert_operands[2] = target;
6141       for (i = 0; i < n_elts; ++i)
6142 	{
6143 	  x = XVECEXP (vals, 0, i);
6144 	  if (!(CONST_INT_P (x)
6145 		|| GET_CODE (x) == CONST_DOUBLE
6146 		|| GET_CODE (x) == CONST_FIXED))
6147 	    {
6148 	      if (!register_operand (x, GET_MODE (x)))
6149 		x = force_reg (GET_MODE (x), x);
6150 	      insert_operands[1] = x;
6151 	      insert_operands[3] = GEN_INT (i);
6152 	      spu_builtin_insert (insert_operands);
6153 	    }
6154 	}
6155     }
6156 }
6157 
6158 /* Return insn index for the vector compare instruction for given CODE,
6159    and DEST_MODE, OP_MODE. Return -1 if valid insn is not available.  */
6160 
6161 static int
6162 get_vec_cmp_insn (enum rtx_code code,
6163                   enum machine_mode dest_mode,
6164                   enum machine_mode op_mode)
6165 
6166 {
6167   switch (code)
6168     {
6169     case EQ:
6170       if (dest_mode == V16QImode && op_mode == V16QImode)
6171         return CODE_FOR_ceq_v16qi;
6172       if (dest_mode == V8HImode && op_mode == V8HImode)
6173         return CODE_FOR_ceq_v8hi;
6174       if (dest_mode == V4SImode && op_mode == V4SImode)
6175         return CODE_FOR_ceq_v4si;
6176       if (dest_mode == V4SImode && op_mode == V4SFmode)
6177         return CODE_FOR_ceq_v4sf;
6178       if (dest_mode == V2DImode && op_mode == V2DFmode)
6179         return CODE_FOR_ceq_v2df;
6180       break;
6181     case GT:
6182       if (dest_mode == V16QImode && op_mode == V16QImode)
6183         return CODE_FOR_cgt_v16qi;
6184       if (dest_mode == V8HImode && op_mode == V8HImode)
6185         return CODE_FOR_cgt_v8hi;
6186       if (dest_mode == V4SImode && op_mode == V4SImode)
6187         return CODE_FOR_cgt_v4si;
6188       if (dest_mode == V4SImode && op_mode == V4SFmode)
6189         return CODE_FOR_cgt_v4sf;
6190       if (dest_mode == V2DImode && op_mode == V2DFmode)
6191         return CODE_FOR_cgt_v2df;
6192       break;
6193     case GTU:
6194       if (dest_mode == V16QImode && op_mode == V16QImode)
6195         return CODE_FOR_clgt_v16qi;
6196       if (dest_mode == V8HImode && op_mode == V8HImode)
6197         return CODE_FOR_clgt_v8hi;
6198       if (dest_mode == V4SImode && op_mode == V4SImode)
6199         return CODE_FOR_clgt_v4si;
6200       break;
6201     default:
6202       break;
6203     }
6204   return -1;
6205 }
6206 
6207 /* Emit vector compare for operands OP0 and OP1 using code RCODE.
6208    DMODE is expected destination mode. This is a recursive function.  */
6209 
6210 static rtx
6211 spu_emit_vector_compare (enum rtx_code rcode,
6212                          rtx op0, rtx op1,
6213                          enum machine_mode dmode)
6214 {
6215   int vec_cmp_insn;
6216   rtx mask;
6217   enum machine_mode dest_mode;
6218   enum machine_mode op_mode = GET_MODE (op1);
6219 
6220   gcc_assert (GET_MODE (op0) == GET_MODE (op1));
6221 
6222   /* Floating point vector compare instructions uses destination V4SImode.
6223      Double floating point vector compare instructions uses destination V2DImode.
6224      Move destination to appropriate mode later.  */
6225   if (dmode == V4SFmode)
6226     dest_mode = V4SImode;
6227   else if (dmode == V2DFmode)
6228     dest_mode = V2DImode;
6229   else
6230     dest_mode = dmode;
6231 
6232   mask = gen_reg_rtx (dest_mode);
6233   vec_cmp_insn = get_vec_cmp_insn (rcode, dest_mode, op_mode);
6234 
6235   if (vec_cmp_insn == -1)
6236     {
6237       bool swap_operands = false;
6238       bool try_again = false;
6239       switch (rcode)
6240         {
6241         case LT:
6242           rcode = GT;
6243           swap_operands = true;
6244           try_again = true;
6245           break;
6246         case LTU:
6247           rcode = GTU;
6248           swap_operands = true;
6249           try_again = true;
6250           break;
6251         case NE:
6252           /* Treat A != B as ~(A==B).  */
6253           {
6254             enum insn_code nor_code;
6255             rtx eq_rtx = spu_emit_vector_compare (EQ, op0, op1, dest_mode);
6256             nor_code = optab_handler (one_cmpl_optab, (int)dest_mode)->insn_code;
6257             gcc_assert (nor_code != CODE_FOR_nothing);
6258             emit_insn (GEN_FCN (nor_code) (mask, eq_rtx));
6259             if (dmode != dest_mode)
6260               {
6261                 rtx temp = gen_reg_rtx (dest_mode);
6262                 convert_move (temp, mask, 0);
6263                 return temp;
6264               }
6265             return mask;
6266           }
6267           break;
6268         case GE:
6269         case GEU:
6270         case LE:
6271         case LEU:
6272           /* Try GT/GTU/LT/LTU OR EQ */
6273           {
6274             rtx c_rtx, eq_rtx;
6275             enum insn_code ior_code;
6276             enum rtx_code new_code;
6277 
6278             switch (rcode)
6279               {
6280               case GE:  new_code = GT;  break;
6281               case GEU: new_code = GTU; break;
6282               case LE:  new_code = LT;  break;
6283               case LEU: new_code = LTU; break;
6284               default:
6285                 gcc_unreachable ();
6286               }
6287 
6288             c_rtx = spu_emit_vector_compare (new_code, op0, op1, dest_mode);
6289             eq_rtx = spu_emit_vector_compare (EQ, op0, op1, dest_mode);
6290 
6291             ior_code = optab_handler (ior_optab, (int)dest_mode)->insn_code;
6292             gcc_assert (ior_code != CODE_FOR_nothing);
6293             emit_insn (GEN_FCN (ior_code) (mask, c_rtx, eq_rtx));
6294             if (dmode != dest_mode)
6295               {
6296                 rtx temp = gen_reg_rtx (dest_mode);
6297                 convert_move (temp, mask, 0);
6298                 return temp;
6299               }
6300             return mask;
6301           }
6302           break;
6303         default:
6304           gcc_unreachable ();
6305         }
6306 
6307       /* You only get two chances.  */
6308       if (try_again)
6309           vec_cmp_insn = get_vec_cmp_insn (rcode, dest_mode, op_mode);
6310 
6311       gcc_assert (vec_cmp_insn != -1);
6312 
6313       if (swap_operands)
6314         {
6315           rtx tmp;
6316           tmp = op0;
6317           op0 = op1;
6318           op1 = tmp;
6319         }
6320     }
6321 
6322   emit_insn (GEN_FCN (vec_cmp_insn) (mask, op0, op1));
6323   if (dmode != dest_mode)
6324     {
6325       rtx temp = gen_reg_rtx (dest_mode);
6326       convert_move (temp, mask, 0);
6327       return temp;
6328     }
6329   return mask;
6330 }
6331 
6332 
6333 /* Emit vector conditional expression.
6334    DEST is destination. OP1 and OP2 are two VEC_COND_EXPR operands.
6335    CC_OP0 and CC_OP1 are the two operands for the relation operation COND.  */
6336 
6337 int
6338 spu_emit_vector_cond_expr (rtx dest, rtx op1, rtx op2,
6339                            rtx cond, rtx cc_op0, rtx cc_op1)
6340 {
6341   enum machine_mode dest_mode = GET_MODE (dest);
6342   enum rtx_code rcode = GET_CODE (cond);
6343   rtx mask;
6344 
6345   /* Get the vector mask for the given relational operations.  */
6346   mask = spu_emit_vector_compare (rcode, cc_op0, cc_op1, dest_mode);
6347 
6348   emit_insn(gen_selb (dest, op2, op1, mask));
6349 
6350   return 1;
6351 }
6352 
6353 static rtx
6354 spu_force_reg (enum machine_mode mode, rtx op)
6355 {
6356   rtx x, r;
6357   if (GET_MODE (op) == VOIDmode || GET_MODE (op) == BLKmode)
6358     {
6359       if ((SCALAR_INT_MODE_P (mode) && GET_CODE (op) == CONST_INT)
6360 	  || GET_MODE (op) == BLKmode)
6361 	return force_reg (mode, convert_to_mode (mode, op, 0));
6362       abort ();
6363     }
6364 
6365   r = force_reg (GET_MODE (op), op);
6366   if (GET_MODE_SIZE (GET_MODE (op)) == GET_MODE_SIZE (mode))
6367     {
6368       x = simplify_gen_subreg (mode, r, GET_MODE (op), 0);
6369       if (x)
6370 	return x;
6371     }
6372 
6373   x = gen_reg_rtx (mode);
6374   emit_insn (gen_spu_convert (x, r));
6375   return x;
6376 }
6377 
6378 static void
6379 spu_check_builtin_parm (struct spu_builtin_description *d, rtx op, int p)
6380 {
6381   HOST_WIDE_INT v = 0;
6382   int lsbits;
6383   /* Check the range of immediate operands. */
6384   if (p >= SPU_BTI_7 && p <= SPU_BTI_U18)
6385     {
6386       int range = p - SPU_BTI_7;
6387 
6388       if (!CONSTANT_P (op))
6389 	error ("%s expects an integer literal in the range [%d, %d].",
6390 	       d->name,
6391 	       spu_builtin_range[range].low, spu_builtin_range[range].high);
6392 
6393       if (GET_CODE (op) == CONST
6394 	  && (GET_CODE (XEXP (op, 0)) == PLUS
6395 	      || GET_CODE (XEXP (op, 0)) == MINUS))
6396 	{
6397 	  v = INTVAL (XEXP (XEXP (op, 0), 1));
6398 	  op = XEXP (XEXP (op, 0), 0);
6399 	}
6400       else if (GET_CODE (op) == CONST_INT)
6401 	v = INTVAL (op);
6402       else if (GET_CODE (op) == CONST_VECTOR
6403 	       && GET_CODE (CONST_VECTOR_ELT (op, 0)) == CONST_INT)
6404 	v = INTVAL (CONST_VECTOR_ELT (op, 0));
6405 
6406       /* The default for v is 0 which is valid in every range. */
6407       if (v < spu_builtin_range[range].low
6408 	  || v > spu_builtin_range[range].high)
6409 	error ("%s expects an integer literal in the range [%d, %d]. ("
6410 	       HOST_WIDE_INT_PRINT_DEC ")",
6411 	       d->name,
6412 	       spu_builtin_range[range].low, spu_builtin_range[range].high,
6413 	       v);
6414 
6415       switch (p)
6416 	{
6417 	case SPU_BTI_S10_4:
6418 	  lsbits = 4;
6419 	  break;
6420 	case SPU_BTI_U16_2:
6421 	  /* This is only used in lqa, and stqa.  Even though the insns
6422 	     encode 16 bits of the address (all but the 2 least
6423 	     significant), only 14 bits are used because it is masked to
6424 	     be 16 byte aligned. */
6425 	  lsbits = 4;
6426 	  break;
6427 	case SPU_BTI_S16_2:
6428 	  /* This is used for lqr and stqr. */
6429 	  lsbits = 2;
6430 	  break;
6431 	default:
6432 	  lsbits = 0;
6433 	}
6434 
6435       if (GET_CODE (op) == LABEL_REF
6436 	  || (GET_CODE (op) == SYMBOL_REF
6437 	      && SYMBOL_REF_FUNCTION_P (op))
6438 	  || (v & ((1 << lsbits) - 1)) != 0)
6439 	warning (0, "%d least significant bits of %s are ignored.", lsbits,
6440 		 d->name);
6441     }
6442 }
6443 
6444 
6445 static int
6446 expand_builtin_args (struct spu_builtin_description *d, tree exp,
6447 		     rtx target, rtx ops[])
6448 {
6449   enum insn_code icode = (enum insn_code) d->icode;
6450   int i = 0, a;
6451 
6452   /* Expand the arguments into rtl. */
6453 
6454   if (d->parm[0] != SPU_BTI_VOID)
6455     ops[i++] = target;
6456 
6457   for (a = 0; d->parm[a+1] != SPU_BTI_END_OF_PARAMS; i++, a++)
6458     {
6459       tree arg = CALL_EXPR_ARG (exp, a);
6460       if (arg == 0)
6461 	abort ();
6462       ops[i] = expand_expr (arg, NULL_RTX, VOIDmode, EXPAND_NORMAL);
6463     }
6464 
6465   /* The insn pattern may have additional operands (SCRATCH).
6466      Return the number of actual non-SCRATCH operands.  */
6467   gcc_assert (i <= insn_data[icode].n_operands);
6468   return i;
6469 }
6470 
6471 static rtx
6472 spu_expand_builtin_1 (struct spu_builtin_description *d,
6473 		      tree exp, rtx target)
6474 {
6475   rtx pat;
6476   rtx ops[8];
6477   enum insn_code icode = (enum insn_code) d->icode;
6478   enum machine_mode mode, tmode;
6479   int i, p;
6480   int n_operands;
6481   tree return_type;
6482 
6483   /* Set up ops[] with values from arglist. */
6484   n_operands = expand_builtin_args (d, exp, target, ops);
6485 
6486   /* Handle the target operand which must be operand 0. */
6487   i = 0;
6488   if (d->parm[0] != SPU_BTI_VOID)
6489     {
6490 
6491       /* We prefer the mode specified for the match_operand otherwise
6492          use the mode from the builtin function prototype. */
6493       tmode = insn_data[d->icode].operand[0].mode;
6494       if (tmode == VOIDmode)
6495 	tmode = TYPE_MODE (spu_builtin_types[d->parm[0]]);
6496 
6497       /* Try to use target because not using it can lead to extra copies
6498          and when we are using all of the registers extra copies leads
6499          to extra spills.  */
6500       if (target && GET_CODE (target) == REG && GET_MODE (target) == tmode)
6501 	ops[0] = target;
6502       else
6503 	target = ops[0] = gen_reg_rtx (tmode);
6504 
6505       if (!(*insn_data[icode].operand[0].predicate) (ops[0], tmode))
6506 	abort ();
6507 
6508       i++;
6509     }
6510 
6511   if (d->fcode == SPU_MASK_FOR_LOAD)
6512     {
6513       enum machine_mode mode = insn_data[icode].operand[1].mode;
6514       tree arg;
6515       rtx addr, op, pat;
6516 
6517       /* get addr */
6518       arg = CALL_EXPR_ARG (exp, 0);
6519       gcc_assert (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE);
6520       op = expand_expr (arg, NULL_RTX, Pmode, EXPAND_NORMAL);
6521       addr = memory_address (mode, op);
6522 
6523       /* negate addr */
6524       op = gen_reg_rtx (GET_MODE (addr));
6525       emit_insn (gen_rtx_SET (VOIDmode, op,
6526                  gen_rtx_NEG (GET_MODE (addr), addr)));
6527       op = gen_rtx_MEM (mode, op);
6528 
6529       pat = GEN_FCN (icode) (target, op);
6530       if (!pat)
6531         return 0;
6532       emit_insn (pat);
6533       return target;
6534     }
6535 
6536   /* Ignore align_hint, but still expand it's args in case they have
6537      side effects. */
6538   if (icode == CODE_FOR_spu_align_hint)
6539     return 0;
6540 
6541   /* Handle the rest of the operands. */
6542   for (p = 1; i < n_operands; i++, p++)
6543     {
6544       if (insn_data[d->icode].operand[i].mode != VOIDmode)
6545 	mode = insn_data[d->icode].operand[i].mode;
6546       else
6547 	mode = TYPE_MODE (spu_builtin_types[d->parm[i]]);
6548 
6549       /* mode can be VOIDmode here for labels */
6550 
6551       /* For specific intrinsics with an immediate operand, e.g.,
6552          si_ai(), we sometimes need to convert the scalar argument to a
6553          vector argument by splatting the scalar. */
6554       if (VECTOR_MODE_P (mode)
6555 	  && (GET_CODE (ops[i]) == CONST_INT
6556 	      || GET_MODE_CLASS (GET_MODE (ops[i])) == MODE_INT
6557 	      || GET_MODE_CLASS (GET_MODE (ops[i])) == MODE_FLOAT))
6558 	{
6559 	  if (GET_CODE (ops[i]) == CONST_INT)
6560 	    ops[i] = spu_const (mode, INTVAL (ops[i]));
6561 	  else
6562 	    {
6563 	      rtx reg = gen_reg_rtx (mode);
6564 	      enum machine_mode imode = GET_MODE_INNER (mode);
6565 	      if (!spu_nonmem_operand (ops[i], GET_MODE (ops[i])))
6566 		ops[i] = force_reg (GET_MODE (ops[i]), ops[i]);
6567 	      if (imode != GET_MODE (ops[i]))
6568 		ops[i] = convert_to_mode (imode, ops[i],
6569 					  TYPE_UNSIGNED (spu_builtin_types
6570 							 [d->parm[i]]));
6571 	      emit_insn (gen_spu_splats (reg, ops[i]));
6572 	      ops[i] = reg;
6573 	    }
6574 	}
6575 
6576       spu_check_builtin_parm (d, ops[i], d->parm[p]);
6577 
6578       if (!(*insn_data[icode].operand[i].predicate) (ops[i], mode))
6579 	ops[i] = spu_force_reg (mode, ops[i]);
6580     }
6581 
6582   switch (n_operands)
6583     {
6584     case 0:
6585       pat = GEN_FCN (icode) (0);
6586       break;
6587     case 1:
6588       pat = GEN_FCN (icode) (ops[0]);
6589       break;
6590     case 2:
6591       pat = GEN_FCN (icode) (ops[0], ops[1]);
6592       break;
6593     case 3:
6594       pat = GEN_FCN (icode) (ops[0], ops[1], ops[2]);
6595       break;
6596     case 4:
6597       pat = GEN_FCN (icode) (ops[0], ops[1], ops[2], ops[3]);
6598       break;
6599     case 5:
6600       pat = GEN_FCN (icode) (ops[0], ops[1], ops[2], ops[3], ops[4]);
6601       break;
6602     case 6:
6603       pat = GEN_FCN (icode) (ops[0], ops[1], ops[2], ops[3], ops[4], ops[5]);
6604       break;
6605     default:
6606       abort ();
6607     }
6608 
6609   if (!pat)
6610     abort ();
6611 
6612   if (d->type == B_CALL || d->type == B_BISLED)
6613     emit_call_insn (pat);
6614   else if (d->type == B_JUMP)
6615     {
6616       emit_jump_insn (pat);
6617       emit_barrier ();
6618     }
6619   else
6620     emit_insn (pat);
6621 
6622   return_type = spu_builtin_types[d->parm[0]];
6623   if (d->parm[0] != SPU_BTI_VOID
6624       && GET_MODE (target) != TYPE_MODE (return_type))
6625     {
6626       /* target is the return value.  It should always be the mode of
6627          the builtin function prototype. */
6628       target = spu_force_reg (TYPE_MODE (return_type), target);
6629     }
6630 
6631   return target;
6632 }
6633 
6634 rtx
6635 spu_expand_builtin (tree exp,
6636 		    rtx target,
6637 		    rtx subtarget ATTRIBUTE_UNUSED,
6638 		    enum machine_mode mode ATTRIBUTE_UNUSED,
6639 		    int ignore ATTRIBUTE_UNUSED)
6640 {
6641   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
6642   unsigned int fcode = DECL_FUNCTION_CODE (fndecl) - END_BUILTINS;
6643   struct spu_builtin_description *d;
6644 
6645   if (fcode < NUM_SPU_BUILTINS)
6646     {
6647       d = &spu_builtins[fcode];
6648 
6649       return spu_expand_builtin_1 (d, exp, target);
6650     }
6651   abort ();
6652 }
6653 
6654 /* Implement targetm.vectorize.builtin_mul_widen_even.  */
6655 static tree
6656 spu_builtin_mul_widen_even (tree type)
6657 {
6658   switch (TYPE_MODE (type))
6659     {
6660     case V8HImode:
6661       if (TYPE_UNSIGNED (type))
6662 	return spu_builtins[SPU_MULE_0].fndecl;
6663       else
6664 	return spu_builtins[SPU_MULE_1].fndecl;
6665       break;
6666     default:
6667       return NULL_TREE;
6668     }
6669 }
6670 
6671 /* Implement targetm.vectorize.builtin_mul_widen_odd.  */
6672 static tree
6673 spu_builtin_mul_widen_odd (tree type)
6674 {
6675   switch (TYPE_MODE (type))
6676     {
6677     case V8HImode:
6678       if (TYPE_UNSIGNED (type))
6679 	return spu_builtins[SPU_MULO_1].fndecl;
6680       else
6681 	return spu_builtins[SPU_MULO_0].fndecl;
6682       break;
6683     default:
6684       return NULL_TREE;
6685     }
6686 }
6687 
6688 /* Implement targetm.vectorize.builtin_mask_for_load.  */
6689 static tree
6690 spu_builtin_mask_for_load (void)
6691 {
6692   struct spu_builtin_description *d = &spu_builtins[SPU_MASK_FOR_LOAD];
6693   gcc_assert (d);
6694   return d->fndecl;
6695 }
6696 
6697 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6698 static int
6699 spu_builtin_vectorization_cost (bool runtime_test)
6700 {
6701   /* If the branch of the runtime test is taken - i.e. - the vectorized
6702      version is skipped - this incurs a misprediction cost (because the
6703      vectorized version is expected to be the fall-through).  So we subtract
6704      the latency of a mispredicted branch from the costs that are incurred
6705      when the vectorized version is executed.  */
6706   if (runtime_test)
6707     return -19;
6708   else
6709     return 0;
6710 }
6711 
6712 /* Return true iff, data reference of TYPE can reach vector alignment (16)
6713    after applying N number of iterations.  This routine does not determine
6714    how may iterations are required to reach desired alignment.  */
6715 
6716 static bool
6717 spu_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_packed)
6718 {
6719   if (is_packed)
6720     return false;
6721 
6722   /* All other types are naturally aligned.  */
6723   return true;
6724 }
6725 
6726 /* Implement targetm.vectorize.builtin_vec_perm.  */
6727 tree
6728 spu_builtin_vec_perm (tree type, tree *mask_element_type)
6729 {
6730   struct spu_builtin_description *d;
6731 
6732   *mask_element_type = unsigned_char_type_node;
6733 
6734   switch (TYPE_MODE (type))
6735     {
6736     case V16QImode:
6737       if (TYPE_UNSIGNED (type))
6738         d = &spu_builtins[SPU_SHUFFLE_0];
6739       else
6740         d = &spu_builtins[SPU_SHUFFLE_1];
6741       break;
6742 
6743     case V8HImode:
6744       if (TYPE_UNSIGNED (type))
6745         d = &spu_builtins[SPU_SHUFFLE_2];
6746       else
6747         d = &spu_builtins[SPU_SHUFFLE_3];
6748       break;
6749 
6750     case V4SImode:
6751       if (TYPE_UNSIGNED (type))
6752         d = &spu_builtins[SPU_SHUFFLE_4];
6753       else
6754         d = &spu_builtins[SPU_SHUFFLE_5];
6755       break;
6756 
6757     case V2DImode:
6758       if (TYPE_UNSIGNED (type))
6759         d = &spu_builtins[SPU_SHUFFLE_6];
6760       else
6761         d = &spu_builtins[SPU_SHUFFLE_7];
6762       break;
6763 
6764     case V4SFmode:
6765       d = &spu_builtins[SPU_SHUFFLE_8];
6766       break;
6767 
6768     case V2DFmode:
6769       d = &spu_builtins[SPU_SHUFFLE_9];
6770       break;
6771 
6772     default:
6773       return NULL_TREE;
6774     }
6775 
6776   gcc_assert (d);
6777   return d->fndecl;
6778 }
6779 
6780 /* Return the appropriate mode for a named address pointer.  */
6781 static enum machine_mode
6782 spu_addr_space_pointer_mode (addr_space_t addrspace)
6783 {
6784   switch (addrspace)
6785     {
6786     case ADDR_SPACE_GENERIC:
6787       return ptr_mode;
6788     case ADDR_SPACE_EA:
6789       return EAmode;
6790     default:
6791       gcc_unreachable ();
6792     }
6793 }
6794 
6795 /* Return the appropriate mode for a named address address.  */
6796 static enum machine_mode
6797 spu_addr_space_address_mode (addr_space_t addrspace)
6798 {
6799   switch (addrspace)
6800     {
6801     case ADDR_SPACE_GENERIC:
6802       return Pmode;
6803     case ADDR_SPACE_EA:
6804       return EAmode;
6805     default:
6806       gcc_unreachable ();
6807     }
6808 }
6809 
6810 /* Determine if one named address space is a subset of another.  */
6811 
6812 static bool
6813 spu_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
6814 {
6815   gcc_assert (subset == ADDR_SPACE_GENERIC || subset == ADDR_SPACE_EA);
6816   gcc_assert (superset == ADDR_SPACE_GENERIC || superset == ADDR_SPACE_EA);
6817 
6818   if (subset == superset)
6819     return true;
6820 
6821   /* If we have -mno-address-space-conversion, treat __ea and generic as not
6822      being subsets but instead as disjoint address spaces.  */
6823   else if (!TARGET_ADDRESS_SPACE_CONVERSION)
6824     return false;
6825 
6826   else
6827     return (subset == ADDR_SPACE_GENERIC && superset == ADDR_SPACE_EA);
6828 }
6829 
6830 /* Convert from one address space to another.  */
6831 static rtx
6832 spu_addr_space_convert (rtx op, tree from_type, tree to_type)
6833 {
6834   addr_space_t from_as = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
6835   addr_space_t to_as = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
6836 
6837   gcc_assert (from_as == ADDR_SPACE_GENERIC || from_as == ADDR_SPACE_EA);
6838   gcc_assert (to_as == ADDR_SPACE_GENERIC || to_as == ADDR_SPACE_EA);
6839 
6840   if (to_as == ADDR_SPACE_GENERIC && from_as == ADDR_SPACE_EA)
6841     {
6842       rtx result, ls;
6843 
6844       ls = gen_const_mem (DImode,
6845 			  gen_rtx_SYMBOL_REF (Pmode, "__ea_local_store"));
6846       set_mem_align (ls, 128);
6847 
6848       result = gen_reg_rtx (Pmode);
6849       ls = force_reg (Pmode, convert_modes (Pmode, DImode, ls, 1));
6850       op = force_reg (Pmode, convert_modes (Pmode, EAmode, op, 1));
6851       ls = emit_conditional_move (ls, NE, op, const0_rtx, Pmode,
6852 					  ls, const0_rtx, Pmode, 1);
6853 
6854       emit_insn (gen_subsi3 (result, op, ls));
6855 
6856       return result;
6857     }
6858 
6859   else if (to_as == ADDR_SPACE_EA && from_as == ADDR_SPACE_GENERIC)
6860     {
6861       rtx result, ls;
6862 
6863       ls = gen_const_mem (DImode,
6864 			  gen_rtx_SYMBOL_REF (Pmode, "__ea_local_store"));
6865       set_mem_align (ls, 128);
6866 
6867       result = gen_reg_rtx (EAmode);
6868       ls = force_reg (EAmode, convert_modes (EAmode, DImode, ls, 1));
6869       op = force_reg (Pmode, op);
6870       ls = emit_conditional_move (ls, NE, op, const0_rtx, Pmode,
6871 					  ls, const0_rtx, EAmode, 1);
6872       op = force_reg (EAmode, convert_modes (EAmode, Pmode, op, 1));
6873 
6874       if (EAmode == SImode)
6875 	emit_insn (gen_addsi3 (result, op, ls));
6876       else
6877 	emit_insn (gen_adddi3 (result, op, ls));
6878 
6879       return result;
6880     }
6881 
6882   else
6883     gcc_unreachable ();
6884 }
6885 
6886 
6887 /* Count the total number of instructions in each pipe and return the
6888    maximum, which is used as the Minimum Iteration Interval (MII)
6889    in the modulo scheduler.  get_pipe() will return -2, -1, 0, or 1.
6890    -2 are instructions that can go in pipe0 or pipe1.  */
6891 static int
6892 spu_sms_res_mii (struct ddg *g)
6893 {
6894   int i;
6895   unsigned t[4] = {0, 0, 0, 0};
6896 
6897   for (i = 0; i < g->num_nodes; i++)
6898     {
6899       rtx insn = g->nodes[i].insn;
6900       int p = get_pipe (insn) + 2;
6901 
6902       assert (p >= 0);
6903       assert (p < 4);
6904 
6905       t[p]++;
6906       if (dump_file && INSN_P (insn))
6907             fprintf (dump_file, "i%d %s %d %d\n",
6908                      INSN_UID (insn),
6909                      insn_data[INSN_CODE(insn)].name,
6910                      p, t[p]);
6911     }
6912   if (dump_file)
6913     fprintf (dump_file, "%d %d %d %d\n", t[0], t[1], t[2], t[3]);
6914 
6915   return MAX ((t[0] + t[2] + t[3] + 1) / 2, MAX (t[2], t[3]));
6916 }
6917 
6918 
6919 void
6920 spu_init_expanders (void)
6921 {
6922   if (cfun)
6923     {
6924       rtx r0, r1;
6925       /* HARD_FRAME_REGISTER is only 128 bit aligned when
6926          frame_pointer_needed is true.  We don't know that until we're
6927          expanding the prologue. */
6928       REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = 8;
6929 
6930       /* A number of passes use LAST_VIRTUAL_REGISTER+1 and
6931 	 LAST_VIRTUAL_REGISTER+2 to test the back-end.  We want them
6932 	 to be treated as aligned, so generate them here. */
6933       r0 = gen_reg_rtx (SImode);
6934       r1 = gen_reg_rtx (SImode);
6935       mark_reg_pointer (r0, 128);
6936       mark_reg_pointer (r1, 128);
6937       gcc_assert (REGNO (r0) == LAST_VIRTUAL_REGISTER + 1
6938 		  && REGNO (r1) == LAST_VIRTUAL_REGISTER + 2);
6939     }
6940 }
6941 
6942 static enum machine_mode
6943 spu_libgcc_cmp_return_mode (void)
6944 {
6945 
6946 /* For SPU word mode is TI mode so it is better to use SImode
6947    for compare returns.  */
6948   return SImode;
6949 }
6950 
6951 static enum machine_mode
6952 spu_libgcc_shift_count_mode (void)
6953 {
6954 /* For SPU word mode is TI mode so it is better to use SImode
6955    for shift counts.  */
6956   return SImode;
6957 }
6958 
6959 /* An early place to adjust some flags after GCC has finished processing
6960  * them. */
6961 static void
6962 asm_file_start (void)
6963 {
6964   /* Variable tracking should be run after all optimizations which
6965      change order of insns.  It also needs a valid CFG. */
6966   spu_flag_var_tracking = flag_var_tracking;
6967   flag_var_tracking = 0;
6968 
6969   default_file_start ();
6970 }
6971 
6972 /* Implement targetm.section_type_flags.  */
6973 static unsigned int
6974 spu_section_type_flags (tree decl, const char *name, int reloc)
6975 {
6976   /* .toe needs to have type @nobits.  */
6977   if (strcmp (name, ".toe") == 0)
6978     return SECTION_BSS;
6979   /* Don't load _ea into the current address space.  */
6980   if (strcmp (name, "._ea") == 0)
6981     return SECTION_WRITE | SECTION_DEBUG;
6982   return default_section_type_flags (decl, name, reloc);
6983 }
6984 
6985 /* Implement targetm.select_section.  */
6986 static section *
6987 spu_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align)
6988 {
6989   /* Variables and constants defined in the __ea address space
6990      go into a special section named "._ea".  */
6991   if (TREE_TYPE (decl) != error_mark_node
6992       && TYPE_ADDR_SPACE (TREE_TYPE (decl)) == ADDR_SPACE_EA)
6993     {
6994       /* We might get called with string constants, but get_named_section
6995 	 doesn't like them as they are not DECLs.  Also, we need to set
6996 	 flags in that case.  */
6997       if (!DECL_P (decl))
6998 	return get_section ("._ea", SECTION_WRITE | SECTION_DEBUG, NULL);
6999 
7000       return get_named_section (decl, "._ea", reloc);
7001     }
7002 
7003   return default_elf_select_section (decl, reloc, align);
7004 }
7005 
7006 /* Implement targetm.unique_section.  */
7007 static void
7008 spu_unique_section (tree decl, int reloc)
7009 {
7010   /* We don't support unique section names in the __ea address
7011      space for now.  */
7012   if (TREE_TYPE (decl) != error_mark_node
7013       && TYPE_ADDR_SPACE (TREE_TYPE (decl)) != 0)
7014     return;
7015 
7016   default_unique_section (decl, reloc);
7017 }
7018 
7019 /* Generate a constant or register which contains 2^SCALE.  We assume
7020    the result is valid for MODE.  Currently, MODE must be V4SFmode and
7021    SCALE must be SImode. */
7022 rtx
7023 spu_gen_exp2 (enum machine_mode mode, rtx scale)
7024 {
7025   gcc_assert (mode == V4SFmode);
7026   gcc_assert (GET_MODE (scale) == SImode || GET_CODE (scale) == CONST_INT);
7027   if (GET_CODE (scale) != CONST_INT)
7028     {
7029       /* unsigned int exp = (127 + scale) << 23;
7030 	__vector float m = (__vector float) spu_splats (exp); */
7031       rtx reg = force_reg (SImode, scale);
7032       rtx exp = gen_reg_rtx (SImode);
7033       rtx mul = gen_reg_rtx (mode);
7034       emit_insn (gen_addsi3 (exp, reg, GEN_INT (127)));
7035       emit_insn (gen_ashlsi3 (exp, exp, GEN_INT (23)));
7036       emit_insn (gen_spu_splats (mul, gen_rtx_SUBREG (GET_MODE_INNER (mode), exp, 0)));
7037       return mul;
7038     }
7039   else
7040     {
7041       HOST_WIDE_INT exp = 127 + INTVAL (scale);
7042       unsigned char arr[16];
7043       arr[0] = arr[4] = arr[8] = arr[12] = exp >> 1;
7044       arr[1] = arr[5] = arr[9] = arr[13] = exp << 7;
7045       arr[2] = arr[6] = arr[10] = arr[14] = 0;
7046       arr[3] = arr[7] = arr[11] = arr[15] = 0;
7047       return array_to_constant (mode, arr);
7048     }
7049 }
7050 
7051 /* After reload, just change the convert into a move instruction
7052    or a dead instruction. */
7053 void
7054 spu_split_convert (rtx ops[])
7055 {
7056   if (REGNO (ops[0]) == REGNO (ops[1]))
7057     emit_note (NOTE_INSN_DELETED);
7058   else
7059     {
7060       /* Use TImode always as this might help hard reg copyprop.  */
7061       rtx op0 = gen_rtx_REG (TImode, REGNO (ops[0]));
7062       rtx op1 = gen_rtx_REG (TImode, REGNO (ops[1]));
7063       emit_insn (gen_move_insn (op0, op1));
7064     }
7065 }
7066 
7067 void
7068 spu_function_profiler (FILE * file, int labelno)
7069 {
7070   fprintf (file, "# profile\n");
7071   fprintf (file, "brsl $75,  _mcount\n");
7072 }
7073 
7074 #include "gt-spu.h"
7075