xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/i386/i386-expand.c (revision 4ac76180e904e771b9d522c7e57296d371f06499)
1 /* Copyright (C) 1988-2020 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9 
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3.  If not see
17 <http://www.gnu.org/licenses/>.  */
18 
19 #define IN_TARGET_CODE 1
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95 
96 /* Split one or more double-mode RTL references into pairs of half-mode
97    references.  The RTL can be REG, offsettable MEM, integer constant, or
98    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
99    split and "num" is its length.  lo_half and hi_half are output arrays
100    that parallel "operands".  */
101 
102 void
split_double_mode(machine_mode mode,rtx operands[],int num,rtx lo_half[],rtx hi_half[])103 split_double_mode (machine_mode mode, rtx operands[],
104 		   int num, rtx lo_half[], rtx hi_half[])
105 {
106   machine_mode half_mode;
107   unsigned int byte;
108   rtx mem_op = NULL_RTX;
109   int mem_num = 0;
110 
111   switch (mode)
112     {
113     case E_TImode:
114       half_mode = DImode;
115       break;
116     case E_DImode:
117       half_mode = SImode;
118       break;
119     case E_P2HImode:
120       half_mode = HImode;
121       break;
122     case E_P2QImode:
123       half_mode = QImode;
124       break;
125     default:
126       gcc_unreachable ();
127     }
128 
129   byte = GET_MODE_SIZE (half_mode);
130 
131   while (num--)
132     {
133       rtx op = operands[num];
134 
135       /* simplify_subreg refuse to split volatile memory addresses,
136          but we still have to handle it.  */
137       if (MEM_P (op))
138 	{
139 	  if (mem_op && rtx_equal_p (op, mem_op))
140 	    {
141 	      lo_half[num] = lo_half[mem_num];
142 	      hi_half[num] = hi_half[mem_num];
143 	    }
144 	  else
145 	    {
146 	      mem_op = op;
147 	      mem_num = num;
148 	      lo_half[num] = adjust_address (op, half_mode, 0);
149 	      hi_half[num] = adjust_address (op, half_mode, byte);
150 	    }
151 	}
152       else
153 	{
154 	  lo_half[num] = simplify_gen_subreg (half_mode, op,
155 					      GET_MODE (op) == VOIDmode
156 					      ? mode : GET_MODE (op), 0);
157 	  hi_half[num] = simplify_gen_subreg (half_mode, op,
158 					      GET_MODE (op) == VOIDmode
159 					      ? mode : GET_MODE (op), byte);
160 	}
161     }
162 }
163 
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
165    for the target.  */
166 
167 void
ix86_expand_clear(rtx dest)168 ix86_expand_clear (rtx dest)
169 {
170   rtx tmp;
171 
172   /* We play register width games, which are only valid after reload.  */
173   gcc_assert (reload_completed);
174 
175   /* Avoid HImode and its attendant prefix byte.  */
176   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
177     dest = gen_rtx_REG (SImode, REGNO (dest));
178   tmp = gen_rtx_SET (dest, const0_rtx);
179 
180   if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
181     {
182       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
183       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
184     }
185 
186   emit_insn (tmp);
187 }
188 
189 void
ix86_expand_move(machine_mode mode,rtx operands[])190 ix86_expand_move (machine_mode mode, rtx operands[])
191 {
192   rtx op0, op1;
193   rtx tmp, addend = NULL_RTX;
194   enum tls_model model;
195 
196   op0 = operands[0];
197   op1 = operands[1];
198 
199   switch (GET_CODE (op1))
200     {
201     case CONST:
202       tmp = XEXP (op1, 0);
203 
204       if (GET_CODE (tmp) != PLUS
205 	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
206 	break;
207 
208       op1 = XEXP (tmp, 0);
209       addend = XEXP (tmp, 1);
210       /* FALLTHRU */
211 
212     case SYMBOL_REF:
213       model = SYMBOL_REF_TLS_MODEL (op1);
214 
215       if (model)
216 	op1 = legitimize_tls_address (op1, model, true);
217       else if (ix86_force_load_from_GOT_p (op1))
218 	{
219 	  /* Load the external function address via GOT slot to avoid PLT.  */
220 	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
221 				(TARGET_64BIT
222 				 ? UNSPEC_GOTPCREL
223 				 : UNSPEC_GOT));
224 	  op1 = gen_rtx_CONST (Pmode, op1);
225 	  op1 = gen_const_mem (Pmode, op1);
226 	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
227 	}
228       else
229 	{
230 	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
231 	  if (tmp)
232 	    {
233 	      op1 = tmp;
234 	      if (!addend)
235 		break;
236 	    }
237 	  else
238 	    {
239 	      op1 = operands[1];
240 	      break;
241 	    }
242 	}
243 
244       if (addend)
245 	{
246 	  op1 = force_operand (op1, NULL_RTX);
247 	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
248 				     op0, 1, OPTAB_DIRECT);
249 	}
250       else
251 	op1 = force_operand (op1, op0);
252 
253       if (op1 == op0)
254 	return;
255 
256       op1 = convert_to_mode (mode, op1, 1);
257 
258     default:
259       break;
260     }
261 
262   if ((flag_pic || MACHOPIC_INDIRECT)
263       && symbolic_operand (op1, mode))
264     {
265       if (TARGET_MACHO && !TARGET_64BIT)
266 	{
267 #if TARGET_MACHO
268 	  /* dynamic-no-pic */
269 	  if (MACHOPIC_INDIRECT)
270 	    {
271 	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
272 			 ? op0 : gen_reg_rtx (Pmode);
273 	      op1 = machopic_indirect_data_reference (op1, temp);
274 	      if (MACHOPIC_PURE)
275 		op1 = machopic_legitimize_pic_address (op1, mode,
276 						       temp == op1 ? 0 : temp);
277 	    }
278 	  if (op0 != op1 && GET_CODE (op0) != MEM)
279 	    {
280 	      rtx insn = gen_rtx_SET (op0, op1);
281 	      emit_insn (insn);
282 	      return;
283 	    }
284 	  if (GET_CODE (op0) == MEM)
285 	    op1 = force_reg (Pmode, op1);
286 	  else
287 	    {
288 	      rtx temp = op0;
289 	      if (GET_CODE (temp) != REG)
290 		temp = gen_reg_rtx (Pmode);
291 	      temp = legitimize_pic_address (op1, temp);
292 	      if (temp == op0)
293 	    return;
294 	      op1 = temp;
295 	    }
296       /* dynamic-no-pic */
297 #endif
298 	}
299       else
300 	{
301 	  if (MEM_P (op0))
302 	    op1 = force_reg (mode, op1);
303 	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
304 	    {
305 	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
306 	      op1 = legitimize_pic_address (op1, reg);
307 	      if (op0 == op1)
308 		return;
309 	      op1 = convert_to_mode (mode, op1, 1);
310 	    }
311 	}
312     }
313   else
314     {
315       if (MEM_P (op0)
316 	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
317 	      || !push_operand (op0, mode))
318 	  && MEM_P (op1))
319 	op1 = force_reg (mode, op1);
320 
321       if (push_operand (op0, mode)
322 	  && ! general_no_elim_operand (op1, mode))
323 	op1 = copy_to_mode_reg (mode, op1);
324 
325       /* Force large constants in 64bit compilation into register
326 	 to get them CSEed.  */
327       if (can_create_pseudo_p ()
328 	  && (mode == DImode) && TARGET_64BIT
329 	  && immediate_operand (op1, mode)
330 	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
331 	  && !register_operand (op0, mode)
332 	  && optimize)
333 	op1 = copy_to_mode_reg (mode, op1);
334 
335       if (can_create_pseudo_p ()
336 	  && CONST_DOUBLE_P (op1))
337 	{
338 	  /* If we are loading a floating point constant to a register,
339 	     force the value to memory now, since we'll get better code
340 	     out the back end.  */
341 
342 	  op1 = validize_mem (force_const_mem (mode, op1));
343 	  if (!register_operand (op0, mode))
344 	    {
345 	      rtx temp = gen_reg_rtx (mode);
346 	      emit_insn (gen_rtx_SET (temp, op1));
347 	      emit_move_insn (op0, temp);
348 	      return;
349 	    }
350 	}
351     }
352 
353   emit_insn (gen_rtx_SET (op0, op1));
354 }
355 
356 void
ix86_expand_vector_move(machine_mode mode,rtx operands[])357 ix86_expand_vector_move (machine_mode mode, rtx operands[])
358 {
359   rtx op0 = operands[0], op1 = operands[1];
360   /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
361      psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
362   unsigned int align = (TARGET_IAMCU
363 			? GET_MODE_BITSIZE (mode)
364 			: GET_MODE_ALIGNMENT (mode));
365 
366   if (push_operand (op0, VOIDmode))
367     op0 = emit_move_resolve_push (mode, op0);
368 
369   /* Force constants other than zero into memory.  We do not know how
370      the instructions used to build constants modify the upper 64 bits
371      of the register, once we have that information we may be able
372      to handle some of them more efficiently.  */
373   if (can_create_pseudo_p ()
374       && (CONSTANT_P (op1)
375 	  || (SUBREG_P (op1)
376 	      && CONSTANT_P (SUBREG_REG (op1))))
377       && ((register_operand (op0, mode)
378 	   && !standard_sse_constant_p (op1, mode))
379 	  /* ix86_expand_vector_move_misalign() does not like constants.  */
380 	  || (SSE_REG_MODE_P (mode)
381 	      && MEM_P (op0)
382 	      && MEM_ALIGN (op0) < align)))
383     {
384       if (SUBREG_P (op1))
385 	{
386 	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
387 	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
388 	  if (r)
389 	    r = validize_mem (r);
390 	  else
391 	    r = force_reg (imode, SUBREG_REG (op1));
392 	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
393 	}
394       else
395 	op1 = validize_mem (force_const_mem (mode, op1));
396     }
397 
398   /* We need to check memory alignment for SSE mode since attribute
399      can make operands unaligned.  */
400   if (can_create_pseudo_p ()
401       && SSE_REG_MODE_P (mode)
402       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
403 	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
404     {
405       rtx tmp[2];
406 
407       /* ix86_expand_vector_move_misalign() does not like both
408 	 arguments in memory.  */
409       if (!register_operand (op0, mode)
410 	  && !register_operand (op1, mode))
411 	op1 = force_reg (mode, op1);
412 
413       tmp[0] = op0; tmp[1] = op1;
414       ix86_expand_vector_move_misalign (mode, tmp);
415       return;
416     }
417 
418   /* Make operand1 a register if it isn't already.  */
419   if (can_create_pseudo_p ()
420       && !register_operand (op0, mode)
421       && !register_operand (op1, mode))
422     {
423       emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
424       return;
425     }
426 
427   emit_insn (gen_rtx_SET (op0, op1));
428 }
429 
430 /* Split 32-byte AVX unaligned load and store if needed.  */
431 
432 static void
ix86_avx256_split_vector_move_misalign(rtx op0,rtx op1)433 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
434 {
435   rtx m;
436   rtx (*extract) (rtx, rtx, rtx);
437   machine_mode mode;
438 
439   if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
440       || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
441     {
442       emit_insn (gen_rtx_SET (op0, op1));
443       return;
444     }
445 
446   rtx orig_op0 = NULL_RTX;
447   mode = GET_MODE (op0);
448   switch (GET_MODE_CLASS (mode))
449     {
450     case MODE_VECTOR_INT:
451     case MODE_INT:
452       if (mode != V32QImode)
453 	{
454 	  if (!MEM_P (op0))
455 	    {
456 	      orig_op0 = op0;
457 	      op0 = gen_reg_rtx (V32QImode);
458 	    }
459 	  else
460 	    op0 = gen_lowpart (V32QImode, op0);
461 	  op1 = gen_lowpart (V32QImode, op1);
462 	  mode = V32QImode;
463 	}
464       break;
465     case MODE_VECTOR_FLOAT:
466       break;
467     default:
468       gcc_unreachable ();
469     }
470 
471   switch (mode)
472     {
473     default:
474       gcc_unreachable ();
475     case E_V32QImode:
476       extract = gen_avx_vextractf128v32qi;
477       mode = V16QImode;
478       break;
479     case E_V8SFmode:
480       extract = gen_avx_vextractf128v8sf;
481       mode = V4SFmode;
482       break;
483     case E_V4DFmode:
484       extract = gen_avx_vextractf128v4df;
485       mode = V2DFmode;
486       break;
487     }
488 
489   if (MEM_P (op1))
490     {
491       rtx r = gen_reg_rtx (mode);
492       m = adjust_address (op1, mode, 0);
493       emit_move_insn (r, m);
494       m = adjust_address (op1, mode, 16);
495       r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
496       emit_move_insn (op0, r);
497     }
498   else if (MEM_P (op0))
499     {
500       m = adjust_address (op0, mode, 0);
501       emit_insn (extract (m, op1, const0_rtx));
502       m = adjust_address (op0, mode, 16);
503       emit_insn (extract (m, copy_rtx (op1), const1_rtx));
504     }
505   else
506     gcc_unreachable ();
507 
508   if (orig_op0)
509     emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
510 }
511 
512 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
513    straight to ix86_expand_vector_move.  */
514 /* Code generation for scalar reg-reg moves of single and double precision data:
515      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
516        movaps reg, reg
517      else
518        movss reg, reg
519      if (x86_sse_partial_reg_dependency == true)
520        movapd reg, reg
521      else
522        movsd reg, reg
523 
524    Code generation for scalar loads of double precision data:
525      if (x86_sse_split_regs == true)
526        movlpd mem, reg      (gas syntax)
527      else
528        movsd mem, reg
529 
530    Code generation for unaligned packed loads of single precision data
531    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
532      if (x86_sse_unaligned_move_optimal)
533        movups mem, reg
534 
535      if (x86_sse_partial_reg_dependency == true)
536        {
537          xorps  reg, reg
538          movlps mem, reg
539          movhps mem+8, reg
540        }
541      else
542        {
543          movlps mem, reg
544          movhps mem+8, reg
545        }
546 
547    Code generation for unaligned packed loads of double precision data
548    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
549      if (x86_sse_unaligned_move_optimal)
550        movupd mem, reg
551 
552      if (x86_sse_split_regs == true)
553        {
554          movlpd mem, reg
555          movhpd mem+8, reg
556        }
557      else
558        {
559          movsd  mem, reg
560          movhpd mem+8, reg
561        }
562  */
563 
564 void
ix86_expand_vector_move_misalign(machine_mode mode,rtx operands[])565 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
566 {
567   rtx op0, op1, m;
568 
569   op0 = operands[0];
570   op1 = operands[1];
571 
572   /* Use unaligned load/store for AVX512 or when optimizing for size.  */
573   if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
574     {
575       emit_insn (gen_rtx_SET (op0, op1));
576       return;
577     }
578 
579   if (TARGET_AVX)
580     {
581       if (GET_MODE_SIZE (mode) == 32)
582 	ix86_avx256_split_vector_move_misalign (op0, op1);
583       else
584 	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
585 	emit_insn (gen_rtx_SET (op0, op1));
586       return;
587     }
588 
589   if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
590       || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
591     {
592       emit_insn (gen_rtx_SET (op0, op1));
593       return;
594     }
595 
596   /* ??? If we have typed data, then it would appear that using
597      movdqu is the only way to get unaligned data loaded with
598      integer type.  */
599   if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
600     {
601       emit_insn (gen_rtx_SET (op0, op1));
602       return;
603     }
604 
605   if (MEM_P (op1))
606     {
607       if (TARGET_SSE2 && mode == V2DFmode)
608         {
609           rtx zero;
610 
611 	  /* When SSE registers are split into halves, we can avoid
612 	     writing to the top half twice.  */
613 	  if (TARGET_SSE_SPLIT_REGS)
614 	    {
615 	      emit_clobber (op0);
616 	      zero = op0;
617 	    }
618 	  else
619 	    {
620 	      /* ??? Not sure about the best option for the Intel chips.
621 		 The following would seem to satisfy; the register is
622 		 entirely cleared, breaking the dependency chain.  We
623 		 then store to the upper half, with a dependency depth
624 		 of one.  A rumor has it that Intel recommends two movsd
625 		 followed by an unpacklpd, but this is unconfirmed.  And
626 		 given that the dependency depth of the unpacklpd would
627 		 still be one, I'm not sure why this would be better.  */
628 	      zero = CONST0_RTX (V2DFmode);
629 	    }
630 
631 	  m = adjust_address (op1, DFmode, 0);
632 	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
633 	  m = adjust_address (op1, DFmode, 8);
634 	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
635 	}
636       else
637         {
638 	  rtx t;
639 
640 	  if (mode != V4SFmode)
641 	    t = gen_reg_rtx (V4SFmode);
642 	  else
643 	    t = op0;
644 
645 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
646 	    emit_move_insn (t, CONST0_RTX (V4SFmode));
647 	  else
648 	    emit_clobber (t);
649 
650 	  m = adjust_address (op1, V2SFmode, 0);
651 	  emit_insn (gen_sse_loadlps (t, t, m));
652 	  m = adjust_address (op1, V2SFmode, 8);
653 	  emit_insn (gen_sse_loadhps (t, t, m));
654 	  if (mode != V4SFmode)
655 	    emit_move_insn (op0, gen_lowpart (mode, t));
656 	}
657     }
658   else if (MEM_P (op0))
659     {
660       if (TARGET_SSE2 && mode == V2DFmode)
661 	{
662 	  m = adjust_address (op0, DFmode, 0);
663 	  emit_insn (gen_sse2_storelpd (m, op1));
664 	  m = adjust_address (op0, DFmode, 8);
665 	  emit_insn (gen_sse2_storehpd (m, op1));
666 	}
667       else
668 	{
669 	  if (mode != V4SFmode)
670 	    op1 = gen_lowpart (V4SFmode, op1);
671 
672 	  m = adjust_address (op0, V2SFmode, 0);
673 	  emit_insn (gen_sse_storelps (m, op1));
674 	  m = adjust_address (op0, V2SFmode, 8);
675 	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
676 	}
677     }
678   else
679     gcc_unreachable ();
680 }
681 
682 /* Move bits 64:95 to bits 32:63.  */
683 
684 void
ix86_move_vector_high_sse_to_mmx(rtx op)685 ix86_move_vector_high_sse_to_mmx (rtx op)
686 {
687   rtx mask = gen_rtx_PARALLEL (VOIDmode,
688 			       gen_rtvec (4, GEN_INT (0), GEN_INT (2),
689 					  GEN_INT (0), GEN_INT (0)));
690   rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
691   op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
692   rtx insn = gen_rtx_SET (dest, op);
693   emit_insn (insn);
694 }
695 
696 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
697 
698 void
ix86_split_mmx_pack(rtx operands[],enum rtx_code code)699 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
700 {
701   rtx op0 = operands[0];
702   rtx op1 = operands[1];
703   rtx op2 = operands[2];
704 
705   machine_mode dmode = GET_MODE (op0);
706   machine_mode smode = GET_MODE (op1);
707   machine_mode inner_dmode = GET_MODE_INNER (dmode);
708   machine_mode inner_smode = GET_MODE_INNER (smode);
709 
710   /* Get the corresponding SSE mode for destination.  */
711   int nunits = 16 / GET_MODE_SIZE (inner_dmode);
712   machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
713 					    nunits).require ();
714   machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
715 						 nunits / 2).require ();
716 
717   /* Get the corresponding SSE mode for source.  */
718   nunits = 16 / GET_MODE_SIZE (inner_smode);
719   machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
720 					    nunits).require ();
721 
722   /* Generate SSE pack with signed/unsigned saturation.  */
723   rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
724   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
725   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
726 
727   op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
728   op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
729   rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
730 						    op1, op2));
731   emit_insn (insn);
732 
733   ix86_move_vector_high_sse_to_mmx (op0);
734 }
735 
736 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
737 
738 void
ix86_split_mmx_punpck(rtx operands[],bool high_p)739 ix86_split_mmx_punpck (rtx operands[], bool high_p)
740 {
741   rtx op0 = operands[0];
742   rtx op1 = operands[1];
743   rtx op2 = operands[2];
744   machine_mode mode = GET_MODE (op0);
745   rtx mask;
746   /* The corresponding SSE mode.  */
747   machine_mode sse_mode, double_sse_mode;
748 
749   switch (mode)
750     {
751     case E_V8QImode:
752       sse_mode = V16QImode;
753       double_sse_mode = V32QImode;
754       mask = gen_rtx_PARALLEL (VOIDmode,
755 			       gen_rtvec (16,
756 					  GEN_INT (0), GEN_INT (16),
757 					  GEN_INT (1), GEN_INT (17),
758 					  GEN_INT (2), GEN_INT (18),
759 					  GEN_INT (3), GEN_INT (19),
760 					  GEN_INT (4), GEN_INT (20),
761 					  GEN_INT (5), GEN_INT (21),
762 					  GEN_INT (6), GEN_INT (22),
763 					  GEN_INT (7), GEN_INT (23)));
764       break;
765 
766     case E_V4HImode:
767       sse_mode = V8HImode;
768       double_sse_mode = V16HImode;
769       mask = gen_rtx_PARALLEL (VOIDmode,
770 			       gen_rtvec (8,
771 					  GEN_INT (0), GEN_INT (8),
772 					  GEN_INT (1), GEN_INT (9),
773 					  GEN_INT (2), GEN_INT (10),
774 					  GEN_INT (3), GEN_INT (11)));
775       break;
776 
777     case E_V2SImode:
778       sse_mode = V4SImode;
779       double_sse_mode = V8SImode;
780       mask = gen_rtx_PARALLEL (VOIDmode,
781 			       gen_rtvec (4,
782 					  GEN_INT (0), GEN_INT (4),
783 					  GEN_INT (1), GEN_INT (5)));
784       break;
785 
786     default:
787       gcc_unreachable ();
788     }
789 
790   /* Generate SSE punpcklXX.  */
791   rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
792   op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
793   op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
794 
795   op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
796   op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
797   rtx insn = gen_rtx_SET (dest, op2);
798   emit_insn (insn);
799 
800   if (high_p)
801     {
802       /* Move bits 64:127 to bits 0:63.  */
803       mask = gen_rtx_PARALLEL (VOIDmode,
804 			       gen_rtvec (4, GEN_INT (2), GEN_INT (3),
805 					  GEN_INT (0), GEN_INT (0)));
806       dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
807       op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
808       insn = gen_rtx_SET (dest, op1);
809       emit_insn (insn);
810     }
811 }
812 
813 /* Helper function of ix86_fixup_binary_operands to canonicalize
814    operand order.  Returns true if the operands should be swapped.  */
815 
816 static bool
ix86_swap_binary_operands_p(enum rtx_code code,machine_mode mode,rtx operands[])817 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
818 			     rtx operands[])
819 {
820   rtx dst = operands[0];
821   rtx src1 = operands[1];
822   rtx src2 = operands[2];
823 
824   /* If the operation is not commutative, we can't do anything.  */
825   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
826       && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
827     return false;
828 
829   /* Highest priority is that src1 should match dst.  */
830   if (rtx_equal_p (dst, src1))
831     return false;
832   if (rtx_equal_p (dst, src2))
833     return true;
834 
835   /* Next highest priority is that immediate constants come second.  */
836   if (immediate_operand (src2, mode))
837     return false;
838   if (immediate_operand (src1, mode))
839     return true;
840 
841   /* Lowest priority is that memory references should come second.  */
842   if (MEM_P (src2))
843     return false;
844   if (MEM_P (src1))
845     return true;
846 
847   return false;
848 }
849 
850 
851 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
852    destination to use for the operation.  If different from the true
853    destination in operands[0], a copy operation will be required.  */
854 
855 rtx
ix86_fixup_binary_operands(enum rtx_code code,machine_mode mode,rtx operands[])856 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
857 			    rtx operands[])
858 {
859   rtx dst = operands[0];
860   rtx src1 = operands[1];
861   rtx src2 = operands[2];
862 
863   /* Canonicalize operand order.  */
864   if (ix86_swap_binary_operands_p (code, mode, operands))
865     {
866       /* It is invalid to swap operands of different modes.  */
867       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
868 
869       std::swap (src1, src2);
870     }
871 
872   /* Both source operands cannot be in memory.  */
873   if (MEM_P (src1) && MEM_P (src2))
874     {
875       /* Optimization: Only read from memory once.  */
876       if (rtx_equal_p (src1, src2))
877 	{
878 	  src2 = force_reg (mode, src2);
879 	  src1 = src2;
880 	}
881       else if (rtx_equal_p (dst, src1))
882 	src2 = force_reg (mode, src2);
883       else
884 	src1 = force_reg (mode, src1);
885     }
886 
887   /* If the destination is memory, and we do not have matching source
888      operands, do things in registers.  */
889   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
890     dst = gen_reg_rtx (mode);
891 
892   /* Source 1 cannot be a constant.  */
893   if (CONSTANT_P (src1))
894     src1 = force_reg (mode, src1);
895 
896   /* Source 1 cannot be a non-matching memory.  */
897   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
898     src1 = force_reg (mode, src1);
899 
900   /* Improve address combine.  */
901   if (code == PLUS
902       && GET_MODE_CLASS (mode) == MODE_INT
903       && MEM_P (src2))
904     src2 = force_reg (mode, src2);
905 
906   operands[1] = src1;
907   operands[2] = src2;
908   return dst;
909 }
910 
911 /* Similarly, but assume that the destination has already been
912    set up properly.  */
913 
914 void
ix86_fixup_binary_operands_no_copy(enum rtx_code code,machine_mode mode,rtx operands[])915 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
916 				    machine_mode mode, rtx operands[])
917 {
918   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
919   gcc_assert (dst == operands[0]);
920 }
921 
922 /* Attempt to expand a binary operator.  Make the expansion closer to the
923    actual machine, then just general_operand, which will allow 3 separate
924    memory references (one output, two input) in a single insn.  */
925 
926 void
ix86_expand_binary_operator(enum rtx_code code,machine_mode mode,rtx operands[])927 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
928 			     rtx operands[])
929 {
930   rtx src1, src2, dst, op, clob;
931 
932   dst = ix86_fixup_binary_operands (code, mode, operands);
933   src1 = operands[1];
934   src2 = operands[2];
935 
936  /* Emit the instruction.  */
937 
938   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
939 
940   if (reload_completed
941       && code == PLUS
942       && !rtx_equal_p (dst, src1))
943     {
944       /* This is going to be an LEA; avoid splitting it later.  */
945       emit_insn (op);
946     }
947   else
948     {
949       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
950       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
951     }
952 
953   /* Fix up the destination if needed.  */
954   if (dst != operands[0])
955     emit_move_insn (operands[0], dst);
956 }
957 
958 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
959    the given OPERANDS.  */
960 
961 void
ix86_expand_vector_logical_operator(enum rtx_code code,machine_mode mode,rtx operands[])962 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
963 				     rtx operands[])
964 {
965   rtx op1 = NULL_RTX, op2 = NULL_RTX;
966   if (SUBREG_P (operands[1]))
967     {
968       op1 = operands[1];
969       op2 = operands[2];
970     }
971   else if (SUBREG_P (operands[2]))
972     {
973       op1 = operands[2];
974       op2 = operands[1];
975     }
976   /* Optimize (__m128i) d | (__m128i) e and similar code
977      when d and e are float vectors into float vector logical
978      insn.  In C/C++ without using intrinsics there is no other way
979      to express vector logical operation on float vectors than
980      to cast them temporarily to integer vectors.  */
981   if (op1
982       && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
983       && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
984       && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
985       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
986       && SUBREG_BYTE (op1) == 0
987       && (GET_CODE (op2) == CONST_VECTOR
988 	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
989 	      && SUBREG_BYTE (op2) == 0))
990       && can_create_pseudo_p ())
991     {
992       rtx dst;
993       switch (GET_MODE (SUBREG_REG (op1)))
994 	{
995 	case E_V4SFmode:
996 	case E_V8SFmode:
997 	case E_V16SFmode:
998 	case E_V2DFmode:
999 	case E_V4DFmode:
1000 	case E_V8DFmode:
1001 	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1002 	  if (GET_CODE (op2) == CONST_VECTOR)
1003 	    {
1004 	      op2 = gen_lowpart (GET_MODE (dst), op2);
1005 	      op2 = force_reg (GET_MODE (dst), op2);
1006 	    }
1007 	  else
1008 	    {
1009 	      op1 = operands[1];
1010 	      op2 = SUBREG_REG (operands[2]);
1011 	      if (!vector_operand (op2, GET_MODE (dst)))
1012 		op2 = force_reg (GET_MODE (dst), op2);
1013 	    }
1014 	  op1 = SUBREG_REG (op1);
1015 	  if (!vector_operand (op1, GET_MODE (dst)))
1016 	    op1 = force_reg (GET_MODE (dst), op1);
1017 	  emit_insn (gen_rtx_SET (dst,
1018 				  gen_rtx_fmt_ee (code, GET_MODE (dst),
1019 						  op1, op2)));
1020 	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
1021 	  return;
1022 	default:
1023 	  break;
1024 	}
1025     }
1026   if (!vector_operand (operands[1], mode))
1027     operands[1] = force_reg (mode, operands[1]);
1028   if (!vector_operand (operands[2], mode))
1029     operands[2] = force_reg (mode, operands[2]);
1030   ix86_fixup_binary_operands_no_copy (code, mode, operands);
1031   emit_insn (gen_rtx_SET (operands[0],
1032 			  gen_rtx_fmt_ee (code, mode, operands[1],
1033 					  operands[2])));
1034 }
1035 
1036 /* Return TRUE or FALSE depending on whether the binary operator meets the
1037    appropriate constraints.  */
1038 
1039 bool
ix86_binary_operator_ok(enum rtx_code code,machine_mode mode,rtx operands[3])1040 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1041 			 rtx operands[3])
1042 {
1043   rtx dst = operands[0];
1044   rtx src1 = operands[1];
1045   rtx src2 = operands[2];
1046 
1047   /* Both source operands cannot be in memory.  */
1048   if (MEM_P (src1) && MEM_P (src2))
1049     return false;
1050 
1051   /* Canonicalize operand order for commutative operators.  */
1052   if (ix86_swap_binary_operands_p (code, mode, operands))
1053     std::swap (src1, src2);
1054 
1055   /* If the destination is memory, we must have a matching source operand.  */
1056   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1057     return false;
1058 
1059   /* Source 1 cannot be a constant.  */
1060   if (CONSTANT_P (src1))
1061     return false;
1062 
1063   /* Source 1 cannot be a non-matching memory.  */
1064   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1065     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
1066     return (code == AND
1067 	    && (mode == HImode
1068 		|| mode == SImode
1069 		|| (TARGET_64BIT && mode == DImode))
1070 	    && satisfies_constraint_L (src2));
1071 
1072   return true;
1073 }
1074 
1075 /* Attempt to expand a unary operator.  Make the expansion closer to the
1076    actual machine, then just general_operand, which will allow 2 separate
1077    memory references (one output, one input) in a single insn.  */
1078 
1079 void
ix86_expand_unary_operator(enum rtx_code code,machine_mode mode,rtx operands[])1080 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1081 			    rtx operands[])
1082 {
1083   bool matching_memory = false;
1084   rtx src, dst, op, clob;
1085 
1086   dst = operands[0];
1087   src = operands[1];
1088 
1089   /* If the destination is memory, and we do not have matching source
1090      operands, do things in registers.  */
1091   if (MEM_P (dst))
1092     {
1093       if (rtx_equal_p (dst, src))
1094 	matching_memory = true;
1095       else
1096 	dst = gen_reg_rtx (mode);
1097     }
1098 
1099   /* When source operand is memory, destination must match.  */
1100   if (MEM_P (src) && !matching_memory)
1101     src = force_reg (mode, src);
1102 
1103   /* Emit the instruction.  */
1104 
1105   op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1106 
1107   if (code == NOT)
1108     emit_insn (op);
1109   else
1110     {
1111       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1112       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1113     }
1114 
1115   /* Fix up the destination if needed.  */
1116   if (dst != operands[0])
1117     emit_move_insn (operands[0], dst);
1118 }
1119 
1120 /* Predict just emitted jump instruction to be taken with probability PROB.  */
1121 
1122 static void
predict_jump(int prob)1123 predict_jump (int prob)
1124 {
1125   rtx_insn *insn = get_last_insn ();
1126   gcc_assert (JUMP_P (insn));
1127   add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1128 }
1129 
1130 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1131    divisor are within the range [0-255].  */
1132 
1133 void
ix86_split_idivmod(machine_mode mode,rtx operands[],bool unsigned_p)1134 ix86_split_idivmod (machine_mode mode, rtx operands[],
1135 		    bool unsigned_p)
1136 {
1137   rtx_code_label *end_label, *qimode_label;
1138   rtx div, mod;
1139   rtx_insn *insn;
1140   rtx scratch, tmp0, tmp1, tmp2;
1141   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1142 
1143   operands[2] = force_reg (mode, operands[2]);
1144   operands[3] = force_reg (mode, operands[3]);
1145 
1146   switch (mode)
1147     {
1148     case E_SImode:
1149       if (GET_MODE (operands[0]) == SImode)
1150 	{
1151 	  if (GET_MODE (operands[1]) == SImode)
1152 	    gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1153 	  else
1154 	    gen_divmod4_1
1155 	      = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1156 	}
1157       else
1158 	gen_divmod4_1
1159 	  = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1160       break;
1161 
1162     case E_DImode:
1163       gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1164       break;
1165 
1166     default:
1167       gcc_unreachable ();
1168     }
1169 
1170   end_label = gen_label_rtx ();
1171   qimode_label = gen_label_rtx ();
1172 
1173   scratch = gen_reg_rtx (mode);
1174 
1175   /* Use 8bit unsigned divimod if dividend and divisor are within
1176      the range [0-255].  */
1177   emit_move_insn (scratch, operands[2]);
1178   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1179 				 scratch, 1, OPTAB_DIRECT);
1180   emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1181   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1182   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1183   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1184 			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1185 			       pc_rtx);
1186   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1187   predict_jump (REG_BR_PROB_BASE * 50 / 100);
1188   JUMP_LABEL (insn) = qimode_label;
1189 
1190   /* Generate original signed/unsigned divimod.  */
1191   div = gen_divmod4_1 (operands[0], operands[1],
1192 		       operands[2], operands[3]);
1193   emit_insn (div);
1194 
1195   /* Branch to the end.  */
1196   emit_jump_insn (gen_jump (end_label));
1197   emit_barrier ();
1198 
1199   /* Generate 8bit unsigned divide.  */
1200   emit_label (qimode_label);
1201   /* Don't use operands[0] for result of 8bit divide since not all
1202      registers support QImode ZERO_EXTRACT.  */
1203   tmp0 = lowpart_subreg (HImode, scratch, mode);
1204   tmp1 = lowpart_subreg (HImode, operands[2], mode);
1205   tmp2 = lowpart_subreg (QImode, operands[3], mode);
1206   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1207 
1208   if (unsigned_p)
1209     {
1210       div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1211       mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1212     }
1213   else
1214     {
1215       div = gen_rtx_DIV (mode, operands[2], operands[3]);
1216       mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1217     }
1218   if (mode == SImode)
1219     {
1220       if (GET_MODE (operands[0]) != SImode)
1221 	div = gen_rtx_ZERO_EXTEND (DImode, div);
1222       if (GET_MODE (operands[1]) != SImode)
1223 	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1224     }
1225 
1226   /* Extract remainder from AH.  */
1227   tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
1228 			       tmp0, GEN_INT (8), GEN_INT (8));
1229   if (REG_P (operands[1]))
1230     insn = emit_move_insn (operands[1], tmp1);
1231   else
1232     {
1233       /* Need a new scratch register since the old one has result
1234 	 of 8bit divide.  */
1235       scratch = gen_reg_rtx (GET_MODE (operands[1]));
1236       emit_move_insn (scratch, tmp1);
1237       insn = emit_move_insn (operands[1], scratch);
1238     }
1239   set_unique_reg_note (insn, REG_EQUAL, mod);
1240 
1241   /* Zero extend quotient from AL.  */
1242   tmp1 = gen_lowpart (QImode, tmp0);
1243   insn = emit_insn (gen_extend_insn
1244 		    (operands[0], tmp1,
1245 		     GET_MODE (operands[0]), QImode, 1));
1246   set_unique_reg_note (insn, REG_EQUAL, div);
1247 
1248   emit_label (end_label);
1249 }
1250 
1251 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1252    matches destination.  RTX includes clobber of FLAGS_REG.  */
1253 
1254 void
ix86_emit_binop(enum rtx_code code,machine_mode mode,rtx dst,rtx src)1255 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1256 		 rtx dst, rtx src)
1257 {
1258   rtx op, clob;
1259 
1260   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1261   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1262 
1263   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1264 }
1265 
1266 /* Return true if regno1 def is nearest to the insn.  */
1267 
1268 static bool
find_nearest_reg_def(rtx_insn * insn,int regno1,int regno2)1269 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1270 {
1271   rtx_insn *prev = insn;
1272   rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1273 
1274   if (insn == start)
1275     return false;
1276   while (prev && prev != start)
1277     {
1278       if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1279 	{
1280 	  prev = PREV_INSN (prev);
1281 	  continue;
1282 	}
1283       if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1284 	return true;
1285       else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1286 	return false;
1287       prev = PREV_INSN (prev);
1288     }
1289 
1290   /* None of the regs is defined in the bb.  */
1291   return false;
1292 }
1293 
1294 /* Split lea instructions into a sequence of instructions
1295    which are executed on ALU to avoid AGU stalls.
1296    It is assumed that it is allowed to clobber flags register
1297    at lea position.  */
1298 
1299 void
ix86_split_lea_for_addr(rtx_insn * insn,rtx operands[],machine_mode mode)1300 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1301 {
1302   unsigned int regno0, regno1, regno2;
1303   struct ix86_address parts;
1304   rtx target, tmp;
1305   int ok, adds;
1306 
1307   ok = ix86_decompose_address (operands[1], &parts);
1308   gcc_assert (ok);
1309 
1310   target = gen_lowpart (mode, operands[0]);
1311 
1312   regno0 = true_regnum (target);
1313   regno1 = INVALID_REGNUM;
1314   regno2 = INVALID_REGNUM;
1315 
1316   if (parts.base)
1317     {
1318       parts.base = gen_lowpart (mode, parts.base);
1319       regno1 = true_regnum (parts.base);
1320     }
1321 
1322   if (parts.index)
1323     {
1324       parts.index = gen_lowpart (mode, parts.index);
1325       regno2 = true_regnum (parts.index);
1326     }
1327 
1328   if (parts.disp)
1329     parts.disp = gen_lowpart (mode, parts.disp);
1330 
1331   if (parts.scale > 1)
1332     {
1333       /* Case r1 = r1 + ...  */
1334       if (regno1 == regno0)
1335 	{
1336 	  /* If we have a case r1 = r1 + C * r2 then we
1337 	     should use multiplication which is very
1338 	     expensive.  Assume cost model is wrong if we
1339 	     have such case here.  */
1340 	  gcc_assert (regno2 != regno0);
1341 
1342 	  for (adds = parts.scale; adds > 0; adds--)
1343 	    ix86_emit_binop (PLUS, mode, target, parts.index);
1344 	}
1345       else
1346 	{
1347 	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
1348 	  if (regno0 != regno2)
1349 	    emit_insn (gen_rtx_SET (target, parts.index));
1350 
1351 	  /* Use shift for scaling.  */
1352 	  ix86_emit_binop (ASHIFT, mode, target,
1353 			   GEN_INT (exact_log2 (parts.scale)));
1354 
1355 	  if (parts.base)
1356 	    ix86_emit_binop (PLUS, mode, target, parts.base);
1357 
1358 	  if (parts.disp && parts.disp != const0_rtx)
1359 	    ix86_emit_binop (PLUS, mode, target, parts.disp);
1360 	}
1361     }
1362   else if (!parts.base && !parts.index)
1363     {
1364       gcc_assert(parts.disp);
1365       emit_insn (gen_rtx_SET (target, parts.disp));
1366     }
1367   else
1368     {
1369       if (!parts.base)
1370 	{
1371 	  if (regno0 != regno2)
1372 	    emit_insn (gen_rtx_SET (target, parts.index));
1373 	}
1374       else if (!parts.index)
1375 	{
1376 	  if (regno0 != regno1)
1377 	    emit_insn (gen_rtx_SET (target, parts.base));
1378 	}
1379       else
1380 	{
1381 	  if (regno0 == regno1)
1382 	    tmp = parts.index;
1383 	  else if (regno0 == regno2)
1384 	    tmp = parts.base;
1385 	  else
1386 	    {
1387 	      rtx tmp1;
1388 
1389 	      /* Find better operand for SET instruction, depending
1390 		 on which definition is farther from the insn.  */
1391 	      if (find_nearest_reg_def (insn, regno1, regno2))
1392 		tmp = parts.index, tmp1 = parts.base;
1393 	      else
1394 		tmp = parts.base, tmp1 = parts.index;
1395 
1396 	      emit_insn (gen_rtx_SET (target, tmp));
1397 
1398 	      if (parts.disp && parts.disp != const0_rtx)
1399 		ix86_emit_binop (PLUS, mode, target, parts.disp);
1400 
1401 	      ix86_emit_binop (PLUS, mode, target, tmp1);
1402 	      return;
1403 	    }
1404 
1405 	  ix86_emit_binop (PLUS, mode, target, tmp);
1406 	}
1407 
1408       if (parts.disp && parts.disp != const0_rtx)
1409 	ix86_emit_binop (PLUS, mode, target, parts.disp);
1410     }
1411 }
1412 
1413 /* Post-reload splitter for converting an SF or DFmode value in an
1414    SSE register into an unsigned SImode.  */
1415 
1416 void
ix86_split_convert_uns_si_sse(rtx operands[])1417 ix86_split_convert_uns_si_sse (rtx operands[])
1418 {
1419   machine_mode vecmode;
1420   rtx value, large, zero_or_two31, input, two31, x;
1421 
1422   large = operands[1];
1423   zero_or_two31 = operands[2];
1424   input = operands[3];
1425   two31 = operands[4];
1426   vecmode = GET_MODE (large);
1427   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1428 
1429   /* Load up the value into the low element.  We must ensure that the other
1430      elements are valid floats -- zero is the easiest such value.  */
1431   if (MEM_P (input))
1432     {
1433       if (vecmode == V4SFmode)
1434 	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1435       else
1436 	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1437     }
1438   else
1439     {
1440       input = gen_rtx_REG (vecmode, REGNO (input));
1441       emit_move_insn (value, CONST0_RTX (vecmode));
1442       if (vecmode == V4SFmode)
1443 	emit_insn (gen_sse_movss (value, value, input));
1444       else
1445 	emit_insn (gen_sse2_movsd (value, value, input));
1446     }
1447 
1448   emit_move_insn (large, two31);
1449   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1450 
1451   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1452   emit_insn (gen_rtx_SET (large, x));
1453 
1454   x = gen_rtx_AND (vecmode, zero_or_two31, large);
1455   emit_insn (gen_rtx_SET (zero_or_two31, x));
1456 
1457   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1458   emit_insn (gen_rtx_SET (value, x));
1459 
1460   large = gen_rtx_REG (V4SImode, REGNO (large));
1461   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1462 
1463   x = gen_rtx_REG (V4SImode, REGNO (value));
1464   if (vecmode == V4SFmode)
1465     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1466   else
1467     emit_insn (gen_sse2_cvttpd2dq (x, value));
1468   value = x;
1469 
1470   emit_insn (gen_xorv4si3 (value, value, large));
1471 }
1472 
1473 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1474 						 machine_mode mode, rtx target,
1475 						 rtx var, int one_var);
1476 
1477 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1478    Expects the 64-bit DImode to be supplied in a pair of integral
1479    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
1480    -mfpmath=sse, !optimize_size only.  */
1481 
1482 void
ix86_expand_convert_uns_didf_sse(rtx target,rtx input)1483 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1484 {
1485   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1486   rtx int_xmm, fp_xmm;
1487   rtx biases, exponents;
1488   rtx x;
1489 
1490   int_xmm = gen_reg_rtx (V4SImode);
1491   if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1492     emit_insn (gen_movdi_to_sse (int_xmm, input));
1493   else if (TARGET_SSE_SPLIT_REGS)
1494     {
1495       emit_clobber (int_xmm);
1496       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1497     }
1498   else
1499     {
1500       x = gen_reg_rtx (V2DImode);
1501       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1502       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1503     }
1504 
1505   x = gen_rtx_CONST_VECTOR (V4SImode,
1506 			    gen_rtvec (4, GEN_INT (0x43300000UL),
1507 				       GEN_INT (0x45300000UL),
1508 				       const0_rtx, const0_rtx));
1509   exponents = validize_mem (force_const_mem (V4SImode, x));
1510 
1511   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1512   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1513 
1514   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1515      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1516      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1517      (0x1.0p84 + double(fp_value_hi_xmm)).
1518      Note these exponents differ by 32.  */
1519 
1520   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1521 
1522   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1523      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
1524   real_ldexp (&bias_lo_rvt, &dconst1, 52);
1525   real_ldexp (&bias_hi_rvt, &dconst1, 84);
1526   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1527   x = const_double_from_real_value (bias_hi_rvt, DFmode);
1528   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1529   biases = validize_mem (force_const_mem (V2DFmode, biases));
1530   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1531 
1532   /* Add the upper and lower DFmode values together.  */
1533   if (TARGET_SSE3)
1534     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1535   else
1536     {
1537       x = copy_to_mode_reg (V2DFmode, fp_xmm);
1538       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1539       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1540     }
1541 
1542   ix86_expand_vector_extract (false, target, fp_xmm, 0);
1543 }
1544 
1545 /* Not used, but eases macroization of patterns.  */
1546 void
ix86_expand_convert_uns_sixf_sse(rtx,rtx)1547 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1548 {
1549   gcc_unreachable ();
1550 }
1551 
1552 /* Convert an unsigned SImode value into a DFmode.  Only currently used
1553    for SSE, but applicable anywhere.  */
1554 
1555 void
ix86_expand_convert_uns_sidf_sse(rtx target,rtx input)1556 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1557 {
1558   REAL_VALUE_TYPE TWO31r;
1559   rtx x, fp;
1560 
1561   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1562 			   NULL, 1, OPTAB_DIRECT);
1563 
1564   fp = gen_reg_rtx (DFmode);
1565   emit_insn (gen_floatsidf2 (fp, x));
1566 
1567   real_ldexp (&TWO31r, &dconst1, 31);
1568   x = const_double_from_real_value (TWO31r, DFmode);
1569 
1570   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1571   if (x != target)
1572     emit_move_insn (target, x);
1573 }
1574 
1575 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
1576    32-bit mode; otherwise we have a direct convert instruction.  */
1577 
1578 void
ix86_expand_convert_sign_didf_sse(rtx target,rtx input)1579 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1580 {
1581   REAL_VALUE_TYPE TWO32r;
1582   rtx fp_lo, fp_hi, x;
1583 
1584   fp_lo = gen_reg_rtx (DFmode);
1585   fp_hi = gen_reg_rtx (DFmode);
1586 
1587   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1588 
1589   real_ldexp (&TWO32r, &dconst1, 32);
1590   x = const_double_from_real_value (TWO32r, DFmode);
1591   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1592 
1593   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1594 
1595   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1596 			   0, OPTAB_DIRECT);
1597   if (x != target)
1598     emit_move_insn (target, x);
1599 }
1600 
1601 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1602    For x86_32, -mfpmath=sse, !optimize_size only.  */
1603 void
ix86_expand_convert_uns_sisf_sse(rtx target,rtx input)1604 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1605 {
1606   REAL_VALUE_TYPE ONE16r;
1607   rtx fp_hi, fp_lo, int_hi, int_lo, x;
1608 
1609   real_ldexp (&ONE16r, &dconst1, 16);
1610   x = const_double_from_real_value (ONE16r, SFmode);
1611   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1612 				      NULL, 0, OPTAB_DIRECT);
1613   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1614 				      NULL, 0, OPTAB_DIRECT);
1615   fp_hi = gen_reg_rtx (SFmode);
1616   fp_lo = gen_reg_rtx (SFmode);
1617   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1618   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1619   fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1620 			       0, OPTAB_DIRECT);
1621   fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1622 			       0, OPTAB_DIRECT);
1623   if (!rtx_equal_p (target, fp_hi))
1624     emit_move_insn (target, fp_hi);
1625 }
1626 
1627 /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
1628    a vector of unsigned ints VAL to vector of floats TARGET.  */
1629 
1630 void
ix86_expand_vector_convert_uns_vsivsf(rtx target,rtx val)1631 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1632 {
1633   rtx tmp[8];
1634   REAL_VALUE_TYPE TWO16r;
1635   machine_mode intmode = GET_MODE (val);
1636   machine_mode fltmode = GET_MODE (target);
1637   rtx (*cvt) (rtx, rtx);
1638 
1639   if (intmode == V4SImode)
1640     cvt = gen_floatv4siv4sf2;
1641   else
1642     cvt = gen_floatv8siv8sf2;
1643   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1644   tmp[0] = force_reg (intmode, tmp[0]);
1645   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1646 				OPTAB_DIRECT);
1647   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1648 				NULL_RTX, 1, OPTAB_DIRECT);
1649   tmp[3] = gen_reg_rtx (fltmode);
1650   emit_insn (cvt (tmp[3], tmp[1]));
1651   tmp[4] = gen_reg_rtx (fltmode);
1652   emit_insn (cvt (tmp[4], tmp[2]));
1653   real_ldexp (&TWO16r, &dconst1, 16);
1654   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1655   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1656   tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1657 				OPTAB_DIRECT);
1658   tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1659 				OPTAB_DIRECT);
1660   if (tmp[7] != target)
1661     emit_move_insn (target, tmp[7]);
1662 }
1663 
1664 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1665    pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1666    This is done by doing just signed conversion if < 0x1p31, and otherwise by
1667    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
1668 
1669 rtx
ix86_expand_adjust_ufix_to_sfix_si(rtx val,rtx * xorp)1670 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1671 {
1672   REAL_VALUE_TYPE TWO31r;
1673   rtx two31r, tmp[4];
1674   machine_mode mode = GET_MODE (val);
1675   machine_mode scalarmode = GET_MODE_INNER (mode);
1676   machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1677   rtx (*cmp) (rtx, rtx, rtx, rtx);
1678   int i;
1679 
1680   for (i = 0; i < 3; i++)
1681     tmp[i] = gen_reg_rtx (mode);
1682   real_ldexp (&TWO31r, &dconst1, 31);
1683   two31r = const_double_from_real_value (TWO31r, scalarmode);
1684   two31r = ix86_build_const_vector (mode, 1, two31r);
1685   two31r = force_reg (mode, two31r);
1686   switch (mode)
1687     {
1688     case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1689     case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1690     case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1691     case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1692     default: gcc_unreachable ();
1693     }
1694   tmp[3] = gen_rtx_LE (mode, two31r, val);
1695   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1696   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1697 				0, OPTAB_DIRECT);
1698   if (intmode == V4SImode || TARGET_AVX2)
1699     *xorp = expand_simple_binop (intmode, ASHIFT,
1700 				 gen_lowpart (intmode, tmp[0]),
1701 				 GEN_INT (31), NULL_RTX, 0,
1702 				 OPTAB_DIRECT);
1703   else
1704     {
1705       rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1706       two31 = ix86_build_const_vector (intmode, 1, two31);
1707       *xorp = expand_simple_binop (intmode, AND,
1708 				   gen_lowpart (intmode, tmp[0]),
1709 				   two31, NULL_RTX, 0,
1710 				   OPTAB_DIRECT);
1711     }
1712   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1713 			      0, OPTAB_DIRECT);
1714 }
1715 
1716 /* Generate code for floating point ABS or NEG.  */
1717 
1718 void
ix86_expand_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])1719 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1720 				rtx operands[])
1721 {
1722   rtx set, dst, src;
1723   bool use_sse = false;
1724   bool vector_mode = VECTOR_MODE_P (mode);
1725   machine_mode vmode = mode;
1726   rtvec par;
1727 
1728   if (vector_mode)
1729     use_sse = true;
1730   else if (mode == TFmode)
1731     use_sse = true;
1732   else if (TARGET_SSE_MATH)
1733     {
1734       use_sse = SSE_FLOAT_MODE_P (mode);
1735       if (mode == SFmode)
1736 	vmode = V4SFmode;
1737       else if (mode == DFmode)
1738 	vmode = V2DFmode;
1739     }
1740 
1741   dst = operands[0];
1742   src = operands[1];
1743 
1744   set = gen_rtx_fmt_e (code, mode, src);
1745   set = gen_rtx_SET (dst, set);
1746 
1747   if (use_sse)
1748     {
1749       rtx mask, use, clob;
1750 
1751       /* NEG and ABS performed with SSE use bitwise mask operations.
1752 	 Create the appropriate mask now.  */
1753       mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1754       use = gen_rtx_USE (VOIDmode, mask);
1755       if (vector_mode)
1756 	par = gen_rtvec (2, set, use);
1757       else
1758 	{
1759           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1760 	  par = gen_rtvec (3, set, use, clob);
1761         }
1762     }
1763   else
1764     {
1765       rtx clob;
1766 
1767       /* Changing of sign for FP values is doable using integer unit too.  */
1768       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1769       par = gen_rtvec (2, set, clob);
1770     }
1771 
1772   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1773 }
1774 
1775 /* Deconstruct a floating point ABS or NEG operation
1776    with integer registers into integer operations.  */
1777 
1778 void
ix86_split_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])1779 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1780 			       rtx operands[])
1781 {
1782   enum rtx_code absneg_op;
1783   rtx dst, set;
1784 
1785   gcc_assert (operands_match_p (operands[0], operands[1]));
1786 
1787   switch (mode)
1788     {
1789     case E_SFmode:
1790       dst = gen_lowpart (SImode, operands[0]);
1791 
1792       if (code == ABS)
1793 	{
1794 	  set = gen_int_mode (0x7fffffff, SImode);
1795 	  absneg_op = AND;
1796 	}
1797       else
1798 	{
1799 	  set = gen_int_mode (0x80000000, SImode);
1800 	  absneg_op = XOR;
1801 	}
1802       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1803       break;
1804 
1805     case E_DFmode:
1806       if (TARGET_64BIT)
1807 	{
1808 	  dst = gen_lowpart (DImode, operands[0]);
1809 	  dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1810 
1811 	  if (code == ABS)
1812 	    set = const0_rtx;
1813 	  else
1814 	    set = gen_rtx_NOT (DImode, dst);
1815 	}
1816       else
1817 	{
1818 	  dst = gen_highpart (SImode, operands[0]);
1819 
1820 	  if (code == ABS)
1821 	    {
1822 	      set = gen_int_mode (0x7fffffff, SImode);
1823 	      absneg_op = AND;
1824 	    }
1825 	  else
1826 	    {
1827 	      set = gen_int_mode (0x80000000, SImode);
1828 	      absneg_op = XOR;
1829 	    }
1830 	  set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1831 	}
1832       break;
1833 
1834     case E_XFmode:
1835       dst = gen_rtx_REG (SImode,
1836 			 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1837       if (code == ABS)
1838 	{
1839 	  set = GEN_INT (0x7fff);
1840 	  absneg_op = AND;
1841 	}
1842       else
1843 	{
1844 	  set = GEN_INT (0x8000);
1845 	  absneg_op = XOR;
1846 	}
1847       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1848       break;
1849 
1850     default:
1851       gcc_unreachable ();
1852     }
1853 
1854   set = gen_rtx_SET (dst, set);
1855 
1856   rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1857   rtvec par = gen_rtvec (2, set, clob);
1858 
1859   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1860 }
1861 
1862 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
1863 
1864 void
ix86_expand_copysign(rtx operands[])1865 ix86_expand_copysign (rtx operands[])
1866 {
1867   machine_mode mode, vmode;
1868   rtx dest, op0, op1, mask;
1869 
1870   dest = operands[0];
1871   op0 = operands[1];
1872   op1 = operands[2];
1873 
1874   mode = GET_MODE (dest);
1875 
1876   if (mode == SFmode)
1877     vmode = V4SFmode;
1878   else if (mode == DFmode)
1879     vmode = V2DFmode;
1880   else if (mode == TFmode)
1881     vmode = mode;
1882   else
1883     gcc_unreachable ();
1884 
1885   mask = ix86_build_signbit_mask (vmode, 0, 0);
1886 
1887   if (CONST_DOUBLE_P (op0))
1888     {
1889       if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1890 	op0 = simplify_unary_operation (ABS, mode, op0, mode);
1891 
1892       if (mode == SFmode || mode == DFmode)
1893 	{
1894 	  if (op0 == CONST0_RTX (mode))
1895 	    op0 = CONST0_RTX (vmode);
1896 	  else
1897 	    {
1898 	      rtx v = ix86_build_const_vector (vmode, false, op0);
1899 
1900 	      op0 = force_reg (vmode, v);
1901 	    }
1902 	}
1903       else if (op0 != CONST0_RTX (mode))
1904 	op0 = force_reg (mode, op0);
1905 
1906       emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1907     }
1908   else
1909     {
1910       rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1911 
1912       emit_insn (gen_copysign3_var
1913 		 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1914     }
1915 }
1916 
1917 /* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
1918    be a constant, and so has already been expanded into a vector constant.  */
1919 
1920 void
ix86_split_copysign_const(rtx operands[])1921 ix86_split_copysign_const (rtx operands[])
1922 {
1923   machine_mode mode, vmode;
1924   rtx dest, op0, mask, x;
1925 
1926   dest = operands[0];
1927   op0 = operands[1];
1928   mask = operands[3];
1929 
1930   mode = GET_MODE (dest);
1931   vmode = GET_MODE (mask);
1932 
1933   dest = lowpart_subreg (vmode, dest, mode);
1934   x = gen_rtx_AND (vmode, dest, mask);
1935   emit_insn (gen_rtx_SET (dest, x));
1936 
1937   if (op0 != CONST0_RTX (vmode))
1938     {
1939       x = gen_rtx_IOR (vmode, dest, op0);
1940       emit_insn (gen_rtx_SET (dest, x));
1941     }
1942 }
1943 
1944 /* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
1945    so we have to do two masks.  */
1946 
1947 void
ix86_split_copysign_var(rtx operands[])1948 ix86_split_copysign_var (rtx operands[])
1949 {
1950   machine_mode mode, vmode;
1951   rtx dest, scratch, op0, op1, mask, nmask, x;
1952 
1953   dest = operands[0];
1954   scratch = operands[1];
1955   op0 = operands[2];
1956   op1 = operands[3];
1957   nmask = operands[4];
1958   mask = operands[5];
1959 
1960   mode = GET_MODE (dest);
1961   vmode = GET_MODE (mask);
1962 
1963   if (rtx_equal_p (op0, op1))
1964     {
1965       /* Shouldn't happen often (it's useless, obviously), but when it does
1966 	 we'd generate incorrect code if we continue below.  */
1967       emit_move_insn (dest, op0);
1968       return;
1969     }
1970 
1971   if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
1972     {
1973       gcc_assert (REGNO (op1) == REGNO (scratch));
1974 
1975       x = gen_rtx_AND (vmode, scratch, mask);
1976       emit_insn (gen_rtx_SET (scratch, x));
1977 
1978       dest = mask;
1979       op0 = lowpart_subreg (vmode, op0, mode);
1980       x = gen_rtx_NOT (vmode, dest);
1981       x = gen_rtx_AND (vmode, x, op0);
1982       emit_insn (gen_rtx_SET (dest, x));
1983     }
1984   else
1985     {
1986       if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
1987 	{
1988 	  x = gen_rtx_AND (vmode, scratch, mask);
1989 	}
1990       else						/* alternative 2,4 */
1991 	{
1992           gcc_assert (REGNO (mask) == REGNO (scratch));
1993           op1 = lowpart_subreg (vmode, op1, mode);
1994 	  x = gen_rtx_AND (vmode, scratch, op1);
1995 	}
1996       emit_insn (gen_rtx_SET (scratch, x));
1997 
1998       if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
1999 	{
2000 	  dest = lowpart_subreg (vmode, op0, mode);
2001 	  x = gen_rtx_AND (vmode, dest, nmask);
2002 	}
2003       else						/* alternative 3,4 */
2004 	{
2005           gcc_assert (REGNO (nmask) == REGNO (dest));
2006 	  dest = nmask;
2007 	  op0 = lowpart_subreg (vmode, op0, mode);
2008 	  x = gen_rtx_AND (vmode, dest, op0);
2009 	}
2010       emit_insn (gen_rtx_SET (dest, x));
2011     }
2012 
2013   x = gen_rtx_IOR (vmode, dest, scratch);
2014   emit_insn (gen_rtx_SET (dest, x));
2015 }
2016 
2017 /* Expand an xorsign operation.  */
2018 
2019 void
ix86_expand_xorsign(rtx operands[])2020 ix86_expand_xorsign (rtx operands[])
2021 {
2022   machine_mode mode, vmode;
2023   rtx dest, op0, op1, mask;
2024 
2025   dest = operands[0];
2026   op0 = operands[1];
2027   op1 = operands[2];
2028 
2029   mode = GET_MODE (dest);
2030 
2031   if (mode == SFmode)
2032     vmode = V4SFmode;
2033   else if (mode == DFmode)
2034     vmode = V2DFmode;
2035   else
2036     gcc_unreachable ();
2037 
2038   mask = ix86_build_signbit_mask (vmode, 0, 0);
2039 
2040   emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2041 }
2042 
2043 /* Deconstruct an xorsign operation into bit masks.  */
2044 
2045 void
ix86_split_xorsign(rtx operands[])2046 ix86_split_xorsign (rtx operands[])
2047 {
2048   machine_mode mode, vmode;
2049   rtx dest, op0, mask, x;
2050 
2051   dest = operands[0];
2052   op0 = operands[1];
2053   mask = operands[3];
2054 
2055   mode = GET_MODE (dest);
2056   vmode = GET_MODE (mask);
2057 
2058   dest = lowpart_subreg (vmode, dest, mode);
2059   x = gen_rtx_AND (vmode, dest, mask);
2060   emit_insn (gen_rtx_SET (dest, x));
2061 
2062   op0 = lowpart_subreg (vmode, op0, mode);
2063   x = gen_rtx_XOR (vmode, dest, op0);
2064   emit_insn (gen_rtx_SET (dest, x));
2065 }
2066 
2067 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2068 
2069 void
ix86_expand_branch(enum rtx_code code,rtx op0,rtx op1,rtx label)2070 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2071 {
2072   machine_mode mode = GET_MODE (op0);
2073   rtx tmp;
2074 
2075   /* Handle special case - vector comparsion with boolean result, transform
2076      it using ptest instruction.  */
2077   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2078     {
2079       rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2080       machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2081 
2082       gcc_assert (code == EQ || code == NE);
2083       /* Generate XOR since we can't check that one operand is zero vector.  */
2084       tmp = gen_reg_rtx (mode);
2085       emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2086       tmp = gen_lowpart (p_mode, tmp);
2087       emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2088 			      gen_rtx_UNSPEC (CCmode,
2089 					      gen_rtvec (2, tmp, tmp),
2090 					      UNSPEC_PTEST)));
2091       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2092       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2093 				  gen_rtx_LABEL_REF (VOIDmode, label),
2094 				  pc_rtx);
2095       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2096       return;
2097     }
2098 
2099   switch (mode)
2100     {
2101     case E_SFmode:
2102     case E_DFmode:
2103     case E_XFmode:
2104     case E_QImode:
2105     case E_HImode:
2106     case E_SImode:
2107       simple:
2108       tmp = ix86_expand_compare (code, op0, op1);
2109       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2110 				  gen_rtx_LABEL_REF (VOIDmode, label),
2111 				  pc_rtx);
2112       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2113       return;
2114 
2115     case E_DImode:
2116       if (TARGET_64BIT)
2117 	goto simple;
2118       /* For 32-bit target DI comparison may be performed on
2119 	 SSE registers.  To allow this we should avoid split
2120 	 to SI mode which is achieved by doing xor in DI mode
2121 	 and then comparing with zero (which is recognized by
2122 	 STV pass).  We don't compare using xor when optimizing
2123 	 for size.  */
2124       if (!optimize_insn_for_size_p ()
2125 	  && TARGET_STV
2126 	  && (code == EQ || code == NE))
2127 	{
2128 	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2129 	  op1 = const0_rtx;
2130 	}
2131       /* FALLTHRU */
2132     case E_TImode:
2133       /* Expand DImode branch into multiple compare+branch.  */
2134       {
2135 	rtx lo[2], hi[2];
2136 	rtx_code_label *label2;
2137 	enum rtx_code code1, code2, code3;
2138 	machine_mode submode;
2139 
2140 	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2141 	  {
2142 	    std::swap (op0, op1);
2143 	    code = swap_condition (code);
2144 	  }
2145 
2146 	split_double_mode (mode, &op0, 1, lo+0, hi+0);
2147 	split_double_mode (mode, &op1, 1, lo+1, hi+1);
2148 
2149 	submode = mode == DImode ? SImode : DImode;
2150 
2151 	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2152 	   avoid two branches.  This costs one extra insn, so disable when
2153 	   optimizing for size.  */
2154 
2155 	if ((code == EQ || code == NE)
2156 	    && (!optimize_insn_for_size_p ()
2157 	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
2158 	  {
2159 	    rtx xor0, xor1;
2160 
2161 	    xor1 = hi[0];
2162 	    if (hi[1] != const0_rtx)
2163 	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2164 				   NULL_RTX, 0, OPTAB_WIDEN);
2165 
2166 	    xor0 = lo[0];
2167 	    if (lo[1] != const0_rtx)
2168 	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2169 				   NULL_RTX, 0, OPTAB_WIDEN);
2170 
2171 	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
2172 				NULL_RTX, 0, OPTAB_WIDEN);
2173 
2174 	    ix86_expand_branch (code, tmp, const0_rtx, label);
2175 	    return;
2176 	  }
2177 
2178 	/* Otherwise, if we are doing less-than or greater-or-equal-than,
2179 	   op1 is a constant and the low word is zero, then we can just
2180 	   examine the high word.  Similarly for low word -1 and
2181 	   less-or-equal-than or greater-than.  */
2182 
2183 	if (CONST_INT_P (hi[1]))
2184 	  switch (code)
2185 	    {
2186 	    case LT: case LTU: case GE: case GEU:
2187 	      if (lo[1] == const0_rtx)
2188 		{
2189 		  ix86_expand_branch (code, hi[0], hi[1], label);
2190 		  return;
2191 		}
2192 	      break;
2193 	    case LE: case LEU: case GT: case GTU:
2194 	      if (lo[1] == constm1_rtx)
2195 		{
2196 		  ix86_expand_branch (code, hi[0], hi[1], label);
2197 		  return;
2198 		}
2199 	      break;
2200 	    default:
2201 	      break;
2202 	    }
2203 
2204 	/* Emulate comparisons that do not depend on Zero flag with
2205 	   double-word subtraction.  Note that only Overflow, Sign
2206 	   and Carry flags are valid, so swap arguments and condition
2207 	   of comparisons that would otherwise test Zero flag.  */
2208 
2209 	switch (code)
2210 	  {
2211 	  case LE: case LEU: case GT: case GTU:
2212 	    std::swap (lo[0], lo[1]);
2213 	    std::swap (hi[0], hi[1]);
2214 	    code = swap_condition (code);
2215 	    /* FALLTHRU */
2216 
2217 	  case LT: case LTU: case GE: case GEU:
2218 	    {
2219 	      bool uns = (code == LTU || code == GEU);
2220 	      rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2221 		= uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2222 
2223 	      if (!nonimmediate_operand (lo[0], submode))
2224 		lo[0] = force_reg (submode, lo[0]);
2225 	      if (!x86_64_general_operand (lo[1], submode))
2226 		lo[1] = force_reg (submode, lo[1]);
2227 
2228 	      if (!register_operand (hi[0], submode))
2229 		hi[0] = force_reg (submode, hi[0]);
2230 	      if ((uns && !nonimmediate_operand (hi[1], submode))
2231 		  || (!uns && !x86_64_general_operand (hi[1], submode)))
2232 		hi[1] = force_reg (submode, hi[1]);
2233 
2234 	      emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2235 
2236 	      tmp = gen_rtx_SCRATCH (submode);
2237 	      emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2238 
2239 	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2240 	      ix86_expand_branch (code, tmp, const0_rtx, label);
2241 	      return;
2242 	    }
2243 
2244 	  default:
2245 	    break;
2246 	  }
2247 
2248 	/* Otherwise, we need two or three jumps.  */
2249 
2250 	label2 = gen_label_rtx ();
2251 
2252 	code1 = code;
2253 	code2 = swap_condition (code);
2254 	code3 = unsigned_condition (code);
2255 
2256 	switch (code)
2257 	  {
2258 	  case LT: case GT: case LTU: case GTU:
2259 	    break;
2260 
2261 	  case LE:   code1 = LT;  code2 = GT;  break;
2262 	  case GE:   code1 = GT;  code2 = LT;  break;
2263 	  case LEU:  code1 = LTU; code2 = GTU; break;
2264 	  case GEU:  code1 = GTU; code2 = LTU; break;
2265 
2266 	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
2267 	  case NE:   code2 = UNKNOWN; break;
2268 
2269 	  default:
2270 	    gcc_unreachable ();
2271 	  }
2272 
2273 	/*
2274 	 * a < b =>
2275 	 *    if (hi(a) < hi(b)) goto true;
2276 	 *    if (hi(a) > hi(b)) goto false;
2277 	 *    if (lo(a) < lo(b)) goto true;
2278 	 *  false:
2279 	 */
2280 
2281 	if (code1 != UNKNOWN)
2282 	  ix86_expand_branch (code1, hi[0], hi[1], label);
2283 	if (code2 != UNKNOWN)
2284 	  ix86_expand_branch (code2, hi[0], hi[1], label2);
2285 
2286 	ix86_expand_branch (code3, lo[0], lo[1], label);
2287 
2288 	if (code2 != UNKNOWN)
2289 	  emit_label (label2);
2290 	return;
2291       }
2292 
2293     default:
2294       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2295       goto simple;
2296     }
2297 }
2298 
2299 /* Figure out whether to use unordered fp comparisons.  */
2300 
2301 static bool
ix86_unordered_fp_compare(enum rtx_code code)2302 ix86_unordered_fp_compare (enum rtx_code code)
2303 {
2304   if (!TARGET_IEEE_FP)
2305     return false;
2306 
2307   switch (code)
2308     {
2309     case LT:
2310     case LE:
2311     case GT:
2312     case GE:
2313     case LTGT:
2314       return false;
2315 
2316     case EQ:
2317     case NE:
2318 
2319     case UNORDERED:
2320     case ORDERED:
2321     case UNLT:
2322     case UNLE:
2323     case UNGT:
2324     case UNGE:
2325     case UNEQ:
2326       return true;
2327 
2328     default:
2329       gcc_unreachable ();
2330     }
2331 }
2332 
2333 /* Return a comparison we can do and that it is equivalent to
2334    swap_condition (code) apart possibly from orderedness.
2335    But, never change orderedness if TARGET_IEEE_FP, returning
2336    UNKNOWN in that case if necessary.  */
2337 
2338 static enum rtx_code
ix86_fp_swap_condition(enum rtx_code code)2339 ix86_fp_swap_condition (enum rtx_code code)
2340 {
2341   switch (code)
2342     {
2343     case GT:                   /* GTU - CF=0 & ZF=0 */
2344       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2345     case GE:                   /* GEU - CF=0 */
2346       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2347     case UNLT:                 /* LTU - CF=1 */
2348       return TARGET_IEEE_FP ? UNKNOWN : GT;
2349     case UNLE:                 /* LEU - CF=1 | ZF=1 */
2350       return TARGET_IEEE_FP ? UNKNOWN : GE;
2351     default:
2352       return swap_condition (code);
2353     }
2354 }
2355 
2356 /* Return cost of comparison CODE using the best strategy for performance.
2357    All following functions do use number of instructions as a cost metrics.
2358    In future this should be tweaked to compute bytes for optimize_size and
2359    take into account performance of various instructions on various CPUs.  */
2360 
2361 static int
ix86_fp_comparison_cost(enum rtx_code code)2362 ix86_fp_comparison_cost (enum rtx_code code)
2363 {
2364   int arith_cost;
2365 
2366   /* The cost of code using bit-twiddling on %ah.  */
2367   switch (code)
2368     {
2369     case UNLE:
2370     case UNLT:
2371     case LTGT:
2372     case GT:
2373     case GE:
2374     case UNORDERED:
2375     case ORDERED:
2376     case UNEQ:
2377       arith_cost = 4;
2378       break;
2379     case LT:
2380     case NE:
2381     case EQ:
2382     case UNGE:
2383       arith_cost = TARGET_IEEE_FP ? 5 : 4;
2384       break;
2385     case LE:
2386     case UNGT:
2387       arith_cost = TARGET_IEEE_FP ? 6 : 4;
2388       break;
2389     default:
2390       gcc_unreachable ();
2391     }
2392 
2393   switch (ix86_fp_comparison_strategy (code))
2394     {
2395     case IX86_FPCMP_COMI:
2396       return arith_cost > 4 ? 3 : 2;
2397     case IX86_FPCMP_SAHF:
2398       return arith_cost > 4 ? 4 : 3;
2399     default:
2400       return arith_cost;
2401     }
2402 }
2403 
2404 /* Swap, force into registers, or otherwise massage the two operands
2405    to a fp comparison.  The operands are updated in place; the new
2406    comparison code is returned.  */
2407 
2408 static enum rtx_code
ix86_prepare_fp_compare_args(enum rtx_code code,rtx * pop0,rtx * pop1)2409 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2410 {
2411   bool unordered_compare = ix86_unordered_fp_compare (code);
2412   rtx op0 = *pop0, op1 = *pop1;
2413   machine_mode op_mode = GET_MODE (op0);
2414   bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2415 
2416   /* All of the unordered compare instructions only work on registers.
2417      The same is true of the fcomi compare instructions.  The XFmode
2418      compare instructions require registers except when comparing
2419      against zero or when converting operand 1 from fixed point to
2420      floating point.  */
2421 
2422   if (!is_sse
2423       && (unordered_compare
2424 	  || (op_mode == XFmode
2425 	      && ! (standard_80387_constant_p (op0) == 1
2426 		    || standard_80387_constant_p (op1) == 1)
2427 	      && GET_CODE (op1) != FLOAT)
2428 	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2429     {
2430       op0 = force_reg (op_mode, op0);
2431       op1 = force_reg (op_mode, op1);
2432     }
2433   else
2434     {
2435       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
2436 	 things around if they appear profitable, otherwise force op0
2437 	 into a register.  */
2438 
2439       if (standard_80387_constant_p (op0) == 0
2440 	  || (MEM_P (op0)
2441 	      && ! (standard_80387_constant_p (op1) == 0
2442 		    || MEM_P (op1))))
2443 	{
2444 	  enum rtx_code new_code = ix86_fp_swap_condition (code);
2445 	  if (new_code != UNKNOWN)
2446 	    {
2447 	      std::swap (op0, op1);
2448 	      code = new_code;
2449 	    }
2450 	}
2451 
2452       if (!REG_P (op0))
2453 	op0 = force_reg (op_mode, op0);
2454 
2455       if (CONSTANT_P (op1))
2456 	{
2457 	  int tmp = standard_80387_constant_p (op1);
2458 	  if (tmp == 0)
2459 	    op1 = validize_mem (force_const_mem (op_mode, op1));
2460 	  else if (tmp == 1)
2461 	    {
2462 	      if (TARGET_CMOVE)
2463 		op1 = force_reg (op_mode, op1);
2464 	    }
2465 	  else
2466 	    op1 = force_reg (op_mode, op1);
2467 	}
2468     }
2469 
2470   /* Try to rearrange the comparison to make it cheaper.  */
2471   if (ix86_fp_comparison_cost (code)
2472       > ix86_fp_comparison_cost (swap_condition (code))
2473       && (REG_P (op1) || can_create_pseudo_p ()))
2474     {
2475       std::swap (op0, op1);
2476       code = swap_condition (code);
2477       if (!REG_P (op0))
2478 	op0 = force_reg (op_mode, op0);
2479     }
2480 
2481   *pop0 = op0;
2482   *pop1 = op1;
2483   return code;
2484 }
2485 
2486 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
2487 
2488 static rtx
ix86_expand_fp_compare(enum rtx_code code,rtx op0,rtx op1)2489 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2490 {
2491   bool unordered_compare = ix86_unordered_fp_compare (code);
2492   machine_mode cmp_mode;
2493   rtx tmp, scratch;
2494 
2495   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2496 
2497   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2498   if (unordered_compare)
2499     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2500 
2501   /* Do fcomi/sahf based test when profitable.  */
2502   switch (ix86_fp_comparison_strategy (code))
2503     {
2504     case IX86_FPCMP_COMI:
2505       cmp_mode = CCFPmode;
2506       emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2507       break;
2508 
2509     case IX86_FPCMP_SAHF:
2510       cmp_mode = CCFPmode;
2511       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2512       scratch = gen_reg_rtx (HImode);
2513       emit_insn (gen_rtx_SET (scratch, tmp));
2514       emit_insn (gen_x86_sahf_1 (scratch));
2515       break;
2516 
2517     case IX86_FPCMP_ARITH:
2518       cmp_mode = CCNOmode;
2519       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2520       scratch = gen_reg_rtx (HImode);
2521       emit_insn (gen_rtx_SET (scratch, tmp));
2522 
2523       /* In the unordered case, we have to check C2 for NaN's, which
2524 	 doesn't happen to work out to anything nice combination-wise.
2525 	 So do some bit twiddling on the value we've got in AH to come
2526 	 up with an appropriate set of condition codes.  */
2527 
2528       switch (code)
2529 	{
2530 	case GT:
2531 	case UNGT:
2532 	  if (code == GT || !TARGET_IEEE_FP)
2533 	    {
2534 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2535 	      code = EQ;
2536 	    }
2537 	  else
2538 	    {
2539 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2540 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2541 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2542 	      cmp_mode = CCmode;
2543 	      code = GEU;
2544 	    }
2545 	  break;
2546 	case LT:
2547 	case UNLT:
2548 	  if (code == LT && TARGET_IEEE_FP)
2549 	    {
2550 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2551 	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2552 	      cmp_mode = CCmode;
2553 	      code = EQ;
2554 	    }
2555 	  else
2556 	    {
2557 	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2558 	      code = NE;
2559 	    }
2560 	  break;
2561 	case GE:
2562 	case UNGE:
2563 	  if (code == GE || !TARGET_IEEE_FP)
2564 	    {
2565 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2566 	      code = EQ;
2567 	    }
2568 	  else
2569 	    {
2570 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2571 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2572 	      code = NE;
2573 	    }
2574 	  break;
2575 	case LE:
2576 	case UNLE:
2577 	  if (code == LE && TARGET_IEEE_FP)
2578 	    {
2579 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2580 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2581 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2582 	      cmp_mode = CCmode;
2583 	      code = LTU;
2584 	    }
2585 	  else
2586 	    {
2587 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2588 	      code = NE;
2589 	    }
2590 	  break;
2591 	case EQ:
2592 	case UNEQ:
2593 	  if (code == EQ && TARGET_IEEE_FP)
2594 	    {
2595 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2596 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2597 	      cmp_mode = CCmode;
2598 	      code = EQ;
2599 	    }
2600 	  else
2601 	    {
2602 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2603 	      code = NE;
2604 	    }
2605 	  break;
2606 	case NE:
2607 	case LTGT:
2608 	  if (code == NE && TARGET_IEEE_FP)
2609 	    {
2610 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2611 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2612 					     GEN_INT (0x40)));
2613 	      code = NE;
2614 	    }
2615 	  else
2616 	    {
2617 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2618 	      code = EQ;
2619 	    }
2620 	  break;
2621 
2622 	case UNORDERED:
2623 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2624 	  code = NE;
2625 	  break;
2626 	case ORDERED:
2627 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2628 	  code = EQ;
2629 	  break;
2630 
2631 	default:
2632 	  gcc_unreachable ();
2633 	}
2634 	break;
2635 
2636     default:
2637       gcc_unreachable();
2638     }
2639 
2640   /* Return the test that should be put into the flags user, i.e.
2641      the bcc, scc, or cmov instruction.  */
2642   return gen_rtx_fmt_ee (code, VOIDmode,
2643 			 gen_rtx_REG (cmp_mode, FLAGS_REG),
2644 			 const0_rtx);
2645 }
2646 
2647 /* Generate insn patterns to do an integer compare of OPERANDS.  */
2648 
2649 static rtx
ix86_expand_int_compare(enum rtx_code code,rtx op0,rtx op1)2650 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2651 {
2652   machine_mode cmpmode;
2653   rtx tmp, flags;
2654 
2655   cmpmode = SELECT_CC_MODE (code, op0, op1);
2656   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2657 
2658   /* This is very simple, but making the interface the same as in the
2659      FP case makes the rest of the code easier.  */
2660   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2661   emit_insn (gen_rtx_SET (flags, tmp));
2662 
2663   /* Return the test that should be put into the flags user, i.e.
2664      the bcc, scc, or cmov instruction.  */
2665   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2666 }
2667 
2668 static rtx
ix86_expand_compare(enum rtx_code code,rtx op0,rtx op1)2669 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2670 {
2671   rtx ret;
2672 
2673   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2674     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2675 
2676   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2677     {
2678       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2679       ret = ix86_expand_fp_compare (code, op0, op1);
2680     }
2681   else
2682     ret = ix86_expand_int_compare (code, op0, op1);
2683 
2684   return ret;
2685 }
2686 
2687 void
ix86_expand_setcc(rtx dest,enum rtx_code code,rtx op0,rtx op1)2688 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2689 {
2690   rtx ret;
2691 
2692   gcc_assert (GET_MODE (dest) == QImode);
2693 
2694   ret = ix86_expand_compare (code, op0, op1);
2695   PUT_MODE (ret, QImode);
2696   emit_insn (gen_rtx_SET (dest, ret));
2697 }
2698 
2699 /* Expand comparison setting or clearing carry flag.  Return true when
2700    successful and set pop for the operation.  */
2701 static bool
ix86_expand_carry_flag_compare(enum rtx_code code,rtx op0,rtx op1,rtx * pop)2702 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2703 {
2704   machine_mode mode
2705     = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2706 
2707   /* Do not handle double-mode compares that go through special path.  */
2708   if (mode == (TARGET_64BIT ? TImode : DImode))
2709     return false;
2710 
2711   if (SCALAR_FLOAT_MODE_P (mode))
2712     {
2713       rtx compare_op;
2714       rtx_insn *compare_seq;
2715 
2716       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2717 
2718       /* Shortcut:  following common codes never translate
2719 	 into carry flag compares.  */
2720       if (code == EQ || code == NE || code == UNEQ || code == LTGT
2721 	  || code == ORDERED || code == UNORDERED)
2722 	return false;
2723 
2724       /* These comparisons require zero flag; swap operands so they won't.  */
2725       if ((code == GT || code == UNLE || code == LE || code == UNGT)
2726 	  && !TARGET_IEEE_FP)
2727 	{
2728 	  std::swap (op0, op1);
2729 	  code = swap_condition (code);
2730 	}
2731 
2732       /* Try to expand the comparison and verify that we end up with
2733 	 carry flag based comparison.  This fails to be true only when
2734 	 we decide to expand comparison using arithmetic that is not
2735 	 too common scenario.  */
2736       start_sequence ();
2737       compare_op = ix86_expand_fp_compare (code, op0, op1);
2738       compare_seq = get_insns ();
2739       end_sequence ();
2740 
2741       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2742         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2743       else
2744 	code = GET_CODE (compare_op);
2745 
2746       if (code != LTU && code != GEU)
2747 	return false;
2748 
2749       emit_insn (compare_seq);
2750       *pop = compare_op;
2751       return true;
2752     }
2753 
2754   if (!INTEGRAL_MODE_P (mode))
2755     return false;
2756 
2757   switch (code)
2758     {
2759     case LTU:
2760     case GEU:
2761       break;
2762 
2763     /* Convert a==0 into (unsigned)a<1.  */
2764     case EQ:
2765     case NE:
2766       if (op1 != const0_rtx)
2767 	return false;
2768       op1 = const1_rtx;
2769       code = (code == EQ ? LTU : GEU);
2770       break;
2771 
2772     /* Convert a>b into b<a or a>=b-1.  */
2773     case GTU:
2774     case LEU:
2775       if (CONST_INT_P (op1))
2776 	{
2777 	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2778 	  /* Bail out on overflow.  We still can swap operands but that
2779 	     would force loading of the constant into register.  */
2780 	  if (op1 == const0_rtx
2781 	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2782 	    return false;
2783 	  code = (code == GTU ? GEU : LTU);
2784 	}
2785       else
2786 	{
2787 	  std::swap (op0, op1);
2788 	  code = (code == GTU ? LTU : GEU);
2789 	}
2790       break;
2791 
2792     /* Convert a>=0 into (unsigned)a<0x80000000.  */
2793     case LT:
2794     case GE:
2795       if (mode == DImode || op1 != const0_rtx)
2796 	return false;
2797       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2798       code = (code == LT ? GEU : LTU);
2799       break;
2800     case LE:
2801     case GT:
2802       if (mode == DImode || op1 != constm1_rtx)
2803 	return false;
2804       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2805       code = (code == LE ? GEU : LTU);
2806       break;
2807 
2808     default:
2809       return false;
2810     }
2811   /* Swapping operands may cause constant to appear as first operand.  */
2812   if (!nonimmediate_operand (op0, VOIDmode))
2813     {
2814       if (!can_create_pseudo_p ())
2815 	return false;
2816       op0 = force_reg (mode, op0);
2817     }
2818   *pop = ix86_expand_compare (code, op0, op1);
2819   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2820   return true;
2821 }
2822 
2823 /* Expand conditional increment or decrement using adb/sbb instructions.
2824    The default case using setcc followed by the conditional move can be
2825    done by generic code.  */
2826 bool
ix86_expand_int_addcc(rtx operands[])2827 ix86_expand_int_addcc (rtx operands[])
2828 {
2829   enum rtx_code code = GET_CODE (operands[1]);
2830   rtx flags;
2831   rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2832   rtx compare_op;
2833   rtx val = const0_rtx;
2834   bool fpcmp = false;
2835   machine_mode mode;
2836   rtx op0 = XEXP (operands[1], 0);
2837   rtx op1 = XEXP (operands[1], 1);
2838 
2839   if (operands[3] != const1_rtx
2840       && operands[3] != constm1_rtx)
2841     return false;
2842   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2843      return false;
2844   code = GET_CODE (compare_op);
2845 
2846   flags = XEXP (compare_op, 0);
2847 
2848   if (GET_MODE (flags) == CCFPmode)
2849     {
2850       fpcmp = true;
2851       code = ix86_fp_compare_code_to_integer (code);
2852     }
2853 
2854   if (code != LTU)
2855     {
2856       val = constm1_rtx;
2857       if (fpcmp)
2858 	PUT_CODE (compare_op,
2859 		  reverse_condition_maybe_unordered
2860 		    (GET_CODE (compare_op)));
2861       else
2862 	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2863     }
2864 
2865   mode = GET_MODE (operands[0]);
2866 
2867   /* Construct either adc or sbb insn.  */
2868   if ((code == LTU) == (operands[3] == constm1_rtx))
2869     insn = gen_sub3_carry;
2870   else
2871     insn = gen_add3_carry;
2872 
2873   emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2874 
2875   return true;
2876 }
2877 
2878 bool
ix86_expand_int_movcc(rtx operands[])2879 ix86_expand_int_movcc (rtx operands[])
2880 {
2881   enum rtx_code code = GET_CODE (operands[1]), compare_code;
2882   rtx_insn *compare_seq;
2883   rtx compare_op;
2884   machine_mode mode = GET_MODE (operands[0]);
2885   bool sign_bit_compare_p = false;
2886   rtx op0 = XEXP (operands[1], 0);
2887   rtx op1 = XEXP (operands[1], 1);
2888 
2889   if (GET_MODE (op0) == TImode
2890       || (GET_MODE (op0) == DImode
2891 	  && !TARGET_64BIT))
2892     return false;
2893 
2894   start_sequence ();
2895   compare_op = ix86_expand_compare (code, op0, op1);
2896   compare_seq = get_insns ();
2897   end_sequence ();
2898 
2899   compare_code = GET_CODE (compare_op);
2900 
2901   if ((op1 == const0_rtx && (code == GE || code == LT))
2902       || (op1 == constm1_rtx && (code == GT || code == LE)))
2903     sign_bit_compare_p = true;
2904 
2905   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2906      HImode insns, we'd be swallowed in word prefix ops.  */
2907 
2908   if ((mode != HImode || TARGET_FAST_PREFIX)
2909       && (mode != (TARGET_64BIT ? TImode : DImode))
2910       && CONST_INT_P (operands[2])
2911       && CONST_INT_P (operands[3]))
2912     {
2913       rtx out = operands[0];
2914       HOST_WIDE_INT ct = INTVAL (operands[2]);
2915       HOST_WIDE_INT cf = INTVAL (operands[3]);
2916       HOST_WIDE_INT diff;
2917 
2918       diff = ct - cf;
2919       /*  Sign bit compares are better done using shifts than we do by using
2920 	  sbb.  */
2921       if (sign_bit_compare_p
2922 	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2923 	{
2924 	  /* Detect overlap between destination and compare sources.  */
2925 	  rtx tmp = out;
2926 
2927           if (!sign_bit_compare_p)
2928 	    {
2929 	      rtx flags;
2930 	      bool fpcmp = false;
2931 
2932 	      compare_code = GET_CODE (compare_op);
2933 
2934 	      flags = XEXP (compare_op, 0);
2935 
2936 	      if (GET_MODE (flags) == CCFPmode)
2937 		{
2938 		  fpcmp = true;
2939 		  compare_code
2940 		    = ix86_fp_compare_code_to_integer (compare_code);
2941 		}
2942 
2943 	      /* To simplify rest of code, restrict to the GEU case.  */
2944 	      if (compare_code == LTU)
2945 		{
2946 		  std::swap (ct, cf);
2947 		  compare_code = reverse_condition (compare_code);
2948 		  code = reverse_condition (code);
2949 		}
2950 	      else
2951 		{
2952 		  if (fpcmp)
2953 		    PUT_CODE (compare_op,
2954 			      reverse_condition_maybe_unordered
2955 			        (GET_CODE (compare_op)));
2956 		  else
2957 		    PUT_CODE (compare_op,
2958 			      reverse_condition (GET_CODE (compare_op)));
2959 		}
2960 	      diff = ct - cf;
2961 
2962 	      if (reg_overlap_mentioned_p (out, op0)
2963 		  || reg_overlap_mentioned_p (out, op1))
2964 		tmp = gen_reg_rtx (mode);
2965 
2966 	      if (mode == DImode)
2967 		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2968 	      else
2969 		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
2970 						 flags, compare_op));
2971 	    }
2972 	  else
2973 	    {
2974 	      if (code == GT || code == GE)
2975 		code = reverse_condition (code);
2976 	      else
2977 		{
2978 		  std::swap (ct, cf);
2979 		  diff = ct - cf;
2980 		}
2981 	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2982 	    }
2983 
2984 	  if (diff == 1)
2985 	    {
2986 	      /*
2987 	       * cmpl op0,op1
2988 	       * sbbl dest,dest
2989 	       * [addl dest, ct]
2990 	       *
2991 	       * Size 5 - 8.
2992 	       */
2993 	      if (ct)
2994 		tmp = expand_simple_binop (mode, PLUS,
2995 					   tmp, GEN_INT (ct),
2996 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
2997 	    }
2998 	  else if (cf == -1)
2999 	    {
3000 	      /*
3001 	       * cmpl op0,op1
3002 	       * sbbl dest,dest
3003 	       * orl $ct, dest
3004 	       *
3005 	       * Size 8.
3006 	       */
3007 	      tmp = expand_simple_binop (mode, IOR,
3008 					 tmp, GEN_INT (ct),
3009 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3010 	    }
3011 	  else if (diff == -1 && ct)
3012 	    {
3013 	      /*
3014 	       * cmpl op0,op1
3015 	       * sbbl dest,dest
3016 	       * notl dest
3017 	       * [addl dest, cf]
3018 	       *
3019 	       * Size 8 - 11.
3020 	       */
3021 	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3022 	      if (cf)
3023 		tmp = expand_simple_binop (mode, PLUS,
3024 					   copy_rtx (tmp), GEN_INT (cf),
3025 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3026 	    }
3027 	  else
3028 	    {
3029 	      /*
3030 	       * cmpl op0,op1
3031 	       * sbbl dest,dest
3032 	       * [notl dest]
3033 	       * andl cf - ct, dest
3034 	       * [addl dest, ct]
3035 	       *
3036 	       * Size 8 - 11.
3037 	       */
3038 
3039 	      if (cf == 0)
3040 		{
3041 		  cf = ct;
3042 		  ct = 0;
3043 		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3044 		}
3045 
3046 	      tmp = expand_simple_binop (mode, AND,
3047 					 copy_rtx (tmp),
3048 					 gen_int_mode (cf - ct, mode),
3049 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3050 	      if (ct)
3051 		tmp = expand_simple_binop (mode, PLUS,
3052 					   copy_rtx (tmp), GEN_INT (ct),
3053 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3054 	    }
3055 
3056 	  if (!rtx_equal_p (tmp, out))
3057 	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3058 
3059 	  return true;
3060 	}
3061 
3062       if (diff < 0)
3063 	{
3064 	  machine_mode cmp_mode = GET_MODE (op0);
3065 	  enum rtx_code new_code;
3066 
3067 	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
3068 	    {
3069 	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3070 
3071 	      /* We may be reversing a non-trapping
3072 		 comparison to a trapping comparison.  */
3073 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3074 		      && code != EQ && code != NE
3075 		      && code != ORDERED && code != UNORDERED)
3076 		    new_code = UNKNOWN;
3077 		  else
3078 		    new_code = reverse_condition_maybe_unordered (code);
3079 	    }
3080 	  else
3081 	    new_code = ix86_reverse_condition (code, cmp_mode);
3082 	  if (new_code != UNKNOWN)
3083 	    {
3084 	      std::swap (ct, cf);
3085 	      diff = -diff;
3086 	      code = new_code;
3087 	    }
3088 	}
3089 
3090       compare_code = UNKNOWN;
3091       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3092 	  && CONST_INT_P (op1))
3093 	{
3094 	  if (op1 == const0_rtx
3095 	      && (code == LT || code == GE))
3096 	    compare_code = code;
3097 	  else if (op1 == constm1_rtx)
3098 	    {
3099 	      if (code == LE)
3100 		compare_code = LT;
3101 	      else if (code == GT)
3102 		compare_code = GE;
3103 	    }
3104 	}
3105 
3106       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
3107       if (compare_code != UNKNOWN
3108 	  && GET_MODE (op0) == GET_MODE (out)
3109 	  && (cf == -1 || ct == -1))
3110 	{
3111 	  /* If lea code below could be used, only optimize
3112 	     if it results in a 2 insn sequence.  */
3113 
3114 	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3115 		 || diff == 3 || diff == 5 || diff == 9)
3116 	      || (compare_code == LT && ct == -1)
3117 	      || (compare_code == GE && cf == -1))
3118 	    {
3119 	      /*
3120 	       * notl op1	(if necessary)
3121 	       * sarl $31, op1
3122 	       * orl cf, op1
3123 	       */
3124 	      if (ct != -1)
3125 		{
3126 		  cf = ct;
3127 		  ct = -1;
3128 		  code = reverse_condition (code);
3129 		}
3130 
3131 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3132 
3133 	      out = expand_simple_binop (mode, IOR,
3134 					 out, GEN_INT (cf),
3135 					 out, 1, OPTAB_DIRECT);
3136 	      if (out != operands[0])
3137 		emit_move_insn (operands[0], out);
3138 
3139 	      return true;
3140 	    }
3141 	}
3142 
3143 
3144       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3145 	   || diff == 3 || diff == 5 || diff == 9)
3146 	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3147 	  && (mode != DImode
3148 	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3149 	{
3150 	  /*
3151 	   * xorl dest,dest
3152 	   * cmpl op1,op2
3153 	   * setcc dest
3154 	   * lea cf(dest*(ct-cf)),dest
3155 	   *
3156 	   * Size 14.
3157 	   *
3158 	   * This also catches the degenerate setcc-only case.
3159 	   */
3160 
3161 	  rtx tmp;
3162 	  int nops;
3163 
3164 	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3165 
3166 	  nops = 0;
3167 	  /* On x86_64 the lea instruction operates on Pmode, so we need
3168 	     to get arithmetics done in proper mode to match.  */
3169 	  if (diff == 1)
3170 	    tmp = copy_rtx (out);
3171 	  else
3172 	    {
3173 	      rtx out1;
3174 	      out1 = copy_rtx (out);
3175 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3176 	      nops++;
3177 	      if (diff & 1)
3178 		{
3179 		  tmp = gen_rtx_PLUS (mode, tmp, out1);
3180 		  nops++;
3181 		}
3182 	    }
3183 	  if (cf != 0)
3184 	    {
3185 	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
3186 	      nops++;
3187 	    }
3188 	  if (!rtx_equal_p (tmp, out))
3189 	    {
3190 	      if (nops == 1)
3191 		out = force_operand (tmp, copy_rtx (out));
3192 	      else
3193 		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3194 	    }
3195 	  if (!rtx_equal_p (out, operands[0]))
3196 	    emit_move_insn (operands[0], copy_rtx (out));
3197 
3198 	  return true;
3199 	}
3200 
3201       /*
3202        * General case:			Jumpful:
3203        *   xorl dest,dest		cmpl op1, op2
3204        *   cmpl op1, op2		movl ct, dest
3205        *   setcc dest			jcc 1f
3206        *   decl dest			movl cf, dest
3207        *   andl (cf-ct),dest		1:
3208        *   addl ct,dest
3209        *
3210        * Size 20.			Size 14.
3211        *
3212        * This is reasonably steep, but branch mispredict costs are
3213        * high on modern cpus, so consider failing only if optimizing
3214        * for space.
3215        */
3216 
3217       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3218 	  && BRANCH_COST (optimize_insn_for_speed_p (),
3219 		  	  false) >= 2)
3220 	{
3221 	  if (cf == 0)
3222 	    {
3223 	      machine_mode cmp_mode = GET_MODE (op0);
3224 	      enum rtx_code new_code;
3225 
3226 	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
3227 		{
3228 		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3229 
3230 		  /* We may be reversing a non-trapping
3231 		     comparison to a trapping comparison.  */
3232 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3233 		      && code != EQ && code != NE
3234 		      && code != ORDERED && code != UNORDERED)
3235 		    new_code = UNKNOWN;
3236 		  else
3237 		    new_code = reverse_condition_maybe_unordered (code);
3238 
3239 		}
3240 	      else
3241 		{
3242 		  new_code = ix86_reverse_condition (code, cmp_mode);
3243 		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
3244 		    compare_code = reverse_condition (compare_code);
3245 		}
3246 
3247 	      if (new_code != UNKNOWN)
3248 		{
3249 		  cf = ct;
3250 		  ct = 0;
3251 		  code = new_code;
3252 		}
3253 	    }
3254 
3255 	  if (compare_code != UNKNOWN)
3256 	    {
3257 	      /* notl op1	(if needed)
3258 		 sarl $31, op1
3259 		 andl (cf-ct), op1
3260 		 addl ct, op1
3261 
3262 		 For x < 0 (resp. x <= -1) there will be no notl,
3263 		 so if possible swap the constants to get rid of the
3264 		 complement.
3265 		 True/false will be -1/0 while code below (store flag
3266 		 followed by decrement) is 0/-1, so the constants need
3267 		 to be exchanged once more.  */
3268 
3269 	      if (compare_code == GE || !cf)
3270 		{
3271 		  code = reverse_condition (code);
3272 		  compare_code = LT;
3273 		}
3274 	      else
3275 		std::swap (ct, cf);
3276 
3277 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3278 	    }
3279 	  else
3280 	    {
3281 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3282 
3283 	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3284 					 constm1_rtx,
3285 					 copy_rtx (out), 1, OPTAB_DIRECT);
3286 	    }
3287 
3288 	  out = expand_simple_binop (mode, AND, copy_rtx (out),
3289 				     gen_int_mode (cf - ct, mode),
3290 				     copy_rtx (out), 1, OPTAB_DIRECT);
3291 	  if (ct)
3292 	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3293 				       copy_rtx (out), 1, OPTAB_DIRECT);
3294 	  if (!rtx_equal_p (out, operands[0]))
3295 	    emit_move_insn (operands[0], copy_rtx (out));
3296 
3297 	  return true;
3298 	}
3299     }
3300 
3301   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3302     {
3303       /* Try a few things more with specific constants and a variable.  */
3304 
3305       optab op;
3306       rtx var, orig_out, out, tmp;
3307 
3308       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3309 	return false;
3310 
3311       /* If one of the two operands is an interesting constant, load a
3312 	 constant with the above and mask it in with a logical operation.  */
3313 
3314       if (CONST_INT_P (operands[2]))
3315 	{
3316 	  var = operands[3];
3317 	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3318 	    operands[3] = constm1_rtx, op = and_optab;
3319 	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3320 	    operands[3] = const0_rtx, op = ior_optab;
3321 	  else
3322 	    return false;
3323 	}
3324       else if (CONST_INT_P (operands[3]))
3325 	{
3326 	  var = operands[2];
3327 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3328 	    operands[2] = constm1_rtx, op = and_optab;
3329 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3330 	    operands[2] = const0_rtx, op = ior_optab;
3331 	  else
3332 	    return false;
3333 	}
3334       else
3335         return false;
3336 
3337       orig_out = operands[0];
3338       tmp = gen_reg_rtx (mode);
3339       operands[0] = tmp;
3340 
3341       /* Recurse to get the constant loaded.  */
3342       if (!ix86_expand_int_movcc (operands))
3343         return false;
3344 
3345       /* Mask in the interesting variable.  */
3346       out = expand_binop (mode, op, var, tmp, orig_out, 0,
3347 			  OPTAB_WIDEN);
3348       if (!rtx_equal_p (out, orig_out))
3349 	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3350 
3351       return true;
3352     }
3353 
3354   /*
3355    * For comparison with above,
3356    *
3357    * movl cf,dest
3358    * movl ct,tmp
3359    * cmpl op1,op2
3360    * cmovcc tmp,dest
3361    *
3362    * Size 15.
3363    */
3364 
3365   if (! nonimmediate_operand (operands[2], mode))
3366     operands[2] = force_reg (mode, operands[2]);
3367   if (! nonimmediate_operand (operands[3], mode))
3368     operands[3] = force_reg (mode, operands[3]);
3369 
3370   if (! register_operand (operands[2], VOIDmode)
3371       && (mode == QImode
3372           || ! register_operand (operands[3], VOIDmode)))
3373     operands[2] = force_reg (mode, operands[2]);
3374 
3375   if (mode == QImode
3376       && ! register_operand (operands[3], VOIDmode))
3377     operands[3] = force_reg (mode, operands[3]);
3378 
3379   emit_insn (compare_seq);
3380   emit_insn (gen_rtx_SET (operands[0],
3381 			  gen_rtx_IF_THEN_ELSE (mode,
3382 						compare_op, operands[2],
3383 						operands[3])));
3384   return true;
3385 }
3386 
3387 /* Detect conditional moves that exactly match min/max operational
3388    semantics.  Note that this is IEEE safe, as long as we don't
3389    interchange the operands.
3390 
3391    Returns FALSE if this conditional move doesn't match a MIN/MAX,
3392    and TRUE if the operation is successful and instructions are emitted.  */
3393 
3394 static bool
ix86_expand_sse_fp_minmax(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx if_true,rtx if_false)3395 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3396 			   rtx cmp_op1, rtx if_true, rtx if_false)
3397 {
3398   machine_mode mode;
3399   bool is_min;
3400   rtx tmp;
3401 
3402   if (code == LT)
3403     ;
3404   else if (code == UNGE)
3405     std::swap (if_true, if_false);
3406   else
3407     return false;
3408 
3409   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3410     is_min = true;
3411   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3412     is_min = false;
3413   else
3414     return false;
3415 
3416   mode = GET_MODE (dest);
3417 
3418   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3419      but MODE may be a vector mode and thus not appropriate.  */
3420   if (!flag_finite_math_only || flag_signed_zeros)
3421     {
3422       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3423       rtvec v;
3424 
3425       if_true = force_reg (mode, if_true);
3426       v = gen_rtvec (2, if_true, if_false);
3427       tmp = gen_rtx_UNSPEC (mode, v, u);
3428     }
3429   else
3430     {
3431       code = is_min ? SMIN : SMAX;
3432       if (MEM_P (if_true) && MEM_P (if_false))
3433 	if_true = force_reg (mode, if_true);
3434       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3435     }
3436 
3437   emit_insn (gen_rtx_SET (dest, tmp));
3438   return true;
3439 }
3440 
3441 /* Return true if MODE is valid for vector compare to mask register,
3442    Same result for conditionl vector move with mask register.  */
3443 static bool
ix86_valid_mask_cmp_mode(machine_mode mode)3444 ix86_valid_mask_cmp_mode (machine_mode mode)
3445 {
3446   /* XOP has its own vector conditional movement.  */
3447   if (TARGET_XOP && !TARGET_AVX512F)
3448     return false;
3449 
3450   /* AVX512F is needed for mask operation.  */
3451   if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3452     return false;
3453 
3454   /* AVX512BW is needed for vector QI/HImode,
3455      AVX512VL is needed for 128/256-bit vector.  */
3456   machine_mode inner_mode = GET_MODE_INNER (mode);
3457   int vector_size = GET_MODE_SIZE (mode);
3458   if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3459     return false;
3460 
3461   return vector_size == 64 || TARGET_AVX512VL;
3462 }
3463 
3464 /* Expand an SSE comparison.  Return the register with the result.  */
3465 
3466 static rtx
ix86_expand_sse_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx op_true,rtx op_false)3467 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3468 		     rtx op_true, rtx op_false)
3469 {
3470   machine_mode mode = GET_MODE (dest);
3471   machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3472 
3473   /* In general case result of comparison can differ from operands' type.  */
3474   machine_mode cmp_mode;
3475 
3476   /* In AVX512F the result of comparison is an integer mask.  */
3477   bool maskcmp = false;
3478   rtx x;
3479 
3480   if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
3481     {
3482       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3483       maskcmp = true;
3484       cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3485     }
3486   else
3487     cmp_mode = cmp_ops_mode;
3488 
3489   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3490 
3491   int (*op1_predicate)(rtx, machine_mode)
3492     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3493 
3494   if (!op1_predicate (cmp_op1, cmp_ops_mode))
3495     cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3496 
3497   if (optimize
3498       || (maskcmp && cmp_mode != mode)
3499       || (op_true && reg_overlap_mentioned_p (dest, op_true))
3500       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3501     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3502 
3503   if (maskcmp)
3504     {
3505       bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3506       gcc_assert (ok);
3507       return dest;
3508     }
3509 
3510   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3511 
3512   if (cmp_mode != mode && !maskcmp)
3513     {
3514       x = force_reg (cmp_ops_mode, x);
3515       convert_move (dest, x, false);
3516     }
3517   else
3518     emit_insn (gen_rtx_SET (dest, x));
3519 
3520   return dest;
3521 }
3522 
3523 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3524    operations.  This is used for both scalar and vector conditional moves.  */
3525 
3526 void
ix86_expand_sse_movcc(rtx dest,rtx cmp,rtx op_true,rtx op_false)3527 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3528 {
3529   machine_mode mode = GET_MODE (dest);
3530   machine_mode cmpmode = GET_MODE (cmp);
3531 
3532   /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506.  */
3533   if (rtx_equal_p (op_true, op_false))
3534     {
3535       emit_move_insn (dest, op_true);
3536       return;
3537     }
3538 
3539   /* In AVX512F the result of comparison is an integer mask.  */
3540   bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
3541 
3542   rtx t2, t3, x;
3543 
3544   /* If we have an integer mask and FP value then we need
3545      to cast mask to FP mode.  */
3546   if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3547     {
3548       cmp = force_reg (cmpmode, cmp);
3549       cmp = gen_rtx_SUBREG (mode, cmp, 0);
3550     }
3551 
3552   if (maskcmp)
3553     {
3554       /* Using vector move with mask register.  */
3555       cmp = force_reg (cmpmode, cmp);
3556       /* Optimize for mask zero.  */
3557       op_true = (op_true != CONST0_RTX (mode)
3558 		 ? force_reg (mode, op_true) : op_true);
3559       op_false = (op_false != CONST0_RTX (mode)
3560 		  ? force_reg (mode, op_false) : op_false);
3561       if (op_true == CONST0_RTX (mode))
3562 	{
3563 	  rtx (*gen_not) (rtx, rtx);
3564 	  switch (cmpmode)
3565 	    {
3566 	    case E_QImode: gen_not = gen_knotqi; break;
3567 	    case E_HImode: gen_not = gen_knothi; break;
3568 	    case E_SImode: gen_not = gen_knotsi; break;
3569 	    case E_DImode: gen_not = gen_knotdi; break;
3570 	    default: gcc_unreachable ();
3571 	    }
3572 	  rtx n = gen_reg_rtx (cmpmode);
3573 	  emit_insn (gen_not (n, cmp));
3574 	  cmp = n;
3575 	  /* Reverse op_true op_false.  */
3576 	  std::swap (op_true, op_false);
3577 	}
3578 
3579       rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3580       emit_insn (gen_rtx_SET (dest, vec_merge));
3581       return;
3582     }
3583   else if (vector_all_ones_operand (op_true, mode)
3584 	   && op_false == CONST0_RTX (mode))
3585     {
3586       emit_insn (gen_rtx_SET (dest, cmp));
3587       return;
3588     }
3589   else if (op_false == CONST0_RTX (mode))
3590     {
3591       op_true = force_reg (mode, op_true);
3592       x = gen_rtx_AND (mode, cmp, op_true);
3593       emit_insn (gen_rtx_SET (dest, x));
3594       return;
3595     }
3596   else if (op_true == CONST0_RTX (mode))
3597     {
3598       op_false = force_reg (mode, op_false);
3599       x = gen_rtx_NOT (mode, cmp);
3600       x = gen_rtx_AND (mode, x, op_false);
3601       emit_insn (gen_rtx_SET (dest, x));
3602       return;
3603     }
3604   else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3605     {
3606       op_false = force_reg (mode, op_false);
3607       x = gen_rtx_IOR (mode, cmp, op_false);
3608       emit_insn (gen_rtx_SET (dest, x));
3609       return;
3610     }
3611   else if (TARGET_XOP)
3612     {
3613       op_true = force_reg (mode, op_true);
3614 
3615       if (!nonimmediate_operand (op_false, mode))
3616 	op_false = force_reg (mode, op_false);
3617 
3618       emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3619 							  op_true,
3620 							  op_false)));
3621       return;
3622     }
3623 
3624   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3625   rtx d = dest;
3626 
3627   if (!vector_operand (op_true, mode))
3628     op_true = force_reg (mode, op_true);
3629 
3630   op_false = force_reg (mode, op_false);
3631 
3632   switch (mode)
3633     {
3634     case E_V4SFmode:
3635       if (TARGET_SSE4_1)
3636 	gen = gen_sse4_1_blendvps;
3637       break;
3638     case E_V2DFmode:
3639       if (TARGET_SSE4_1)
3640 	gen = gen_sse4_1_blendvpd;
3641       break;
3642     case E_SFmode:
3643       if (TARGET_SSE4_1)
3644 	{
3645 	  gen = gen_sse4_1_blendvss;
3646 	  op_true = force_reg (mode, op_true);
3647 	}
3648       break;
3649     case E_DFmode:
3650       if (TARGET_SSE4_1)
3651 	{
3652 	  gen = gen_sse4_1_blendvsd;
3653 	  op_true = force_reg (mode, op_true);
3654 	}
3655       break;
3656     case E_V16QImode:
3657     case E_V8HImode:
3658     case E_V4SImode:
3659     case E_V2DImode:
3660       if (TARGET_SSE4_1)
3661 	{
3662 	  gen = gen_sse4_1_pblendvb;
3663 	  if (mode != V16QImode)
3664 	    d = gen_reg_rtx (V16QImode);
3665 	  op_false = gen_lowpart (V16QImode, op_false);
3666 	  op_true = gen_lowpart (V16QImode, op_true);
3667 	  cmp = gen_lowpart (V16QImode, cmp);
3668 	}
3669       break;
3670     case E_V8SFmode:
3671       if (TARGET_AVX)
3672 	gen = gen_avx_blendvps256;
3673       break;
3674     case E_V4DFmode:
3675       if (TARGET_AVX)
3676 	gen = gen_avx_blendvpd256;
3677       break;
3678     case E_V32QImode:
3679     case E_V16HImode:
3680     case E_V8SImode:
3681     case E_V4DImode:
3682       if (TARGET_AVX2)
3683 	{
3684 	  gen = gen_avx2_pblendvb;
3685 	  if (mode != V32QImode)
3686 	    d = gen_reg_rtx (V32QImode);
3687 	  op_false = gen_lowpart (V32QImode, op_false);
3688 	  op_true = gen_lowpart (V32QImode, op_true);
3689 	  cmp = gen_lowpart (V32QImode, cmp);
3690 	}
3691       break;
3692 
3693     case E_V64QImode:
3694       gen = gen_avx512bw_blendmv64qi;
3695       break;
3696     case E_V32HImode:
3697       gen = gen_avx512bw_blendmv32hi;
3698       break;
3699     case E_V16SImode:
3700       gen = gen_avx512f_blendmv16si;
3701       break;
3702     case E_V8DImode:
3703       gen = gen_avx512f_blendmv8di;
3704       break;
3705     case E_V8DFmode:
3706       gen = gen_avx512f_blendmv8df;
3707       break;
3708     case E_V16SFmode:
3709       gen = gen_avx512f_blendmv16sf;
3710       break;
3711 
3712     default:
3713       break;
3714     }
3715 
3716   if (gen != NULL)
3717     {
3718       emit_insn (gen (d, op_false, op_true, cmp));
3719       if (d != dest)
3720 	emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3721     }
3722   else
3723     {
3724       op_true = force_reg (mode, op_true);
3725 
3726       t2 = gen_reg_rtx (mode);
3727       if (optimize)
3728 	t3 = gen_reg_rtx (mode);
3729       else
3730 	t3 = dest;
3731 
3732       x = gen_rtx_AND (mode, op_true, cmp);
3733       emit_insn (gen_rtx_SET (t2, x));
3734 
3735       x = gen_rtx_NOT (mode, cmp);
3736       x = gen_rtx_AND (mode, x, op_false);
3737       emit_insn (gen_rtx_SET (t3, x));
3738 
3739       x = gen_rtx_IOR (mode, t3, t2);
3740       emit_insn (gen_rtx_SET (dest, x));
3741     }
3742 }
3743 
3744 /* Swap, force into registers, or otherwise massage the two operands
3745    to an sse comparison with a mask result.  Thus we differ a bit from
3746    ix86_prepare_fp_compare_args which expects to produce a flags result.
3747 
3748    The DEST operand exists to help determine whether to commute commutative
3749    operators.  The POP0/POP1 operands are updated in place.  The new
3750    comparison code is returned, or UNKNOWN if not implementable.  */
3751 
3752 static enum rtx_code
ix86_prepare_sse_fp_compare_args(rtx dest,enum rtx_code code,rtx * pop0,rtx * pop1)3753 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3754 				  rtx *pop0, rtx *pop1)
3755 {
3756   switch (code)
3757     {
3758     case LTGT:
3759     case UNEQ:
3760       /* AVX supports all the needed comparisons.  */
3761       if (TARGET_AVX)
3762 	break;
3763       /* We have no LTGT as an operator.  We could implement it with
3764 	 NE & ORDERED, but this requires an extra temporary.  It's
3765 	 not clear that it's worth it.  */
3766       return UNKNOWN;
3767 
3768     case LT:
3769     case LE:
3770     case UNGT:
3771     case UNGE:
3772       /* These are supported directly.  */
3773       break;
3774 
3775     case EQ:
3776     case NE:
3777     case UNORDERED:
3778     case ORDERED:
3779       /* AVX has 3 operand comparisons, no need to swap anything.  */
3780       if (TARGET_AVX)
3781 	break;
3782       /* For commutative operators, try to canonicalize the destination
3783 	 operand to be first in the comparison - this helps reload to
3784 	 avoid extra moves.  */
3785       if (!dest || !rtx_equal_p (dest, *pop1))
3786 	break;
3787       /* FALLTHRU */
3788 
3789     case GE:
3790     case GT:
3791     case UNLE:
3792     case UNLT:
3793       /* These are not supported directly before AVX, and furthermore
3794 	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
3795 	 comparison operands to transform into something that is
3796 	 supported.  */
3797       std::swap (*pop0, *pop1);
3798       code = swap_condition (code);
3799       break;
3800 
3801     default:
3802       gcc_unreachable ();
3803     }
3804 
3805   return code;
3806 }
3807 
3808 /* Expand a floating-point conditional move.  Return true if successful.  */
3809 
3810 bool
ix86_expand_fp_movcc(rtx operands[])3811 ix86_expand_fp_movcc (rtx operands[])
3812 {
3813   machine_mode mode = GET_MODE (operands[0]);
3814   enum rtx_code code = GET_CODE (operands[1]);
3815   rtx tmp, compare_op;
3816   rtx op0 = XEXP (operands[1], 0);
3817   rtx op1 = XEXP (operands[1], 1);
3818 
3819   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3820     {
3821       machine_mode cmode;
3822 
3823       /* Since we've no cmove for sse registers, don't force bad register
3824 	 allocation just to gain access to it.  Deny movcc when the
3825 	 comparison mode doesn't match the move mode.  */
3826       cmode = GET_MODE (op0);
3827       if (cmode == VOIDmode)
3828 	cmode = GET_MODE (op1);
3829       if (cmode != mode)
3830 	return false;
3831 
3832       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3833       if (code == UNKNOWN)
3834 	return false;
3835 
3836       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3837 				     operands[2], operands[3]))
3838 	return true;
3839 
3840       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3841 				 operands[2], operands[3]);
3842       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3843       return true;
3844     }
3845 
3846   if (GET_MODE (op0) == TImode
3847       || (GET_MODE (op0) == DImode
3848 	  && !TARGET_64BIT))
3849     return false;
3850 
3851   /* The floating point conditional move instructions don't directly
3852      support conditions resulting from a signed integer comparison.  */
3853 
3854   compare_op = ix86_expand_compare (code, op0, op1);
3855   if (!fcmov_comparison_operator (compare_op, VOIDmode))
3856     {
3857       tmp = gen_reg_rtx (QImode);
3858       ix86_expand_setcc (tmp, code, op0, op1);
3859 
3860       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3861     }
3862 
3863   emit_insn (gen_rtx_SET (operands[0],
3864 			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
3865 						operands[2], operands[3])));
3866 
3867   return true;
3868 }
3869 
3870 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
3871 
3872 static int
ix86_int_cmp_code_to_pcmp_immediate(enum rtx_code code)3873 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3874 {
3875   switch (code)
3876     {
3877     case EQ:
3878       return 0;
3879     case LT:
3880     case LTU:
3881       return 1;
3882     case LE:
3883     case LEU:
3884       return 2;
3885     case NE:
3886       return 4;
3887     case GE:
3888     case GEU:
3889       return 5;
3890     case GT:
3891     case GTU:
3892       return 6;
3893     default:
3894       gcc_unreachable ();
3895     }
3896 }
3897 
3898 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
3899 
3900 static int
ix86_fp_cmp_code_to_pcmp_immediate(enum rtx_code code)3901 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3902 {
3903   switch (code)
3904     {
3905     case EQ:
3906       return 0x00;
3907     case NE:
3908       return 0x04;
3909     case GT:
3910       return 0x0e;
3911     case LE:
3912       return 0x02;
3913     case GE:
3914       return 0x0d;
3915     case LT:
3916       return 0x01;
3917     case UNLE:
3918       return 0x0a;
3919     case UNLT:
3920       return 0x09;
3921     case UNGE:
3922       return 0x05;
3923     case UNGT:
3924       return 0x06;
3925     case UNEQ:
3926       return 0x18;
3927     case LTGT:
3928       return 0x0c;
3929     case ORDERED:
3930       return 0x07;
3931     case UNORDERED:
3932       return 0x03;
3933     default:
3934       gcc_unreachable ();
3935     }
3936 }
3937 
3938 /* Return immediate value to be used in UNSPEC_PCMP
3939    for comparison CODE in MODE.  */
3940 
3941 static int
ix86_cmp_code_to_pcmp_immediate(enum rtx_code code,machine_mode mode)3942 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3943 {
3944   if (FLOAT_MODE_P (mode))
3945     return ix86_fp_cmp_code_to_pcmp_immediate (code);
3946   return ix86_int_cmp_code_to_pcmp_immediate (code);
3947 }
3948 
3949 /* Expand AVX-512 vector comparison.  */
3950 
3951 bool
ix86_expand_mask_vec_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1)3952 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
3953 {
3954   machine_mode mask_mode = GET_MODE (dest);
3955   machine_mode cmp_mode = GET_MODE (cmp_op0);
3956   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3957   int unspec_code;
3958   rtx unspec;
3959 
3960   switch (code)
3961     {
3962     case LEU:
3963     case GTU:
3964     case GEU:
3965     case LTU:
3966       unspec_code = UNSPEC_UNSIGNED_PCMP;
3967       break;
3968 
3969     default:
3970       unspec_code = UNSPEC_PCMP;
3971     }
3972 
3973   unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
3974 			   unspec_code);
3975   emit_insn (gen_rtx_SET (dest, unspec));
3976 
3977   return true;
3978 }
3979 
3980 /* Expand fp vector comparison.  */
3981 
3982 bool
ix86_expand_fp_vec_cmp(rtx operands[])3983 ix86_expand_fp_vec_cmp (rtx operands[])
3984 {
3985   enum rtx_code code = GET_CODE (operands[1]);
3986   rtx cmp;
3987 
3988   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
3989 					   &operands[2], &operands[3]);
3990   if (code == UNKNOWN)
3991     {
3992       rtx temp;
3993       switch (GET_CODE (operands[1]))
3994 	{
3995 	case LTGT:
3996 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
3997 				      operands[3], NULL, NULL);
3998 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
3999 				     operands[3], NULL, NULL);
4000 	  code = AND;
4001 	  break;
4002 	case UNEQ:
4003 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4004 				      operands[3], NULL, NULL);
4005 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4006 				     operands[3], NULL, NULL);
4007 	  code = IOR;
4008 	  break;
4009 	default:
4010 	  gcc_unreachable ();
4011 	}
4012       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4013 				 OPTAB_DIRECT);
4014     }
4015   else
4016     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4017 			       operands[1], operands[2]);
4018 
4019   if (operands[0] != cmp)
4020     emit_move_insn (operands[0], cmp);
4021 
4022   return true;
4023 }
4024 
4025 static rtx
ix86_expand_int_sse_cmp(rtx dest,enum rtx_code code,rtx cop0,rtx cop1,rtx op_true,rtx op_false,bool * negate)4026 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4027 			 rtx op_true, rtx op_false, bool *negate)
4028 {
4029   machine_mode data_mode = GET_MODE (dest);
4030   machine_mode mode = GET_MODE (cop0);
4031   rtx x;
4032 
4033   *negate = false;
4034 
4035   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
4036   if (TARGET_XOP
4037       && (mode == V16QImode || mode == V8HImode
4038 	  || mode == V4SImode || mode == V2DImode))
4039     ;
4040   /* AVX512F supports all of the comparsions
4041      on all 128/256/512-bit vector int types.  */
4042   else if (ix86_valid_mask_cmp_mode (mode))
4043     ;
4044   else
4045     {
4046       /* Canonicalize the comparison to EQ, GT, GTU.  */
4047       switch (code)
4048 	{
4049 	case EQ:
4050 	case GT:
4051 	case GTU:
4052 	  break;
4053 
4054 	case NE:
4055 	case LE:
4056 	case LEU:
4057 	  code = reverse_condition (code);
4058 	  *negate = true;
4059 	  break;
4060 
4061 	case GE:
4062 	case GEU:
4063 	  code = reverse_condition (code);
4064 	  *negate = true;
4065 	  /* FALLTHRU */
4066 
4067 	case LT:
4068 	case LTU:
4069 	  std::swap (cop0, cop1);
4070 	  code = swap_condition (code);
4071 	  break;
4072 
4073 	default:
4074 	  gcc_unreachable ();
4075 	}
4076 
4077       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
4078       if (mode == V2DImode)
4079 	{
4080 	  switch (code)
4081 	    {
4082 	    case EQ:
4083 	      /* SSE4.1 supports EQ.  */
4084 	      if (!TARGET_SSE4_1)
4085 		return NULL;
4086 	      break;
4087 
4088 	    case GT:
4089 	    case GTU:
4090 	      /* SSE4.2 supports GT/GTU.  */
4091 	      if (!TARGET_SSE4_2)
4092 		return NULL;
4093 	      break;
4094 
4095 	    default:
4096 	      gcc_unreachable ();
4097 	    }
4098 	}
4099 
4100       rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4101       rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4102       if (*negate)
4103 	std::swap (optrue, opfalse);
4104 
4105       /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4106 	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4107 	 min (x, y) == x).  While we add one instruction (the minimum),
4108 	 we remove the need for two instructions in the negation, as the
4109 	 result is done this way.
4110 	 When using masks, do it for SI/DImode element types, as it is shorter
4111 	 than the two subtractions.  */
4112       if ((code != EQ
4113 	   && GET_MODE_SIZE (mode) != 64
4114 	   && vector_all_ones_operand (opfalse, data_mode)
4115 	   && optrue == CONST0_RTX (data_mode))
4116 	  || (code == GTU
4117 	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4118 	      /* Don't do it if not using integer masks and we'd end up with
4119 		 the right values in the registers though.  */
4120 	      && (GET_MODE_SIZE (mode) == 64
4121 		  || !vector_all_ones_operand (optrue, data_mode)
4122 		  || opfalse != CONST0_RTX (data_mode))))
4123 	{
4124 	  rtx (*gen) (rtx, rtx, rtx) = NULL;
4125 
4126 	  switch (mode)
4127 	    {
4128 	    case E_V16SImode:
4129 	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4130 	      break;
4131 	    case E_V8DImode:
4132 	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4133 	      cop0 = force_reg (mode, cop0);
4134 	      cop1 = force_reg (mode, cop1);
4135 	      break;
4136 	    case E_V32QImode:
4137 	      if (TARGET_AVX2)
4138 		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4139 	      break;
4140 	    case E_V16HImode:
4141 	      if (TARGET_AVX2)
4142 		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4143 	      break;
4144 	    case E_V8SImode:
4145 	      if (TARGET_AVX2)
4146 		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4147 	      break;
4148 	    case E_V4DImode:
4149 	      if (TARGET_AVX512VL)
4150 		{
4151 		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4152 		  cop0 = force_reg (mode, cop0);
4153 		  cop1 = force_reg (mode, cop1);
4154 		}
4155 	      break;
4156 	    case E_V16QImode:
4157 	      if (code == GTU && TARGET_SSE2)
4158 		gen = gen_uminv16qi3;
4159 	      else if (code == GT && TARGET_SSE4_1)
4160 		gen = gen_sminv16qi3;
4161 	      break;
4162 	    case E_V8HImode:
4163 	      if (code == GTU && TARGET_SSE4_1)
4164 		gen = gen_uminv8hi3;
4165 	      else if (code == GT && TARGET_SSE2)
4166 		gen = gen_sminv8hi3;
4167 	      break;
4168 	    case E_V4SImode:
4169 	      if (TARGET_SSE4_1)
4170 		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4171 	      break;
4172 	    case E_V2DImode:
4173 	      if (TARGET_AVX512VL)
4174 		{
4175 		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4176 		  cop0 = force_reg (mode, cop0);
4177 		  cop1 = force_reg (mode, cop1);
4178 		}
4179 	      break;
4180 	    default:
4181 	      break;
4182 	    }
4183 
4184 	  if (gen)
4185 	    {
4186 	      rtx tem = gen_reg_rtx (mode);
4187 	      if (!vector_operand (cop0, mode))
4188 		cop0 = force_reg (mode, cop0);
4189 	      if (!vector_operand (cop1, mode))
4190 		cop1 = force_reg (mode, cop1);
4191 	      *negate = !*negate;
4192 	      emit_insn (gen (tem, cop0, cop1));
4193 	      cop1 = tem;
4194 	      code = EQ;
4195 	    }
4196 	}
4197 
4198       /* Unsigned parallel compare is not supported by the hardware.
4199 	 Play some tricks to turn this into a signed comparison
4200 	 against 0.  */
4201       if (code == GTU)
4202 	{
4203 	  cop0 = force_reg (mode, cop0);
4204 
4205 	  switch (mode)
4206 	    {
4207 	    case E_V16SImode:
4208 	    case E_V8DImode:
4209 	    case E_V8SImode:
4210 	    case E_V4DImode:
4211 	    case E_V4SImode:
4212 	    case E_V2DImode:
4213 		{
4214 		  rtx t1, t2, mask;
4215 
4216 		  /* Subtract (-(INT MAX) - 1) from both operands to make
4217 		     them signed.  */
4218 		  mask = ix86_build_signbit_mask (mode, true, false);
4219 		  t1 = gen_reg_rtx (mode);
4220 		  emit_insn (gen_sub3_insn (t1, cop0, mask));
4221 
4222 		  t2 = gen_reg_rtx (mode);
4223 		  emit_insn (gen_sub3_insn (t2, cop1, mask));
4224 
4225 		  cop0 = t1;
4226 		  cop1 = t2;
4227 		  code = GT;
4228 		}
4229 	      break;
4230 
4231 	    case E_V64QImode:
4232 	    case E_V32HImode:
4233 	    case E_V32QImode:
4234 	    case E_V16HImode:
4235 	    case E_V16QImode:
4236 	    case E_V8HImode:
4237 	      /* Perform a parallel unsigned saturating subtraction.  */
4238 	      x = gen_reg_rtx (mode);
4239 	      emit_insn (gen_rtx_SET
4240 			 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4241 	      cop0 = x;
4242 	      cop1 = CONST0_RTX (mode);
4243 	      code = EQ;
4244 	      *negate = !*negate;
4245 	      break;
4246 
4247 	    default:
4248 	      gcc_unreachable ();
4249 	    }
4250 	}
4251     }
4252 
4253   if (*negate)
4254     std::swap (op_true, op_false);
4255 
4256   /* Allow the comparison to be done in one mode, but the movcc to
4257      happen in another mode.  */
4258   if (data_mode == mode)
4259     {
4260       x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4261 			       op_true, op_false);
4262     }
4263   else
4264     {
4265       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4266       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4267 			       op_true, op_false);
4268       if (GET_MODE (x) == mode)
4269 	x = gen_lowpart (data_mode, x);
4270     }
4271 
4272   return x;
4273 }
4274 
4275 /* Expand integer vector comparison.  */
4276 
4277 bool
ix86_expand_int_vec_cmp(rtx operands[])4278 ix86_expand_int_vec_cmp (rtx operands[])
4279 {
4280   rtx_code code = GET_CODE (operands[1]);
4281   bool negate = false;
4282   rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4283 				     operands[3], NULL, NULL, &negate);
4284 
4285   if (!cmp)
4286     return false;
4287 
4288   if (negate)
4289     cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4290 				   CONST0_RTX (GET_MODE (cmp)),
4291 				   NULL, NULL, &negate);
4292 
4293   gcc_assert (!negate);
4294 
4295   if (operands[0] != cmp)
4296     emit_move_insn (operands[0], cmp);
4297 
4298   return true;
4299 }
4300 
4301 /* Expand a floating-point vector conditional move; a vcond operation
4302    rather than a movcc operation.  */
4303 
4304 bool
ix86_expand_fp_vcond(rtx operands[])4305 ix86_expand_fp_vcond (rtx operands[])
4306 {
4307   enum rtx_code code = GET_CODE (operands[3]);
4308   rtx cmp;
4309 
4310   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4311 					   &operands[4], &operands[5]);
4312   if (code == UNKNOWN)
4313     {
4314       rtx temp;
4315       switch (GET_CODE (operands[3]))
4316 	{
4317 	case LTGT:
4318 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4319 				      operands[5], operands[0], operands[0]);
4320 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4321 				     operands[5], operands[1], operands[2]);
4322 	  code = AND;
4323 	  break;
4324 	case UNEQ:
4325 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4326 				      operands[5], operands[0], operands[0]);
4327 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4328 				     operands[5], operands[1], operands[2]);
4329 	  code = IOR;
4330 	  break;
4331 	default:
4332 	  gcc_unreachable ();
4333 	}
4334       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4335 				 OPTAB_DIRECT);
4336       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4337       return true;
4338     }
4339 
4340   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4341 				 operands[5], operands[1], operands[2]))
4342     return true;
4343 
4344   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4345 			     operands[1], operands[2]);
4346   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4347   return true;
4348 }
4349 
4350 /* Expand a signed/unsigned integral vector conditional move.  */
4351 
4352 bool
ix86_expand_int_vcond(rtx operands[])4353 ix86_expand_int_vcond (rtx operands[])
4354 {
4355   machine_mode data_mode = GET_MODE (operands[0]);
4356   machine_mode mode = GET_MODE (operands[4]);
4357   enum rtx_code code = GET_CODE (operands[3]);
4358   bool negate = false;
4359   rtx x, cop0, cop1;
4360 
4361   cop0 = operands[4];
4362   cop1 = operands[5];
4363 
4364   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4365      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
4366   if ((code == LT || code == GE)
4367       && data_mode == mode
4368       && cop1 == CONST0_RTX (mode)
4369       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4370       && GET_MODE_UNIT_SIZE (data_mode) > 1
4371       && GET_MODE_UNIT_SIZE (data_mode) <= 8
4372       && (GET_MODE_SIZE (data_mode) == 16
4373 	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4374     {
4375       rtx negop = operands[2 - (code == LT)];
4376       int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4377       if (negop == CONST1_RTX (data_mode))
4378 	{
4379 	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4380 					 operands[0], 1, OPTAB_DIRECT);
4381 	  if (res != operands[0])
4382 	    emit_move_insn (operands[0], res);
4383 	  return true;
4384 	}
4385       else if (GET_MODE_INNER (data_mode) != DImode
4386 	       && vector_all_ones_operand (negop, data_mode))
4387 	{
4388 	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4389 					 operands[0], 0, OPTAB_DIRECT);
4390 	  if (res != operands[0])
4391 	    emit_move_insn (operands[0], res);
4392 	  return true;
4393 	}
4394     }
4395 
4396   if (!nonimmediate_operand (cop1, mode))
4397     cop1 = force_reg (mode, cop1);
4398   if (!general_operand (operands[1], data_mode))
4399     operands[1] = force_reg (data_mode, operands[1]);
4400   if (!general_operand (operands[2], data_mode))
4401     operands[2] = force_reg (data_mode, operands[2]);
4402 
4403   x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4404 			       operands[1], operands[2], &negate);
4405 
4406   if (!x)
4407     return false;
4408 
4409   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4410 			 operands[2-negate]);
4411   return true;
4412 }
4413 
4414 static bool
ix86_expand_vec_perm_vpermt2(rtx target,rtx mask,rtx op0,rtx op1,struct expand_vec_perm_d * d)4415 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4416 			      struct expand_vec_perm_d *d)
4417 {
4418   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4419      expander, so args are either in d, or in op0, op1 etc.  */
4420   machine_mode mode = GET_MODE (d ? d->op0 : op0);
4421   machine_mode maskmode = mode;
4422   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4423 
4424   switch (mode)
4425     {
4426     case E_V8HImode:
4427       if (TARGET_AVX512VL && TARGET_AVX512BW)
4428 	gen = gen_avx512vl_vpermt2varv8hi3;
4429       break;
4430     case E_V16HImode:
4431       if (TARGET_AVX512VL && TARGET_AVX512BW)
4432 	gen = gen_avx512vl_vpermt2varv16hi3;
4433       break;
4434     case E_V64QImode:
4435       if (TARGET_AVX512VBMI)
4436 	gen = gen_avx512bw_vpermt2varv64qi3;
4437       break;
4438     case E_V32HImode:
4439       if (TARGET_AVX512BW)
4440 	gen = gen_avx512bw_vpermt2varv32hi3;
4441       break;
4442     case E_V4SImode:
4443       if (TARGET_AVX512VL)
4444 	gen = gen_avx512vl_vpermt2varv4si3;
4445       break;
4446     case E_V8SImode:
4447       if (TARGET_AVX512VL)
4448 	gen = gen_avx512vl_vpermt2varv8si3;
4449       break;
4450     case E_V16SImode:
4451       if (TARGET_AVX512F)
4452 	gen = gen_avx512f_vpermt2varv16si3;
4453       break;
4454     case E_V4SFmode:
4455       if (TARGET_AVX512VL)
4456 	{
4457 	  gen = gen_avx512vl_vpermt2varv4sf3;
4458 	  maskmode = V4SImode;
4459 	}
4460       break;
4461     case E_V8SFmode:
4462       if (TARGET_AVX512VL)
4463 	{
4464 	  gen = gen_avx512vl_vpermt2varv8sf3;
4465 	  maskmode = V8SImode;
4466 	}
4467       break;
4468     case E_V16SFmode:
4469       if (TARGET_AVX512F)
4470 	{
4471 	  gen = gen_avx512f_vpermt2varv16sf3;
4472 	  maskmode = V16SImode;
4473 	}
4474       break;
4475     case E_V2DImode:
4476       if (TARGET_AVX512VL)
4477 	gen = gen_avx512vl_vpermt2varv2di3;
4478       break;
4479     case E_V4DImode:
4480       if (TARGET_AVX512VL)
4481 	gen = gen_avx512vl_vpermt2varv4di3;
4482       break;
4483     case E_V8DImode:
4484       if (TARGET_AVX512F)
4485 	gen = gen_avx512f_vpermt2varv8di3;
4486       break;
4487     case E_V2DFmode:
4488       if (TARGET_AVX512VL)
4489 	{
4490 	  gen = gen_avx512vl_vpermt2varv2df3;
4491 	  maskmode = V2DImode;
4492 	}
4493       break;
4494     case E_V4DFmode:
4495       if (TARGET_AVX512VL)
4496 	{
4497 	  gen = gen_avx512vl_vpermt2varv4df3;
4498 	  maskmode = V4DImode;
4499 	}
4500       break;
4501     case E_V8DFmode:
4502       if (TARGET_AVX512F)
4503 	{
4504 	  gen = gen_avx512f_vpermt2varv8df3;
4505 	  maskmode = V8DImode;
4506 	}
4507       break;
4508     default:
4509       break;
4510     }
4511 
4512   if (gen == NULL)
4513     return false;
4514 
4515   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4516      expander, so args are either in d, or in op0, op1 etc.  */
4517   if (d)
4518     {
4519       rtx vec[64];
4520       target = d->target;
4521       op0 = d->op0;
4522       op1 = d->op1;
4523       for (int i = 0; i < d->nelt; ++i)
4524 	vec[i] = GEN_INT (d->perm[i]);
4525       mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4526     }
4527 
4528   emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4529   return true;
4530 }
4531 
4532 /* Expand a variable vector permutation.  */
4533 
4534 void
ix86_expand_vec_perm(rtx operands[])4535 ix86_expand_vec_perm (rtx operands[])
4536 {
4537   rtx target = operands[0];
4538   rtx op0 = operands[1];
4539   rtx op1 = operands[2];
4540   rtx mask = operands[3];
4541   rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4542   machine_mode mode = GET_MODE (op0);
4543   machine_mode maskmode = GET_MODE (mask);
4544   int w, e, i;
4545   bool one_operand_shuffle = rtx_equal_p (op0, op1);
4546 
4547   /* Number of elements in the vector.  */
4548   w = GET_MODE_NUNITS (mode);
4549   e = GET_MODE_UNIT_SIZE (mode);
4550   gcc_assert (w <= 64);
4551 
4552   if (TARGET_AVX512F && one_operand_shuffle)
4553     {
4554       rtx (*gen) (rtx, rtx, rtx) = NULL;
4555       switch (mode)
4556 	{
4557 	case E_V16SImode:
4558 	  gen =gen_avx512f_permvarv16si;
4559 	  break;
4560 	case E_V16SFmode:
4561 	  gen = gen_avx512f_permvarv16sf;
4562 	  break;
4563 	case E_V8DImode:
4564 	  gen = gen_avx512f_permvarv8di;
4565 	  break;
4566 	case E_V8DFmode:
4567 	  gen = gen_avx512f_permvarv8df;
4568 	  break;
4569 	default:
4570 	  break;
4571 	}
4572       if (gen != NULL)
4573 	{
4574 	  emit_insn (gen (target, op0, mask));
4575 	  return;
4576 	}
4577     }
4578 
4579   if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4580     return;
4581 
4582   if (TARGET_AVX2)
4583     {
4584       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4585 	{
4586 	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4587 	     an constant shuffle operand.  With a tiny bit of effort we can
4588 	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
4589 	     unfortunate but there's no avoiding it.
4590 	     Similarly for V16HImode we don't have instructions for variable
4591 	     shuffling, while for V32QImode we can use after preparing suitable
4592 	     masks vpshufb; vpshufb; vpermq; vpor.  */
4593 
4594 	  if (mode == V16HImode)
4595 	    {
4596 	      maskmode = mode = V32QImode;
4597 	      w = 32;
4598 	      e = 1;
4599 	    }
4600 	  else
4601 	    {
4602 	      maskmode = mode = V8SImode;
4603 	      w = 8;
4604 	      e = 4;
4605 	    }
4606 	  t1 = gen_reg_rtx (maskmode);
4607 
4608 	  /* Replicate the low bits of the V4DImode mask into V8SImode:
4609 	       mask = { A B C D }
4610 	       t1 = { A A B B C C D D }.  */
4611 	  for (i = 0; i < w / 2; ++i)
4612 	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4613 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4614 	  vt = force_reg (maskmode, vt);
4615 	  mask = gen_lowpart (maskmode, mask);
4616 	  if (maskmode == V8SImode)
4617 	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4618 	  else
4619 	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4620 
4621 	  /* Multiply the shuffle indicies by two.  */
4622 	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4623 				    OPTAB_DIRECT);
4624 
4625 	  /* Add one to the odd shuffle indicies:
4626 		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
4627 	  for (i = 0; i < w / 2; ++i)
4628 	    {
4629 	      vec[i * 2] = const0_rtx;
4630 	      vec[i * 2 + 1] = const1_rtx;
4631 	    }
4632 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4633 	  vt = validize_mem (force_const_mem (maskmode, vt));
4634 	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4635 				    OPTAB_DIRECT);
4636 
4637 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
4638 	  operands[3] = mask = t1;
4639 	  target = gen_reg_rtx (mode);
4640 	  op0 = gen_lowpart (mode, op0);
4641 	  op1 = gen_lowpart (mode, op1);
4642 	}
4643 
4644       switch (mode)
4645 	{
4646 	case E_V8SImode:
4647 	  /* The VPERMD and VPERMPS instructions already properly ignore
4648 	     the high bits of the shuffle elements.  No need for us to
4649 	     perform an AND ourselves.  */
4650 	  if (one_operand_shuffle)
4651 	    {
4652 	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4653 	      if (target != operands[0])
4654 		emit_move_insn (operands[0],
4655 				gen_lowpart (GET_MODE (operands[0]), target));
4656 	    }
4657 	  else
4658 	    {
4659 	      t1 = gen_reg_rtx (V8SImode);
4660 	      t2 = gen_reg_rtx (V8SImode);
4661 	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4662 	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4663 	      goto merge_two;
4664 	    }
4665 	  return;
4666 
4667 	case E_V8SFmode:
4668 	  mask = gen_lowpart (V8SImode, mask);
4669 	  if (one_operand_shuffle)
4670 	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4671 	  else
4672 	    {
4673 	      t1 = gen_reg_rtx (V8SFmode);
4674 	      t2 = gen_reg_rtx (V8SFmode);
4675 	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4676 	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4677 	      goto merge_two;
4678 	    }
4679 	  return;
4680 
4681         case E_V4SImode:
4682 	  /* By combining the two 128-bit input vectors into one 256-bit
4683 	     input vector, we can use VPERMD and VPERMPS for the full
4684 	     two-operand shuffle.  */
4685 	  t1 = gen_reg_rtx (V8SImode);
4686 	  t2 = gen_reg_rtx (V8SImode);
4687 	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4688 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4689 	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4690 	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4691 	  return;
4692 
4693         case E_V4SFmode:
4694 	  t1 = gen_reg_rtx (V8SFmode);
4695 	  t2 = gen_reg_rtx (V8SImode);
4696 	  mask = gen_lowpart (V4SImode, mask);
4697 	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4698 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4699 	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4700 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4701 	  return;
4702 
4703 	case E_V32QImode:
4704 	  t1 = gen_reg_rtx (V32QImode);
4705 	  t2 = gen_reg_rtx (V32QImode);
4706 	  t3 = gen_reg_rtx (V32QImode);
4707 	  vt2 = GEN_INT (-128);
4708 	  vt = gen_const_vec_duplicate (V32QImode, vt2);
4709 	  vt = force_reg (V32QImode, vt);
4710 	  for (i = 0; i < 32; i++)
4711 	    vec[i] = i < 16 ? vt2 : const0_rtx;
4712 	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4713 	  vt2 = force_reg (V32QImode, vt2);
4714 	  /* From mask create two adjusted masks, which contain the same
4715 	     bits as mask in the low 7 bits of each vector element.
4716 	     The first mask will have the most significant bit clear
4717 	     if it requests element from the same 128-bit lane
4718 	     and MSB set if it requests element from the other 128-bit lane.
4719 	     The second mask will have the opposite values of the MSB,
4720 	     and additionally will have its 128-bit lanes swapped.
4721 	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4722 	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
4723 	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4724 	     stands for other 12 bytes.  */
4725 	  /* The bit whether element is from the same lane or the other
4726 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
4727 	  t5 = gen_reg_rtx (V4DImode);
4728 	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4729 				    GEN_INT (3)));
4730 	  /* Clear MSB bits from the mask just in case it had them set.  */
4731 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4732 	  /* After this t1 will have MSB set for elements from other lane.  */
4733 	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4734 	  /* Clear bits other than MSB.  */
4735 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
4736 	  /* Or in the lower bits from mask into t3.  */
4737 	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
4738 	  /* And invert MSB bits in t1, so MSB is set for elements from the same
4739 	     lane.  */
4740 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
4741 	  /* Swap 128-bit lanes in t3.  */
4742 	  t6 = gen_reg_rtx (V4DImode);
4743 	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4744 					  const2_rtx, GEN_INT (3),
4745 					  const0_rtx, const1_rtx));
4746 	  /* And or in the lower bits from mask into t1.  */
4747 	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
4748 	  if (one_operand_shuffle)
4749 	    {
4750 	      /* Each of these shuffles will put 0s in places where
4751 		 element from the other 128-bit lane is needed, otherwise
4752 		 will shuffle in the requested value.  */
4753 	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4754 						gen_lowpart (V32QImode, t6)));
4755 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4756 	      /* For t3 the 128-bit lanes are swapped again.  */
4757 	      t7 = gen_reg_rtx (V4DImode);
4758 	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4759 					      const2_rtx, GEN_INT (3),
4760 					      const0_rtx, const1_rtx));
4761 	      /* And oring both together leads to the result.  */
4762 	      emit_insn (gen_iorv32qi3 (target, t1,
4763 					gen_lowpart (V32QImode, t7)));
4764 	      if (target != operands[0])
4765 		emit_move_insn (operands[0],
4766 				gen_lowpart (GET_MODE (operands[0]), target));
4767 	      return;
4768 	    }
4769 
4770 	  t4 = gen_reg_rtx (V32QImode);
4771 	  /* Similarly to the above one_operand_shuffle code,
4772 	     just for repeated twice for each operand.  merge_two:
4773 	     code will merge the two results together.  */
4774 	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4775 					    gen_lowpart (V32QImode, t6)));
4776 	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4777 					    gen_lowpart (V32QImode, t6)));
4778 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4779 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4780 	  t7 = gen_reg_rtx (V4DImode);
4781 	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4782 					  const2_rtx, GEN_INT (3),
4783 					  const0_rtx, const1_rtx));
4784 	  t8 = gen_reg_rtx (V4DImode);
4785 	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4786 					  const2_rtx, GEN_INT (3),
4787 					  const0_rtx, const1_rtx));
4788 	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4789 	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4790 	  t1 = t4;
4791 	  t2 = t3;
4792 	  goto merge_two;
4793 
4794 	default:
4795 	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
4796 	  break;
4797 	}
4798     }
4799 
4800   if (TARGET_XOP)
4801     {
4802       /* The XOP VPPERM insn supports three inputs.  By ignoring the
4803 	 one_operand_shuffle special case, we avoid creating another
4804 	 set of constant vectors in memory.  */
4805       one_operand_shuffle = false;
4806 
4807       /* mask = mask & {2*w-1, ...} */
4808       vt = GEN_INT (2*w - 1);
4809     }
4810   else
4811     {
4812       /* mask = mask & {w-1, ...} */
4813       vt = GEN_INT (w - 1);
4814     }
4815 
4816   vt = gen_const_vec_duplicate (maskmode, vt);
4817   mask = expand_simple_binop (maskmode, AND, mask, vt,
4818 			      NULL_RTX, 0, OPTAB_DIRECT);
4819 
4820   /* For non-QImode operations, convert the word permutation control
4821      into a byte permutation control.  */
4822   if (mode != V16QImode)
4823     {
4824       mask = expand_simple_binop (maskmode, ASHIFT, mask,
4825 				  GEN_INT (exact_log2 (e)),
4826 				  NULL_RTX, 0, OPTAB_DIRECT);
4827 
4828       /* Convert mask to vector of chars.  */
4829       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4830 
4831       /* Replicate each of the input bytes into byte positions:
4832 	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4833 	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4834 	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
4835       for (i = 0; i < 16; ++i)
4836 	vec[i] = GEN_INT (i/e * e);
4837       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4838       vt = validize_mem (force_const_mem (V16QImode, vt));
4839       if (TARGET_XOP)
4840 	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4841       else
4842 	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4843 
4844       /* Convert it into the byte positions by doing
4845 	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
4846       for (i = 0; i < 16; ++i)
4847 	vec[i] = GEN_INT (i % e);
4848       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4849       vt = validize_mem (force_const_mem (V16QImode, vt));
4850       emit_insn (gen_addv16qi3 (mask, mask, vt));
4851     }
4852 
4853   /* The actual shuffle operations all operate on V16QImode.  */
4854   op0 = gen_lowpart (V16QImode, op0);
4855   op1 = gen_lowpart (V16QImode, op1);
4856 
4857   if (TARGET_XOP)
4858     {
4859       if (GET_MODE (target) != V16QImode)
4860 	target = gen_reg_rtx (V16QImode);
4861       emit_insn (gen_xop_pperm (target, op0, op1, mask));
4862       if (target != operands[0])
4863 	emit_move_insn (operands[0],
4864 			gen_lowpart (GET_MODE (operands[0]), target));
4865     }
4866   else if (one_operand_shuffle)
4867     {
4868       if (GET_MODE (target) != V16QImode)
4869 	target = gen_reg_rtx (V16QImode);
4870       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4871       if (target != operands[0])
4872 	emit_move_insn (operands[0],
4873 			gen_lowpart (GET_MODE (operands[0]), target));
4874     }
4875   else
4876     {
4877       rtx xops[6];
4878       bool ok;
4879 
4880       /* Shuffle the two input vectors independently.  */
4881       t1 = gen_reg_rtx (V16QImode);
4882       t2 = gen_reg_rtx (V16QImode);
4883       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4884       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4885 
4886  merge_two:
4887       /* Then merge them together.  The key is whether any given control
4888          element contained a bit set that indicates the second word.  */
4889       mask = operands[3];
4890       vt = GEN_INT (w);
4891       if (maskmode == V2DImode && !TARGET_SSE4_1)
4892 	{
4893 	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
4894 	     more shuffle to convert the V2DI input mask into a V4SI
4895 	     input mask.  At which point the masking that expand_int_vcond
4896 	     will work as desired.  */
4897 	  rtx t3 = gen_reg_rtx (V4SImode);
4898 	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4899 				        const0_rtx, const0_rtx,
4900 				        const2_rtx, const2_rtx));
4901 	  mask = t3;
4902 	  maskmode = V4SImode;
4903 	  e = w = 4;
4904 	}
4905 
4906       vt = gen_const_vec_duplicate (maskmode, vt);
4907       vt = force_reg (maskmode, vt);
4908       mask = expand_simple_binop (maskmode, AND, mask, vt,
4909 				  NULL_RTX, 0, OPTAB_DIRECT);
4910 
4911       if (GET_MODE (target) != mode)
4912 	target = gen_reg_rtx (mode);
4913       xops[0] = target;
4914       xops[1] = gen_lowpart (mode, t2);
4915       xops[2] = gen_lowpart (mode, t1);
4916       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4917       xops[4] = mask;
4918       xops[5] = vt;
4919       ok = ix86_expand_int_vcond (xops);
4920       gcc_assert (ok);
4921       if (target != operands[0])
4922 	emit_move_insn (operands[0],
4923 			gen_lowpart (GET_MODE (operands[0]), target));
4924     }
4925 }
4926 
4927 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
4928    true if we should do zero extension, else sign extension.  HIGH_P is
4929    true if we want the N/2 high elements, else the low elements.  */
4930 
4931 void
ix86_expand_sse_unpack(rtx dest,rtx src,bool unsigned_p,bool high_p)4932 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4933 {
4934   machine_mode imode = GET_MODE (src);
4935   rtx tmp;
4936 
4937   if (TARGET_SSE4_1)
4938     {
4939       rtx (*unpack)(rtx, rtx);
4940       rtx (*extract)(rtx, rtx) = NULL;
4941       machine_mode halfmode = BLKmode;
4942 
4943       switch (imode)
4944 	{
4945 	case E_V64QImode:
4946 	  if (unsigned_p)
4947 	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4948 	  else
4949 	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4950 	  halfmode = V32QImode;
4951 	  extract
4952 	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4953 	  break;
4954 	case E_V32QImode:
4955 	  if (unsigned_p)
4956 	    unpack = gen_avx2_zero_extendv16qiv16hi2;
4957 	  else
4958 	    unpack = gen_avx2_sign_extendv16qiv16hi2;
4959 	  halfmode = V16QImode;
4960 	  extract
4961 	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4962 	  break;
4963 	case E_V32HImode:
4964 	  if (unsigned_p)
4965 	    unpack = gen_avx512f_zero_extendv16hiv16si2;
4966 	  else
4967 	    unpack = gen_avx512f_sign_extendv16hiv16si2;
4968 	  halfmode = V16HImode;
4969 	  extract
4970 	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4971 	  break;
4972 	case E_V16HImode:
4973 	  if (unsigned_p)
4974 	    unpack = gen_avx2_zero_extendv8hiv8si2;
4975 	  else
4976 	    unpack = gen_avx2_sign_extendv8hiv8si2;
4977 	  halfmode = V8HImode;
4978 	  extract
4979 	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4980 	  break;
4981 	case E_V16SImode:
4982 	  if (unsigned_p)
4983 	    unpack = gen_avx512f_zero_extendv8siv8di2;
4984 	  else
4985 	    unpack = gen_avx512f_sign_extendv8siv8di2;
4986 	  halfmode = V8SImode;
4987 	  extract
4988 	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
4989 	  break;
4990 	case E_V8SImode:
4991 	  if (unsigned_p)
4992 	    unpack = gen_avx2_zero_extendv4siv4di2;
4993 	  else
4994 	    unpack = gen_avx2_sign_extendv4siv4di2;
4995 	  halfmode = V4SImode;
4996 	  extract
4997 	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
4998 	  break;
4999 	case E_V16QImode:
5000 	  if (unsigned_p)
5001 	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5002 	  else
5003 	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5004 	  break;
5005 	case E_V8HImode:
5006 	  if (unsigned_p)
5007 	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
5008 	  else
5009 	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
5010 	  break;
5011 	case E_V4SImode:
5012 	  if (unsigned_p)
5013 	    unpack = gen_sse4_1_zero_extendv2siv2di2;
5014 	  else
5015 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
5016 	  break;
5017 	default:
5018 	  gcc_unreachable ();
5019 	}
5020 
5021       if (GET_MODE_SIZE (imode) >= 32)
5022 	{
5023 	  tmp = gen_reg_rtx (halfmode);
5024 	  emit_insn (extract (tmp, src));
5025 	}
5026       else if (high_p)
5027 	{
5028 	  /* Shift higher 8 bytes to lower 8 bytes.  */
5029 	  tmp = gen_reg_rtx (V1TImode);
5030 	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5031 					 GEN_INT (64)));
5032 	  tmp = gen_lowpart (imode, tmp);
5033 	}
5034       else
5035 	tmp = src;
5036 
5037       emit_insn (unpack (dest, tmp));
5038     }
5039   else
5040     {
5041       rtx (*unpack)(rtx, rtx, rtx);
5042 
5043       switch (imode)
5044 	{
5045 	case E_V16QImode:
5046 	  if (high_p)
5047 	    unpack = gen_vec_interleave_highv16qi;
5048 	  else
5049 	    unpack = gen_vec_interleave_lowv16qi;
5050 	  break;
5051 	case E_V8HImode:
5052 	  if (high_p)
5053 	    unpack = gen_vec_interleave_highv8hi;
5054 	  else
5055 	    unpack = gen_vec_interleave_lowv8hi;
5056 	  break;
5057 	case E_V4SImode:
5058 	  if (high_p)
5059 	    unpack = gen_vec_interleave_highv4si;
5060 	  else
5061 	    unpack = gen_vec_interleave_lowv4si;
5062 	  break;
5063 	default:
5064 	  gcc_unreachable ();
5065 	}
5066 
5067       if (unsigned_p)
5068 	tmp = force_reg (imode, CONST0_RTX (imode));
5069       else
5070 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5071 				   src, pc_rtx, pc_rtx);
5072 
5073       rtx tmp2 = gen_reg_rtx (imode);
5074       emit_insn (unpack (tmp2, src, tmp));
5075       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5076     }
5077 }
5078 
5079 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
5080    but works for floating pointer parameters and nonoffsetable memories.
5081    For pushes, it returns just stack offsets; the values will be saved
5082    in the right order.  Maximally three parts are generated.  */
5083 
5084 static int
ix86_split_to_parts(rtx operand,rtx * parts,machine_mode mode)5085 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5086 {
5087   int size;
5088 
5089   if (!TARGET_64BIT)
5090     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5091   else
5092     size = (GET_MODE_SIZE (mode) + 4) / 8;
5093 
5094   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5095   gcc_assert (size >= 2 && size <= 4);
5096 
5097   /* Optimize constant pool reference to immediates.  This is used by fp
5098      moves, that force all constants to memory to allow combining.  */
5099   if (MEM_P (operand) && MEM_READONLY_P (operand))
5100     operand = avoid_constant_pool_reference (operand);
5101 
5102   if (MEM_P (operand) && !offsettable_memref_p (operand))
5103     {
5104       /* The only non-offsetable memories we handle are pushes.  */
5105       int ok = push_operand (operand, VOIDmode);
5106 
5107       gcc_assert (ok);
5108 
5109       operand = copy_rtx (operand);
5110       PUT_MODE (operand, word_mode);
5111       parts[0] = parts[1] = parts[2] = parts[3] = operand;
5112       return size;
5113     }
5114 
5115   if (GET_CODE (operand) == CONST_VECTOR)
5116     {
5117       scalar_int_mode imode = int_mode_for_mode (mode).require ();
5118       /* Caution: if we looked through a constant pool memory above,
5119 	 the operand may actually have a different mode now.  That's
5120 	 ok, since we want to pun this all the way back to an integer.  */
5121       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5122       gcc_assert (operand != NULL);
5123       mode = imode;
5124     }
5125 
5126   if (!TARGET_64BIT)
5127     {
5128       if (mode == DImode)
5129 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5130       else
5131 	{
5132 	  int i;
5133 
5134 	  if (REG_P (operand))
5135 	    {
5136 	      gcc_assert (reload_completed);
5137 	      for (i = 0; i < size; i++)
5138 		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5139 	    }
5140 	  else if (offsettable_memref_p (operand))
5141 	    {
5142 	      operand = adjust_address (operand, SImode, 0);
5143 	      parts[0] = operand;
5144 	      for (i = 1; i < size; i++)
5145 		parts[i] = adjust_address (operand, SImode, 4 * i);
5146 	    }
5147 	  else if (CONST_DOUBLE_P (operand))
5148 	    {
5149 	      const REAL_VALUE_TYPE *r;
5150 	      long l[4];
5151 
5152 	      r = CONST_DOUBLE_REAL_VALUE (operand);
5153 	      switch (mode)
5154 		{
5155 		case E_TFmode:
5156 		  real_to_target (l, r, mode);
5157 		  parts[3] = gen_int_mode (l[3], SImode);
5158 		  parts[2] = gen_int_mode (l[2], SImode);
5159 		  break;
5160 		case E_XFmode:
5161 		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5162 		     long double may not be 80-bit.  */
5163 		  real_to_target (l, r, mode);
5164 		  parts[2] = gen_int_mode (l[2], SImode);
5165 		  break;
5166 		case E_DFmode:
5167 		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5168 		  break;
5169 		default:
5170 		  gcc_unreachable ();
5171 		}
5172 	      parts[1] = gen_int_mode (l[1], SImode);
5173 	      parts[0] = gen_int_mode (l[0], SImode);
5174 	    }
5175 	  else
5176 	    gcc_unreachable ();
5177 	}
5178     }
5179   else
5180     {
5181       if (mode == TImode)
5182 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5183       if (mode == XFmode || mode == TFmode)
5184 	{
5185 	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5186 	  if (REG_P (operand))
5187 	    {
5188 	      gcc_assert (reload_completed);
5189 	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5190 	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5191 	    }
5192 	  else if (offsettable_memref_p (operand))
5193 	    {
5194 	      operand = adjust_address (operand, DImode, 0);
5195 	      parts[0] = operand;
5196 	      parts[1] = adjust_address (operand, upper_mode, 8);
5197 	    }
5198 	  else if (CONST_DOUBLE_P (operand))
5199 	    {
5200 	      long l[4];
5201 
5202 	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5203 
5204 	      /* real_to_target puts 32-bit pieces in each long.  */
5205 	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5206 				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5207 					  << 32), DImode);
5208 
5209 	      if (upper_mode == SImode)
5210 	        parts[1] = gen_int_mode (l[2], SImode);
5211 	      else
5212 	        parts[1]
5213 		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5214 				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5215 				     << 32), DImode);
5216 	    }
5217 	  else
5218 	    gcc_unreachable ();
5219 	}
5220     }
5221 
5222   return size;
5223 }
5224 
5225 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5226    Return false when normal moves are needed; true when all required
5227    insns have been emitted.  Operands 2-4 contain the input values
5228    int the correct order; operands 5-7 contain the output values.  */
5229 
5230 void
ix86_split_long_move(rtx operands[])5231 ix86_split_long_move (rtx operands[])
5232 {
5233   rtx part[2][4];
5234   int nparts, i, j;
5235   int push = 0;
5236   int collisions = 0;
5237   machine_mode mode = GET_MODE (operands[0]);
5238   bool collisionparts[4];
5239 
5240   /* The DFmode expanders may ask us to move double.
5241      For 64bit target this is single move.  By hiding the fact
5242      here we simplify i386.md splitters.  */
5243   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5244     {
5245       /* Optimize constant pool reference to immediates.  This is used by
5246 	 fp moves, that force all constants to memory to allow combining.  */
5247 
5248       if (MEM_P (operands[1])
5249 	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5250 	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5251 	operands[1] = get_pool_constant (XEXP (operands[1], 0));
5252       if (push_operand (operands[0], VOIDmode))
5253 	{
5254 	  operands[0] = copy_rtx (operands[0]);
5255 	  PUT_MODE (operands[0], word_mode);
5256 	}
5257       else
5258         operands[0] = gen_lowpart (DImode, operands[0]);
5259       operands[1] = gen_lowpart (DImode, operands[1]);
5260       emit_move_insn (operands[0], operands[1]);
5261       return;
5262     }
5263 
5264   /* The only non-offsettable memory we handle is push.  */
5265   if (push_operand (operands[0], VOIDmode))
5266     push = 1;
5267   else
5268     gcc_assert (!MEM_P (operands[0])
5269 		|| offsettable_memref_p (operands[0]));
5270 
5271   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5272   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5273 
5274   /* When emitting push, take care for source operands on the stack.  */
5275   if (push && MEM_P (operands[1])
5276       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5277     {
5278       rtx src_base = XEXP (part[1][nparts - 1], 0);
5279 
5280       /* Compensate for the stack decrement by 4.  */
5281       if (!TARGET_64BIT && nparts == 3
5282 	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5283 	src_base = plus_constant (Pmode, src_base, 4);
5284 
5285       /* src_base refers to the stack pointer and is
5286 	 automatically decreased by emitted push.  */
5287       for (i = 0; i < nparts; i++)
5288 	part[1][i] = change_address (part[1][i],
5289 				     GET_MODE (part[1][i]), src_base);
5290     }
5291 
5292   /* We need to do copy in the right order in case an address register
5293      of the source overlaps the destination.  */
5294   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5295     {
5296       rtx tmp;
5297 
5298       for (i = 0; i < nparts; i++)
5299 	{
5300 	  collisionparts[i]
5301 	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5302 	  if (collisionparts[i])
5303 	    collisions++;
5304 	}
5305 
5306       /* Collision in the middle part can be handled by reordering.  */
5307       if (collisions == 1 && nparts == 3 && collisionparts [1])
5308 	{
5309 	  std::swap (part[0][1], part[0][2]);
5310 	  std::swap (part[1][1], part[1][2]);
5311 	}
5312       else if (collisions == 1
5313 	       && nparts == 4
5314 	       && (collisionparts [1] || collisionparts [2]))
5315 	{
5316 	  if (collisionparts [1])
5317 	    {
5318 	      std::swap (part[0][1], part[0][2]);
5319 	      std::swap (part[1][1], part[1][2]);
5320 	    }
5321 	  else
5322 	    {
5323 	      std::swap (part[0][2], part[0][3]);
5324 	      std::swap (part[1][2], part[1][3]);
5325 	    }
5326 	}
5327 
5328       /* If there are more collisions, we can't handle it by reordering.
5329 	 Do an lea to the last part and use only one colliding move.  */
5330       else if (collisions > 1)
5331 	{
5332 	  rtx base, addr;
5333 
5334 	  collisions = 1;
5335 
5336 	  base = part[0][nparts - 1];
5337 
5338 	  /* Handle the case when the last part isn't valid for lea.
5339 	     Happens in 64-bit mode storing the 12-byte XFmode.  */
5340 	  if (GET_MODE (base) != Pmode)
5341 	    base = gen_rtx_REG (Pmode, REGNO (base));
5342 
5343 	  addr = XEXP (part[1][0], 0);
5344 	  if (TARGET_TLS_DIRECT_SEG_REFS)
5345 	    {
5346 	      struct ix86_address parts;
5347 	      int ok = ix86_decompose_address (addr, &parts);
5348 	      gcc_assert (ok);
5349 	      /* It is not valid to use %gs: or %fs: in lea.  */
5350 	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5351 	    }
5352 	  emit_insn (gen_rtx_SET (base, addr));
5353 	  part[1][0] = replace_equiv_address (part[1][0], base);
5354 	  for (i = 1; i < nparts; i++)
5355 	    {
5356 	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5357 	      part[1][i] = replace_equiv_address (part[1][i], tmp);
5358 	    }
5359 	}
5360     }
5361 
5362   if (push)
5363     {
5364       if (!TARGET_64BIT)
5365 	{
5366 	  if (nparts == 3)
5367 	    {
5368 	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5369                 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5370 	      emit_move_insn (part[0][2], part[1][2]);
5371 	    }
5372 	  else if (nparts == 4)
5373 	    {
5374 	      emit_move_insn (part[0][3], part[1][3]);
5375 	      emit_move_insn (part[0][2], part[1][2]);
5376 	    }
5377 	}
5378       else
5379 	{
5380 	  /* In 64bit mode we don't have 32bit push available.  In case this is
5381 	     register, it is OK - we will just use larger counterpart.  We also
5382 	     retype memory - these comes from attempt to avoid REX prefix on
5383 	     moving of second half of TFmode value.  */
5384 	  if (GET_MODE (part[1][1]) == SImode)
5385 	    {
5386 	      switch (GET_CODE (part[1][1]))
5387 		{
5388 		case MEM:
5389 		  part[1][1] = adjust_address (part[1][1], DImode, 0);
5390 		  break;
5391 
5392 		case REG:
5393 		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5394 		  break;
5395 
5396 		default:
5397 		  gcc_unreachable ();
5398 		}
5399 
5400 	      if (GET_MODE (part[1][0]) == SImode)
5401 		part[1][0] = part[1][1];
5402 	    }
5403 	}
5404       emit_move_insn (part[0][1], part[1][1]);
5405       emit_move_insn (part[0][0], part[1][0]);
5406       return;
5407     }
5408 
5409   /* Choose correct order to not overwrite the source before it is copied.  */
5410   if ((REG_P (part[0][0])
5411        && REG_P (part[1][1])
5412        && (REGNO (part[0][0]) == REGNO (part[1][1])
5413 	   || (nparts == 3
5414 	       && REGNO (part[0][0]) == REGNO (part[1][2]))
5415 	   || (nparts == 4
5416 	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
5417       || (collisions > 0
5418 	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5419     {
5420       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5421 	{
5422 	  operands[2 + i] = part[0][j];
5423 	  operands[6 + i] = part[1][j];
5424 	}
5425     }
5426   else
5427     {
5428       for (i = 0; i < nparts; i++)
5429 	{
5430 	  operands[2 + i] = part[0][i];
5431 	  operands[6 + i] = part[1][i];
5432 	}
5433     }
5434 
5435   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
5436   if (optimize_insn_for_size_p ())
5437     {
5438       for (j = 0; j < nparts - 1; j++)
5439 	if (CONST_INT_P (operands[6 + j])
5440 	    && operands[6 + j] != const0_rtx
5441 	    && REG_P (operands[2 + j]))
5442 	  for (i = j; i < nparts - 1; i++)
5443 	    if (CONST_INT_P (operands[7 + i])
5444 		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5445 	      operands[7 + i] = operands[2 + j];
5446     }
5447 
5448   for (i = 0; i < nparts; i++)
5449     emit_move_insn (operands[2 + i], operands[6 + i]);
5450 
5451   return;
5452 }
5453 
5454 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5455    left shift by a constant, either using a single shift or
5456    a sequence of add instructions.  */
5457 
5458 static void
ix86_expand_ashl_const(rtx operand,int count,machine_mode mode)5459 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5460 {
5461   if (count == 1
5462       || (count * ix86_cost->add <= ix86_cost->shift_const
5463 	  && !optimize_insn_for_size_p ()))
5464     {
5465       while (count-- > 0)
5466 	emit_insn (gen_add2_insn (operand, operand));
5467     }
5468   else
5469     {
5470       rtx (*insn)(rtx, rtx, rtx);
5471 
5472       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5473       emit_insn (insn (operand, operand, GEN_INT (count)));
5474     }
5475 }
5476 
5477 void
ix86_split_ashl(rtx * operands,rtx scratch,machine_mode mode)5478 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5479 {
5480   rtx (*gen_ashl3)(rtx, rtx, rtx);
5481   rtx (*gen_shld)(rtx, rtx, rtx);
5482   int half_width = GET_MODE_BITSIZE (mode) >> 1;
5483   machine_mode half_mode;
5484 
5485   rtx low[2], high[2];
5486   int count;
5487 
5488   if (CONST_INT_P (operands[2]))
5489     {
5490       split_double_mode (mode, operands, 2, low, high);
5491       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5492 
5493       if (count >= half_width)
5494 	{
5495 	  emit_move_insn (high[0], low[1]);
5496 	  emit_move_insn (low[0], const0_rtx);
5497 
5498 	  if (count > half_width)
5499 	    ix86_expand_ashl_const (high[0], count - half_width, mode);
5500 	}
5501       else
5502 	{
5503 	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5504 
5505 	  if (!rtx_equal_p (operands[0], operands[1]))
5506 	    emit_move_insn (operands[0], operands[1]);
5507 
5508 	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5509 	  ix86_expand_ashl_const (low[0], count, mode);
5510 	}
5511       return;
5512     }
5513 
5514   split_double_mode (mode, operands, 1, low, high);
5515   half_mode = mode == DImode ? SImode : DImode;
5516 
5517   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5518 
5519   if (operands[1] == const1_rtx)
5520     {
5521       /* Assuming we've chosen a QImode capable registers, then 1 << N
5522 	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
5523       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5524 	{
5525 	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5526 
5527 	  ix86_expand_clear (low[0]);
5528 	  ix86_expand_clear (high[0]);
5529 	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5530 
5531 	  d = gen_lowpart (QImode, low[0]);
5532 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5533 	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
5534 	  emit_insn (gen_rtx_SET (d, s));
5535 
5536 	  d = gen_lowpart (QImode, high[0]);
5537 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5538 	  s = gen_rtx_NE (QImode, flags, const0_rtx);
5539 	  emit_insn (gen_rtx_SET (d, s));
5540 	}
5541 
5542       /* Otherwise, we can get the same results by manually performing
5543 	 a bit extract operation on bit 5/6, and then performing the two
5544 	 shifts.  The two methods of getting 0/1 into low/high are exactly
5545 	 the same size.  Avoiding the shift in the bit extract case helps
5546 	 pentium4 a bit; no one else seems to care much either way.  */
5547       else
5548 	{
5549 	  rtx (*gen_lshr3)(rtx, rtx, rtx);
5550 	  rtx (*gen_and3)(rtx, rtx, rtx);
5551 	  rtx (*gen_xor3)(rtx, rtx, rtx);
5552 	  HOST_WIDE_INT bits;
5553 	  rtx x;
5554 
5555 	  if (mode == DImode)
5556 	    {
5557 	      gen_lshr3 = gen_lshrsi3;
5558 	      gen_and3 = gen_andsi3;
5559 	      gen_xor3 = gen_xorsi3;
5560 	      bits = 5;
5561 	    }
5562 	  else
5563 	    {
5564 	      gen_lshr3 = gen_lshrdi3;
5565 	      gen_and3 = gen_anddi3;
5566 	      gen_xor3 = gen_xordi3;
5567 	      bits = 6;
5568 	    }
5569 
5570 	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5571 	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5572 	  else
5573 	    x = gen_lowpart (half_mode, operands[2]);
5574 	  emit_insn (gen_rtx_SET (high[0], x));
5575 
5576 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5577 	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5578 	  emit_move_insn (low[0], high[0]);
5579 	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5580 	}
5581 
5582       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5583       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5584       return;
5585     }
5586 
5587   if (operands[1] == constm1_rtx)
5588     {
5589       /* For -1 << N, we can avoid the shld instruction, because we
5590 	 know that we're shifting 0...31/63 ones into a -1.  */
5591       emit_move_insn (low[0], constm1_rtx);
5592       if (optimize_insn_for_size_p ())
5593 	emit_move_insn (high[0], low[0]);
5594       else
5595 	emit_move_insn (high[0], constm1_rtx);
5596     }
5597   else
5598     {
5599       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5600 
5601       if (!rtx_equal_p (operands[0], operands[1]))
5602 	emit_move_insn (operands[0], operands[1]);
5603 
5604       split_double_mode (mode, operands, 1, low, high);
5605       emit_insn (gen_shld (high[0], low[0], operands[2]));
5606     }
5607 
5608   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5609 
5610   if (TARGET_CMOVE && scratch)
5611     {
5612       ix86_expand_clear (scratch);
5613       emit_insn (gen_x86_shift_adj_1
5614 		 (half_mode, high[0], low[0], operands[2], scratch));
5615     }
5616   else
5617     emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5618 }
5619 
5620 void
ix86_split_ashr(rtx * operands,rtx scratch,machine_mode mode)5621 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5622 {
5623   rtx (*gen_ashr3)(rtx, rtx, rtx)
5624     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5625   rtx (*gen_shrd)(rtx, rtx, rtx);
5626   int half_width = GET_MODE_BITSIZE (mode) >> 1;
5627 
5628   rtx low[2], high[2];
5629   int count;
5630 
5631   if (CONST_INT_P (operands[2]))
5632     {
5633       split_double_mode (mode, operands, 2, low, high);
5634       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5635 
5636       if (count == GET_MODE_BITSIZE (mode) - 1)
5637 	{
5638 	  emit_move_insn (high[0], high[1]);
5639 	  emit_insn (gen_ashr3 (high[0], high[0],
5640 				GEN_INT (half_width - 1)));
5641 	  emit_move_insn (low[0], high[0]);
5642 
5643 	}
5644       else if (count >= half_width)
5645 	{
5646 	  emit_move_insn (low[0], high[1]);
5647 	  emit_move_insn (high[0], low[0]);
5648 	  emit_insn (gen_ashr3 (high[0], high[0],
5649 				GEN_INT (half_width - 1)));
5650 
5651 	  if (count > half_width)
5652 	    emit_insn (gen_ashr3 (low[0], low[0],
5653 				  GEN_INT (count - half_width)));
5654 	}
5655       else
5656 	{
5657 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5658 
5659 	  if (!rtx_equal_p (operands[0], operands[1]))
5660 	    emit_move_insn (operands[0], operands[1]);
5661 
5662 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5663 	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5664 	}
5665     }
5666   else
5667     {
5668       machine_mode half_mode;
5669 
5670       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5671 
5672      if (!rtx_equal_p (operands[0], operands[1]))
5673 	emit_move_insn (operands[0], operands[1]);
5674 
5675       split_double_mode (mode, operands, 1, low, high);
5676       half_mode = mode == DImode ? SImode : DImode;
5677 
5678       emit_insn (gen_shrd (low[0], high[0], operands[2]));
5679       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5680 
5681       if (TARGET_CMOVE && scratch)
5682 	{
5683 	  emit_move_insn (scratch, high[0]);
5684 	  emit_insn (gen_ashr3 (scratch, scratch,
5685 				GEN_INT (half_width - 1)));
5686 	  emit_insn (gen_x86_shift_adj_1
5687 		     (half_mode, low[0], high[0], operands[2], scratch));
5688 	}
5689       else
5690 	emit_insn (gen_x86_shift_adj_3
5691 		   (half_mode, low[0], high[0], operands[2]));
5692     }
5693 }
5694 
5695 void
ix86_split_lshr(rtx * operands,rtx scratch,machine_mode mode)5696 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5697 {
5698   rtx (*gen_lshr3)(rtx, rtx, rtx)
5699     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5700   rtx (*gen_shrd)(rtx, rtx, rtx);
5701   int half_width = GET_MODE_BITSIZE (mode) >> 1;
5702 
5703   rtx low[2], high[2];
5704   int count;
5705 
5706   if (CONST_INT_P (operands[2]))
5707     {
5708       split_double_mode (mode, operands, 2, low, high);
5709       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5710 
5711       if (count >= half_width)
5712 	{
5713 	  emit_move_insn (low[0], high[1]);
5714 	  ix86_expand_clear (high[0]);
5715 
5716 	  if (count > half_width)
5717 	    emit_insn (gen_lshr3 (low[0], low[0],
5718 				  GEN_INT (count - half_width)));
5719 	}
5720       else
5721 	{
5722 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5723 
5724 	  if (!rtx_equal_p (operands[0], operands[1]))
5725 	    emit_move_insn (operands[0], operands[1]);
5726 
5727 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5728 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5729 	}
5730     }
5731   else
5732     {
5733       machine_mode half_mode;
5734 
5735       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5736 
5737       if (!rtx_equal_p (operands[0], operands[1]))
5738 	emit_move_insn (operands[0], operands[1]);
5739 
5740       split_double_mode (mode, operands, 1, low, high);
5741       half_mode = mode == DImode ? SImode : DImode;
5742 
5743       emit_insn (gen_shrd (low[0], high[0], operands[2]));
5744       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5745 
5746       if (TARGET_CMOVE && scratch)
5747 	{
5748 	  ix86_expand_clear (scratch);
5749 	  emit_insn (gen_x86_shift_adj_1
5750 		     (half_mode, low[0], high[0], operands[2], scratch));
5751 	}
5752       else
5753 	emit_insn (gen_x86_shift_adj_2
5754 		   (half_mode, low[0], high[0], operands[2]));
5755     }
5756 }
5757 
5758 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
5759    DImode for constant loop counts.  */
5760 
5761 static machine_mode
counter_mode(rtx count_exp)5762 counter_mode (rtx count_exp)
5763 {
5764   if (GET_MODE (count_exp) != VOIDmode)
5765     return GET_MODE (count_exp);
5766   if (!CONST_INT_P (count_exp))
5767     return Pmode;
5768   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5769     return DImode;
5770   return SImode;
5771 }
5772 
5773 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5774    to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5775    specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
5776    memory by VALUE (supposed to be in MODE).
5777 
5778    The size is rounded down to whole number of chunk size moved at once.
5779    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
5780 
5781 
5782 static void
expand_set_or_cpymem_via_loop(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx count,machine_mode mode,int unroll,int expected_size,bool issetmem)5783 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5784 			       rtx destptr, rtx srcptr, rtx value,
5785 			       rtx count, machine_mode mode, int unroll,
5786 			       int expected_size, bool issetmem)
5787 {
5788   rtx_code_label *out_label, *top_label;
5789   rtx iter, tmp;
5790   machine_mode iter_mode = counter_mode (count);
5791   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5792   rtx piece_size = GEN_INT (piece_size_n);
5793   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5794   rtx size;
5795   int i;
5796 
5797   top_label = gen_label_rtx ();
5798   out_label = gen_label_rtx ();
5799   iter = gen_reg_rtx (iter_mode);
5800 
5801   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5802 			      NULL, 1, OPTAB_DIRECT);
5803   /* Those two should combine.  */
5804   if (piece_size == const1_rtx)
5805     {
5806       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5807 			       true, out_label);
5808       predict_jump (REG_BR_PROB_BASE * 10 / 100);
5809     }
5810   emit_move_insn (iter, const0_rtx);
5811 
5812   emit_label (top_label);
5813 
5814   tmp = convert_modes (Pmode, iter_mode, iter, true);
5815 
5816   /* This assert could be relaxed - in this case we'll need to compute
5817      smallest power of two, containing in PIECE_SIZE_N and pass it to
5818      offset_address.  */
5819   gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5820   destmem = offset_address (destmem, tmp, piece_size_n);
5821   destmem = adjust_address (destmem, mode, 0);
5822 
5823   if (!issetmem)
5824     {
5825       srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5826       srcmem = adjust_address (srcmem, mode, 0);
5827 
5828       /* When unrolling for chips that reorder memory reads and writes,
5829 	 we can save registers by using single temporary.
5830 	 Also using 4 temporaries is overkill in 32bit mode.  */
5831       if (!TARGET_64BIT && 0)
5832 	{
5833 	  for (i = 0; i < unroll; i++)
5834 	    {
5835 	      if (i)
5836 		{
5837 		  destmem = adjust_address (copy_rtx (destmem), mode,
5838 					    GET_MODE_SIZE (mode));
5839 		  srcmem = adjust_address (copy_rtx (srcmem), mode,
5840 					   GET_MODE_SIZE (mode));
5841 		}
5842 	      emit_move_insn (destmem, srcmem);
5843 	    }
5844 	}
5845       else
5846 	{
5847 	  rtx tmpreg[4];
5848 	  gcc_assert (unroll <= 4);
5849 	  for (i = 0; i < unroll; i++)
5850 	    {
5851 	      tmpreg[i] = gen_reg_rtx (mode);
5852 	      if (i)
5853 		srcmem = adjust_address (copy_rtx (srcmem), mode,
5854 					 GET_MODE_SIZE (mode));
5855 	      emit_move_insn (tmpreg[i], srcmem);
5856 	    }
5857 	  for (i = 0; i < unroll; i++)
5858 	    {
5859 	      if (i)
5860 		destmem = adjust_address (copy_rtx (destmem), mode,
5861 					  GET_MODE_SIZE (mode));
5862 	      emit_move_insn (destmem, tmpreg[i]);
5863 	    }
5864 	}
5865     }
5866   else
5867     for (i = 0; i < unroll; i++)
5868       {
5869 	if (i)
5870 	  destmem = adjust_address (copy_rtx (destmem), mode,
5871 				    GET_MODE_SIZE (mode));
5872 	emit_move_insn (destmem, value);
5873       }
5874 
5875   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5876 			     true, OPTAB_LIB_WIDEN);
5877   if (tmp != iter)
5878     emit_move_insn (iter, tmp);
5879 
5880   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5881 			   true, top_label);
5882   if (expected_size != -1)
5883     {
5884       expected_size /= GET_MODE_SIZE (mode) * unroll;
5885       if (expected_size == 0)
5886 	predict_jump (0);
5887       else if (expected_size > REG_BR_PROB_BASE)
5888 	predict_jump (REG_BR_PROB_BASE - 1);
5889       else
5890         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5891 		      / expected_size);
5892     }
5893   else
5894     predict_jump (REG_BR_PROB_BASE * 80 / 100);
5895   iter = ix86_zero_extend_to_Pmode (iter);
5896   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5897 			     true, OPTAB_LIB_WIDEN);
5898   if (tmp != destptr)
5899     emit_move_insn (destptr, tmp);
5900   if (!issetmem)
5901     {
5902       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5903 				 true, OPTAB_LIB_WIDEN);
5904       if (tmp != srcptr)
5905 	emit_move_insn (srcptr, tmp);
5906     }
5907   emit_label (out_label);
5908 }
5909 
5910 /* Divide COUNTREG by SCALE.  */
5911 static rtx
scale_counter(rtx countreg,int scale)5912 scale_counter (rtx countreg, int scale)
5913 {
5914   rtx sc;
5915 
5916   if (scale == 1)
5917     return countreg;
5918   if (CONST_INT_P (countreg))
5919     return GEN_INT (INTVAL (countreg) / scale);
5920   gcc_assert (REG_P (countreg));
5921 
5922   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5923 			    GEN_INT (exact_log2 (scale)),
5924 			    NULL, 1, OPTAB_DIRECT);
5925   return sc;
5926 }
5927 
5928 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5929    When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5930    When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5931    For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5932    ORIG_VALUE is the original value passed to memset to fill the memory with.
5933    Other arguments have same meaning as for previous function.  */
5934 
5935 static void
expand_set_or_cpymem_via_rep(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx orig_value,rtx count,machine_mode mode,bool issetmem)5936 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5937 			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5938 			   rtx count,
5939 			   machine_mode mode, bool issetmem)
5940 {
5941   rtx destexp;
5942   rtx srcexp;
5943   rtx countreg;
5944   HOST_WIDE_INT rounded_count;
5945 
5946   /* If possible, it is shorter to use rep movs.
5947      TODO: Maybe it is better to move this logic to decide_alg.  */
5948   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5949       && (!issetmem || orig_value == const0_rtx))
5950     mode = SImode;
5951 
5952   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5953     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5954 
5955   countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5956 						       GET_MODE_SIZE (mode)));
5957   if (mode != QImode)
5958     {
5959       destexp = gen_rtx_ASHIFT (Pmode, countreg,
5960 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5961       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5962     }
5963   else
5964     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5965   if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5966     {
5967       rounded_count
5968 	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5969       destmem = shallow_copy_rtx (destmem);
5970       set_mem_size (destmem, rounded_count);
5971     }
5972   else if (MEM_SIZE_KNOWN_P (destmem))
5973     clear_mem_size (destmem);
5974 
5975   if (issetmem)
5976     {
5977       value = force_reg (mode, gen_lowpart (mode, value));
5978       emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
5979     }
5980   else
5981     {
5982       if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
5983 	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
5984       if (mode != QImode)
5985 	{
5986 	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
5987 				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5988 	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
5989 	}
5990       else
5991 	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
5992       if (CONST_INT_P (count))
5993 	{
5994 	  rounded_count
5995 	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5996 	  srcmem = shallow_copy_rtx (srcmem);
5997 	  set_mem_size (srcmem, rounded_count);
5998 	}
5999       else
6000 	{
6001 	  if (MEM_SIZE_KNOWN_P (srcmem))
6002 	    clear_mem_size (srcmem);
6003 	}
6004       emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6005 			      destexp, srcexp));
6006     }
6007 }
6008 
6009 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6010    DESTMEM.
6011    SRC is passed by pointer to be updated on return.
6012    Return value is updated DST.  */
6013 static rtx
emit_memmov(rtx destmem,rtx * srcmem,rtx destptr,rtx srcptr,HOST_WIDE_INT size_to_move)6014 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6015 	     HOST_WIDE_INT size_to_move)
6016 {
6017   rtx dst = destmem, src = *srcmem, adjust, tempreg;
6018   enum insn_code code;
6019   machine_mode move_mode;
6020   int piece_size, i;
6021 
6022   /* Find the widest mode in which we could perform moves.
6023      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6024      it until move of such size is supported.  */
6025   piece_size = 1 << floor_log2 (size_to_move);
6026   while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6027 	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6028     {
6029       gcc_assert (piece_size > 1);
6030       piece_size >>= 1;
6031     }
6032 
6033   /* Find the corresponding vector mode with the same size as MOVE_MODE.
6034      MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
6035   if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6036     {
6037       int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6038       if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6039 	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6040 	{
6041 	  move_mode = word_mode;
6042 	  piece_size = GET_MODE_SIZE (move_mode);
6043 	  code = optab_handler (mov_optab, move_mode);
6044 	}
6045     }
6046   gcc_assert (code != CODE_FOR_nothing);
6047 
6048   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6049   src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6050 
6051   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
6052   gcc_assert (size_to_move % piece_size == 0);
6053   adjust = GEN_INT (piece_size);
6054   for (i = 0; i < size_to_move; i += piece_size)
6055     {
6056       /* We move from memory to memory, so we'll need to do it via
6057 	 a temporary register.  */
6058       tempreg = gen_reg_rtx (move_mode);
6059       emit_insn (GEN_FCN (code) (tempreg, src));
6060       emit_insn (GEN_FCN (code) (dst, tempreg));
6061 
6062       emit_move_insn (destptr,
6063 		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6064       emit_move_insn (srcptr,
6065 		      gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
6066 
6067       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6068 					  piece_size);
6069       src = adjust_automodify_address_nv (src, move_mode, srcptr,
6070 					  piece_size);
6071     }
6072 
6073   /* Update DST and SRC rtx.  */
6074   *srcmem = src;
6075   return dst;
6076 }
6077 
6078 /* Helper function for the string operations below.  Dest VARIABLE whether
6079    it is aligned to VALUE bytes.  If true, jump to the label.  */
6080 
6081 static rtx_code_label *
ix86_expand_aligntest(rtx variable,int value,bool epilogue)6082 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6083 {
6084   rtx_code_label *label = gen_label_rtx ();
6085   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6086   if (GET_MODE (variable) == DImode)
6087     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6088   else
6089     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6090   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6091 			   1, label);
6092   if (epilogue)
6093     predict_jump (REG_BR_PROB_BASE * 50 / 100);
6094   else
6095     predict_jump (REG_BR_PROB_BASE * 90 / 100);
6096   return label;
6097 }
6098 
6099 
6100 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
6101 
6102 static void
expand_cpymem_epilogue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx count,int max_size)6103 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6104 			rtx destptr, rtx srcptr, rtx count, int max_size)
6105 {
6106   rtx src, dest;
6107   if (CONST_INT_P (count))
6108     {
6109       HOST_WIDE_INT countval = INTVAL (count);
6110       HOST_WIDE_INT epilogue_size = countval % max_size;
6111       int i;
6112 
6113       /* For now MAX_SIZE should be a power of 2.  This assert could be
6114 	 relaxed, but it'll require a bit more complicated epilogue
6115 	 expanding.  */
6116       gcc_assert ((max_size & (max_size - 1)) == 0);
6117       for (i = max_size; i >= 1; i >>= 1)
6118 	{
6119 	  if (epilogue_size & i)
6120 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6121 	}
6122       return;
6123     }
6124   if (max_size > 8)
6125     {
6126       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6127 				    count, 1, OPTAB_DIRECT);
6128       expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6129 				     count, QImode, 1, 4, false);
6130       return;
6131     }
6132 
6133   /* When there are stringops, we can cheaply increase dest and src pointers.
6134      Otherwise we save code size by maintaining offset (zero is readily
6135      available from preceding rep operation) and using x86 addressing modes.
6136    */
6137   if (TARGET_SINGLE_STRINGOP)
6138     {
6139       if (max_size > 4)
6140 	{
6141 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6142 	  src = change_address (srcmem, SImode, srcptr);
6143 	  dest = change_address (destmem, SImode, destptr);
6144 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
6145 	  emit_label (label);
6146 	  LABEL_NUSES (label) = 1;
6147 	}
6148       if (max_size > 2)
6149 	{
6150 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6151 	  src = change_address (srcmem, HImode, srcptr);
6152 	  dest = change_address (destmem, HImode, destptr);
6153 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
6154 	  emit_label (label);
6155 	  LABEL_NUSES (label) = 1;
6156 	}
6157       if (max_size > 1)
6158 	{
6159 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6160 	  src = change_address (srcmem, QImode, srcptr);
6161 	  dest = change_address (destmem, QImode, destptr);
6162 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
6163 	  emit_label (label);
6164 	  LABEL_NUSES (label) = 1;
6165 	}
6166     }
6167   else
6168     {
6169       rtx offset = force_reg (Pmode, const0_rtx);
6170       rtx tmp;
6171 
6172       if (max_size > 4)
6173 	{
6174 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6175 	  src = change_address (srcmem, SImode, srcptr);
6176 	  dest = change_address (destmem, SImode, destptr);
6177 	  emit_move_insn (dest, src);
6178 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6179 				     true, OPTAB_LIB_WIDEN);
6180 	  if (tmp != offset)
6181 	    emit_move_insn (offset, tmp);
6182 	  emit_label (label);
6183 	  LABEL_NUSES (label) = 1;
6184 	}
6185       if (max_size > 2)
6186 	{
6187 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6188 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6189 	  src = change_address (srcmem, HImode, tmp);
6190 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6191 	  dest = change_address (destmem, HImode, tmp);
6192 	  emit_move_insn (dest, src);
6193 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6194 				     true, OPTAB_LIB_WIDEN);
6195 	  if (tmp != offset)
6196 	    emit_move_insn (offset, tmp);
6197 	  emit_label (label);
6198 	  LABEL_NUSES (label) = 1;
6199 	}
6200       if (max_size > 1)
6201 	{
6202 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6203 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6204 	  src = change_address (srcmem, QImode, tmp);
6205 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6206 	  dest = change_address (destmem, QImode, tmp);
6207 	  emit_move_insn (dest, src);
6208 	  emit_label (label);
6209 	  LABEL_NUSES (label) = 1;
6210 	}
6211     }
6212 }
6213 
6214 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6215    with value PROMOTED_VAL.
6216    SRC is passed by pointer to be updated on return.
6217    Return value is updated DST.  */
6218 static rtx
emit_memset(rtx destmem,rtx destptr,rtx promoted_val,HOST_WIDE_INT size_to_move)6219 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6220 	     HOST_WIDE_INT size_to_move)
6221 {
6222   rtx dst = destmem, adjust;
6223   enum insn_code code;
6224   machine_mode move_mode;
6225   int piece_size, i;
6226 
6227   /* Find the widest mode in which we could perform moves.
6228      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6229      it until move of such size is supported.  */
6230   move_mode = GET_MODE (promoted_val);
6231   if (move_mode == VOIDmode)
6232     move_mode = QImode;
6233   if (size_to_move < GET_MODE_SIZE (move_mode))
6234     {
6235       unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6236       move_mode = int_mode_for_size (move_bits, 0).require ();
6237       promoted_val = gen_lowpart (move_mode, promoted_val);
6238     }
6239   piece_size = GET_MODE_SIZE (move_mode);
6240   code = optab_handler (mov_optab, move_mode);
6241   gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6242 
6243   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6244 
6245   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
6246   gcc_assert (size_to_move % piece_size == 0);
6247   adjust = GEN_INT (piece_size);
6248   for (i = 0; i < size_to_move; i += piece_size)
6249     {
6250       if (piece_size <= GET_MODE_SIZE (word_mode))
6251 	{
6252 	  emit_insn (gen_strset (destptr, dst, promoted_val));
6253 	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6254 					      piece_size);
6255 	  continue;
6256 	}
6257 
6258       emit_insn (GEN_FCN (code) (dst, promoted_val));
6259 
6260       emit_move_insn (destptr,
6261 		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6262 
6263       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6264 					  piece_size);
6265     }
6266 
6267   /* Update DST rtx.  */
6268   return dst;
6269 }
6270 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
6271 static void
expand_setmem_epilogue_via_loop(rtx destmem,rtx destptr,rtx value,rtx count,int max_size)6272 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6273 				 rtx count, int max_size)
6274 {
6275   count = expand_simple_binop (counter_mode (count), AND, count,
6276 			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6277   expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6278 				 gen_lowpart (QImode, value), count, QImode,
6279 				 1, max_size / 2, true);
6280 }
6281 
6282 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
6283 static void
expand_setmem_epilogue(rtx destmem,rtx destptr,rtx value,rtx vec_value,rtx count,int max_size)6284 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6285 			rtx count, int max_size)
6286 {
6287   rtx dest;
6288 
6289   if (CONST_INT_P (count))
6290     {
6291       HOST_WIDE_INT countval = INTVAL (count);
6292       HOST_WIDE_INT epilogue_size = countval % max_size;
6293       int i;
6294 
6295       /* For now MAX_SIZE should be a power of 2.  This assert could be
6296 	 relaxed, but it'll require a bit more complicated epilogue
6297 	 expanding.  */
6298       gcc_assert ((max_size & (max_size - 1)) == 0);
6299       for (i = max_size; i >= 1; i >>= 1)
6300 	{
6301 	  if (epilogue_size & i)
6302 	    {
6303 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6304 		destmem = emit_memset (destmem, destptr, vec_value, i);
6305 	      else
6306 		destmem = emit_memset (destmem, destptr, value, i);
6307 	    }
6308 	}
6309       return;
6310     }
6311   if (max_size > 32)
6312     {
6313       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6314       return;
6315     }
6316   if (max_size > 16)
6317     {
6318       rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6319       if (TARGET_64BIT)
6320 	{
6321 	  dest = change_address (destmem, DImode, destptr);
6322 	  emit_insn (gen_strset (destptr, dest, value));
6323 	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6324 	  emit_insn (gen_strset (destptr, dest, value));
6325 	}
6326       else
6327 	{
6328 	  dest = change_address (destmem, SImode, destptr);
6329 	  emit_insn (gen_strset (destptr, dest, value));
6330 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6331 	  emit_insn (gen_strset (destptr, dest, value));
6332 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6333 	  emit_insn (gen_strset (destptr, dest, value));
6334 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6335 	  emit_insn (gen_strset (destptr, dest, value));
6336 	}
6337       emit_label (label);
6338       LABEL_NUSES (label) = 1;
6339     }
6340   if (max_size > 8)
6341     {
6342       rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6343       if (TARGET_64BIT)
6344 	{
6345 	  dest = change_address (destmem, DImode, destptr);
6346 	  emit_insn (gen_strset (destptr, dest, value));
6347 	}
6348       else
6349 	{
6350 	  dest = change_address (destmem, SImode, destptr);
6351 	  emit_insn (gen_strset (destptr, dest, value));
6352 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6353 	  emit_insn (gen_strset (destptr, dest, value));
6354 	}
6355       emit_label (label);
6356       LABEL_NUSES (label) = 1;
6357     }
6358   if (max_size > 4)
6359     {
6360       rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6361       dest = change_address (destmem, SImode, destptr);
6362       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6363       emit_label (label);
6364       LABEL_NUSES (label) = 1;
6365     }
6366   if (max_size > 2)
6367     {
6368       rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6369       dest = change_address (destmem, HImode, destptr);
6370       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6371       emit_label (label);
6372       LABEL_NUSES (label) = 1;
6373     }
6374   if (max_size > 1)
6375     {
6376       rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6377       dest = change_address (destmem, QImode, destptr);
6378       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6379       emit_label (label);
6380       LABEL_NUSES (label) = 1;
6381     }
6382 }
6383 
6384 /* Adjust COUNTER by the VALUE.  */
6385 static void
ix86_adjust_counter(rtx countreg,HOST_WIDE_INT value)6386 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6387 {
6388   emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6389 }
6390 
6391 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6392    DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
6393    Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6394    ignored.
6395    Return value is updated DESTMEM.  */
6396 
6397 static rtx
expand_set_or_cpymem_prologue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int align,int desired_alignment,bool issetmem)6398 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6399 				  rtx destptr, rtx srcptr, rtx value,
6400 				  rtx vec_value, rtx count, int align,
6401 				  int desired_alignment, bool issetmem)
6402 {
6403   int i;
6404   for (i = 1; i < desired_alignment; i <<= 1)
6405     {
6406       if (align <= i)
6407 	{
6408 	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6409 	  if (issetmem)
6410 	    {
6411 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6412 		destmem = emit_memset (destmem, destptr, vec_value, i);
6413 	      else
6414 		destmem = emit_memset (destmem, destptr, value, i);
6415 	    }
6416 	  else
6417 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6418 	  ix86_adjust_counter (count, i);
6419 	  emit_label (label);
6420 	  LABEL_NUSES (label) = 1;
6421 	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6422 	}
6423     }
6424   return destmem;
6425 }
6426 
6427 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6428    or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6429    and jump to DONE_LABEL.  */
6430 static void
expand_small_cpymem_or_setmem(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int size,rtx done_label,bool issetmem)6431 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6432 			       rtx destptr, rtx srcptr,
6433 			       rtx value, rtx vec_value,
6434 			       rtx count, int size,
6435 			       rtx done_label, bool issetmem)
6436 {
6437   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6438   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6439   rtx modesize;
6440   int n;
6441 
6442   /* If we do not have vector value to copy, we must reduce size.  */
6443   if (issetmem)
6444     {
6445       if (!vec_value)
6446 	{
6447 	  if (GET_MODE (value) == VOIDmode && size > 8)
6448 	    mode = Pmode;
6449 	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6450 	    mode = GET_MODE (value);
6451 	}
6452       else
6453 	mode = GET_MODE (vec_value), value = vec_value;
6454     }
6455   else
6456     {
6457       /* Choose appropriate vector mode.  */
6458       if (size >= 32)
6459 	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6460       else if (size >= 16)
6461 	mode = TARGET_SSE ? V16QImode : DImode;
6462       srcmem = change_address (srcmem, mode, srcptr);
6463     }
6464   destmem = change_address (destmem, mode, destptr);
6465   modesize = GEN_INT (GET_MODE_SIZE (mode));
6466   gcc_assert (GET_MODE_SIZE (mode) <= size);
6467   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6468     {
6469       if (issetmem)
6470 	emit_move_insn (destmem, gen_lowpart (mode, value));
6471       else
6472 	{
6473           emit_move_insn (destmem, srcmem);
6474           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6475 	}
6476       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6477     }
6478 
6479   destmem = offset_address (destmem, count, 1);
6480   destmem = offset_address (destmem, GEN_INT (-2 * size),
6481 			    GET_MODE_SIZE (mode));
6482   if (!issetmem)
6483     {
6484       srcmem = offset_address (srcmem, count, 1);
6485       srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6486 			       GET_MODE_SIZE (mode));
6487     }
6488   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6489     {
6490       if (issetmem)
6491 	emit_move_insn (destmem, gen_lowpart (mode, value));
6492       else
6493 	{
6494 	  emit_move_insn (destmem, srcmem);
6495 	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6496 	}
6497       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6498     }
6499   emit_jump_insn (gen_jump (done_label));
6500   emit_barrier ();
6501 
6502   emit_label (label);
6503   LABEL_NUSES (label) = 1;
6504 }
6505 
6506 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6507    and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6508    bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6509    proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6510    DONE_LABEL is a label after the whole copying sequence. The label is created
6511    on demand if *DONE_LABEL is NULL.
6512    MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
6513    bounds after the initial copies.
6514 
6515    DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6516    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6517    we will dispatch to a library call for large blocks.
6518 
6519    In pseudocode we do:
6520 
6521    if (COUNT < SIZE)
6522      {
6523        Assume that SIZE is 4. Bigger sizes are handled analogously
6524        if (COUNT & 4)
6525 	 {
6526 	    copy 4 bytes from SRCPTR to DESTPTR
6527 	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6528 	    goto done_label
6529 	 }
6530        if (!COUNT)
6531 	 goto done_label;
6532        copy 1 byte from SRCPTR to DESTPTR
6533        if (COUNT & 2)
6534 	 {
6535 	    copy 2 bytes from SRCPTR to DESTPTR
6536 	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6537 	 }
6538      }
6539    else
6540      {
6541        copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6542        copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6543 
6544        OLD_DESPTR = DESTPTR;
6545        Align DESTPTR up to DESIRED_ALIGN
6546        SRCPTR += DESTPTR - OLD_DESTPTR
6547        COUNT -= DEST_PTR - OLD_DESTPTR
6548        if (DYNAMIC_CHECK)
6549 	 Round COUNT down to multiple of SIZE
6550        << optional caller supplied zero size guard is here >>
6551        << optional caller supplied dynamic check is here >>
6552        << caller supplied main copy loop is here >>
6553      }
6554    done_label:
6555   */
6556 static void
expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves(rtx destmem,rtx srcmem,rtx * destptr,rtx * srcptr,machine_mode mode,rtx value,rtx vec_value,rtx * count,rtx_code_label ** done_label,int size,int desired_align,int align,unsigned HOST_WIDE_INT * min_size,bool dynamic_check,bool issetmem)6557 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6558 							    rtx *destptr, rtx *srcptr,
6559 							    machine_mode mode,
6560 							    rtx value, rtx vec_value,
6561 							    rtx *count,
6562 							    rtx_code_label **done_label,
6563 							    int size,
6564 							    int desired_align,
6565 							    int align,
6566 							    unsigned HOST_WIDE_INT *min_size,
6567 							    bool dynamic_check,
6568 							    bool issetmem)
6569 {
6570   rtx_code_label *loop_label = NULL, *label;
6571   int n;
6572   rtx modesize;
6573   int prolog_size = 0;
6574   rtx mode_value;
6575 
6576   /* Chose proper value to copy.  */
6577   if (issetmem && VECTOR_MODE_P (mode))
6578     mode_value = vec_value;
6579   else
6580     mode_value = value;
6581   gcc_assert (GET_MODE_SIZE (mode) <= size);
6582 
6583   /* See if block is big or small, handle small blocks.  */
6584   if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6585     {
6586       int size2 = size;
6587       loop_label = gen_label_rtx ();
6588 
6589       if (!*done_label)
6590 	*done_label = gen_label_rtx ();
6591 
6592       emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6593 			       1, loop_label);
6594       size2 >>= 1;
6595 
6596       /* Handle sizes > 3.  */
6597       for (;size2 > 2; size2 >>= 1)
6598 	expand_small_cpymem_or_setmem (destmem, srcmem,
6599 				       *destptr, *srcptr,
6600 				       value, vec_value,
6601 				       *count,
6602 				       size2, *done_label, issetmem);
6603       /* Nothing to copy?  Jump to DONE_LABEL if so */
6604       emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6605 			       1, *done_label);
6606 
6607       /* Do a byte copy.  */
6608       destmem = change_address (destmem, QImode, *destptr);
6609       if (issetmem)
6610 	emit_move_insn (destmem, gen_lowpart (QImode, value));
6611       else
6612 	{
6613           srcmem = change_address (srcmem, QImode, *srcptr);
6614           emit_move_insn (destmem, srcmem);
6615 	}
6616 
6617       /* Handle sizes 2 and 3.  */
6618       label = ix86_expand_aligntest (*count, 2, false);
6619       destmem = change_address (destmem, HImode, *destptr);
6620       destmem = offset_address (destmem, *count, 1);
6621       destmem = offset_address (destmem, GEN_INT (-2), 2);
6622       if (issetmem)
6623         emit_move_insn (destmem, gen_lowpart (HImode, value));
6624       else
6625 	{
6626 	  srcmem = change_address (srcmem, HImode, *srcptr);
6627 	  srcmem = offset_address (srcmem, *count, 1);
6628 	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6629 	  emit_move_insn (destmem, srcmem);
6630 	}
6631 
6632       emit_label (label);
6633       LABEL_NUSES (label) = 1;
6634       emit_jump_insn (gen_jump (*done_label));
6635       emit_barrier ();
6636     }
6637   else
6638     gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6639 		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6640 
6641   /* Start memcpy for COUNT >= SIZE.  */
6642   if (loop_label)
6643     {
6644        emit_label (loop_label);
6645        LABEL_NUSES (loop_label) = 1;
6646     }
6647 
6648   /* Copy first desired_align bytes.  */
6649   if (!issetmem)
6650     srcmem = change_address (srcmem, mode, *srcptr);
6651   destmem = change_address (destmem, mode, *destptr);
6652   modesize = GEN_INT (GET_MODE_SIZE (mode));
6653   for (n = 0; prolog_size < desired_align - align; n++)
6654     {
6655       if (issetmem)
6656         emit_move_insn (destmem, mode_value);
6657       else
6658 	{
6659           emit_move_insn (destmem, srcmem);
6660           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6661 	}
6662       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6663       prolog_size += GET_MODE_SIZE (mode);
6664     }
6665 
6666 
6667   /* Copy last SIZE bytes.  */
6668   destmem = offset_address (destmem, *count, 1);
6669   destmem = offset_address (destmem,
6670 			    GEN_INT (-size - prolog_size),
6671 			    1);
6672   if (issetmem)
6673     emit_move_insn (destmem, mode_value);
6674   else
6675     {
6676       srcmem = offset_address (srcmem, *count, 1);
6677       srcmem = offset_address (srcmem,
6678 			       GEN_INT (-size - prolog_size),
6679 			       1);
6680       emit_move_insn (destmem, srcmem);
6681     }
6682   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6683     {
6684       destmem = offset_address (destmem, modesize, 1);
6685       if (issetmem)
6686 	emit_move_insn (destmem, mode_value);
6687       else
6688 	{
6689           srcmem = offset_address (srcmem, modesize, 1);
6690           emit_move_insn (destmem, srcmem);
6691 	}
6692     }
6693 
6694   /* Align destination.  */
6695   if (desired_align > 1 && desired_align > align)
6696     {
6697       rtx saveddest = *destptr;
6698 
6699       gcc_assert (desired_align <= size);
6700       /* Align destptr up, place it to new register.  */
6701       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6702 				      GEN_INT (prolog_size),
6703 				      NULL_RTX, 1, OPTAB_DIRECT);
6704       if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6705 	REG_POINTER (*destptr) = 1;
6706       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6707 				      GEN_INT (-desired_align),
6708 				      *destptr, 1, OPTAB_DIRECT);
6709       /* See how many bytes we skipped.  */
6710       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6711 				       *destptr,
6712 				       saveddest, 1, OPTAB_DIRECT);
6713       /* Adjust srcptr and count.  */
6714       if (!issetmem)
6715 	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6716 				       saveddest, *srcptr, 1, OPTAB_DIRECT);
6717       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6718 				    saveddest, *count, 1, OPTAB_DIRECT);
6719       /* We copied at most size + prolog_size.  */
6720       if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6721 	*min_size
6722 	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6723       else
6724 	*min_size = 0;
6725 
6726       /* Our loops always round down the block size, but for dispatch to
6727          library we need precise value.  */
6728       if (dynamic_check)
6729 	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
6730 				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6731     }
6732   else
6733     {
6734       gcc_assert (prolog_size == 0);
6735       /* Decrease count, so we won't end up copying last word twice.  */
6736       if (!CONST_INT_P (*count))
6737 	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6738 				      constm1_rtx, *count, 1, OPTAB_DIRECT);
6739       else
6740 	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6741 				      (unsigned HOST_WIDE_INT)size));
6742       if (*min_size)
6743 	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6744     }
6745 }
6746 
6747 
6748 /* This function is like the previous one, except here we know how many bytes
6749    need to be copied.  That allows us to update alignment not only of DST, which
6750    is returned, but also of SRC, which is passed as a pointer for that
6751    reason.  */
6752 static rtx
expand_set_or_cpymem_constant_prologue(rtx dst,rtx * srcp,rtx destreg,rtx srcreg,rtx value,rtx vec_value,int desired_align,int align_bytes,bool issetmem)6753 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6754 					   rtx srcreg, rtx value, rtx vec_value,
6755 					   int desired_align, int align_bytes,
6756 					   bool issetmem)
6757 {
6758   rtx src = NULL;
6759   rtx orig_dst = dst;
6760   rtx orig_src = NULL;
6761   int piece_size = 1;
6762   int copied_bytes = 0;
6763 
6764   if (!issetmem)
6765     {
6766       gcc_assert (srcp != NULL);
6767       src = *srcp;
6768       orig_src = src;
6769     }
6770 
6771   for (piece_size = 1;
6772        piece_size <= desired_align && copied_bytes < align_bytes;
6773        piece_size <<= 1)
6774     {
6775       if (align_bytes & piece_size)
6776 	{
6777 	  if (issetmem)
6778 	    {
6779 	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6780 		dst = emit_memset (dst, destreg, vec_value, piece_size);
6781 	      else
6782 		dst = emit_memset (dst, destreg, value, piece_size);
6783 	    }
6784 	  else
6785 	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6786 	  copied_bytes += piece_size;
6787 	}
6788     }
6789   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6790     set_mem_align (dst, desired_align * BITS_PER_UNIT);
6791   if (MEM_SIZE_KNOWN_P (orig_dst))
6792     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6793 
6794   if (!issetmem)
6795     {
6796       int src_align_bytes = get_mem_align_offset (src, desired_align
6797 						       * BITS_PER_UNIT);
6798       if (src_align_bytes >= 0)
6799 	src_align_bytes = desired_align - src_align_bytes;
6800       if (src_align_bytes >= 0)
6801 	{
6802 	  unsigned int src_align;
6803 	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6804 	    {
6805 	      if ((src_align_bytes & (src_align - 1))
6806 		   == (align_bytes & (src_align - 1)))
6807 		break;
6808 	    }
6809 	  if (src_align > (unsigned int) desired_align)
6810 	    src_align = desired_align;
6811 	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6812 	    set_mem_align (src, src_align * BITS_PER_UNIT);
6813 	}
6814       if (MEM_SIZE_KNOWN_P (orig_src))
6815 	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6816       *srcp = src;
6817     }
6818 
6819   return dst;
6820 }
6821 
6822 /* Return true if ALG can be used in current context.
6823    Assume we expand memset if MEMSET is true.  */
6824 static bool
alg_usable_p(enum stringop_alg alg,bool memset,bool have_as)6825 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6826 {
6827   if (alg == no_stringop)
6828     return false;
6829   if (alg == vector_loop)
6830     return TARGET_SSE || TARGET_AVX;
6831   /* Algorithms using the rep prefix want at least edi and ecx;
6832      additionally, memset wants eax and memcpy wants esi.  Don't
6833      consider such algorithms if the user has appropriated those
6834      registers for their own purposes, or if we have a non-default
6835      address space, since some string insns cannot override the segment.  */
6836   if (alg == rep_prefix_1_byte
6837       || alg == rep_prefix_4_byte
6838       || alg == rep_prefix_8_byte)
6839     {
6840       if (have_as)
6841 	return false;
6842       if (fixed_regs[CX_REG]
6843 	  || fixed_regs[DI_REG]
6844 	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6845 	return false;
6846     }
6847   return true;
6848 }
6849 
6850 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
6851 static enum stringop_alg
decide_alg(HOST_WIDE_INT count,HOST_WIDE_INT expected_size,unsigned HOST_WIDE_INT min_size,unsigned HOST_WIDE_INT max_size,bool memset,bool zero_memset,bool have_as,int * dynamic_check,bool * noalign,bool recur)6852 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6853 	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6854 	    bool memset, bool zero_memset, bool have_as,
6855 	    int *dynamic_check, bool *noalign, bool recur)
6856 {
6857   const struct stringop_algs *algs;
6858   bool optimize_for_speed;
6859   int max = 0;
6860   const struct processor_costs *cost;
6861   int i;
6862   bool any_alg_usable_p = false;
6863 
6864   *noalign = false;
6865   *dynamic_check = -1;
6866 
6867   /* Even if the string operation call is cold, we still might spend a lot
6868      of time processing large blocks.  */
6869   if (optimize_function_for_size_p (cfun)
6870       || (optimize_insn_for_size_p ()
6871  	  && (max_size < 256
6872               || (expected_size != -1 && expected_size < 256))))
6873     optimize_for_speed = false;
6874   else
6875     optimize_for_speed = true;
6876 
6877   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6878   if (memset)
6879     algs = &cost->memset[TARGET_64BIT != 0];
6880   else
6881     algs = &cost->memcpy[TARGET_64BIT != 0];
6882 
6883   /* See maximal size for user defined algorithm.  */
6884   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6885     {
6886       enum stringop_alg candidate = algs->size[i].alg;
6887       bool usable = alg_usable_p (candidate, memset, have_as);
6888       any_alg_usable_p |= usable;
6889 
6890       if (candidate != libcall && candidate && usable)
6891 	max = algs->size[i].max;
6892     }
6893 
6894   /* If expected size is not known but max size is small enough
6895      so inline version is a win, set expected size into
6896      the range.  */
6897   if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6898       && expected_size == -1)
6899     expected_size = min_size / 2 + max_size / 2;
6900 
6901   /* If user specified the algorithm, honor it if possible.  */
6902   if (ix86_stringop_alg != no_stringop
6903       && alg_usable_p (ix86_stringop_alg, memset, have_as))
6904     return ix86_stringop_alg;
6905   /* rep; movq or rep; movl is the smallest variant.  */
6906   else if (!optimize_for_speed)
6907     {
6908       *noalign = true;
6909       if (!count || (count & 3) || (memset && !zero_memset))
6910 	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6911 	       ? rep_prefix_1_byte : loop_1_byte;
6912       else
6913 	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6914 	       ? rep_prefix_4_byte : loop;
6915     }
6916   /* Very tiny blocks are best handled via the loop, REP is expensive to
6917      setup.  */
6918   else if (expected_size != -1 && expected_size < 4)
6919     return loop_1_byte;
6920   else if (expected_size != -1)
6921     {
6922       enum stringop_alg alg = libcall;
6923       bool alg_noalign = false;
6924       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6925 	{
6926 	  /* We get here if the algorithms that were not libcall-based
6927 	     were rep-prefix based and we are unable to use rep prefixes
6928 	     based on global register usage.  Break out of the loop and
6929 	     use the heuristic below.  */
6930 	  if (algs->size[i].max == 0)
6931 	    break;
6932 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6933 	    {
6934 	      enum stringop_alg candidate = algs->size[i].alg;
6935 
6936 	      if (candidate != libcall
6937 		  && alg_usable_p (candidate, memset, have_as))
6938 		{
6939 		  alg = candidate;
6940 		  alg_noalign = algs->size[i].noalign;
6941 		}
6942 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6943 		 last non-libcall inline algorithm.  */
6944 	      if (TARGET_INLINE_ALL_STRINGOPS)
6945 		{
6946 		  /* When the current size is best to be copied by a libcall,
6947 		     but we are still forced to inline, run the heuristic below
6948 		     that will pick code for medium sized blocks.  */
6949 		  if (alg != libcall)
6950 		    {
6951 		      *noalign = alg_noalign;
6952 		      return alg;
6953 		    }
6954 		  else if (!any_alg_usable_p)
6955 		    break;
6956 		}
6957 	      else if (alg_usable_p (candidate, memset, have_as))
6958 		{
6959 		  *noalign = algs->size[i].noalign;
6960 		  return candidate;
6961 		}
6962 	    }
6963 	}
6964     }
6965   /* When asked to inline the call anyway, try to pick meaningful choice.
6966      We look for maximal size of block that is faster to copy by hand and
6967      take blocks of at most of that size guessing that average size will
6968      be roughly half of the block.
6969 
6970      If this turns out to be bad, we might simply specify the preferred
6971      choice in ix86_costs.  */
6972   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6973       && (algs->unknown_size == libcall
6974 	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
6975     {
6976       enum stringop_alg alg;
6977       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6978 
6979       /* If there aren't any usable algorithms or if recursing already,
6980 	 then recursing on smaller sizes or same size isn't going to
6981 	 find anything.  Just return the simple byte-at-a-time copy loop.  */
6982       if (!any_alg_usable_p || recur)
6983 	{
6984 	  /* Pick something reasonable.  */
6985 	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
6986 	    *dynamic_check = 128;
6987 	  return loop_1_byte;
6988 	}
6989       alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
6990 			zero_memset, have_as, dynamic_check, noalign, true);
6991       gcc_assert (*dynamic_check == -1);
6992       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6993 	*dynamic_check = max;
6994       else
6995 	gcc_assert (alg != libcall);
6996       return alg;
6997     }
6998   return (alg_usable_p (algs->unknown_size, memset, have_as)
6999 	  ? algs->unknown_size : libcall);
7000 }
7001 
7002 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
7003    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
7004 static int
decide_alignment(int align,enum stringop_alg alg,int expected_size,machine_mode move_mode)7005 decide_alignment (int align,
7006 		  enum stringop_alg alg,
7007 		  int expected_size,
7008 		  machine_mode move_mode)
7009 {
7010   int desired_align = 0;
7011 
7012   gcc_assert (alg != no_stringop);
7013 
7014   if (alg == libcall)
7015     return 0;
7016   if (move_mode == VOIDmode)
7017     return 0;
7018 
7019   desired_align = GET_MODE_SIZE (move_mode);
7020   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7021      copying whole cacheline at once.  */
7022   if (TARGET_PENTIUMPRO
7023       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7024     desired_align = 8;
7025 
7026   if (optimize_size)
7027     desired_align = 1;
7028   if (desired_align < align)
7029     desired_align = align;
7030   if (expected_size != -1 && expected_size < 4)
7031     desired_align = align;
7032 
7033   return desired_align;
7034 }
7035 
7036 
7037 /* Helper function for memcpy.  For QImode value 0xXY produce
7038    0xXYXYXYXY of wide specified by MODE.  This is essentially
7039    a * 0x10101010, but we can do slightly better than
7040    synth_mult by unwinding the sequence by hand on CPUs with
7041    slow multiply.  */
7042 static rtx
promote_duplicated_reg(machine_mode mode,rtx val)7043 promote_duplicated_reg (machine_mode mode, rtx val)
7044 {
7045   machine_mode valmode = GET_MODE (val);
7046   rtx tmp;
7047   int nops = mode == DImode ? 3 : 2;
7048 
7049   gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7050   if (val == const0_rtx)
7051     return copy_to_mode_reg (mode, CONST0_RTX (mode));
7052   if (CONST_INT_P (val))
7053     {
7054       HOST_WIDE_INT v = INTVAL (val) & 255;
7055 
7056       v |= v << 8;
7057       v |= v << 16;
7058       if (mode == DImode)
7059         v |= (v << 16) << 16;
7060       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7061     }
7062 
7063   if (valmode == VOIDmode)
7064     valmode = QImode;
7065   if (valmode != QImode)
7066     val = gen_lowpart (QImode, val);
7067   if (mode == QImode)
7068     return val;
7069   if (!TARGET_PARTIAL_REG_STALL)
7070     nops--;
7071   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7072       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7073       <= (ix86_cost->shift_const + ix86_cost->add) * nops
7074           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7075     {
7076       rtx reg = convert_modes (mode, QImode, val, true);
7077       tmp = promote_duplicated_reg (mode, const1_rtx);
7078       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7079 				  OPTAB_DIRECT);
7080     }
7081   else
7082     {
7083       rtx reg = convert_modes (mode, QImode, val, true);
7084 
7085       if (!TARGET_PARTIAL_REG_STALL)
7086 	if (mode == SImode)
7087 	  emit_insn (gen_insvsi_1 (reg, reg));
7088 	else
7089 	  emit_insn (gen_insvdi_1 (reg, reg));
7090       else
7091 	{
7092 	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7093 				     NULL, 1, OPTAB_DIRECT);
7094 	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7095 				     OPTAB_DIRECT);
7096 	}
7097       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7098 			         NULL, 1, OPTAB_DIRECT);
7099       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7100       if (mode == SImode)
7101 	return reg;
7102       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7103 				 NULL, 1, OPTAB_DIRECT);
7104       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7105       return reg;
7106     }
7107 }
7108 
7109 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7110    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7111    alignment from ALIGN to DESIRED_ALIGN.  */
7112 static rtx
promote_duplicated_reg_to_size(rtx val,int size_needed,int desired_align,int align)7113 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7114 				int align)
7115 {
7116   rtx promoted_val;
7117 
7118   if (TARGET_64BIT
7119       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7120     promoted_val = promote_duplicated_reg (DImode, val);
7121   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7122     promoted_val = promote_duplicated_reg (SImode, val);
7123   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7124     promoted_val = promote_duplicated_reg (HImode, val);
7125   else
7126     promoted_val = val;
7127 
7128   return promoted_val;
7129 }
7130 
7131 /* Copy the address to a Pmode register.  This is used for x32 to
7132    truncate DImode TLS address to a SImode register. */
7133 
7134 static rtx
ix86_copy_addr_to_reg(rtx addr)7135 ix86_copy_addr_to_reg (rtx addr)
7136 {
7137   rtx reg;
7138   if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7139     {
7140       reg = copy_addr_to_reg (addr);
7141       REG_POINTER (reg) = 1;
7142       return reg;
7143     }
7144   else
7145     {
7146       gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7147       reg = copy_to_mode_reg (DImode, addr);
7148       REG_POINTER (reg) = 1;
7149       return gen_rtx_SUBREG (SImode, reg, 0);
7150     }
7151 }
7152 
7153 /* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
7154    operations when profitable.  The code depends upon architecture, block size
7155    and alignment, but always has one of the following overall structures:
7156 
7157    Aligned move sequence:
7158 
7159      1) Prologue guard: Conditional that jumps up to epilogues for small
7160 	blocks that can be handled by epilogue alone.  This is faster
7161 	but also needed for correctness, since prologue assume the block
7162 	is larger than the desired alignment.
7163 
7164 	Optional dynamic check for size and libcall for large
7165 	blocks is emitted here too, with -minline-stringops-dynamically.
7166 
7167      2) Prologue: copy first few bytes in order to get destination
7168 	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
7169 	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7170 	copied.  We emit either a jump tree on power of two sized
7171 	blocks, or a byte loop.
7172 
7173      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7174 	with specified algorithm.
7175 
7176      4) Epilogue: code copying tail of the block that is too small to be
7177 	handled by main body (or up to size guarded by prologue guard).
7178 
7179   Misaligned move sequence
7180 
7181      1) missaligned move prologue/epilogue containing:
7182         a) Prologue handling small memory blocks and jumping to done_label
7183 	   (skipped if blocks are known to be large enough)
7184 	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7185            needed by single possibly misaligned move
7186 	   (skipped if alignment is not needed)
7187         c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7188 
7189      2) Zero size guard dispatching to done_label, if needed
7190 
7191      3) dispatch to library call, if needed,
7192 
7193      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7194 	with specified algorithm.  */
7195 bool
ix86_expand_set_or_cpymem(rtx dst,rtx src,rtx count_exp,rtx val_exp,rtx align_exp,rtx expected_align_exp,rtx expected_size_exp,rtx min_size_exp,rtx max_size_exp,rtx probable_max_size_exp,bool issetmem)7196 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7197 			   rtx align_exp, rtx expected_align_exp,
7198 			   rtx expected_size_exp, rtx min_size_exp,
7199 			   rtx max_size_exp, rtx probable_max_size_exp,
7200 			   bool issetmem)
7201 {
7202   rtx destreg;
7203   rtx srcreg = NULL;
7204   rtx_code_label *label = NULL;
7205   rtx tmp;
7206   rtx_code_label *jump_around_label = NULL;
7207   HOST_WIDE_INT align = 1;
7208   unsigned HOST_WIDE_INT count = 0;
7209   HOST_WIDE_INT expected_size = -1;
7210   int size_needed = 0, epilogue_size_needed;
7211   int desired_align = 0, align_bytes = 0;
7212   enum stringop_alg alg;
7213   rtx promoted_val = NULL;
7214   rtx vec_promoted_val = NULL;
7215   bool force_loopy_epilogue = false;
7216   int dynamic_check;
7217   bool need_zero_guard = false;
7218   bool noalign;
7219   machine_mode move_mode = VOIDmode;
7220   machine_mode wider_mode;
7221   int unroll_factor = 1;
7222   /* TODO: Once value ranges are available, fill in proper data.  */
7223   unsigned HOST_WIDE_INT min_size = 0;
7224   unsigned HOST_WIDE_INT max_size = -1;
7225   unsigned HOST_WIDE_INT probable_max_size = -1;
7226   bool misaligned_prologue_used = false;
7227   bool have_as;
7228 
7229   if (CONST_INT_P (align_exp))
7230     align = INTVAL (align_exp);
7231   /* i386 can do misaligned access on reasonably increased cost.  */
7232   if (CONST_INT_P (expected_align_exp)
7233       && INTVAL (expected_align_exp) > align)
7234     align = INTVAL (expected_align_exp);
7235   /* ALIGN is the minimum of destination and source alignment, but we care here
7236      just about destination alignment.  */
7237   else if (!issetmem
7238 	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7239     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7240 
7241   if (CONST_INT_P (count_exp))
7242     {
7243       min_size = max_size = probable_max_size = count = expected_size
7244 	= INTVAL (count_exp);
7245       /* When COUNT is 0, there is nothing to do.  */
7246       if (!count)
7247 	return true;
7248     }
7249   else
7250     {
7251       if (min_size_exp)
7252 	min_size = INTVAL (min_size_exp);
7253       if (max_size_exp)
7254 	max_size = INTVAL (max_size_exp);
7255       if (probable_max_size_exp)
7256 	probable_max_size = INTVAL (probable_max_size_exp);
7257       if (CONST_INT_P (expected_size_exp))
7258 	expected_size = INTVAL (expected_size_exp);
7259      }
7260 
7261   /* Make sure we don't need to care about overflow later on.  */
7262   if (count > (HOST_WIDE_INT_1U << 30))
7263     return false;
7264 
7265   have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7266   if (!issetmem)
7267     have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7268 
7269   /* Step 0: Decide on preferred algorithm, desired alignment and
7270      size of chunks to be copied by main loop.  */
7271   alg = decide_alg (count, expected_size, min_size, probable_max_size,
7272 		    issetmem,
7273 		    issetmem && val_exp == const0_rtx, have_as,
7274 		    &dynamic_check, &noalign, false);
7275 
7276   if (dump_file)
7277     fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7278 	     stringop_alg_names[alg]);
7279 
7280   if (alg == libcall)
7281     return false;
7282   gcc_assert (alg != no_stringop);
7283 
7284   /* For now vector-version of memset is generated only for memory zeroing, as
7285      creating of promoted vector value is very cheap in this case.  */
7286   if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7287     alg = unrolled_loop;
7288 
7289   if (!count)
7290     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7291   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7292   if (!issetmem)
7293     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7294 
7295   unroll_factor = 1;
7296   move_mode = word_mode;
7297   switch (alg)
7298     {
7299     case libcall:
7300     case no_stringop:
7301     case last_alg:
7302       gcc_unreachable ();
7303     case loop_1_byte:
7304       need_zero_guard = true;
7305       move_mode = QImode;
7306       break;
7307     case loop:
7308       need_zero_guard = true;
7309       break;
7310     case unrolled_loop:
7311       need_zero_guard = true;
7312       unroll_factor = (TARGET_64BIT ? 4 : 2);
7313       break;
7314     case vector_loop:
7315       need_zero_guard = true;
7316       unroll_factor = 4;
7317       /* Find the widest supported mode.  */
7318       move_mode = word_mode;
7319       while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7320 	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7321 	move_mode = wider_mode;
7322 
7323       if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
7324 	move_mode = TImode;
7325 
7326       /* Find the corresponding vector mode with the same size as MOVE_MODE.
7327 	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
7328       if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7329 	{
7330 	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7331 	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7332 	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7333 	    move_mode = word_mode;
7334 	}
7335       gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7336       break;
7337     case rep_prefix_8_byte:
7338       move_mode = DImode;
7339       break;
7340     case rep_prefix_4_byte:
7341       move_mode = SImode;
7342       break;
7343     case rep_prefix_1_byte:
7344       move_mode = QImode;
7345       break;
7346     }
7347   size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7348   epilogue_size_needed = size_needed;
7349 
7350   /* If we are going to call any library calls conditionally, make sure any
7351      pending stack adjustment happen before the first conditional branch,
7352      otherwise they will be emitted before the library call only and won't
7353      happen from the other branches.  */
7354   if (dynamic_check != -1)
7355     do_pending_stack_adjust ();
7356 
7357   desired_align = decide_alignment (align, alg, expected_size, move_mode);
7358   if (!TARGET_ALIGN_STRINGOPS || noalign)
7359     align = desired_align;
7360 
7361   /* Step 1: Prologue guard.  */
7362 
7363   /* Alignment code needs count to be in register.  */
7364   if (CONST_INT_P (count_exp) && desired_align > align)
7365     {
7366       if (INTVAL (count_exp) > desired_align
7367 	  && INTVAL (count_exp) > size_needed)
7368 	{
7369 	  align_bytes
7370 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7371 	  if (align_bytes <= 0)
7372 	    align_bytes = 0;
7373 	  else
7374 	    align_bytes = desired_align - align_bytes;
7375 	}
7376       if (align_bytes == 0)
7377 	count_exp = force_reg (counter_mode (count_exp), count_exp);
7378     }
7379   gcc_assert (desired_align >= 1 && align >= 1);
7380 
7381   /* Misaligned move sequences handle both prologue and epilogue at once.
7382      Default code generation results in a smaller code for large alignments
7383      and also avoids redundant job when sizes are known precisely.  */
7384   misaligned_prologue_used
7385     = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7386        && MAX (desired_align, epilogue_size_needed) <= 32
7387        && desired_align <= epilogue_size_needed
7388        && ((desired_align > align && !align_bytes)
7389 	   || (!count && epilogue_size_needed > 1)));
7390 
7391   /* Do the cheap promotion to allow better CSE across the
7392      main loop and epilogue (ie one load of the big constant in the
7393      front of all code.
7394      For now the misaligned move sequences do not have fast path
7395      without broadcasting.  */
7396   if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7397     {
7398       if (alg == vector_loop)
7399 	{
7400 	  gcc_assert (val_exp == const0_rtx);
7401 	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7402 	  promoted_val = promote_duplicated_reg_to_size (val_exp,
7403 							 GET_MODE_SIZE (word_mode),
7404 							 desired_align, align);
7405 	}
7406       else
7407 	{
7408 	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7409 							 desired_align, align);
7410 	}
7411     }
7412   /* Misaligned move sequences handles both prologues and epilogues at once.
7413      Default code generation results in smaller code for large alignments and
7414      also avoids redundant job when sizes are known precisely.  */
7415   if (misaligned_prologue_used)
7416     {
7417       /* Misaligned move prologue handled small blocks by itself.  */
7418       expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7419 	   (dst, src, &destreg, &srcreg,
7420 	    move_mode, promoted_val, vec_promoted_val,
7421 	    &count_exp,
7422 	    &jump_around_label,
7423             desired_align < align
7424 	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7425 	    desired_align, align, &min_size, dynamic_check, issetmem);
7426       if (!issetmem)
7427         src = change_address (src, BLKmode, srcreg);
7428       dst = change_address (dst, BLKmode, destreg);
7429       set_mem_align (dst, desired_align * BITS_PER_UNIT);
7430       epilogue_size_needed = 0;
7431       if (need_zero_guard
7432 	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
7433 	{
7434 	  /* It is possible that we copied enough so the main loop will not
7435 	     execute.  */
7436 	  gcc_assert (size_needed > 1);
7437 	  if (jump_around_label == NULL_RTX)
7438 	    jump_around_label = gen_label_rtx ();
7439 	  emit_cmp_and_jump_insns (count_exp,
7440 				   GEN_INT (size_needed),
7441 				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7442 	  if (expected_size == -1
7443 	      || expected_size < (desired_align - align) / 2 + size_needed)
7444 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
7445 	  else
7446 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
7447 	}
7448     }
7449   /* Ensure that alignment prologue won't copy past end of block.  */
7450   else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7451     {
7452       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7453       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7454 	 Make sure it is power of 2.  */
7455       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7456 
7457       /* To improve performance of small blocks, we jump around the VAL
7458 	 promoting mode.  This mean that if the promoted VAL is not constant,
7459 	 we might not use it in the epilogue and have to use byte
7460 	 loop variant.  */
7461       if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7462 	force_loopy_epilogue = true;
7463       if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7464 	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7465 	{
7466 	  /* If main algorithm works on QImode, no epilogue is needed.
7467 	     For small sizes just don't align anything.  */
7468 	  if (size_needed == 1)
7469 	    desired_align = align;
7470 	  else
7471 	    goto epilogue;
7472 	}
7473       else if (!count
7474 	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7475 	{
7476 	  label = gen_label_rtx ();
7477 	  emit_cmp_and_jump_insns (count_exp,
7478 				   GEN_INT (epilogue_size_needed),
7479 				   LTU, 0, counter_mode (count_exp), 1, label);
7480 	  if (expected_size == -1 || expected_size < epilogue_size_needed)
7481 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
7482 	  else
7483 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
7484 	}
7485     }
7486 
7487   /* Emit code to decide on runtime whether library call or inline should be
7488      used.  */
7489   if (dynamic_check != -1)
7490     {
7491       if (!issetmem && CONST_INT_P (count_exp))
7492 	{
7493 	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7494 	    {
7495 	      emit_block_copy_via_libcall (dst, src, count_exp);
7496 	      count_exp = const0_rtx;
7497 	      goto epilogue;
7498 	    }
7499 	}
7500       else
7501 	{
7502 	  rtx_code_label *hot_label = gen_label_rtx ();
7503 	  if (jump_around_label == NULL_RTX)
7504 	    jump_around_label = gen_label_rtx ();
7505 	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7506 				   LEU, 0, counter_mode (count_exp),
7507 				   1, hot_label);
7508 	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
7509 	  if (issetmem)
7510 	    set_storage_via_libcall (dst, count_exp, val_exp);
7511 	  else
7512 	    emit_block_copy_via_libcall (dst, src, count_exp);
7513 	  emit_jump (jump_around_label);
7514 	  emit_label (hot_label);
7515 	}
7516     }
7517 
7518   /* Step 2: Alignment prologue.  */
7519   /* Do the expensive promotion once we branched off the small blocks.  */
7520   if (issetmem && !promoted_val)
7521     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7522 						   desired_align, align);
7523 
7524   if (desired_align > align && !misaligned_prologue_used)
7525     {
7526       if (align_bytes == 0)
7527 	{
7528 	  /* Except for the first move in prologue, we no longer know
7529 	     constant offset in aliasing info.  It don't seems to worth
7530 	     the pain to maintain it for the first move, so throw away
7531 	     the info early.  */
7532 	  dst = change_address (dst, BLKmode, destreg);
7533 	  if (!issetmem)
7534 	    src = change_address (src, BLKmode, srcreg);
7535 	  dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7536 					    promoted_val, vec_promoted_val,
7537 					    count_exp, align, desired_align,
7538 					    issetmem);
7539 	  /* At most desired_align - align bytes are copied.  */
7540 	  if (min_size < (unsigned)(desired_align - align))
7541 	    min_size = 0;
7542 	  else
7543 	    min_size -= desired_align - align;
7544 	}
7545       else
7546 	{
7547 	  /* If we know how many bytes need to be stored before dst is
7548 	     sufficiently aligned, maintain aliasing info accurately.  */
7549 	  dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7550 							   srcreg,
7551 							   promoted_val,
7552 							   vec_promoted_val,
7553 							   desired_align,
7554 							   align_bytes,
7555 							   issetmem);
7556 
7557 	  count_exp = plus_constant (counter_mode (count_exp),
7558 				     count_exp, -align_bytes);
7559 	  count -= align_bytes;
7560 	  min_size -= align_bytes;
7561 	  max_size -= align_bytes;
7562 	}
7563       if (need_zero_guard
7564 	  && min_size < (unsigned HOST_WIDE_INT) size_needed
7565 	  && (count < (unsigned HOST_WIDE_INT) size_needed
7566 	      || (align_bytes == 0
7567 		  && count < ((unsigned HOST_WIDE_INT) size_needed
7568 			      + desired_align - align))))
7569 	{
7570 	  /* It is possible that we copied enough so the main loop will not
7571 	     execute.  */
7572 	  gcc_assert (size_needed > 1);
7573 	  if (label == NULL_RTX)
7574 	    label = gen_label_rtx ();
7575 	  emit_cmp_and_jump_insns (count_exp,
7576 				   GEN_INT (size_needed),
7577 				   LTU, 0, counter_mode (count_exp), 1, label);
7578 	  if (expected_size == -1
7579 	      || expected_size < (desired_align - align) / 2 + size_needed)
7580 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
7581 	  else
7582 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
7583 	}
7584     }
7585   if (label && size_needed == 1)
7586     {
7587       emit_label (label);
7588       LABEL_NUSES (label) = 1;
7589       label = NULL;
7590       epilogue_size_needed = 1;
7591       if (issetmem)
7592 	promoted_val = val_exp;
7593     }
7594   else if (label == NULL_RTX && !misaligned_prologue_used)
7595     epilogue_size_needed = size_needed;
7596 
7597   /* Step 3: Main loop.  */
7598 
7599   switch (alg)
7600     {
7601     case libcall:
7602     case no_stringop:
7603     case last_alg:
7604       gcc_unreachable ();
7605     case loop_1_byte:
7606     case loop:
7607     case unrolled_loop:
7608       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7609 				     count_exp, move_mode, unroll_factor,
7610 				     expected_size, issetmem);
7611       break;
7612     case vector_loop:
7613       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7614 				     vec_promoted_val, count_exp, move_mode,
7615 				     unroll_factor, expected_size, issetmem);
7616       break;
7617     case rep_prefix_8_byte:
7618     case rep_prefix_4_byte:
7619     case rep_prefix_1_byte:
7620       expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7621 				       val_exp, count_exp, move_mode, issetmem);
7622       break;
7623     }
7624   /* Adjust properly the offset of src and dest memory for aliasing.  */
7625   if (CONST_INT_P (count_exp))
7626     {
7627       if (!issetmem)
7628 	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7629 					    (count / size_needed) * size_needed);
7630       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7631 					  (count / size_needed) * size_needed);
7632     }
7633   else
7634     {
7635       if (!issetmem)
7636 	src = change_address (src, BLKmode, srcreg);
7637       dst = change_address (dst, BLKmode, destreg);
7638     }
7639 
7640   /* Step 4: Epilogue to copy the remaining bytes.  */
7641  epilogue:
7642   if (label)
7643     {
7644       /* When the main loop is done, COUNT_EXP might hold original count,
7645 	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7646 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7647 	 bytes. Compensate if needed.  */
7648 
7649       if (size_needed < epilogue_size_needed)
7650 	{
7651 	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7652 				     GEN_INT (size_needed - 1), count_exp, 1,
7653 				     OPTAB_DIRECT);
7654 	  if (tmp != count_exp)
7655 	    emit_move_insn (count_exp, tmp);
7656 	}
7657       emit_label (label);
7658       LABEL_NUSES (label) = 1;
7659     }
7660 
7661   if (count_exp != const0_rtx && epilogue_size_needed > 1)
7662     {
7663       if (force_loopy_epilogue)
7664 	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7665 					 epilogue_size_needed);
7666       else
7667 	{
7668 	  if (issetmem)
7669 	    expand_setmem_epilogue (dst, destreg, promoted_val,
7670 				    vec_promoted_val, count_exp,
7671 				    epilogue_size_needed);
7672 	  else
7673 	    expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7674 				    epilogue_size_needed);
7675 	}
7676     }
7677   if (jump_around_label)
7678     emit_label (jump_around_label);
7679   return true;
7680 }
7681 
7682 
7683 /* Expand the appropriate insns for doing strlen if not just doing
7684    repnz; scasb
7685 
7686    out = result, initialized with the start address
7687    align_rtx = alignment of the address.
7688    scratch = scratch register, initialized with the startaddress when
7689 	not aligned, otherwise undefined
7690 
7691    This is just the body. It needs the initializations mentioned above and
7692    some address computing at the end.  These things are done in i386.md.  */
7693 
7694 static void
ix86_expand_strlensi_unroll_1(rtx out,rtx src,rtx align_rtx)7695 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7696 {
7697   int align;
7698   rtx tmp;
7699   rtx_code_label *align_2_label = NULL;
7700   rtx_code_label *align_3_label = NULL;
7701   rtx_code_label *align_4_label = gen_label_rtx ();
7702   rtx_code_label *end_0_label = gen_label_rtx ();
7703   rtx mem;
7704   rtx tmpreg = gen_reg_rtx (SImode);
7705   rtx scratch = gen_reg_rtx (SImode);
7706   rtx cmp;
7707 
7708   align = 0;
7709   if (CONST_INT_P (align_rtx))
7710     align = INTVAL (align_rtx);
7711 
7712   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
7713 
7714   /* Is there a known alignment and is it less than 4?  */
7715   if (align < 4)
7716     {
7717       rtx scratch1 = gen_reg_rtx (Pmode);
7718       emit_move_insn (scratch1, out);
7719       /* Is there a known alignment and is it not 2? */
7720       if (align != 2)
7721 	{
7722 	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7723 	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7724 
7725 	  /* Leave just the 3 lower bits.  */
7726 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7727 				    NULL_RTX, 0, OPTAB_WIDEN);
7728 
7729 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7730 				   Pmode, 1, align_4_label);
7731 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7732 				   Pmode, 1, align_2_label);
7733 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7734 				   Pmode, 1, align_3_label);
7735 	}
7736       else
7737         {
7738 	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
7739 	     check if is aligned to 4 - byte.  */
7740 
7741 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7742 				    NULL_RTX, 0, OPTAB_WIDEN);
7743 
7744 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7745 				   Pmode, 1, align_4_label);
7746         }
7747 
7748       mem = change_address (src, QImode, out);
7749 
7750       /* Now compare the bytes.  */
7751 
7752       /* Compare the first n unaligned byte on a byte per byte basis.  */
7753       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7754 			       QImode, 1, end_0_label);
7755 
7756       /* Increment the address.  */
7757       emit_insn (gen_add2_insn (out, const1_rtx));
7758 
7759       /* Not needed with an alignment of 2 */
7760       if (align != 2)
7761 	{
7762 	  emit_label (align_2_label);
7763 
7764 	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7765 				   end_0_label);
7766 
7767 	  emit_insn (gen_add2_insn (out, const1_rtx));
7768 
7769 	  emit_label (align_3_label);
7770 	}
7771 
7772       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7773 			       end_0_label);
7774 
7775       emit_insn (gen_add2_insn (out, const1_rtx));
7776     }
7777 
7778   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
7779      align this loop.  It gives only huge programs, but does not help to
7780      speed up.  */
7781   emit_label (align_4_label);
7782 
7783   mem = change_address (src, SImode, out);
7784   emit_move_insn (scratch, mem);
7785   emit_insn (gen_add2_insn (out, GEN_INT (4)));
7786 
7787   /* This formula yields a nonzero result iff one of the bytes is zero.
7788      This saves three branches inside loop and many cycles.  */
7789 
7790   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7791   emit_insn (gen_one_cmplsi2 (scratch, scratch));
7792   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7793   emit_insn (gen_andsi3 (tmpreg, tmpreg,
7794 			 gen_int_mode (0x80808080, SImode)));
7795   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7796 			   align_4_label);
7797 
7798   if (TARGET_CMOVE)
7799     {
7800        rtx reg = gen_reg_rtx (SImode);
7801        rtx reg2 = gen_reg_rtx (Pmode);
7802        emit_move_insn (reg, tmpreg);
7803        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7804 
7805        /* If zero is not in the first two bytes, move two bytes forward.  */
7806        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7807        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7808        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7809        emit_insn (gen_rtx_SET (tmpreg,
7810 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
7811 						     reg,
7812 						     tmpreg)));
7813        /* Emit lea manually to avoid clobbering of flags.  */
7814        emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
7815 
7816        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7817        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7818        emit_insn (gen_rtx_SET (out,
7819 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7820 						     reg2,
7821 						     out)));
7822     }
7823   else
7824     {
7825        rtx_code_label *end_2_label = gen_label_rtx ();
7826        /* Is zero in the first two bytes? */
7827 
7828        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7829        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7830        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7831        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7832                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7833                             pc_rtx);
7834        tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7835        JUMP_LABEL (tmp) = end_2_label;
7836 
7837        /* Not in the first two.  Move two bytes forward.  */
7838        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7839        emit_insn (gen_add2_insn (out, const2_rtx));
7840 
7841        emit_label (end_2_label);
7842 
7843     }
7844 
7845   /* Avoid branch in fixing the byte.  */
7846   tmpreg = gen_lowpart (QImode, tmpreg);
7847   emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7848   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7849   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7850   emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7851 
7852   emit_label (end_0_label);
7853 }
7854 
7855 /* Expand strlen.  */
7856 
7857 bool
ix86_expand_strlen(rtx out,rtx src,rtx eoschar,rtx align)7858 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7859 {
7860 if (TARGET_UNROLL_STRLEN
7861 	   && TARGET_INLINE_ALL_STRINGOPS
7862 	   && eoschar == const0_rtx
7863 	   && optimize > 1)
7864     {
7865       /* The generic case of strlen expander is long.  Avoid it's
7866 	 expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
7867       rtx addr = force_reg (Pmode, XEXP (src, 0));
7868       /* Well it seems that some optimizer does not combine a call like
7869 	 foo(strlen(bar), strlen(bar));
7870 	 when the move and the subtraction is done here.  It does calculate
7871 	 the length just once when these instructions are done inside of
7872 	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
7873 	 often used and I use one fewer register for the lifetime of
7874 	 output_strlen_unroll() this is better.  */
7875 
7876       emit_move_insn (out, addr);
7877 
7878       ix86_expand_strlensi_unroll_1 (out, src, align);
7879 
7880       /* strlensi_unroll_1 returns the address of the zero at the end of
7881 	 the string, like memchr(), so compute the length by subtracting
7882 	 the start address.  */
7883       emit_insn (gen_sub2_insn (out, addr));
7884       return true;
7885     }
7886   else
7887     return false;
7888 }
7889 
7890 /* For given symbol (function) construct code to compute address of it's PLT
7891    entry in large x86-64 PIC model.  */
7892 
7893 static rtx
construct_plt_address(rtx symbol)7894 construct_plt_address (rtx symbol)
7895 {
7896   rtx tmp, unspec;
7897 
7898   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7899   gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7900   gcc_assert (Pmode == DImode);
7901 
7902   tmp = gen_reg_rtx (Pmode);
7903   unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7904 
7905   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7906   emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7907   return tmp;
7908 }
7909 
7910 /* Additional registers that are clobbered by SYSV calls.  */
7911 
7912 static int const x86_64_ms_sysv_extra_clobbered_registers
7913 		 [NUM_X86_64_MS_CLOBBERED_REGS] =
7914 {
7915   SI_REG, DI_REG,
7916   XMM6_REG, XMM7_REG,
7917   XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7918   XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7919 };
7920 
7921 rtx_insn *
ix86_expand_call(rtx retval,rtx fnaddr,rtx callarg1,rtx callarg2,rtx pop,bool sibcall)7922 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7923 		  rtx callarg2,
7924 		  rtx pop, bool sibcall)
7925 {
7926   rtx vec[3];
7927   rtx use = NULL, call;
7928   unsigned int vec_len = 0;
7929   tree fndecl;
7930 
7931   if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7932     {
7933       fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7934       if (fndecl
7935 	  && (lookup_attribute ("interrupt",
7936 				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
7937 	error ("interrupt service routine cannot be called directly");
7938     }
7939   else
7940     fndecl = NULL_TREE;
7941 
7942   if (pop == const0_rtx)
7943     pop = NULL;
7944   gcc_assert (!TARGET_64BIT || !pop);
7945 
7946   rtx addr = XEXP (fnaddr, 0);
7947   if (TARGET_MACHO && !TARGET_64BIT)
7948     {
7949 #if TARGET_MACHO
7950       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7951 	fnaddr = machopic_indirect_call_target (fnaddr);
7952 #endif
7953     }
7954   else
7955     {
7956       /* Static functions and indirect calls don't need the pic register.  Also,
7957 	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7958 	 it an indirect call.  */
7959       if (flag_pic
7960 	  && GET_CODE (addr) == SYMBOL_REF
7961 	  && !SYMBOL_REF_LOCAL_P (addr))
7962 	{
7963 	  if (flag_plt
7964 	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
7965 		  || !lookup_attribute ("noplt",
7966 					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
7967 	    {
7968 	      if (!TARGET_64BIT
7969 		  || (ix86_cmodel == CM_LARGE_PIC
7970 		      && DEFAULT_ABI != MS_ABI))
7971 		{
7972 		  use_reg (&use, gen_rtx_REG (Pmode,
7973 					      REAL_PIC_OFFSET_TABLE_REGNUM));
7974 		  if (ix86_use_pseudo_pic_reg ())
7975 		    emit_move_insn (gen_rtx_REG (Pmode,
7976 						 REAL_PIC_OFFSET_TABLE_REGNUM),
7977 				    pic_offset_table_rtx);
7978 		}
7979 	    }
7980 	  else if (!TARGET_PECOFF && !TARGET_MACHO)
7981 	    {
7982 	      if (TARGET_64BIT
7983 		  && ix86_cmodel == CM_LARGE_PIC
7984 		  && DEFAULT_ABI != MS_ABI)
7985 		{
7986 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
7987 					   UNSPEC_GOT);
7988 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7989 		  fnaddr = force_reg (Pmode, fnaddr);
7990 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
7991 		}
7992 	      else if (TARGET_64BIT)
7993 		{
7994 		  fnaddr = gen_rtx_UNSPEC (Pmode,
7995 					   gen_rtvec (1, addr),
7996 					   UNSPEC_GOTPCREL);
7997 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7998 		}
7999 	      else
8000 		{
8001 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8002 					   UNSPEC_GOT);
8003 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8004 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8005 					 fnaddr);
8006 		}
8007 	      fnaddr = gen_const_mem (Pmode, fnaddr);
8008 	      /* Pmode may not be the same as word_mode for x32, which
8009 		 doesn't support indirect branch via 32-bit memory slot.
8010 		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8011 		 indirect branch via x32 GOT slot is OK.  */
8012 	      if (GET_MODE (fnaddr) != word_mode)
8013 		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8014 	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
8015 	    }
8016 	}
8017     }
8018 
8019   /* Skip setting up RAX register for -mskip-rax-setup when there are no
8020      parameters passed in vector registers.  */
8021   if (TARGET_64BIT
8022       && (INTVAL (callarg2) > 0
8023 	  || (INTVAL (callarg2) == 0
8024 	      && (TARGET_SSE || !flag_skip_rax_setup))))
8025     {
8026       rtx al = gen_rtx_REG (QImode, AX_REG);
8027       emit_move_insn (al, callarg2);
8028       use_reg (&use, al);
8029     }
8030 
8031   if (ix86_cmodel == CM_LARGE_PIC
8032       && !TARGET_PECOFF
8033       && MEM_P (fnaddr)
8034       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8035       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8036     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8037   /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8038      branch via x32 GOT slot is OK.  */
8039   else if (!(TARGET_X32
8040 	     && MEM_P (fnaddr)
8041 	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8042 	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8043 	   && (sibcall
8044 	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8045 	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8046     {
8047       fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8048       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8049     }
8050 
8051   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8052 
8053   if (retval)
8054     call = gen_rtx_SET (retval, call);
8055   vec[vec_len++] = call;
8056 
8057   if (pop)
8058     {
8059       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8060       pop = gen_rtx_SET (stack_pointer_rtx, pop);
8061       vec[vec_len++] = pop;
8062     }
8063 
8064   if (cfun->machine->no_caller_saved_registers
8065       && (!fndecl
8066 	  || (!TREE_THIS_VOLATILE (fndecl)
8067 	      && !lookup_attribute ("no_caller_saved_registers",
8068 				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8069     {
8070       static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8071       bool is_64bit_ms_abi = (TARGET_64BIT
8072 			      && ix86_function_abi (fndecl) == MS_ABI);
8073       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8074 
8075       /* If there are no caller-saved registers, add all registers
8076 	 that are clobbered by the call which returns.  */
8077       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8078 	if (!fixed_regs[i]
8079 	    && (ix86_call_used_regs[i] == 1
8080 		|| (ix86_call_used_regs[i] & c_mask))
8081 	    && !STACK_REGNO_P (i)
8082 	    && !MMX_REGNO_P (i))
8083 	  clobber_reg (&use,
8084 		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8085     }
8086   else if (TARGET_64BIT_MS_ABI
8087 	   && (!callarg2 || INTVAL (callarg2) != -2))
8088     {
8089       unsigned i;
8090 
8091       for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8092 	{
8093 	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8094 	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8095 
8096 	  clobber_reg (&use, gen_rtx_REG (mode, regno));
8097 	}
8098 
8099       /* Set here, but it may get cleared later.  */
8100       if (TARGET_CALL_MS2SYSV_XLOGUES)
8101 	{
8102 	  if (!TARGET_SSE)
8103 	    ;
8104 
8105 	  /* Don't break hot-patched functions.  */
8106 	  else if (ix86_function_ms_hook_prologue (current_function_decl))
8107 	    ;
8108 
8109 	  /* TODO: Cases not yet examined.  */
8110 	  else if (flag_split_stack)
8111 	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8112 
8113 	  else
8114 	    {
8115 	      gcc_assert (!reload_completed);
8116 	      cfun->machine->call_ms2sysv = true;
8117 	    }
8118 	}
8119     }
8120 
8121   if (TARGET_MACHO && TARGET_64BIT && !sibcall
8122       && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
8123 	  || !fndecl || TREE_PUBLIC (fndecl)))
8124     {
8125       /* We allow public functions defined in a TU to bind locally for PIC
8126 	 code (the default) on 64bit Mach-O.
8127 	 If such functions are not inlined, we cannot tell at compile-time if
8128 	 they will be called via the lazy symbol resolver (this can depend on
8129 	 options given at link-time).  Therefore, we must assume that the lazy
8130 	 resolver could be used which clobbers R11 and R10.  */
8131       clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
8132       clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
8133     }
8134 
8135   if (vec_len > 1)
8136     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8137   rtx_insn *call_insn = emit_call_insn (call);
8138   if (use)
8139     CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8140 
8141   return call_insn;
8142 }
8143 
8144 /* Split simple return with popping POPC bytes from stack to indirect
8145    branch with stack adjustment .  */
8146 
8147 void
ix86_split_simple_return_pop_internal(rtx popc)8148 ix86_split_simple_return_pop_internal (rtx popc)
8149 {
8150   struct machine_function *m = cfun->machine;
8151   rtx ecx = gen_rtx_REG (SImode, CX_REG);
8152   rtx_insn *insn;
8153 
8154   /* There is no "pascal" calling convention in any 64bit ABI.  */
8155   gcc_assert (!TARGET_64BIT);
8156 
8157   insn = emit_insn (gen_pop (ecx));
8158   m->fs.cfa_offset -= UNITS_PER_WORD;
8159   m->fs.sp_offset -= UNITS_PER_WORD;
8160 
8161   rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8162   x = gen_rtx_SET (stack_pointer_rtx, x);
8163   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8164   add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8165   RTX_FRAME_RELATED_P (insn) = 1;
8166 
8167   x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8168   x = gen_rtx_SET (stack_pointer_rtx, x);
8169   insn = emit_insn (x);
8170   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8171   RTX_FRAME_RELATED_P (insn) = 1;
8172 
8173   /* Now return address is in ECX.  */
8174   emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8175 }
8176 
8177 /* Errors in the source file can cause expand_expr to return const0_rtx
8178    where we expect a vector.  To avoid crashing, use one of the vector
8179    clear instructions.  */
8180 
8181 static rtx
safe_vector_operand(rtx x,machine_mode mode)8182 safe_vector_operand (rtx x, machine_mode mode)
8183 {
8184   if (x == const0_rtx)
8185     x = CONST0_RTX (mode);
8186   return x;
8187 }
8188 
8189 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
8190 
8191 static rtx
ix86_expand_binop_builtin(enum insn_code icode,tree exp,rtx target)8192 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8193 {
8194   rtx pat;
8195   tree arg0 = CALL_EXPR_ARG (exp, 0);
8196   tree arg1 = CALL_EXPR_ARG (exp, 1);
8197   rtx op0 = expand_normal (arg0);
8198   rtx op1 = expand_normal (arg1);
8199   machine_mode tmode = insn_data[icode].operand[0].mode;
8200   machine_mode mode0 = insn_data[icode].operand[1].mode;
8201   machine_mode mode1 = insn_data[icode].operand[2].mode;
8202 
8203   if (VECTOR_MODE_P (mode0))
8204     op0 = safe_vector_operand (op0, mode0);
8205   if (VECTOR_MODE_P (mode1))
8206     op1 = safe_vector_operand (op1, mode1);
8207 
8208   if (optimize || !target
8209       || GET_MODE (target) != tmode
8210       || !insn_data[icode].operand[0].predicate (target, tmode))
8211     target = gen_reg_rtx (tmode);
8212 
8213   if (GET_MODE (op1) == SImode && mode1 == TImode)
8214     {
8215       rtx x = gen_reg_rtx (V4SImode);
8216       emit_insn (gen_sse2_loadd (x, op1));
8217       op1 = gen_lowpart (TImode, x);
8218     }
8219 
8220   if (!insn_data[icode].operand[1].predicate (op0, mode0))
8221     op0 = copy_to_mode_reg (mode0, op0);
8222   if (!insn_data[icode].operand[2].predicate (op1, mode1))
8223     op1 = copy_to_mode_reg (mode1, op1);
8224 
8225   pat = GEN_FCN (icode) (target, op0, op1);
8226   if (! pat)
8227     return 0;
8228 
8229   emit_insn (pat);
8230 
8231   return target;
8232 }
8233 
8234 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
8235 
8236 static rtx
ix86_expand_multi_arg_builtin(enum insn_code icode,tree exp,rtx target,enum ix86_builtin_func_type m_type,enum rtx_code sub_code)8237 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8238 			       enum ix86_builtin_func_type m_type,
8239 			       enum rtx_code sub_code)
8240 {
8241   rtx pat;
8242   int i;
8243   int nargs;
8244   bool comparison_p = false;
8245   bool tf_p = false;
8246   bool last_arg_constant = false;
8247   int num_memory = 0;
8248   struct {
8249     rtx op;
8250     machine_mode mode;
8251   } args[4];
8252 
8253   machine_mode tmode = insn_data[icode].operand[0].mode;
8254 
8255   switch (m_type)
8256     {
8257     case MULTI_ARG_4_DF2_DI_I:
8258     case MULTI_ARG_4_DF2_DI_I1:
8259     case MULTI_ARG_4_SF2_SI_I:
8260     case MULTI_ARG_4_SF2_SI_I1:
8261       nargs = 4;
8262       last_arg_constant = true;
8263       break;
8264 
8265     case MULTI_ARG_3_SF:
8266     case MULTI_ARG_3_DF:
8267     case MULTI_ARG_3_SF2:
8268     case MULTI_ARG_3_DF2:
8269     case MULTI_ARG_3_DI:
8270     case MULTI_ARG_3_SI:
8271     case MULTI_ARG_3_SI_DI:
8272     case MULTI_ARG_3_HI:
8273     case MULTI_ARG_3_HI_SI:
8274     case MULTI_ARG_3_QI:
8275     case MULTI_ARG_3_DI2:
8276     case MULTI_ARG_3_SI2:
8277     case MULTI_ARG_3_HI2:
8278     case MULTI_ARG_3_QI2:
8279       nargs = 3;
8280       break;
8281 
8282     case MULTI_ARG_2_SF:
8283     case MULTI_ARG_2_DF:
8284     case MULTI_ARG_2_DI:
8285     case MULTI_ARG_2_SI:
8286     case MULTI_ARG_2_HI:
8287     case MULTI_ARG_2_QI:
8288       nargs = 2;
8289       break;
8290 
8291     case MULTI_ARG_2_DI_IMM:
8292     case MULTI_ARG_2_SI_IMM:
8293     case MULTI_ARG_2_HI_IMM:
8294     case MULTI_ARG_2_QI_IMM:
8295       nargs = 2;
8296       last_arg_constant = true;
8297       break;
8298 
8299     case MULTI_ARG_1_SF:
8300     case MULTI_ARG_1_DF:
8301     case MULTI_ARG_1_SF2:
8302     case MULTI_ARG_1_DF2:
8303     case MULTI_ARG_1_DI:
8304     case MULTI_ARG_1_SI:
8305     case MULTI_ARG_1_HI:
8306     case MULTI_ARG_1_QI:
8307     case MULTI_ARG_1_SI_DI:
8308     case MULTI_ARG_1_HI_DI:
8309     case MULTI_ARG_1_HI_SI:
8310     case MULTI_ARG_1_QI_DI:
8311     case MULTI_ARG_1_QI_SI:
8312     case MULTI_ARG_1_QI_HI:
8313       nargs = 1;
8314       break;
8315 
8316     case MULTI_ARG_2_DI_CMP:
8317     case MULTI_ARG_2_SI_CMP:
8318     case MULTI_ARG_2_HI_CMP:
8319     case MULTI_ARG_2_QI_CMP:
8320       nargs = 2;
8321       comparison_p = true;
8322       break;
8323 
8324     case MULTI_ARG_2_SF_TF:
8325     case MULTI_ARG_2_DF_TF:
8326     case MULTI_ARG_2_DI_TF:
8327     case MULTI_ARG_2_SI_TF:
8328     case MULTI_ARG_2_HI_TF:
8329     case MULTI_ARG_2_QI_TF:
8330       nargs = 2;
8331       tf_p = true;
8332       break;
8333 
8334     default:
8335       gcc_unreachable ();
8336     }
8337 
8338   if (optimize || !target
8339       || GET_MODE (target) != tmode
8340       || !insn_data[icode].operand[0].predicate (target, tmode))
8341     target = gen_reg_rtx (tmode);
8342   else if (memory_operand (target, tmode))
8343     num_memory++;
8344 
8345   gcc_assert (nargs <= 4);
8346 
8347   for (i = 0; i < nargs; i++)
8348     {
8349       tree arg = CALL_EXPR_ARG (exp, i);
8350       rtx op = expand_normal (arg);
8351       int adjust = (comparison_p) ? 1 : 0;
8352       machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8353 
8354       if (last_arg_constant && i == nargs - 1)
8355 	{
8356 	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8357 	    {
8358 	      enum insn_code new_icode = icode;
8359 	      switch (icode)
8360 		{
8361 		case CODE_FOR_xop_vpermil2v2df3:
8362 		case CODE_FOR_xop_vpermil2v4sf3:
8363 		case CODE_FOR_xop_vpermil2v4df3:
8364 		case CODE_FOR_xop_vpermil2v8sf3:
8365 		  error ("the last argument must be a 2-bit immediate");
8366 		  return gen_reg_rtx (tmode);
8367 		case CODE_FOR_xop_rotlv2di3:
8368 		  new_icode = CODE_FOR_rotlv2di3;
8369 		  goto xop_rotl;
8370 		case CODE_FOR_xop_rotlv4si3:
8371 		  new_icode = CODE_FOR_rotlv4si3;
8372 		  goto xop_rotl;
8373 		case CODE_FOR_xop_rotlv8hi3:
8374 		  new_icode = CODE_FOR_rotlv8hi3;
8375 		  goto xop_rotl;
8376 		case CODE_FOR_xop_rotlv16qi3:
8377 		  new_icode = CODE_FOR_rotlv16qi3;
8378 		xop_rotl:
8379 		  if (CONST_INT_P (op))
8380 		    {
8381 		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8382 		      op = GEN_INT (INTVAL (op) & mask);
8383 		      gcc_checking_assert
8384 			(insn_data[icode].operand[i + 1].predicate (op, mode));
8385 		    }
8386 		  else
8387 		    {
8388 		      gcc_checking_assert
8389 			(nargs == 2
8390 			 && insn_data[new_icode].operand[0].mode == tmode
8391 			 && insn_data[new_icode].operand[1].mode == tmode
8392 			 && insn_data[new_icode].operand[2].mode == mode
8393 			 && insn_data[new_icode].operand[0].predicate
8394 			    == insn_data[icode].operand[0].predicate
8395 			 && insn_data[new_icode].operand[1].predicate
8396 			    == insn_data[icode].operand[1].predicate);
8397 		      icode = new_icode;
8398 		      goto non_constant;
8399 		    }
8400 		  break;
8401 		default:
8402 		  gcc_unreachable ();
8403 		}
8404 	    }
8405 	}
8406       else
8407 	{
8408 	non_constant:
8409 	  if (VECTOR_MODE_P (mode))
8410 	    op = safe_vector_operand (op, mode);
8411 
8412 	  /* If we aren't optimizing, only allow one memory operand to be
8413 	     generated.  */
8414 	  if (memory_operand (op, mode))
8415 	    num_memory++;
8416 
8417 	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8418 
8419 	  if (optimize
8420 	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8421 	      || num_memory > 1)
8422 	    op = force_reg (mode, op);
8423 	}
8424 
8425       args[i].op = op;
8426       args[i].mode = mode;
8427     }
8428 
8429   switch (nargs)
8430     {
8431     case 1:
8432       pat = GEN_FCN (icode) (target, args[0].op);
8433       break;
8434 
8435     case 2:
8436       if (tf_p)
8437 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8438 			       GEN_INT ((int)sub_code));
8439       else if (! comparison_p)
8440 	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8441       else
8442 	{
8443 	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8444 				       args[0].op,
8445 				       args[1].op);
8446 
8447 	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8448 	}
8449       break;
8450 
8451     case 3:
8452       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8453       break;
8454 
8455     case 4:
8456       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8457       break;
8458 
8459     default:
8460       gcc_unreachable ();
8461     }
8462 
8463   if (! pat)
8464     return 0;
8465 
8466   emit_insn (pat);
8467   return target;
8468 }
8469 
8470 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8471    insns with vec_merge.  */
8472 
8473 static rtx
ix86_expand_unop_vec_merge_builtin(enum insn_code icode,tree exp,rtx target)8474 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8475 				    rtx target)
8476 {
8477   rtx pat;
8478   tree arg0 = CALL_EXPR_ARG (exp, 0);
8479   rtx op1, op0 = expand_normal (arg0);
8480   machine_mode tmode = insn_data[icode].operand[0].mode;
8481   machine_mode mode0 = insn_data[icode].operand[1].mode;
8482 
8483   if (optimize || !target
8484       || GET_MODE (target) != tmode
8485       || !insn_data[icode].operand[0].predicate (target, tmode))
8486     target = gen_reg_rtx (tmode);
8487 
8488   if (VECTOR_MODE_P (mode0))
8489     op0 = safe_vector_operand (op0, mode0);
8490 
8491   if ((optimize && !register_operand (op0, mode0))
8492       || !insn_data[icode].operand[1].predicate (op0, mode0))
8493     op0 = copy_to_mode_reg (mode0, op0);
8494 
8495   op1 = op0;
8496   if (!insn_data[icode].operand[2].predicate (op1, mode0))
8497     op1 = copy_to_mode_reg (mode0, op1);
8498 
8499   pat = GEN_FCN (icode) (target, op0, op1);
8500   if (! pat)
8501     return 0;
8502   emit_insn (pat);
8503   return target;
8504 }
8505 
8506 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
8507 
8508 static rtx
ix86_expand_sse_compare(const struct builtin_description * d,tree exp,rtx target,bool swap)8509 ix86_expand_sse_compare (const struct builtin_description *d,
8510 			 tree exp, rtx target, bool swap)
8511 {
8512   rtx pat;
8513   tree arg0 = CALL_EXPR_ARG (exp, 0);
8514   tree arg1 = CALL_EXPR_ARG (exp, 1);
8515   rtx op0 = expand_normal (arg0);
8516   rtx op1 = expand_normal (arg1);
8517   rtx op2;
8518   machine_mode tmode = insn_data[d->icode].operand[0].mode;
8519   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8520   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8521   enum rtx_code comparison = d->comparison;
8522 
8523   if (VECTOR_MODE_P (mode0))
8524     op0 = safe_vector_operand (op0, mode0);
8525   if (VECTOR_MODE_P (mode1))
8526     op1 = safe_vector_operand (op1, mode1);
8527 
8528   /* Swap operands if we have a comparison that isn't available in
8529      hardware.  */
8530   if (swap)
8531     std::swap (op0, op1);
8532 
8533   if (optimize || !target
8534       || GET_MODE (target) != tmode
8535       || !insn_data[d->icode].operand[0].predicate (target, tmode))
8536     target = gen_reg_rtx (tmode);
8537 
8538   if ((optimize && !register_operand (op0, mode0))
8539       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8540     op0 = copy_to_mode_reg (mode0, op0);
8541   if ((optimize && !register_operand (op1, mode1))
8542       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8543     op1 = copy_to_mode_reg (mode1, op1);
8544 
8545   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8546   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8547   if (! pat)
8548     return 0;
8549   emit_insn (pat);
8550   return target;
8551 }
8552 
8553 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
8554 
8555 static rtx
ix86_expand_sse_comi(const struct builtin_description * d,tree exp,rtx target)8556 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8557 		      rtx target)
8558 {
8559   rtx pat;
8560   tree arg0 = CALL_EXPR_ARG (exp, 0);
8561   tree arg1 = CALL_EXPR_ARG (exp, 1);
8562   rtx op0 = expand_normal (arg0);
8563   rtx op1 = expand_normal (arg1);
8564   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8565   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8566   enum rtx_code comparison = d->comparison;
8567 
8568   if (VECTOR_MODE_P (mode0))
8569     op0 = safe_vector_operand (op0, mode0);
8570   if (VECTOR_MODE_P (mode1))
8571     op1 = safe_vector_operand (op1, mode1);
8572 
8573   /* Swap operands if we have a comparison that isn't available in
8574      hardware.  */
8575   if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8576     std::swap (op0, op1);
8577 
8578   target = gen_reg_rtx (SImode);
8579   emit_move_insn (target, const0_rtx);
8580   target = gen_rtx_SUBREG (QImode, target, 0);
8581 
8582   if ((optimize && !register_operand (op0, mode0))
8583       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8584     op0 = copy_to_mode_reg (mode0, op0);
8585   if ((optimize && !register_operand (op1, mode1))
8586       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8587     op1 = copy_to_mode_reg (mode1, op1);
8588 
8589   pat = GEN_FCN (d->icode) (op0, op1);
8590   if (! pat)
8591     return 0;
8592   emit_insn (pat);
8593   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8594 			  gen_rtx_fmt_ee (comparison, QImode,
8595 					  SET_DEST (pat),
8596 					  const0_rtx)));
8597 
8598   return SUBREG_REG (target);
8599 }
8600 
8601 /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
8602 
8603 static rtx
ix86_expand_sse_round(const struct builtin_description * d,tree exp,rtx target)8604 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8605 		       rtx target)
8606 {
8607   rtx pat;
8608   tree arg0 = CALL_EXPR_ARG (exp, 0);
8609   rtx op1, op0 = expand_normal (arg0);
8610   machine_mode tmode = insn_data[d->icode].operand[0].mode;
8611   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8612 
8613   if (optimize || target == 0
8614       || GET_MODE (target) != tmode
8615       || !insn_data[d->icode].operand[0].predicate (target, tmode))
8616     target = gen_reg_rtx (tmode);
8617 
8618   if (VECTOR_MODE_P (mode0))
8619     op0 = safe_vector_operand (op0, mode0);
8620 
8621   if ((optimize && !register_operand (op0, mode0))
8622       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8623     op0 = copy_to_mode_reg (mode0, op0);
8624 
8625   op1 = GEN_INT (d->comparison);
8626 
8627   pat = GEN_FCN (d->icode) (target, op0, op1);
8628   if (! pat)
8629     return 0;
8630   emit_insn (pat);
8631   return target;
8632 }
8633 
8634 static rtx
ix86_expand_sse_round_vec_pack_sfix(const struct builtin_description * d,tree exp,rtx target)8635 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8636 				     tree exp, rtx target)
8637 {
8638   rtx pat;
8639   tree arg0 = CALL_EXPR_ARG (exp, 0);
8640   tree arg1 = CALL_EXPR_ARG (exp, 1);
8641   rtx op0 = expand_normal (arg0);
8642   rtx op1 = expand_normal (arg1);
8643   rtx op2;
8644   machine_mode tmode = insn_data[d->icode].operand[0].mode;
8645   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8646   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8647 
8648   if (optimize || target == 0
8649       || GET_MODE (target) != tmode
8650       || !insn_data[d->icode].operand[0].predicate (target, tmode))
8651     target = gen_reg_rtx (tmode);
8652 
8653   op0 = safe_vector_operand (op0, mode0);
8654   op1 = safe_vector_operand (op1, mode1);
8655 
8656   if ((optimize && !register_operand (op0, mode0))
8657       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8658     op0 = copy_to_mode_reg (mode0, op0);
8659   if ((optimize && !register_operand (op1, mode1))
8660       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8661     op1 = copy_to_mode_reg (mode1, op1);
8662 
8663   op2 = GEN_INT (d->comparison);
8664 
8665   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8666   if (! pat)
8667     return 0;
8668   emit_insn (pat);
8669   return target;
8670 }
8671 
8672 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
8673 
8674 static rtx
ix86_expand_sse_ptest(const struct builtin_description * d,tree exp,rtx target)8675 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8676 		       rtx target)
8677 {
8678   rtx pat;
8679   tree arg0 = CALL_EXPR_ARG (exp, 0);
8680   tree arg1 = CALL_EXPR_ARG (exp, 1);
8681   rtx op0 = expand_normal (arg0);
8682   rtx op1 = expand_normal (arg1);
8683   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8684   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8685   enum rtx_code comparison = d->comparison;
8686 
8687   if (VECTOR_MODE_P (mode0))
8688     op0 = safe_vector_operand (op0, mode0);
8689   if (VECTOR_MODE_P (mode1))
8690     op1 = safe_vector_operand (op1, mode1);
8691 
8692   target = gen_reg_rtx (SImode);
8693   emit_move_insn (target, const0_rtx);
8694   target = gen_rtx_SUBREG (QImode, target, 0);
8695 
8696   if ((optimize && !register_operand (op0, mode0))
8697       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8698     op0 = copy_to_mode_reg (mode0, op0);
8699   if ((optimize && !register_operand (op1, mode1))
8700       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8701     op1 = copy_to_mode_reg (mode1, op1);
8702 
8703   pat = GEN_FCN (d->icode) (op0, op1);
8704   if (! pat)
8705     return 0;
8706   emit_insn (pat);
8707   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8708 			  gen_rtx_fmt_ee (comparison, QImode,
8709 					  SET_DEST (pat),
8710 					  const0_rtx)));
8711 
8712   return SUBREG_REG (target);
8713 }
8714 
8715 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
8716 
8717 static rtx
ix86_expand_sse_pcmpestr(const struct builtin_description * d,tree exp,rtx target)8718 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8719 			  tree exp, rtx target)
8720 {
8721   rtx pat;
8722   tree arg0 = CALL_EXPR_ARG (exp, 0);
8723   tree arg1 = CALL_EXPR_ARG (exp, 1);
8724   tree arg2 = CALL_EXPR_ARG (exp, 2);
8725   tree arg3 = CALL_EXPR_ARG (exp, 3);
8726   tree arg4 = CALL_EXPR_ARG (exp, 4);
8727   rtx scratch0, scratch1;
8728   rtx op0 = expand_normal (arg0);
8729   rtx op1 = expand_normal (arg1);
8730   rtx op2 = expand_normal (arg2);
8731   rtx op3 = expand_normal (arg3);
8732   rtx op4 = expand_normal (arg4);
8733   machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8734 
8735   tmode0 = insn_data[d->icode].operand[0].mode;
8736   tmode1 = insn_data[d->icode].operand[1].mode;
8737   modev2 = insn_data[d->icode].operand[2].mode;
8738   modei3 = insn_data[d->icode].operand[3].mode;
8739   modev4 = insn_data[d->icode].operand[4].mode;
8740   modei5 = insn_data[d->icode].operand[5].mode;
8741   modeimm = insn_data[d->icode].operand[6].mode;
8742 
8743   if (VECTOR_MODE_P (modev2))
8744     op0 = safe_vector_operand (op0, modev2);
8745   if (VECTOR_MODE_P (modev4))
8746     op2 = safe_vector_operand (op2, modev4);
8747 
8748   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8749     op0 = copy_to_mode_reg (modev2, op0);
8750   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8751     op1 = copy_to_mode_reg (modei3, op1);
8752   if ((optimize && !register_operand (op2, modev4))
8753       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8754     op2 = copy_to_mode_reg (modev4, op2);
8755   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8756     op3 = copy_to_mode_reg (modei5, op3);
8757 
8758   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8759     {
8760       error ("the fifth argument must be an 8-bit immediate");
8761       return const0_rtx;
8762     }
8763 
8764   if (d->code == IX86_BUILTIN_PCMPESTRI128)
8765     {
8766       if (optimize || !target
8767 	  || GET_MODE (target) != tmode0
8768 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8769 	target = gen_reg_rtx (tmode0);
8770 
8771       scratch1 = gen_reg_rtx (tmode1);
8772 
8773       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8774     }
8775   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8776     {
8777       if (optimize || !target
8778 	  || GET_MODE (target) != tmode1
8779 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8780 	target = gen_reg_rtx (tmode1);
8781 
8782       scratch0 = gen_reg_rtx (tmode0);
8783 
8784       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8785     }
8786   else
8787     {
8788       gcc_assert (d->flag);
8789 
8790       scratch0 = gen_reg_rtx (tmode0);
8791       scratch1 = gen_reg_rtx (tmode1);
8792 
8793       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8794     }
8795 
8796   if (! pat)
8797     return 0;
8798 
8799   emit_insn (pat);
8800 
8801   if (d->flag)
8802     {
8803       target = gen_reg_rtx (SImode);
8804       emit_move_insn (target, const0_rtx);
8805       target = gen_rtx_SUBREG (QImode, target, 0);
8806 
8807       emit_insn
8808 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8809 		      gen_rtx_fmt_ee (EQ, QImode,
8810 				      gen_rtx_REG ((machine_mode) d->flag,
8811 						   FLAGS_REG),
8812 				      const0_rtx)));
8813       return SUBREG_REG (target);
8814     }
8815   else
8816     return target;
8817 }
8818 
8819 
8820 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
8821 
8822 static rtx
ix86_expand_sse_pcmpistr(const struct builtin_description * d,tree exp,rtx target)8823 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8824 			  tree exp, rtx target)
8825 {
8826   rtx pat;
8827   tree arg0 = CALL_EXPR_ARG (exp, 0);
8828   tree arg1 = CALL_EXPR_ARG (exp, 1);
8829   tree arg2 = CALL_EXPR_ARG (exp, 2);
8830   rtx scratch0, scratch1;
8831   rtx op0 = expand_normal (arg0);
8832   rtx op1 = expand_normal (arg1);
8833   rtx op2 = expand_normal (arg2);
8834   machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8835 
8836   tmode0 = insn_data[d->icode].operand[0].mode;
8837   tmode1 = insn_data[d->icode].operand[1].mode;
8838   modev2 = insn_data[d->icode].operand[2].mode;
8839   modev3 = insn_data[d->icode].operand[3].mode;
8840   modeimm = insn_data[d->icode].operand[4].mode;
8841 
8842   if (VECTOR_MODE_P (modev2))
8843     op0 = safe_vector_operand (op0, modev2);
8844   if (VECTOR_MODE_P (modev3))
8845     op1 = safe_vector_operand (op1, modev3);
8846 
8847   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8848     op0 = copy_to_mode_reg (modev2, op0);
8849   if ((optimize && !register_operand (op1, modev3))
8850       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8851     op1 = copy_to_mode_reg (modev3, op1);
8852 
8853   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8854     {
8855       error ("the third argument must be an 8-bit immediate");
8856       return const0_rtx;
8857     }
8858 
8859   if (d->code == IX86_BUILTIN_PCMPISTRI128)
8860     {
8861       if (optimize || !target
8862 	  || GET_MODE (target) != tmode0
8863 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8864 	target = gen_reg_rtx (tmode0);
8865 
8866       scratch1 = gen_reg_rtx (tmode1);
8867 
8868       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8869     }
8870   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8871     {
8872       if (optimize || !target
8873 	  || GET_MODE (target) != tmode1
8874 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8875 	target = gen_reg_rtx (tmode1);
8876 
8877       scratch0 = gen_reg_rtx (tmode0);
8878 
8879       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8880     }
8881   else
8882     {
8883       gcc_assert (d->flag);
8884 
8885       scratch0 = gen_reg_rtx (tmode0);
8886       scratch1 = gen_reg_rtx (tmode1);
8887 
8888       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8889     }
8890 
8891   if (! pat)
8892     return 0;
8893 
8894   emit_insn (pat);
8895 
8896   if (d->flag)
8897     {
8898       target = gen_reg_rtx (SImode);
8899       emit_move_insn (target, const0_rtx);
8900       target = gen_rtx_SUBREG (QImode, target, 0);
8901 
8902       emit_insn
8903 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8904 		      gen_rtx_fmt_ee (EQ, QImode,
8905 				      gen_rtx_REG ((machine_mode) d->flag,
8906 						   FLAGS_REG),
8907 				      const0_rtx)));
8908       return SUBREG_REG (target);
8909     }
8910   else
8911     return target;
8912 }
8913 
8914 /* Fixup modeless constants to fit required mode.  */
8915 
8916 static rtx
fixup_modeless_constant(rtx x,machine_mode mode)8917 fixup_modeless_constant (rtx x, machine_mode mode)
8918 {
8919   if (GET_MODE (x) == VOIDmode)
8920     x = convert_to_mode (mode, x, 1);
8921   return x;
8922 }
8923 
8924 /* Subroutine of ix86_expand_builtin to take care of insns with
8925    variable number of operands.  */
8926 
8927 static rtx
ix86_expand_args_builtin(const struct builtin_description * d,tree exp,rtx target)8928 ix86_expand_args_builtin (const struct builtin_description *d,
8929 			  tree exp, rtx target)
8930 {
8931   rtx pat, real_target;
8932   unsigned int i, nargs;
8933   unsigned int nargs_constant = 0;
8934   unsigned int mask_pos = 0;
8935   int num_memory = 0;
8936   struct
8937     {
8938       rtx op;
8939       machine_mode mode;
8940     } args[6];
8941   bool second_arg_count = false;
8942   enum insn_code icode = d->icode;
8943   const struct insn_data_d *insn_p = &insn_data[icode];
8944   machine_mode tmode = insn_p->operand[0].mode;
8945   machine_mode rmode = VOIDmode;
8946   bool swap = false;
8947   enum rtx_code comparison = d->comparison;
8948 
8949   switch ((enum ix86_builtin_func_type) d->flag)
8950     {
8951     case V2DF_FTYPE_V2DF_ROUND:
8952     case V4DF_FTYPE_V4DF_ROUND:
8953     case V8DF_FTYPE_V8DF_ROUND:
8954     case V4SF_FTYPE_V4SF_ROUND:
8955     case V8SF_FTYPE_V8SF_ROUND:
8956     case V16SF_FTYPE_V16SF_ROUND:
8957     case V4SI_FTYPE_V4SF_ROUND:
8958     case V8SI_FTYPE_V8SF_ROUND:
8959     case V16SI_FTYPE_V16SF_ROUND:
8960       return ix86_expand_sse_round (d, exp, target);
8961     case V4SI_FTYPE_V2DF_V2DF_ROUND:
8962     case V8SI_FTYPE_V4DF_V4DF_ROUND:
8963     case V16SI_FTYPE_V8DF_V8DF_ROUND:
8964       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8965     case INT_FTYPE_V8SF_V8SF_PTEST:
8966     case INT_FTYPE_V4DI_V4DI_PTEST:
8967     case INT_FTYPE_V4DF_V4DF_PTEST:
8968     case INT_FTYPE_V4SF_V4SF_PTEST:
8969     case INT_FTYPE_V2DI_V2DI_PTEST:
8970     case INT_FTYPE_V2DF_V2DF_PTEST:
8971       return ix86_expand_sse_ptest (d, exp, target);
8972     case FLOAT128_FTYPE_FLOAT128:
8973     case FLOAT_FTYPE_FLOAT:
8974     case INT_FTYPE_INT:
8975     case UINT_FTYPE_UINT:
8976     case UINT16_FTYPE_UINT16:
8977     case UINT64_FTYPE_INT:
8978     case UINT64_FTYPE_UINT64:
8979     case INT64_FTYPE_INT64:
8980     case INT64_FTYPE_V4SF:
8981     case INT64_FTYPE_V2DF:
8982     case INT_FTYPE_V16QI:
8983     case INT_FTYPE_V8QI:
8984     case INT_FTYPE_V8SF:
8985     case INT_FTYPE_V4DF:
8986     case INT_FTYPE_V4SF:
8987     case INT_FTYPE_V2DF:
8988     case INT_FTYPE_V32QI:
8989     case V16QI_FTYPE_V16QI:
8990     case V8SI_FTYPE_V8SF:
8991     case V8SI_FTYPE_V4SI:
8992     case V8HI_FTYPE_V8HI:
8993     case V8HI_FTYPE_V16QI:
8994     case V8QI_FTYPE_V8QI:
8995     case V8SF_FTYPE_V8SF:
8996     case V8SF_FTYPE_V8SI:
8997     case V8SF_FTYPE_V4SF:
8998     case V8SF_FTYPE_V8HI:
8999     case V4SI_FTYPE_V4SI:
9000     case V4SI_FTYPE_V16QI:
9001     case V4SI_FTYPE_V4SF:
9002     case V4SI_FTYPE_V8SI:
9003     case V4SI_FTYPE_V8HI:
9004     case V4SI_FTYPE_V4DF:
9005     case V4SI_FTYPE_V2DF:
9006     case V4HI_FTYPE_V4HI:
9007     case V4DF_FTYPE_V4DF:
9008     case V4DF_FTYPE_V4SI:
9009     case V4DF_FTYPE_V4SF:
9010     case V4DF_FTYPE_V2DF:
9011     case V4SF_FTYPE_V4SF:
9012     case V4SF_FTYPE_V4SI:
9013     case V4SF_FTYPE_V8SF:
9014     case V4SF_FTYPE_V4DF:
9015     case V4SF_FTYPE_V8HI:
9016     case V4SF_FTYPE_V2DF:
9017     case V2DI_FTYPE_V2DI:
9018     case V2DI_FTYPE_V16QI:
9019     case V2DI_FTYPE_V8HI:
9020     case V2DI_FTYPE_V4SI:
9021     case V2DF_FTYPE_V2DF:
9022     case V2DF_FTYPE_V4SI:
9023     case V2DF_FTYPE_V4DF:
9024     case V2DF_FTYPE_V4SF:
9025     case V2DF_FTYPE_V2SI:
9026     case V2SI_FTYPE_V2SI:
9027     case V2SI_FTYPE_V4SF:
9028     case V2SI_FTYPE_V2SF:
9029     case V2SI_FTYPE_V2DF:
9030     case V2SF_FTYPE_V2SF:
9031     case V2SF_FTYPE_V2SI:
9032     case V32QI_FTYPE_V32QI:
9033     case V32QI_FTYPE_V16QI:
9034     case V16HI_FTYPE_V16HI:
9035     case V16HI_FTYPE_V8HI:
9036     case V8SI_FTYPE_V8SI:
9037     case V16HI_FTYPE_V16QI:
9038     case V8SI_FTYPE_V16QI:
9039     case V4DI_FTYPE_V16QI:
9040     case V8SI_FTYPE_V8HI:
9041     case V4DI_FTYPE_V8HI:
9042     case V4DI_FTYPE_V4SI:
9043     case V4DI_FTYPE_V2DI:
9044     case UQI_FTYPE_UQI:
9045     case UHI_FTYPE_UHI:
9046     case USI_FTYPE_USI:
9047     case USI_FTYPE_UQI:
9048     case USI_FTYPE_UHI:
9049     case UDI_FTYPE_UDI:
9050     case UHI_FTYPE_V16QI:
9051     case USI_FTYPE_V32QI:
9052     case UDI_FTYPE_V64QI:
9053     case V16QI_FTYPE_UHI:
9054     case V32QI_FTYPE_USI:
9055     case V64QI_FTYPE_UDI:
9056     case V8HI_FTYPE_UQI:
9057     case V16HI_FTYPE_UHI:
9058     case V32HI_FTYPE_USI:
9059     case V4SI_FTYPE_UQI:
9060     case V8SI_FTYPE_UQI:
9061     case V4SI_FTYPE_UHI:
9062     case V8SI_FTYPE_UHI:
9063     case UQI_FTYPE_V8HI:
9064     case UHI_FTYPE_V16HI:
9065     case USI_FTYPE_V32HI:
9066     case UQI_FTYPE_V4SI:
9067     case UQI_FTYPE_V8SI:
9068     case UHI_FTYPE_V16SI:
9069     case UQI_FTYPE_V2DI:
9070     case UQI_FTYPE_V4DI:
9071     case UQI_FTYPE_V8DI:
9072     case V16SI_FTYPE_UHI:
9073     case V2DI_FTYPE_UQI:
9074     case V4DI_FTYPE_UQI:
9075     case V16SI_FTYPE_INT:
9076     case V16SF_FTYPE_V8SF:
9077     case V16SI_FTYPE_V8SI:
9078     case V16SF_FTYPE_V4SF:
9079     case V16SI_FTYPE_V4SI:
9080     case V16SI_FTYPE_V16SF:
9081     case V16SI_FTYPE_V16SI:
9082     case V64QI_FTYPE_V64QI:
9083     case V32HI_FTYPE_V32HI:
9084     case V16SF_FTYPE_V16SF:
9085     case V8DI_FTYPE_UQI:
9086     case V8DI_FTYPE_V8DI:
9087     case V8DF_FTYPE_V4DF:
9088     case V8DF_FTYPE_V2DF:
9089     case V8DF_FTYPE_V8DF:
9090     case V4DI_FTYPE_V4DI:
9091     case V16HI_FTYPE_V16SF:
9092     case V8HI_FTYPE_V8SF:
9093     case V8HI_FTYPE_V4SF:
9094       nargs = 1;
9095       break;
9096     case V4SF_FTYPE_V4SF_VEC_MERGE:
9097     case V2DF_FTYPE_V2DF_VEC_MERGE:
9098       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9099     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9100     case V16QI_FTYPE_V16QI_V16QI:
9101     case V16QI_FTYPE_V8HI_V8HI:
9102     case V16SF_FTYPE_V16SF_V16SF:
9103     case V8QI_FTYPE_V8QI_V8QI:
9104     case V8QI_FTYPE_V4HI_V4HI:
9105     case V8HI_FTYPE_V8HI_V8HI:
9106     case V8HI_FTYPE_V16QI_V16QI:
9107     case V8HI_FTYPE_V4SI_V4SI:
9108     case V8SF_FTYPE_V8SF_V8SF:
9109     case V8SF_FTYPE_V8SF_V8SI:
9110     case V8DF_FTYPE_V8DF_V8DF:
9111     case V4SI_FTYPE_V4SI_V4SI:
9112     case V4SI_FTYPE_V8HI_V8HI:
9113     case V4SI_FTYPE_V2DF_V2DF:
9114     case V4HI_FTYPE_V4HI_V4HI:
9115     case V4HI_FTYPE_V8QI_V8QI:
9116     case V4HI_FTYPE_V2SI_V2SI:
9117     case V4DF_FTYPE_V4DF_V4DF:
9118     case V4DF_FTYPE_V4DF_V4DI:
9119     case V4SF_FTYPE_V4SF_V4SF:
9120     case V4SF_FTYPE_V4SF_V4SI:
9121     case V4SF_FTYPE_V4SF_V2SI:
9122     case V4SF_FTYPE_V4SF_V2DF:
9123     case V4SF_FTYPE_V4SF_UINT:
9124     case V4SF_FTYPE_V4SF_DI:
9125     case V4SF_FTYPE_V4SF_SI:
9126     case V2DI_FTYPE_V2DI_V2DI:
9127     case V2DI_FTYPE_V16QI_V16QI:
9128     case V2DI_FTYPE_V4SI_V4SI:
9129     case V2DI_FTYPE_V2DI_V16QI:
9130     case V2SI_FTYPE_V2SI_V2SI:
9131     case V2SI_FTYPE_V4HI_V4HI:
9132     case V2SI_FTYPE_V2SF_V2SF:
9133     case V2DF_FTYPE_V2DF_V2DF:
9134     case V2DF_FTYPE_V2DF_V4SF:
9135     case V2DF_FTYPE_V2DF_V2DI:
9136     case V2DF_FTYPE_V2DF_DI:
9137     case V2DF_FTYPE_V2DF_SI:
9138     case V2DF_FTYPE_V2DF_UINT:
9139     case V2SF_FTYPE_V2SF_V2SF:
9140     case V1DI_FTYPE_V1DI_V1DI:
9141     case V1DI_FTYPE_V8QI_V8QI:
9142     case V1DI_FTYPE_V2SI_V2SI:
9143     case V32QI_FTYPE_V16HI_V16HI:
9144     case V16HI_FTYPE_V8SI_V8SI:
9145     case V64QI_FTYPE_V64QI_V64QI:
9146     case V32QI_FTYPE_V32QI_V32QI:
9147     case V16HI_FTYPE_V32QI_V32QI:
9148     case V16HI_FTYPE_V16HI_V16HI:
9149     case V8SI_FTYPE_V4DF_V4DF:
9150     case V8SI_FTYPE_V8SI_V8SI:
9151     case V8SI_FTYPE_V16HI_V16HI:
9152     case V4DI_FTYPE_V4DI_V4DI:
9153     case V4DI_FTYPE_V8SI_V8SI:
9154     case V8DI_FTYPE_V64QI_V64QI:
9155       if (comparison == UNKNOWN)
9156 	return ix86_expand_binop_builtin (icode, exp, target);
9157       nargs = 2;
9158       break;
9159     case V4SF_FTYPE_V4SF_V4SF_SWAP:
9160     case V2DF_FTYPE_V2DF_V2DF_SWAP:
9161       gcc_assert (comparison != UNKNOWN);
9162       nargs = 2;
9163       swap = true;
9164       break;
9165     case V16HI_FTYPE_V16HI_V8HI_COUNT:
9166     case V16HI_FTYPE_V16HI_SI_COUNT:
9167     case V8SI_FTYPE_V8SI_V4SI_COUNT:
9168     case V8SI_FTYPE_V8SI_SI_COUNT:
9169     case V4DI_FTYPE_V4DI_V2DI_COUNT:
9170     case V4DI_FTYPE_V4DI_INT_COUNT:
9171     case V8HI_FTYPE_V8HI_V8HI_COUNT:
9172     case V8HI_FTYPE_V8HI_SI_COUNT:
9173     case V4SI_FTYPE_V4SI_V4SI_COUNT:
9174     case V4SI_FTYPE_V4SI_SI_COUNT:
9175     case V4HI_FTYPE_V4HI_V4HI_COUNT:
9176     case V4HI_FTYPE_V4HI_SI_COUNT:
9177     case V2DI_FTYPE_V2DI_V2DI_COUNT:
9178     case V2DI_FTYPE_V2DI_SI_COUNT:
9179     case V2SI_FTYPE_V2SI_V2SI_COUNT:
9180     case V2SI_FTYPE_V2SI_SI_COUNT:
9181     case V1DI_FTYPE_V1DI_V1DI_COUNT:
9182     case V1DI_FTYPE_V1DI_SI_COUNT:
9183       nargs = 2;
9184       second_arg_count = true;
9185       break;
9186     case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9187     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9188     case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9189     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9190     case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9191     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9192     case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9193     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9194     case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9195     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9196     case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9197     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9198     case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9199     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9200     case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9201     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9202     case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9203     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9204       nargs = 4;
9205       second_arg_count = true;
9206       break;
9207     case UINT64_FTYPE_UINT64_UINT64:
9208     case UINT_FTYPE_UINT_UINT:
9209     case UINT_FTYPE_UINT_USHORT:
9210     case UINT_FTYPE_UINT_UCHAR:
9211     case UINT16_FTYPE_UINT16_INT:
9212     case UINT8_FTYPE_UINT8_INT:
9213     case UQI_FTYPE_UQI_UQI:
9214     case UHI_FTYPE_UHI_UHI:
9215     case USI_FTYPE_USI_USI:
9216     case UDI_FTYPE_UDI_UDI:
9217     case V16SI_FTYPE_V8DF_V8DF:
9218     case V32HI_FTYPE_V16SF_V16SF:
9219     case V16HI_FTYPE_V8SF_V8SF:
9220     case V8HI_FTYPE_V4SF_V4SF:
9221     case V16HI_FTYPE_V16SF_UHI:
9222     case V8HI_FTYPE_V8SF_UQI:
9223     case V8HI_FTYPE_V4SF_UQI:
9224       nargs = 2;
9225       break;
9226     case V2DI_FTYPE_V2DI_INT_CONVERT:
9227       nargs = 2;
9228       rmode = V1TImode;
9229       nargs_constant = 1;
9230       break;
9231     case V4DI_FTYPE_V4DI_INT_CONVERT:
9232       nargs = 2;
9233       rmode = V2TImode;
9234       nargs_constant = 1;
9235       break;
9236     case V8DI_FTYPE_V8DI_INT_CONVERT:
9237       nargs = 2;
9238       rmode = V4TImode;
9239       nargs_constant = 1;
9240       break;
9241     case V8HI_FTYPE_V8HI_INT:
9242     case V8HI_FTYPE_V8SF_INT:
9243     case V16HI_FTYPE_V16SF_INT:
9244     case V8HI_FTYPE_V4SF_INT:
9245     case V8SF_FTYPE_V8SF_INT:
9246     case V4SF_FTYPE_V16SF_INT:
9247     case V16SF_FTYPE_V16SF_INT:
9248     case V4SI_FTYPE_V4SI_INT:
9249     case V4SI_FTYPE_V8SI_INT:
9250     case V4HI_FTYPE_V4HI_INT:
9251     case V4DF_FTYPE_V4DF_INT:
9252     case V4DF_FTYPE_V8DF_INT:
9253     case V4SF_FTYPE_V4SF_INT:
9254     case V4SF_FTYPE_V8SF_INT:
9255     case V2DI_FTYPE_V2DI_INT:
9256     case V2DF_FTYPE_V2DF_INT:
9257     case V2DF_FTYPE_V4DF_INT:
9258     case V16HI_FTYPE_V16HI_INT:
9259     case V8SI_FTYPE_V8SI_INT:
9260     case V16SI_FTYPE_V16SI_INT:
9261     case V4SI_FTYPE_V16SI_INT:
9262     case V4DI_FTYPE_V4DI_INT:
9263     case V2DI_FTYPE_V4DI_INT:
9264     case V4DI_FTYPE_V8DI_INT:
9265     case UQI_FTYPE_UQI_UQI_CONST:
9266     case UHI_FTYPE_UHI_UQI:
9267     case USI_FTYPE_USI_UQI:
9268     case UDI_FTYPE_UDI_UQI:
9269       nargs = 2;
9270       nargs_constant = 1;
9271       break;
9272     case V16QI_FTYPE_V16QI_V16QI_V16QI:
9273     case V8SF_FTYPE_V8SF_V8SF_V8SF:
9274     case V4DF_FTYPE_V4DF_V4DF_V4DF:
9275     case V4SF_FTYPE_V4SF_V4SF_V4SF:
9276     case V2DF_FTYPE_V2DF_V2DF_V2DF:
9277     case V32QI_FTYPE_V32QI_V32QI_V32QI:
9278     case UHI_FTYPE_V16SI_V16SI_UHI:
9279     case UQI_FTYPE_V8DI_V8DI_UQI:
9280     case V16HI_FTYPE_V16SI_V16HI_UHI:
9281     case V16QI_FTYPE_V16SI_V16QI_UHI:
9282     case V16QI_FTYPE_V8DI_V16QI_UQI:
9283     case V16SF_FTYPE_V16SF_V16SF_UHI:
9284     case V16SF_FTYPE_V4SF_V16SF_UHI:
9285     case V16SI_FTYPE_SI_V16SI_UHI:
9286     case V16SI_FTYPE_V16HI_V16SI_UHI:
9287     case V16SI_FTYPE_V16QI_V16SI_UHI:
9288     case V8SF_FTYPE_V4SF_V8SF_UQI:
9289     case V4DF_FTYPE_V2DF_V4DF_UQI:
9290     case V8SI_FTYPE_V4SI_V8SI_UQI:
9291     case V8SI_FTYPE_SI_V8SI_UQI:
9292     case V4SI_FTYPE_V4SI_V4SI_UQI:
9293     case V4SI_FTYPE_SI_V4SI_UQI:
9294     case V4DI_FTYPE_V2DI_V4DI_UQI:
9295     case V4DI_FTYPE_DI_V4DI_UQI:
9296     case V2DI_FTYPE_V2DI_V2DI_UQI:
9297     case V2DI_FTYPE_DI_V2DI_UQI:
9298     case V64QI_FTYPE_V64QI_V64QI_UDI:
9299     case V64QI_FTYPE_V16QI_V64QI_UDI:
9300     case V64QI_FTYPE_QI_V64QI_UDI:
9301     case V32QI_FTYPE_V32QI_V32QI_USI:
9302     case V32QI_FTYPE_V16QI_V32QI_USI:
9303     case V32QI_FTYPE_QI_V32QI_USI:
9304     case V16QI_FTYPE_V16QI_V16QI_UHI:
9305     case V16QI_FTYPE_QI_V16QI_UHI:
9306     case V32HI_FTYPE_V8HI_V32HI_USI:
9307     case V32HI_FTYPE_HI_V32HI_USI:
9308     case V16HI_FTYPE_V8HI_V16HI_UHI:
9309     case V16HI_FTYPE_HI_V16HI_UHI:
9310     case V8HI_FTYPE_V8HI_V8HI_UQI:
9311     case V8HI_FTYPE_HI_V8HI_UQI:
9312     case V8SF_FTYPE_V8HI_V8SF_UQI:
9313     case V4SF_FTYPE_V8HI_V4SF_UQI:
9314     case V8SI_FTYPE_V8SF_V8SI_UQI:
9315     case V4SI_FTYPE_V4SF_V4SI_UQI:
9316     case V4DI_FTYPE_V4SF_V4DI_UQI:
9317     case V2DI_FTYPE_V4SF_V2DI_UQI:
9318     case V4SF_FTYPE_V4DI_V4SF_UQI:
9319     case V4SF_FTYPE_V2DI_V4SF_UQI:
9320     case V4DF_FTYPE_V4DI_V4DF_UQI:
9321     case V2DF_FTYPE_V2DI_V2DF_UQI:
9322     case V16QI_FTYPE_V8HI_V16QI_UQI:
9323     case V16QI_FTYPE_V16HI_V16QI_UHI:
9324     case V16QI_FTYPE_V4SI_V16QI_UQI:
9325     case V16QI_FTYPE_V8SI_V16QI_UQI:
9326     case V8HI_FTYPE_V4SI_V8HI_UQI:
9327     case V8HI_FTYPE_V8SI_V8HI_UQI:
9328     case V16QI_FTYPE_V2DI_V16QI_UQI:
9329     case V16QI_FTYPE_V4DI_V16QI_UQI:
9330     case V8HI_FTYPE_V2DI_V8HI_UQI:
9331     case V8HI_FTYPE_V4DI_V8HI_UQI:
9332     case V4SI_FTYPE_V2DI_V4SI_UQI:
9333     case V4SI_FTYPE_V4DI_V4SI_UQI:
9334     case V32QI_FTYPE_V32HI_V32QI_USI:
9335     case UHI_FTYPE_V16QI_V16QI_UHI:
9336     case USI_FTYPE_V32QI_V32QI_USI:
9337     case UDI_FTYPE_V64QI_V64QI_UDI:
9338     case UQI_FTYPE_V8HI_V8HI_UQI:
9339     case UHI_FTYPE_V16HI_V16HI_UHI:
9340     case USI_FTYPE_V32HI_V32HI_USI:
9341     case UQI_FTYPE_V4SI_V4SI_UQI:
9342     case UQI_FTYPE_V8SI_V8SI_UQI:
9343     case UQI_FTYPE_V2DI_V2DI_UQI:
9344     case UQI_FTYPE_V4DI_V4DI_UQI:
9345     case V4SF_FTYPE_V2DF_V4SF_UQI:
9346     case V4SF_FTYPE_V4DF_V4SF_UQI:
9347     case V16SI_FTYPE_V16SI_V16SI_UHI:
9348     case V16SI_FTYPE_V4SI_V16SI_UHI:
9349     case V2DI_FTYPE_V4SI_V2DI_UQI:
9350     case V2DI_FTYPE_V8HI_V2DI_UQI:
9351     case V2DI_FTYPE_V16QI_V2DI_UQI:
9352     case V4DI_FTYPE_V4DI_V4DI_UQI:
9353     case V4DI_FTYPE_V4SI_V4DI_UQI:
9354     case V4DI_FTYPE_V8HI_V4DI_UQI:
9355     case V4DI_FTYPE_V16QI_V4DI_UQI:
9356     case V4DI_FTYPE_V4DF_V4DI_UQI:
9357     case V2DI_FTYPE_V2DF_V2DI_UQI:
9358     case V4SI_FTYPE_V4DF_V4SI_UQI:
9359     case V4SI_FTYPE_V2DF_V4SI_UQI:
9360     case V4SI_FTYPE_V8HI_V4SI_UQI:
9361     case V4SI_FTYPE_V16QI_V4SI_UQI:
9362     case V4DI_FTYPE_V4DI_V4DI_V4DI:
9363     case V8DF_FTYPE_V2DF_V8DF_UQI:
9364     case V8DF_FTYPE_V4DF_V8DF_UQI:
9365     case V8DF_FTYPE_V8DF_V8DF_UQI:
9366     case V8SF_FTYPE_V8SF_V8SF_UQI:
9367     case V8SF_FTYPE_V8SI_V8SF_UQI:
9368     case V4DF_FTYPE_V4DF_V4DF_UQI:
9369     case V4SF_FTYPE_V4SF_V4SF_UQI:
9370     case V2DF_FTYPE_V2DF_V2DF_UQI:
9371     case V2DF_FTYPE_V4SF_V2DF_UQI:
9372     case V2DF_FTYPE_V4SI_V2DF_UQI:
9373     case V4SF_FTYPE_V4SI_V4SF_UQI:
9374     case V4DF_FTYPE_V4SF_V4DF_UQI:
9375     case V4DF_FTYPE_V4SI_V4DF_UQI:
9376     case V8SI_FTYPE_V8SI_V8SI_UQI:
9377     case V8SI_FTYPE_V8HI_V8SI_UQI:
9378     case V8SI_FTYPE_V16QI_V8SI_UQI:
9379     case V8DF_FTYPE_V8SI_V8DF_UQI:
9380     case V8DI_FTYPE_DI_V8DI_UQI:
9381     case V16SF_FTYPE_V8SF_V16SF_UHI:
9382     case V16SI_FTYPE_V8SI_V16SI_UHI:
9383     case V16HI_FTYPE_V16HI_V16HI_UHI:
9384     case V8HI_FTYPE_V16QI_V8HI_UQI:
9385     case V16HI_FTYPE_V16QI_V16HI_UHI:
9386     case V32HI_FTYPE_V32HI_V32HI_USI:
9387     case V32HI_FTYPE_V32QI_V32HI_USI:
9388     case V8DI_FTYPE_V16QI_V8DI_UQI:
9389     case V8DI_FTYPE_V2DI_V8DI_UQI:
9390     case V8DI_FTYPE_V4DI_V8DI_UQI:
9391     case V8DI_FTYPE_V8DI_V8DI_UQI:
9392     case V8DI_FTYPE_V8HI_V8DI_UQI:
9393     case V8DI_FTYPE_V8SI_V8DI_UQI:
9394     case V8HI_FTYPE_V8DI_V8HI_UQI:
9395     case V8SI_FTYPE_V8DI_V8SI_UQI:
9396     case V4SI_FTYPE_V4SI_V4SI_V4SI:
9397     case V16SI_FTYPE_V16SI_V16SI_V16SI:
9398     case V8DI_FTYPE_V8DI_V8DI_V8DI:
9399     case V32HI_FTYPE_V32HI_V32HI_V32HI:
9400     case V2DI_FTYPE_V2DI_V2DI_V2DI:
9401     case V16HI_FTYPE_V16HI_V16HI_V16HI:
9402     case V8SI_FTYPE_V8SI_V8SI_V8SI:
9403     case V8HI_FTYPE_V8HI_V8HI_V8HI:
9404     case V32HI_FTYPE_V16SF_V16SF_USI:
9405     case V16HI_FTYPE_V8SF_V8SF_UHI:
9406     case V8HI_FTYPE_V4SF_V4SF_UQI:
9407     case V16HI_FTYPE_V16SF_V16HI_UHI:
9408     case V8HI_FTYPE_V8SF_V8HI_UQI:
9409     case V8HI_FTYPE_V4SF_V8HI_UQI:
9410     case V16SF_FTYPE_V16SF_V32HI_V32HI:
9411     case V8SF_FTYPE_V8SF_V16HI_V16HI:
9412     case V4SF_FTYPE_V4SF_V8HI_V8HI:
9413       nargs = 3;
9414       break;
9415     case V32QI_FTYPE_V32QI_V32QI_INT:
9416     case V16HI_FTYPE_V16HI_V16HI_INT:
9417     case V16QI_FTYPE_V16QI_V16QI_INT:
9418     case V4DI_FTYPE_V4DI_V4DI_INT:
9419     case V8HI_FTYPE_V8HI_V8HI_INT:
9420     case V8SI_FTYPE_V8SI_V8SI_INT:
9421     case V8SI_FTYPE_V8SI_V4SI_INT:
9422     case V8SF_FTYPE_V8SF_V8SF_INT:
9423     case V8SF_FTYPE_V8SF_V4SF_INT:
9424     case V4SI_FTYPE_V4SI_V4SI_INT:
9425     case V4DF_FTYPE_V4DF_V4DF_INT:
9426     case V16SF_FTYPE_V16SF_V16SF_INT:
9427     case V16SF_FTYPE_V16SF_V4SF_INT:
9428     case V16SI_FTYPE_V16SI_V4SI_INT:
9429     case V4DF_FTYPE_V4DF_V2DF_INT:
9430     case V4SF_FTYPE_V4SF_V4SF_INT:
9431     case V2DI_FTYPE_V2DI_V2DI_INT:
9432     case V4DI_FTYPE_V4DI_V2DI_INT:
9433     case V2DF_FTYPE_V2DF_V2DF_INT:
9434     case UQI_FTYPE_V8DI_V8UDI_INT:
9435     case UQI_FTYPE_V8DF_V8DF_INT:
9436     case UQI_FTYPE_V2DF_V2DF_INT:
9437     case UQI_FTYPE_V4SF_V4SF_INT:
9438     case UHI_FTYPE_V16SI_V16SI_INT:
9439     case UHI_FTYPE_V16SF_V16SF_INT:
9440     case V64QI_FTYPE_V64QI_V64QI_INT:
9441     case V32HI_FTYPE_V32HI_V32HI_INT:
9442     case V16SI_FTYPE_V16SI_V16SI_INT:
9443     case V8DI_FTYPE_V8DI_V8DI_INT:
9444       nargs = 3;
9445       nargs_constant = 1;
9446       break;
9447     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9448       nargs = 3;
9449       rmode = V4DImode;
9450       nargs_constant = 1;
9451       break;
9452     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9453       nargs = 3;
9454       rmode = V2DImode;
9455       nargs_constant = 1;
9456       break;
9457     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9458       nargs = 3;
9459       rmode = DImode;
9460       nargs_constant = 1;
9461       break;
9462     case V2DI_FTYPE_V2DI_UINT_UINT:
9463       nargs = 3;
9464       nargs_constant = 2;
9465       break;
9466     case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9467       nargs = 3;
9468       rmode = V8DImode;
9469       nargs_constant = 1;
9470       break;
9471     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9472       nargs = 5;
9473       rmode = V8DImode;
9474       mask_pos = 2;
9475       nargs_constant = 1;
9476       break;
9477     case QI_FTYPE_V8DF_INT_UQI:
9478     case QI_FTYPE_V4DF_INT_UQI:
9479     case QI_FTYPE_V2DF_INT_UQI:
9480     case HI_FTYPE_V16SF_INT_UHI:
9481     case QI_FTYPE_V8SF_INT_UQI:
9482     case QI_FTYPE_V4SF_INT_UQI:
9483     case V4SI_FTYPE_V4SI_V4SI_UHI:
9484     case V8SI_FTYPE_V8SI_V8SI_UHI:
9485       nargs = 3;
9486       mask_pos = 1;
9487       nargs_constant = 1;
9488       break;
9489     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9490       nargs = 5;
9491       rmode = V4DImode;
9492       mask_pos = 2;
9493       nargs_constant = 1;
9494       break;
9495     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9496       nargs = 5;
9497       rmode = V2DImode;
9498       mask_pos = 2;
9499       nargs_constant = 1;
9500       break;
9501     case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9502     case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9503     case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9504     case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9505     case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9506     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9507     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9508     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9509     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9510     case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9511     case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9512     case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9513     case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9514     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9515     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9516     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9517     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9518     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9519     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9520     case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9521     case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9522     case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9523     case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9524     case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9525     case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9526     case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9527     case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9528     case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9529     case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9530     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9531     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9532     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9533     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9534     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9535     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9536     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9537     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9538     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9539     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9540     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9541     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9542     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9543     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9544     case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9545     case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9546     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9547     case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9548     case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9549     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9550     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9551     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9552     case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9553     case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9554     case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9555       nargs = 4;
9556       break;
9557     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9558     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9559     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9560     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9561     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9562       nargs = 4;
9563       nargs_constant = 1;
9564       break;
9565     case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9566     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9567     case QI_FTYPE_V4DF_V4DF_INT_UQI:
9568     case QI_FTYPE_V8SF_V8SF_INT_UQI:
9569     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9570     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9571     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9572     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9573     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9574     case USI_FTYPE_V32QI_V32QI_INT_USI:
9575     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9576     case USI_FTYPE_V32HI_V32HI_INT_USI:
9577     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9578     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9579       nargs = 4;
9580       mask_pos = 1;
9581       nargs_constant = 1;
9582       break;
9583     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9584       nargs = 4;
9585       nargs_constant = 2;
9586       break;
9587     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9588     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9589     case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9590     case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9591     case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9592       nargs = 4;
9593       break;
9594     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9595     case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9596       mask_pos = 1;
9597       nargs = 4;
9598       nargs_constant = 1;
9599       break;
9600     case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9601     case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9602     case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9603     case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9604     case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9605     case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9606     case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9607     case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9608     case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9609     case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9610     case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9611     case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9612     case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9613     case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9614     case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9615     case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9616     case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9617     case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9618     case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9619     case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9620     case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9621     case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9622     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9623     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9624     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9625     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9626     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9627     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9628     case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9629     case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9630       nargs = 4;
9631       mask_pos = 2;
9632       nargs_constant = 1;
9633       break;
9634     case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9635     case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9636     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9637     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9638     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9639     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9640     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9641     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9642     case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9643     case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9644     case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9645     case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9646     case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9647     case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9648     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9649     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9650     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9651     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9652     case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9653     case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9654     case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9655     case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9656     case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9657     case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9658     case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9659     case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9660     case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9661       nargs = 5;
9662       mask_pos = 2;
9663       nargs_constant = 1;
9664       break;
9665     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9666     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9667     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9668     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9669     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9670     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9671     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9672     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9673     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9674     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9675       nargs = 5;
9676       mask_pos = 1;
9677       nargs_constant = 1;
9678       break;
9679     case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9680     case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9681     case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9682     case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9683     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9684     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9685     case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9686     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9687     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9688     case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9689     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9690     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9691       nargs = 5;
9692       mask_pos = 1;
9693       nargs_constant = 2;
9694       break;
9695 
9696     default:
9697       gcc_unreachable ();
9698     }
9699 
9700   gcc_assert (nargs <= ARRAY_SIZE (args));
9701 
9702   if (comparison != UNKNOWN)
9703     {
9704       gcc_assert (nargs == 2);
9705       return ix86_expand_sse_compare (d, exp, target, swap);
9706     }
9707 
9708   if (rmode == VOIDmode || rmode == tmode)
9709     {
9710       if (optimize
9711 	  || target == 0
9712 	  || GET_MODE (target) != tmode
9713 	  || !insn_p->operand[0].predicate (target, tmode))
9714 	target = gen_reg_rtx (tmode);
9715       else if (memory_operand (target, tmode))
9716 	num_memory++;
9717       real_target = target;
9718     }
9719   else
9720     {
9721       real_target = gen_reg_rtx (tmode);
9722       target = lowpart_subreg (rmode, real_target, tmode);
9723     }
9724 
9725   for (i = 0; i < nargs; i++)
9726     {
9727       tree arg = CALL_EXPR_ARG (exp, i);
9728       rtx op = expand_normal (arg);
9729       machine_mode mode = insn_p->operand[i + 1].mode;
9730       bool match = insn_p->operand[i + 1].predicate (op, mode);
9731 
9732       if (second_arg_count && i == 1)
9733 	{
9734 	  /* SIMD shift insns take either an 8-bit immediate or
9735 	     register as count.  But builtin functions take int as
9736 	     count.  If count doesn't match, we put it in register.
9737 	     The instructions are using 64-bit count, if op is just
9738 	     32-bit, zero-extend it, as negative shift counts
9739 	     are undefined behavior and zero-extension is more
9740 	     efficient.  */
9741 	  if (!match)
9742 	    {
9743 	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
9744 		op = convert_modes (mode, GET_MODE (op), op, 1);
9745 	      else
9746 		op = lowpart_subreg (mode, op, GET_MODE (op));
9747 	      if (!insn_p->operand[i + 1].predicate (op, mode))
9748 		op = copy_to_reg (op);
9749 	    }
9750 	}
9751       else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9752 	       (!mask_pos && (nargs - i) <= nargs_constant))
9753 	{
9754 	  if (!match)
9755 	    switch (icode)
9756 	      {
9757 	      case CODE_FOR_avx_vinsertf128v4di:
9758 	      case CODE_FOR_avx_vextractf128v4di:
9759 		error ("the last argument must be an 1-bit immediate");
9760 		return const0_rtx;
9761 
9762 	      case CODE_FOR_avx512f_cmpv8di3_mask:
9763 	      case CODE_FOR_avx512f_cmpv16si3_mask:
9764 	      case CODE_FOR_avx512f_ucmpv8di3_mask:
9765 	      case CODE_FOR_avx512f_ucmpv16si3_mask:
9766 	      case CODE_FOR_avx512vl_cmpv4di3_mask:
9767 	      case CODE_FOR_avx512vl_cmpv8si3_mask:
9768 	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
9769 	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
9770 	      case CODE_FOR_avx512vl_cmpv2di3_mask:
9771 	      case CODE_FOR_avx512vl_cmpv4si3_mask:
9772 	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
9773 	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
9774 		error ("the last argument must be a 3-bit immediate");
9775 		return const0_rtx;
9776 
9777 	      case CODE_FOR_sse4_1_roundsd:
9778 	      case CODE_FOR_sse4_1_roundss:
9779 
9780 	      case CODE_FOR_sse4_1_roundpd:
9781 	      case CODE_FOR_sse4_1_roundps:
9782 	      case CODE_FOR_avx_roundpd256:
9783 	      case CODE_FOR_avx_roundps256:
9784 
9785 	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9786 	      case CODE_FOR_sse4_1_roundps_sfix:
9787 	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9788 	      case CODE_FOR_avx_roundps_sfix256:
9789 
9790 	      case CODE_FOR_sse4_1_blendps:
9791 	      case CODE_FOR_avx_blendpd256:
9792 	      case CODE_FOR_avx_vpermilv4df:
9793 	      case CODE_FOR_avx_vpermilv4df_mask:
9794 	      case CODE_FOR_avx512f_getmantv8df_mask:
9795 	      case CODE_FOR_avx512f_getmantv16sf_mask:
9796 	      case CODE_FOR_avx512vl_getmantv8sf_mask:
9797 	      case CODE_FOR_avx512vl_getmantv4df_mask:
9798 	      case CODE_FOR_avx512vl_getmantv4sf_mask:
9799 	      case CODE_FOR_avx512vl_getmantv2df_mask:
9800 	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
9801 	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9802 	      case CODE_FOR_avx512dq_rangepv4df_mask:
9803 	      case CODE_FOR_avx512dq_rangepv8sf_mask:
9804 	      case CODE_FOR_avx512dq_rangepv2df_mask:
9805 	      case CODE_FOR_avx512dq_rangepv4sf_mask:
9806 	      case CODE_FOR_avx_shufpd256_mask:
9807 		error ("the last argument must be a 4-bit immediate");
9808 		return const0_rtx;
9809 
9810 	      case CODE_FOR_sha1rnds4:
9811 	      case CODE_FOR_sse4_1_blendpd:
9812 	      case CODE_FOR_avx_vpermilv2df:
9813 	      case CODE_FOR_avx_vpermilv2df_mask:
9814 	      case CODE_FOR_xop_vpermil2v2df3:
9815 	      case CODE_FOR_xop_vpermil2v4sf3:
9816 	      case CODE_FOR_xop_vpermil2v4df3:
9817 	      case CODE_FOR_xop_vpermil2v8sf3:
9818 	      case CODE_FOR_avx512f_vinsertf32x4_mask:
9819 	      case CODE_FOR_avx512f_vinserti32x4_mask:
9820 	      case CODE_FOR_avx512f_vextractf32x4_mask:
9821 	      case CODE_FOR_avx512f_vextracti32x4_mask:
9822 	      case CODE_FOR_sse2_shufpd:
9823 	      case CODE_FOR_sse2_shufpd_mask:
9824 	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
9825 	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
9826 	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
9827 	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
9828 		error ("the last argument must be a 2-bit immediate");
9829 		return const0_rtx;
9830 
9831 	      case CODE_FOR_avx_vextractf128v4df:
9832 	      case CODE_FOR_avx_vextractf128v8sf:
9833 	      case CODE_FOR_avx_vextractf128v8si:
9834 	      case CODE_FOR_avx_vinsertf128v4df:
9835 	      case CODE_FOR_avx_vinsertf128v8sf:
9836 	      case CODE_FOR_avx_vinsertf128v8si:
9837 	      case CODE_FOR_avx512f_vinsertf64x4_mask:
9838 	      case CODE_FOR_avx512f_vinserti64x4_mask:
9839 	      case CODE_FOR_avx512f_vextractf64x4_mask:
9840 	      case CODE_FOR_avx512f_vextracti64x4_mask:
9841 	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
9842 	      case CODE_FOR_avx512dq_vinserti32x8_mask:
9843 	      case CODE_FOR_avx512vl_vinsertv4df:
9844 	      case CODE_FOR_avx512vl_vinsertv4di:
9845 	      case CODE_FOR_avx512vl_vinsertv8sf:
9846 	      case CODE_FOR_avx512vl_vinsertv8si:
9847 		error ("the last argument must be a 1-bit immediate");
9848 		return const0_rtx;
9849 
9850 	      case CODE_FOR_avx_vmcmpv2df3:
9851 	      case CODE_FOR_avx_vmcmpv4sf3:
9852 	      case CODE_FOR_avx_cmpv2df3:
9853 	      case CODE_FOR_avx_cmpv4sf3:
9854 	      case CODE_FOR_avx_cmpv4df3:
9855 	      case CODE_FOR_avx_cmpv8sf3:
9856 	      case CODE_FOR_avx512f_cmpv8df3_mask:
9857 	      case CODE_FOR_avx512f_cmpv16sf3_mask:
9858 	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
9859 	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9860 		error ("the last argument must be a 5-bit immediate");
9861 		return const0_rtx;
9862 
9863 	      default:
9864 		switch (nargs_constant)
9865 		  {
9866 		  case 2:
9867 		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9868 			(!mask_pos && (nargs - i) == nargs_constant))
9869 		      {
9870 			error ("the next to last argument must be an 8-bit immediate");
9871 			break;
9872 		      }
9873 		    /* FALLTHRU */
9874 		  case 1:
9875 		    error ("the last argument must be an 8-bit immediate");
9876 		    break;
9877 		  default:
9878 		    gcc_unreachable ();
9879 		  }
9880 		return const0_rtx;
9881 	      }
9882 	}
9883       else
9884 	{
9885 	  if (VECTOR_MODE_P (mode))
9886 	    op = safe_vector_operand (op, mode);
9887 
9888 	  /* If we aren't optimizing, only allow one memory operand to
9889 	     be generated.  */
9890 	  if (memory_operand (op, mode))
9891 	    num_memory++;
9892 
9893 	  op = fixup_modeless_constant (op, mode);
9894 
9895 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9896 	    {
9897 	      if (optimize || !match || num_memory > 1)
9898 		op = copy_to_mode_reg (mode, op);
9899 	    }
9900 	  else
9901 	    {
9902 	      op = copy_to_reg (op);
9903 	      op = lowpart_subreg (mode, op, GET_MODE (op));
9904 	    }
9905 	}
9906 
9907       args[i].op = op;
9908       args[i].mode = mode;
9909     }
9910 
9911   switch (nargs)
9912     {
9913     case 1:
9914       pat = GEN_FCN (icode) (real_target, args[0].op);
9915       break;
9916     case 2:
9917       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9918       break;
9919     case 3:
9920       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9921 			     args[2].op);
9922       break;
9923     case 4:
9924       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9925 			     args[2].op, args[3].op);
9926       break;
9927     case 5:
9928       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9929 			     args[2].op, args[3].op, args[4].op);
9930       break;
9931     case 6:
9932       pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9933 			     args[2].op, args[3].op, args[4].op,
9934 			     args[5].op);
9935       break;
9936     default:
9937       gcc_unreachable ();
9938     }
9939 
9940   if (! pat)
9941     return 0;
9942 
9943   emit_insn (pat);
9944   return target;
9945 }
9946 
9947 /* Transform pattern of following layout:
9948      (set A
9949        (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9950      )
9951    into:
9952      (set (A B)) */
9953 
9954 static rtx
ix86_erase_embedded_rounding(rtx pat)9955 ix86_erase_embedded_rounding (rtx pat)
9956 {
9957   if (GET_CODE (pat) == INSN)
9958     pat = PATTERN (pat);
9959 
9960   gcc_assert (GET_CODE (pat) == SET);
9961   rtx src = SET_SRC (pat);
9962   gcc_assert (XVECLEN (src, 0) == 2);
9963   rtx p0 = XVECEXP (src, 0, 0);
9964   gcc_assert (GET_CODE (src) == UNSPEC
9965 	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9966   rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9967   return res;
9968 }
9969 
9970 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9971    with rounding.  */
9972 static rtx
ix86_expand_sse_comi_round(const struct builtin_description * d,tree exp,rtx target)9973 ix86_expand_sse_comi_round (const struct builtin_description *d,
9974 			    tree exp, rtx target)
9975 {
9976   rtx pat, set_dst;
9977   tree arg0 = CALL_EXPR_ARG (exp, 0);
9978   tree arg1 = CALL_EXPR_ARG (exp, 1);
9979   tree arg2 = CALL_EXPR_ARG (exp, 2);
9980   tree arg3 = CALL_EXPR_ARG (exp, 3);
9981   rtx op0 = expand_normal (arg0);
9982   rtx op1 = expand_normal (arg1);
9983   rtx op2 = expand_normal (arg2);
9984   rtx op3 = expand_normal (arg3);
9985   enum insn_code icode = d->icode;
9986   const struct insn_data_d *insn_p = &insn_data[icode];
9987   machine_mode mode0 = insn_p->operand[0].mode;
9988   machine_mode mode1 = insn_p->operand[1].mode;
9989 
9990   /* See avxintrin.h for values.  */
9991   static const enum rtx_code comparisons[32] =
9992     {
9993       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9994       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
9995       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9996       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
9997     };
9998   static const bool ordereds[32] =
9999     {
10000       true,  true,  true,  false, false, false, false, true,
10001       false, false, false, true,  true,  true,  true,  false,
10002       true,  true,  true,  false, false, false, false, true,
10003       false, false, false, true,  true,  true,  true,  false
10004     };
10005   static const bool non_signalings[32] =
10006     {
10007       true,  false, false, true,  true,  false, false, true,
10008       true,  false, false, true,  true,  false, false, true,
10009       false, true,  true,  false, false, true,  true,  false,
10010       false, true,  true,  false, false, true,  true,  false
10011     };
10012 
10013   if (!CONST_INT_P (op2))
10014     {
10015       error ("the third argument must be comparison constant");
10016       return const0_rtx;
10017     }
10018   if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10019     {
10020       error ("incorrect comparison mode");
10021       return const0_rtx;
10022     }
10023 
10024   if (!insn_p->operand[2].predicate (op3, SImode))
10025     {
10026       error ("incorrect rounding operand");
10027       return const0_rtx;
10028     }
10029 
10030   if (VECTOR_MODE_P (mode0))
10031     op0 = safe_vector_operand (op0, mode0);
10032   if (VECTOR_MODE_P (mode1))
10033     op1 = safe_vector_operand (op1, mode1);
10034 
10035   enum rtx_code comparison = comparisons[INTVAL (op2)];
10036   bool ordered = ordereds[INTVAL (op2)];
10037   bool non_signaling = non_signalings[INTVAL (op2)];
10038   rtx const_val = const0_rtx;
10039 
10040   bool check_unordered = false;
10041   machine_mode mode = CCFPmode;
10042   switch (comparison)
10043     {
10044     case ORDERED:
10045       if (!ordered)
10046 	{
10047 	  /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
10048 	  if (!non_signaling)
10049 	    ordered = true;
10050 	  mode = CCSmode;
10051 	}
10052       else
10053 	{
10054 	  /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S.  */
10055 	  if (non_signaling)
10056 	    ordered = false;
10057 	  mode = CCPmode;
10058 	}
10059       comparison = NE;
10060       break;
10061     case UNORDERED:
10062       if (ordered)
10063 	{
10064 	  /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS.  */
10065 	  if (non_signaling)
10066 	    ordered = false;
10067 	  mode = CCSmode;
10068 	}
10069       else
10070 	{
10071 	  /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S.  */
10072 	  if (!non_signaling)
10073 	    ordered = true;
10074 	  mode = CCPmode;
10075 	}
10076       comparison = EQ;
10077       break;
10078 
10079     case LE:	/* -> GE  */
10080     case LT:	/* -> GT  */
10081     case UNGE:	/* -> UNLE  */
10082     case UNGT:	/* -> UNLT  */
10083       std::swap (op0, op1);
10084       comparison = swap_condition (comparison);
10085       /* FALLTHRU */
10086     case GT:
10087     case GE:
10088     case UNEQ:
10089     case UNLT:
10090     case UNLE:
10091     case LTGT:
10092       /* These are supported by CCFPmode.  NB: Use ordered/signaling
10093 	 COMI or unordered/non-signaling UCOMI.  Both set ZF, PF, CF
10094 	 with NAN operands.  */
10095       if (ordered == non_signaling)
10096 	ordered = !ordered;
10097       break;
10098     case EQ:
10099       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
10100 	 _CMP_EQ_OQ/_CMP_EQ_OS.  */
10101       check_unordered = true;
10102       mode = CCZmode;
10103       break;
10104     case NE:
10105       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
10106 	 _CMP_NEQ_UQ/_CMP_NEQ_US.  */
10107       gcc_assert (!ordered);
10108       check_unordered = true;
10109       mode = CCZmode;
10110       const_val = const1_rtx;
10111       break;
10112     default:
10113       gcc_unreachable ();
10114     }
10115 
10116   target = gen_reg_rtx (SImode);
10117   emit_move_insn (target, const_val);
10118   target = gen_rtx_SUBREG (QImode, target, 0);
10119 
10120   if ((optimize && !register_operand (op0, mode0))
10121       || !insn_p->operand[0].predicate (op0, mode0))
10122     op0 = copy_to_mode_reg (mode0, op0);
10123   if ((optimize && !register_operand (op1, mode1))
10124       || !insn_p->operand[1].predicate (op1, mode1))
10125     op1 = copy_to_mode_reg (mode1, op1);
10126 
10127   /*
10128      1. COMI: ordered and signaling.
10129      2. UCOMI: unordered and non-signaling.
10130    */
10131   if (non_signaling)
10132     icode = (icode == CODE_FOR_sse_comi_round
10133 	     ? CODE_FOR_sse_ucomi_round
10134 	     : CODE_FOR_sse2_ucomi_round);
10135 
10136   pat = GEN_FCN (icode) (op0, op1, op3);
10137   if (! pat)
10138     return 0;
10139 
10140   /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
10141   if (INTVAL (op3) == NO_ROUND)
10142     {
10143       pat = ix86_erase_embedded_rounding (pat);
10144       if (! pat)
10145 	return 0;
10146 
10147       set_dst = SET_DEST (pat);
10148     }
10149   else
10150     {
10151       gcc_assert (GET_CODE (pat) == SET);
10152       set_dst = SET_DEST (pat);
10153     }
10154 
10155   emit_insn (pat);
10156 
10157   rtx_code_label *label = NULL;
10158 
10159   /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10160      with NAN operands.  */
10161   if (check_unordered)
10162     {
10163       gcc_assert (comparison == EQ || comparison == NE);
10164 
10165       rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10166       label = gen_label_rtx ();
10167       rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10168       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10169 				  gen_rtx_LABEL_REF (VOIDmode, label),
10170 				  pc_rtx);
10171       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10172     }
10173 
10174   /* NB: Set CCFPmode and check a different CCmode which is in subset
10175      of CCFPmode.  */
10176   if (GET_MODE (set_dst) != mode)
10177     {
10178       gcc_assert (mode == CCAmode || mode == CCCmode
10179 		  || mode == CCOmode || mode == CCPmode
10180 		  || mode == CCSmode || mode == CCZmode);
10181       set_dst = gen_rtx_REG (mode, FLAGS_REG);
10182     }
10183 
10184   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10185 			  gen_rtx_fmt_ee (comparison, QImode,
10186 					  set_dst,
10187 					  const0_rtx)));
10188 
10189   if (label)
10190     emit_label (label);
10191 
10192   return SUBREG_REG (target);
10193 }
10194 
10195 static rtx
ix86_expand_round_builtin(const struct builtin_description * d,tree exp,rtx target)10196 ix86_expand_round_builtin (const struct builtin_description *d,
10197 			   tree exp, rtx target)
10198 {
10199   rtx pat;
10200   unsigned int i, nargs;
10201   struct
10202     {
10203       rtx op;
10204       machine_mode mode;
10205     } args[6];
10206   enum insn_code icode = d->icode;
10207   const struct insn_data_d *insn_p = &insn_data[icode];
10208   machine_mode tmode = insn_p->operand[0].mode;
10209   unsigned int nargs_constant = 0;
10210   unsigned int redundant_embed_rnd = 0;
10211 
10212   switch ((enum ix86_builtin_func_type) d->flag)
10213     {
10214     case UINT64_FTYPE_V2DF_INT:
10215     case UINT64_FTYPE_V4SF_INT:
10216     case UINT_FTYPE_V2DF_INT:
10217     case UINT_FTYPE_V4SF_INT:
10218     case INT64_FTYPE_V2DF_INT:
10219     case INT64_FTYPE_V4SF_INT:
10220     case INT_FTYPE_V2DF_INT:
10221     case INT_FTYPE_V4SF_INT:
10222       nargs = 2;
10223       break;
10224     case V4SF_FTYPE_V4SF_UINT_INT:
10225     case V4SF_FTYPE_V4SF_UINT64_INT:
10226     case V2DF_FTYPE_V2DF_UINT64_INT:
10227     case V4SF_FTYPE_V4SF_INT_INT:
10228     case V4SF_FTYPE_V4SF_INT64_INT:
10229     case V2DF_FTYPE_V2DF_INT64_INT:
10230     case V4SF_FTYPE_V4SF_V4SF_INT:
10231     case V2DF_FTYPE_V2DF_V2DF_INT:
10232     case V4SF_FTYPE_V4SF_V2DF_INT:
10233     case V2DF_FTYPE_V2DF_V4SF_INT:
10234       nargs = 3;
10235       break;
10236     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10237     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10238     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10239     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10240     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10241     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10242     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10243     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10244     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10245     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10246     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10247     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10248     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10249     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10250       nargs = 4;
10251       break;
10252     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10253     case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10254       nargs_constant = 2;
10255       nargs = 4;
10256       break;
10257     case INT_FTYPE_V4SF_V4SF_INT_INT:
10258     case INT_FTYPE_V2DF_V2DF_INT_INT:
10259       return ix86_expand_sse_comi_round (d, exp, target);
10260     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10261     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10262     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10263     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10264     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10265     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10266     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10267     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10268       nargs = 5;
10269       break;
10270     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10271     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10272       nargs_constant = 4;
10273       nargs = 5;
10274       break;
10275     case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10276     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10277     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10278     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10279       nargs_constant = 3;
10280       nargs = 5;
10281       break;
10282     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10283     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10284     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10285     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10286     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10287     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10288       nargs = 6;
10289       nargs_constant = 4;
10290       break;
10291     case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10292     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10293     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10294     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10295       nargs = 6;
10296       nargs_constant = 3;
10297       break;
10298     default:
10299       gcc_unreachable ();
10300     }
10301   gcc_assert (nargs <= ARRAY_SIZE (args));
10302 
10303   if (optimize
10304       || target == 0
10305       || GET_MODE (target) != tmode
10306       || !insn_p->operand[0].predicate (target, tmode))
10307     target = gen_reg_rtx (tmode);
10308 
10309   for (i = 0; i < nargs; i++)
10310     {
10311       tree arg = CALL_EXPR_ARG (exp, i);
10312       rtx op = expand_normal (arg);
10313       machine_mode mode = insn_p->operand[i + 1].mode;
10314       bool match = insn_p->operand[i + 1].predicate (op, mode);
10315 
10316       if (i == nargs - nargs_constant)
10317 	{
10318 	  if (!match)
10319 	    {
10320 	      switch (icode)
10321 		{
10322 		case CODE_FOR_avx512f_getmantv8df_mask_round:
10323 		case CODE_FOR_avx512f_getmantv16sf_mask_round:
10324 		case CODE_FOR_avx512f_vgetmantv2df_round:
10325 		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10326 		case CODE_FOR_avx512f_vgetmantv4sf_round:
10327 		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10328 		  error ("the immediate argument must be a 4-bit immediate");
10329 		  return const0_rtx;
10330 		case CODE_FOR_avx512f_cmpv8df3_mask_round:
10331 		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10332 		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10333 		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10334 		  error ("the immediate argument must be a 5-bit immediate");
10335 		  return const0_rtx;
10336 		default:
10337 		  error ("the immediate argument must be an 8-bit immediate");
10338 		  return const0_rtx;
10339 		}
10340 	    }
10341 	}
10342       else if (i == nargs-1)
10343 	{
10344 	  if (!insn_p->operand[nargs].predicate (op, SImode))
10345 	    {
10346 	      error ("incorrect rounding operand");
10347 	      return const0_rtx;
10348 	    }
10349 
10350 	  /* If there is no rounding use normal version of the pattern.  */
10351 	  if (INTVAL (op) == NO_ROUND)
10352 	    redundant_embed_rnd = 1;
10353 	}
10354       else
10355 	{
10356 	  if (VECTOR_MODE_P (mode))
10357 	    op = safe_vector_operand (op, mode);
10358 
10359 	  op = fixup_modeless_constant (op, mode);
10360 
10361 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10362 	    {
10363 	      if (optimize || !match)
10364 		op = copy_to_mode_reg (mode, op);
10365 	    }
10366 	  else
10367 	    {
10368 	      op = copy_to_reg (op);
10369 	      op = lowpart_subreg (mode, op, GET_MODE (op));
10370 	    }
10371 	}
10372 
10373       args[i].op = op;
10374       args[i].mode = mode;
10375     }
10376 
10377   switch (nargs)
10378     {
10379     case 1:
10380       pat = GEN_FCN (icode) (target, args[0].op);
10381       break;
10382     case 2:
10383       pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10384       break;
10385     case 3:
10386       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10387 			     args[2].op);
10388       break;
10389     case 4:
10390       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10391 			     args[2].op, args[3].op);
10392       break;
10393     case 5:
10394       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10395 			     args[2].op, args[3].op, args[4].op);
10396       break;
10397     case 6:
10398       pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10399 			     args[2].op, args[3].op, args[4].op,
10400 			     args[5].op);
10401       break;
10402     default:
10403       gcc_unreachable ();
10404     }
10405 
10406   if (!pat)
10407     return 0;
10408 
10409   if (redundant_embed_rnd)
10410     pat = ix86_erase_embedded_rounding (pat);
10411 
10412   emit_insn (pat);
10413   return target;
10414 }
10415 
10416 /* Subroutine of ix86_expand_builtin to take care of special insns
10417    with variable number of operands.  */
10418 
10419 static rtx
ix86_expand_special_args_builtin(const struct builtin_description * d,tree exp,rtx target)10420 ix86_expand_special_args_builtin (const struct builtin_description *d,
10421 				  tree exp, rtx target)
10422 {
10423   tree arg;
10424   rtx pat, op;
10425   unsigned int i, nargs, arg_adjust, memory;
10426   bool aligned_mem = false;
10427   struct
10428     {
10429       rtx op;
10430       machine_mode mode;
10431     } args[3];
10432   enum insn_code icode = d->icode;
10433   bool last_arg_constant = false;
10434   const struct insn_data_d *insn_p = &insn_data[icode];
10435   machine_mode tmode = insn_p->operand[0].mode;
10436   enum { load, store } klass;
10437 
10438   switch ((enum ix86_builtin_func_type) d->flag)
10439     {
10440     case VOID_FTYPE_VOID:
10441       emit_insn (GEN_FCN (icode) (target));
10442       return 0;
10443     case VOID_FTYPE_UINT64:
10444     case VOID_FTYPE_UNSIGNED:
10445       nargs = 0;
10446       klass = store;
10447       memory = 0;
10448       break;
10449 
10450     case INT_FTYPE_VOID:
10451     case USHORT_FTYPE_VOID:
10452     case UINT64_FTYPE_VOID:
10453     case UINT_FTYPE_VOID:
10454     case UNSIGNED_FTYPE_VOID:
10455       nargs = 0;
10456       klass = load;
10457       memory = 0;
10458       break;
10459     case UINT64_FTYPE_PUNSIGNED:
10460     case V2DI_FTYPE_PV2DI:
10461     case V4DI_FTYPE_PV4DI:
10462     case V32QI_FTYPE_PCCHAR:
10463     case V16QI_FTYPE_PCCHAR:
10464     case V8SF_FTYPE_PCV4SF:
10465     case V8SF_FTYPE_PCFLOAT:
10466     case V4SF_FTYPE_PCFLOAT:
10467     case V4DF_FTYPE_PCV2DF:
10468     case V4DF_FTYPE_PCDOUBLE:
10469     case V2DF_FTYPE_PCDOUBLE:
10470     case VOID_FTYPE_PVOID:
10471     case V8DI_FTYPE_PV8DI:
10472       nargs = 1;
10473       klass = load;
10474       memory = 0;
10475       switch (icode)
10476 	{
10477 	case CODE_FOR_sse4_1_movntdqa:
10478 	case CODE_FOR_avx2_movntdqa:
10479 	case CODE_FOR_avx512f_movntdqa:
10480 	  aligned_mem = true;
10481 	  break;
10482 	default:
10483 	  break;
10484 	}
10485       break;
10486     case VOID_FTYPE_PV2SF_V4SF:
10487     case VOID_FTYPE_PV8DI_V8DI:
10488     case VOID_FTYPE_PV4DI_V4DI:
10489     case VOID_FTYPE_PV2DI_V2DI:
10490     case VOID_FTYPE_PCHAR_V32QI:
10491     case VOID_FTYPE_PCHAR_V16QI:
10492     case VOID_FTYPE_PFLOAT_V16SF:
10493     case VOID_FTYPE_PFLOAT_V8SF:
10494     case VOID_FTYPE_PFLOAT_V4SF:
10495     case VOID_FTYPE_PDOUBLE_V8DF:
10496     case VOID_FTYPE_PDOUBLE_V4DF:
10497     case VOID_FTYPE_PDOUBLE_V2DF:
10498     case VOID_FTYPE_PLONGLONG_LONGLONG:
10499     case VOID_FTYPE_PULONGLONG_ULONGLONG:
10500     case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10501     case VOID_FTYPE_PINT_INT:
10502       nargs = 1;
10503       klass = store;
10504       /* Reserve memory operand for target.  */
10505       memory = ARRAY_SIZE (args);
10506       switch (icode)
10507 	{
10508 	/* These builtins and instructions require the memory
10509 	   to be properly aligned.  */
10510 	case CODE_FOR_avx_movntv4di:
10511 	case CODE_FOR_sse2_movntv2di:
10512 	case CODE_FOR_avx_movntv8sf:
10513 	case CODE_FOR_sse_movntv4sf:
10514 	case CODE_FOR_sse4a_vmmovntv4sf:
10515 	case CODE_FOR_avx_movntv4df:
10516 	case CODE_FOR_sse2_movntv2df:
10517 	case CODE_FOR_sse4a_vmmovntv2df:
10518 	case CODE_FOR_sse2_movntidi:
10519 	case CODE_FOR_sse_movntq:
10520 	case CODE_FOR_sse2_movntisi:
10521 	case CODE_FOR_avx512f_movntv16sf:
10522 	case CODE_FOR_avx512f_movntv8df:
10523 	case CODE_FOR_avx512f_movntv8di:
10524 	  aligned_mem = true;
10525 	  break;
10526 	default:
10527 	  break;
10528 	}
10529       break;
10530     case VOID_FTYPE_PVOID_PCVOID:
10531 	nargs = 1;
10532 	klass = store;
10533 	memory = 0;
10534 
10535 	break;
10536     case V4SF_FTYPE_V4SF_PCV2SF:
10537     case V2DF_FTYPE_V2DF_PCDOUBLE:
10538       nargs = 2;
10539       klass = load;
10540       memory = 1;
10541       break;
10542     case V8SF_FTYPE_PCV8SF_V8SI:
10543     case V4DF_FTYPE_PCV4DF_V4DI:
10544     case V4SF_FTYPE_PCV4SF_V4SI:
10545     case V2DF_FTYPE_PCV2DF_V2DI:
10546     case V8SI_FTYPE_PCV8SI_V8SI:
10547     case V4DI_FTYPE_PCV4DI_V4DI:
10548     case V4SI_FTYPE_PCV4SI_V4SI:
10549     case V2DI_FTYPE_PCV2DI_V2DI:
10550     case VOID_FTYPE_INT_INT64:
10551       nargs = 2;
10552       klass = load;
10553       memory = 0;
10554       break;
10555     case VOID_FTYPE_PV8DF_V8DF_UQI:
10556     case VOID_FTYPE_PV4DF_V4DF_UQI:
10557     case VOID_FTYPE_PV2DF_V2DF_UQI:
10558     case VOID_FTYPE_PV16SF_V16SF_UHI:
10559     case VOID_FTYPE_PV8SF_V8SF_UQI:
10560     case VOID_FTYPE_PV4SF_V4SF_UQI:
10561     case VOID_FTYPE_PV8DI_V8DI_UQI:
10562     case VOID_FTYPE_PV4DI_V4DI_UQI:
10563     case VOID_FTYPE_PV2DI_V2DI_UQI:
10564     case VOID_FTYPE_PV16SI_V16SI_UHI:
10565     case VOID_FTYPE_PV8SI_V8SI_UQI:
10566     case VOID_FTYPE_PV4SI_V4SI_UQI:
10567     case VOID_FTYPE_PV64QI_V64QI_UDI:
10568     case VOID_FTYPE_PV32HI_V32HI_USI:
10569     case VOID_FTYPE_PV32QI_V32QI_USI:
10570     case VOID_FTYPE_PV16QI_V16QI_UHI:
10571     case VOID_FTYPE_PV16HI_V16HI_UHI:
10572     case VOID_FTYPE_PV8HI_V8HI_UQI:
10573       switch (icode)
10574 	{
10575 	/* These builtins and instructions require the memory
10576 	   to be properly aligned.  */
10577 	case CODE_FOR_avx512f_storev16sf_mask:
10578 	case CODE_FOR_avx512f_storev16si_mask:
10579 	case CODE_FOR_avx512f_storev8df_mask:
10580 	case CODE_FOR_avx512f_storev8di_mask:
10581 	case CODE_FOR_avx512vl_storev8sf_mask:
10582 	case CODE_FOR_avx512vl_storev8si_mask:
10583 	case CODE_FOR_avx512vl_storev4df_mask:
10584 	case CODE_FOR_avx512vl_storev4di_mask:
10585 	case CODE_FOR_avx512vl_storev4sf_mask:
10586 	case CODE_FOR_avx512vl_storev4si_mask:
10587 	case CODE_FOR_avx512vl_storev2df_mask:
10588 	case CODE_FOR_avx512vl_storev2di_mask:
10589 	  aligned_mem = true;
10590 	  break;
10591 	default:
10592 	  break;
10593 	}
10594       /* FALLTHRU */
10595     case VOID_FTYPE_PV8SF_V8SI_V8SF:
10596     case VOID_FTYPE_PV4DF_V4DI_V4DF:
10597     case VOID_FTYPE_PV4SF_V4SI_V4SF:
10598     case VOID_FTYPE_PV2DF_V2DI_V2DF:
10599     case VOID_FTYPE_PV8SI_V8SI_V8SI:
10600     case VOID_FTYPE_PV4DI_V4DI_V4DI:
10601     case VOID_FTYPE_PV4SI_V4SI_V4SI:
10602     case VOID_FTYPE_PV2DI_V2DI_V2DI:
10603     case VOID_FTYPE_PV8SI_V8DI_UQI:
10604     case VOID_FTYPE_PV8HI_V8DI_UQI:
10605     case VOID_FTYPE_PV16HI_V16SI_UHI:
10606     case VOID_FTYPE_PV16QI_V8DI_UQI:
10607     case VOID_FTYPE_PV16QI_V16SI_UHI:
10608     case VOID_FTYPE_PV4SI_V4DI_UQI:
10609     case VOID_FTYPE_PV4SI_V2DI_UQI:
10610     case VOID_FTYPE_PV8HI_V4DI_UQI:
10611     case VOID_FTYPE_PV8HI_V2DI_UQI:
10612     case VOID_FTYPE_PV8HI_V8SI_UQI:
10613     case VOID_FTYPE_PV8HI_V4SI_UQI:
10614     case VOID_FTYPE_PV16QI_V4DI_UQI:
10615     case VOID_FTYPE_PV16QI_V2DI_UQI:
10616     case VOID_FTYPE_PV16QI_V8SI_UQI:
10617     case VOID_FTYPE_PV16QI_V4SI_UQI:
10618     case VOID_FTYPE_PCHAR_V64QI_UDI:
10619     case VOID_FTYPE_PCHAR_V32QI_USI:
10620     case VOID_FTYPE_PCHAR_V16QI_UHI:
10621     case VOID_FTYPE_PSHORT_V32HI_USI:
10622     case VOID_FTYPE_PSHORT_V16HI_UHI:
10623     case VOID_FTYPE_PSHORT_V8HI_UQI:
10624     case VOID_FTYPE_PINT_V16SI_UHI:
10625     case VOID_FTYPE_PINT_V8SI_UQI:
10626     case VOID_FTYPE_PINT_V4SI_UQI:
10627     case VOID_FTYPE_PINT64_V8DI_UQI:
10628     case VOID_FTYPE_PINT64_V4DI_UQI:
10629     case VOID_FTYPE_PINT64_V2DI_UQI:
10630     case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10631     case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10632     case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10633     case VOID_FTYPE_PFLOAT_V16SF_UHI:
10634     case VOID_FTYPE_PFLOAT_V8SF_UQI:
10635     case VOID_FTYPE_PFLOAT_V4SF_UQI:
10636     case VOID_FTYPE_PV32QI_V32HI_USI:
10637     case VOID_FTYPE_PV16QI_V16HI_UHI:
10638     case VOID_FTYPE_PV8QI_V8HI_UQI:
10639       nargs = 2;
10640       klass = store;
10641       /* Reserve memory operand for target.  */
10642       memory = ARRAY_SIZE (args);
10643       break;
10644     case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10645     case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10646     case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10647     case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10648     case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10649     case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10650     case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10651     case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10652     case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10653     case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10654     case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10655     case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10656     case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10657     case V32HI_FTYPE_PCV32HI_V32HI_USI:
10658     case V32QI_FTYPE_PCV32QI_V32QI_USI:
10659     case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10660     case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10661     case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10662       switch (icode)
10663 	{
10664 	/* These builtins and instructions require the memory
10665 	   to be properly aligned.  */
10666 	case CODE_FOR_avx512f_loadv16sf_mask:
10667 	case CODE_FOR_avx512f_loadv16si_mask:
10668 	case CODE_FOR_avx512f_loadv8df_mask:
10669 	case CODE_FOR_avx512f_loadv8di_mask:
10670 	case CODE_FOR_avx512vl_loadv8sf_mask:
10671 	case CODE_FOR_avx512vl_loadv8si_mask:
10672 	case CODE_FOR_avx512vl_loadv4df_mask:
10673 	case CODE_FOR_avx512vl_loadv4di_mask:
10674 	case CODE_FOR_avx512vl_loadv4sf_mask:
10675 	case CODE_FOR_avx512vl_loadv4si_mask:
10676 	case CODE_FOR_avx512vl_loadv2df_mask:
10677 	case CODE_FOR_avx512vl_loadv2di_mask:
10678 	case CODE_FOR_avx512bw_loadv64qi_mask:
10679 	case CODE_FOR_avx512vl_loadv32qi_mask:
10680 	case CODE_FOR_avx512vl_loadv16qi_mask:
10681 	case CODE_FOR_avx512bw_loadv32hi_mask:
10682 	case CODE_FOR_avx512vl_loadv16hi_mask:
10683 	case CODE_FOR_avx512vl_loadv8hi_mask:
10684 	  aligned_mem = true;
10685 	  break;
10686 	default:
10687 	  break;
10688 	}
10689       /* FALLTHRU */
10690     case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10691     case V32QI_FTYPE_PCCHAR_V32QI_USI:
10692     case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10693     case V32HI_FTYPE_PCSHORT_V32HI_USI:
10694     case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10695     case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10696     case V16SI_FTYPE_PCINT_V16SI_UHI:
10697     case V8SI_FTYPE_PCINT_V8SI_UQI:
10698     case V4SI_FTYPE_PCINT_V4SI_UQI:
10699     case V8DI_FTYPE_PCINT64_V8DI_UQI:
10700     case V4DI_FTYPE_PCINT64_V4DI_UQI:
10701     case V2DI_FTYPE_PCINT64_V2DI_UQI:
10702     case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10703     case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10704     case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10705     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10706     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10707     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10708       nargs = 3;
10709       klass = load;
10710       memory = 0;
10711       break;
10712     case VOID_FTYPE_UINT_UINT_UINT:
10713     case VOID_FTYPE_UINT64_UINT_UINT:
10714     case UCHAR_FTYPE_UINT_UINT_UINT:
10715     case UCHAR_FTYPE_UINT64_UINT_UINT:
10716       nargs = 3;
10717       klass = load;
10718       memory = ARRAY_SIZE (args);
10719       last_arg_constant = true;
10720       break;
10721     default:
10722       gcc_unreachable ();
10723     }
10724 
10725   gcc_assert (nargs <= ARRAY_SIZE (args));
10726 
10727   if (klass == store)
10728     {
10729       arg = CALL_EXPR_ARG (exp, 0);
10730       op = expand_normal (arg);
10731       gcc_assert (target == 0);
10732       if (memory)
10733 	{
10734 	  op = ix86_zero_extend_to_Pmode (op);
10735 	  target = gen_rtx_MEM (tmode, op);
10736 	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10737 	     on it.  Try to improve it using get_pointer_alignment,
10738 	     and if the special builtin is one that requires strict
10739 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
10740 	     Failure to do so could lead to ix86_legitimate_combined_insn
10741 	     rejecting all changes to such insns.  */
10742 	  unsigned int align = get_pointer_alignment (arg);
10743 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10744 	    align = GET_MODE_ALIGNMENT (tmode);
10745 	  if (MEM_ALIGN (target) < align)
10746 	    set_mem_align (target, align);
10747 	}
10748       else
10749 	target = force_reg (tmode, op);
10750       arg_adjust = 1;
10751     }
10752   else
10753     {
10754       arg_adjust = 0;
10755       if (optimize
10756 	  || target == 0
10757 	  || !register_operand (target, tmode)
10758 	  || GET_MODE (target) != tmode)
10759 	target = gen_reg_rtx (tmode);
10760     }
10761 
10762   for (i = 0; i < nargs; i++)
10763     {
10764       machine_mode mode = insn_p->operand[i + 1].mode;
10765       bool match;
10766 
10767       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10768       op = expand_normal (arg);
10769       match = insn_p->operand[i + 1].predicate (op, mode);
10770 
10771       if (last_arg_constant && (i + 1) == nargs)
10772 	{
10773 	  if (!match)
10774 	    {
10775 	      if (icode == CODE_FOR_lwp_lwpvalsi3
10776 		  || icode == CODE_FOR_lwp_lwpinssi3
10777 		  || icode == CODE_FOR_lwp_lwpvaldi3
10778 		  || icode == CODE_FOR_lwp_lwpinsdi3)
10779 		error ("the last argument must be a 32-bit immediate");
10780 	      else
10781 		error ("the last argument must be an 8-bit immediate");
10782 	      return const0_rtx;
10783 	    }
10784 	}
10785       else
10786 	{
10787 	  if (i == memory)
10788 	    {
10789 	      /* This must be the memory operand.  */
10790 	      op = ix86_zero_extend_to_Pmode (op);
10791 	      op = gen_rtx_MEM (mode, op);
10792 	      /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10793 		 on it.  Try to improve it using get_pointer_alignment,
10794 		 and if the special builtin is one that requires strict
10795 		 mode alignment, also from it's GET_MODE_ALIGNMENT.
10796 		 Failure to do so could lead to ix86_legitimate_combined_insn
10797 		 rejecting all changes to such insns.  */
10798 	      unsigned int align = get_pointer_alignment (arg);
10799 	      if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10800 		align = GET_MODE_ALIGNMENT (mode);
10801 	      if (MEM_ALIGN (op) < align)
10802 		set_mem_align (op, align);
10803 	    }
10804 	  else
10805 	    {
10806 	      /* This must be register.  */
10807 	      if (VECTOR_MODE_P (mode))
10808 		op = safe_vector_operand (op, mode);
10809 
10810 	      op = fixup_modeless_constant (op, mode);
10811 
10812 	      if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10813 		op = copy_to_mode_reg (mode, op);
10814 	      else
10815 	        {
10816 	          op = copy_to_reg (op);
10817 	          op = lowpart_subreg (mode, op, GET_MODE (op));
10818 	        }
10819 	    }
10820 	}
10821 
10822       args[i].op = op;
10823       args[i].mode = mode;
10824     }
10825 
10826   switch (nargs)
10827     {
10828     case 0:
10829       pat = GEN_FCN (icode) (target);
10830       break;
10831     case 1:
10832       pat = GEN_FCN (icode) (target, args[0].op);
10833       break;
10834     case 2:
10835       pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10836       break;
10837     case 3:
10838       pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10839       break;
10840     default:
10841       gcc_unreachable ();
10842     }
10843 
10844   if (! pat)
10845     return 0;
10846   emit_insn (pat);
10847   return klass == store ? 0 : target;
10848 }
10849 
10850 /* Return the integer constant in ARG.  Constrain it to be in the range
10851    of the subparts of VEC_TYPE; issue an error if not.  */
10852 
10853 static int
get_element_number(tree vec_type,tree arg)10854 get_element_number (tree vec_type, tree arg)
10855 {
10856   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10857 
10858   if (!tree_fits_uhwi_p (arg)
10859       || (elt = tree_to_uhwi (arg), elt > max))
10860     {
10861       error ("selector must be an integer constant in the range "
10862 	     "[0, %wi]", max);
10863       return 0;
10864     }
10865 
10866   return elt;
10867 }
10868 
10869 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
10870    ix86_expand_vector_init.  We DO have language-level syntax for this, in
10871    the form of  (type){ init-list }.  Except that since we can't place emms
10872    instructions from inside the compiler, we can't allow the use of MMX
10873    registers unless the user explicitly asks for it.  So we do *not* define
10874    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
10875    we have builtins invoked by mmintrin.h that gives us license to emit
10876    these sorts of instructions.  */
10877 
10878 static rtx
ix86_expand_vec_init_builtin(tree type,tree exp,rtx target)10879 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10880 {
10881   machine_mode tmode = TYPE_MODE (type);
10882   machine_mode inner_mode = GET_MODE_INNER (tmode);
10883   int i, n_elt = GET_MODE_NUNITS (tmode);
10884   rtvec v = rtvec_alloc (n_elt);
10885 
10886   gcc_assert (VECTOR_MODE_P (tmode));
10887   gcc_assert (call_expr_nargs (exp) == n_elt);
10888 
10889   for (i = 0; i < n_elt; ++i)
10890     {
10891       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10892       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10893     }
10894 
10895   if (!target || !register_operand (target, tmode))
10896     target = gen_reg_rtx (tmode);
10897 
10898   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10899   return target;
10900 }
10901 
10902 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
10903    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
10904    had a language-level syntax for referencing vector elements.  */
10905 
10906 static rtx
ix86_expand_vec_ext_builtin(tree exp,rtx target)10907 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10908 {
10909   machine_mode tmode, mode0;
10910   tree arg0, arg1;
10911   int elt;
10912   rtx op0;
10913 
10914   arg0 = CALL_EXPR_ARG (exp, 0);
10915   arg1 = CALL_EXPR_ARG (exp, 1);
10916 
10917   op0 = expand_normal (arg0);
10918   elt = get_element_number (TREE_TYPE (arg0), arg1);
10919 
10920   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10921   mode0 = TYPE_MODE (TREE_TYPE (arg0));
10922   gcc_assert (VECTOR_MODE_P (mode0));
10923 
10924   op0 = force_reg (mode0, op0);
10925 
10926   if (optimize || !target || !register_operand (target, tmode))
10927     target = gen_reg_rtx (tmode);
10928 
10929   ix86_expand_vector_extract (true, target, op0, elt);
10930 
10931   return target;
10932 }
10933 
10934 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
10935    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
10936    a language-level syntax for referencing vector elements.  */
10937 
10938 static rtx
ix86_expand_vec_set_builtin(tree exp)10939 ix86_expand_vec_set_builtin (tree exp)
10940 {
10941   machine_mode tmode, mode1;
10942   tree arg0, arg1, arg2;
10943   int elt;
10944   rtx op0, op1, target;
10945 
10946   arg0 = CALL_EXPR_ARG (exp, 0);
10947   arg1 = CALL_EXPR_ARG (exp, 1);
10948   arg2 = CALL_EXPR_ARG (exp, 2);
10949 
10950   tmode = TYPE_MODE (TREE_TYPE (arg0));
10951   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10952   gcc_assert (VECTOR_MODE_P (tmode));
10953 
10954   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10955   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10956   elt = get_element_number (TREE_TYPE (arg0), arg2);
10957 
10958   if (GET_MODE (op1) != mode1)
10959     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10960 
10961   op0 = force_reg (tmode, op0);
10962   op1 = force_reg (mode1, op1);
10963 
10964   /* OP0 is the source of these builtin functions and shouldn't be
10965      modified.  Create a copy, use it and return it as target.  */
10966   target = gen_reg_rtx (tmode);
10967   emit_move_insn (target, op0);
10968   ix86_expand_vector_set (true, target, op1, elt);
10969 
10970   return target;
10971 }
10972 
10973 /* Expand an expression EXP that calls a built-in function,
10974    with result going to TARGET if that's convenient
10975    (and in mode MODE if that's convenient).
10976    SUBTARGET may be used as the target for computing one of EXP's operands.
10977    IGNORE is nonzero if the value is to be ignored.  */
10978 
10979 rtx
ix86_expand_builtin(tree exp,rtx target,rtx subtarget,machine_mode mode,int ignore)10980 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10981 		     machine_mode mode, int ignore)
10982 {
10983   size_t i;
10984   enum insn_code icode, icode2;
10985   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10986   tree arg0, arg1, arg2, arg3, arg4;
10987   rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10988   machine_mode mode0, mode1, mode2, mode3, mode4;
10989   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
10990 
10991   /* For CPU builtins that can be folded, fold first and expand the fold.  */
10992   switch (fcode)
10993     {
10994     case IX86_BUILTIN_CPU_INIT:
10995       {
10996 	/* Make it call __cpu_indicator_init in libgcc. */
10997 	tree call_expr, fndecl, type;
10998         type = build_function_type_list (integer_type_node, NULL_TREE);
10999 	fndecl = build_fn_decl ("__cpu_indicator_init", type);
11000 	call_expr = build_call_expr (fndecl, 0);
11001 	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11002       }
11003     case IX86_BUILTIN_CPU_IS:
11004     case IX86_BUILTIN_CPU_SUPPORTS:
11005       {
11006 	tree arg0 = CALL_EXPR_ARG (exp, 0);
11007 	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11008 	gcc_assert (fold_expr != NULL_TREE);
11009 	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11010       }
11011     }
11012 
11013   HOST_WIDE_INT isa = ix86_isa_flags;
11014   HOST_WIDE_INT isa2 = ix86_isa_flags2;
11015   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11016   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11017   /* The general case is we require all the ISAs specified in bisa{,2}
11018      to be enabled.
11019      The exceptions are:
11020      OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11021      OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11022      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11023      where for each such pair it is sufficient if either of the ISAs is
11024      enabled, plus if it is ored with other options also those others.
11025      OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
11026   if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11027        == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11028       && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11029     isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11030   if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11031        == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11032       && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11033     isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11034   if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11035        == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11036       && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11037     isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11038   if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE)
11039     {
11040       bisa &= ~OPTION_MASK_ISA_MMX;
11041       bisa |= OPTION_MASK_ISA_SSE2;
11042     }
11043   if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11044     {
11045       bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11046       if (TARGET_ABI_X32)
11047 	bisa |= OPTION_MASK_ABI_X32;
11048       else
11049 	bisa |= OPTION_MASK_ABI_64;
11050       char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11051 				       (enum fpmath_unit) 0,
11052 				       (enum prefer_vector_width) 0,
11053 				       false, add_abi_p);
11054       if (!opts)
11055 	error ("%qE needs unknown isa option", fndecl);
11056       else
11057 	{
11058 	  gcc_assert (opts != NULL);
11059 	  error ("%qE needs isa option %s", fndecl, opts);
11060 	  free (opts);
11061 	}
11062       return expand_call (exp, target, ignore);
11063     }
11064 
11065   switch (fcode)
11066     {
11067     case IX86_BUILTIN_MASKMOVQ:
11068     case IX86_BUILTIN_MASKMOVDQU:
11069       icode = (fcode == IX86_BUILTIN_MASKMOVQ
11070 	       ? CODE_FOR_mmx_maskmovq
11071 	       : CODE_FOR_sse2_maskmovdqu);
11072       /* Note the arg order is different from the operand order.  */
11073       arg1 = CALL_EXPR_ARG (exp, 0);
11074       arg2 = CALL_EXPR_ARG (exp, 1);
11075       arg0 = CALL_EXPR_ARG (exp, 2);
11076       op0 = expand_normal (arg0);
11077       op1 = expand_normal (arg1);
11078       op2 = expand_normal (arg2);
11079       mode0 = insn_data[icode].operand[0].mode;
11080       mode1 = insn_data[icode].operand[1].mode;
11081       mode2 = insn_data[icode].operand[2].mode;
11082 
11083       op0 = ix86_zero_extend_to_Pmode (op0);
11084       op0 = gen_rtx_MEM (mode1, op0);
11085 
11086       if (!insn_data[icode].operand[0].predicate (op0, mode0))
11087 	op0 = copy_to_mode_reg (mode0, op0);
11088       if (!insn_data[icode].operand[1].predicate (op1, mode1))
11089 	op1 = copy_to_mode_reg (mode1, op1);
11090       if (!insn_data[icode].operand[2].predicate (op2, mode2))
11091 	op2 = copy_to_mode_reg (mode2, op2);
11092       pat = GEN_FCN (icode) (op0, op1, op2);
11093       if (! pat)
11094 	return 0;
11095       emit_insn (pat);
11096       return 0;
11097 
11098     case IX86_BUILTIN_LDMXCSR:
11099       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11100       target = assign_386_stack_local (SImode, SLOT_TEMP);
11101       emit_move_insn (target, op0);
11102       emit_insn (gen_sse_ldmxcsr (target));
11103       return 0;
11104 
11105     case IX86_BUILTIN_STMXCSR:
11106       target = assign_386_stack_local (SImode, SLOT_TEMP);
11107       emit_insn (gen_sse_stmxcsr (target));
11108       return copy_to_mode_reg (SImode, target);
11109 
11110     case IX86_BUILTIN_CLFLUSH:
11111 	arg0 = CALL_EXPR_ARG (exp, 0);
11112 	op0 = expand_normal (arg0);
11113 	icode = CODE_FOR_sse2_clflush;
11114 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11115 	  op0 = ix86_zero_extend_to_Pmode (op0);
11116 
11117 	emit_insn (gen_sse2_clflush (op0));
11118 	return 0;
11119 
11120     case IX86_BUILTIN_CLWB:
11121 	arg0 = CALL_EXPR_ARG (exp, 0);
11122 	op0 = expand_normal (arg0);
11123 	icode = CODE_FOR_clwb;
11124 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11125 	  op0 = ix86_zero_extend_to_Pmode (op0);
11126 
11127 	emit_insn (gen_clwb (op0));
11128 	return 0;
11129 
11130     case IX86_BUILTIN_CLFLUSHOPT:
11131 	arg0 = CALL_EXPR_ARG (exp, 0);
11132 	op0 = expand_normal (arg0);
11133 	icode = CODE_FOR_clflushopt;
11134 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11135 	  op0 = ix86_zero_extend_to_Pmode (op0);
11136 
11137 	emit_insn (gen_clflushopt (op0));
11138 	return 0;
11139 
11140     case IX86_BUILTIN_MONITOR:
11141     case IX86_BUILTIN_MONITORX:
11142       arg0 = CALL_EXPR_ARG (exp, 0);
11143       arg1 = CALL_EXPR_ARG (exp, 1);
11144       arg2 = CALL_EXPR_ARG (exp, 2);
11145       op0 = expand_normal (arg0);
11146       op1 = expand_normal (arg1);
11147       op2 = expand_normal (arg2);
11148       if (!REG_P (op0))
11149 	op0 = ix86_zero_extend_to_Pmode (op0);
11150       if (!REG_P (op1))
11151 	op1 = copy_to_mode_reg (SImode, op1);
11152       if (!REG_P (op2))
11153 	op2 = copy_to_mode_reg (SImode, op2);
11154 
11155       emit_insn (fcode == IX86_BUILTIN_MONITOR
11156 		 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11157 		 : gen_monitorx (Pmode, op0, op1, op2));
11158       return 0;
11159 
11160     case IX86_BUILTIN_MWAIT:
11161       arg0 = CALL_EXPR_ARG (exp, 0);
11162       arg1 = CALL_EXPR_ARG (exp, 1);
11163       op0 = expand_normal (arg0);
11164       op1 = expand_normal (arg1);
11165       if (!REG_P (op0))
11166 	op0 = copy_to_mode_reg (SImode, op0);
11167       if (!REG_P (op1))
11168 	op1 = copy_to_mode_reg (SImode, op1);
11169       emit_insn (gen_sse3_mwait (op0, op1));
11170       return 0;
11171 
11172     case IX86_BUILTIN_MWAITX:
11173       arg0 = CALL_EXPR_ARG (exp, 0);
11174       arg1 = CALL_EXPR_ARG (exp, 1);
11175       arg2 = CALL_EXPR_ARG (exp, 2);
11176       op0 = expand_normal (arg0);
11177       op1 = expand_normal (arg1);
11178       op2 = expand_normal (arg2);
11179       if (!REG_P (op0))
11180 	op0 = copy_to_mode_reg (SImode, op0);
11181       if (!REG_P (op1))
11182 	op1 = copy_to_mode_reg (SImode, op1);
11183       if (!REG_P (op2))
11184 	op2 = copy_to_mode_reg (SImode, op2);
11185       emit_insn (gen_mwaitx (op0, op1, op2));
11186       return 0;
11187 
11188     case IX86_BUILTIN_UMONITOR:
11189       arg0 = CALL_EXPR_ARG (exp, 0);
11190       op0 = expand_normal (arg0);
11191 
11192       op0 = ix86_zero_extend_to_Pmode (op0);
11193       emit_insn (gen_umonitor (Pmode, op0));
11194       return 0;
11195 
11196     case IX86_BUILTIN_UMWAIT:
11197     case IX86_BUILTIN_TPAUSE:
11198       arg0 = CALL_EXPR_ARG (exp, 0);
11199       arg1 = CALL_EXPR_ARG (exp, 1);
11200       op0 = expand_normal (arg0);
11201       op1 = expand_normal (arg1);
11202 
11203       if (!REG_P (op0))
11204 	op0 = copy_to_mode_reg (SImode, op0);
11205 
11206       op1 = force_reg (DImode, op1);
11207 
11208       if (TARGET_64BIT)
11209 	{
11210 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11211 				     NULL, 1, OPTAB_DIRECT);
11212 	  switch (fcode)
11213 	    {
11214 	    case IX86_BUILTIN_UMWAIT:
11215 	      icode = CODE_FOR_umwait_rex64;
11216 	      break;
11217 	    case IX86_BUILTIN_TPAUSE:
11218 	      icode = CODE_FOR_tpause_rex64;
11219 	      break;
11220 	    default:
11221 	      gcc_unreachable ();
11222 	    }
11223 
11224 	  op2 = gen_lowpart (SImode, op2);
11225 	  op1 = gen_lowpart (SImode, op1);
11226 	  pat = GEN_FCN (icode) (op0, op1, op2);
11227 	}
11228       else
11229 	{
11230 	  switch (fcode)
11231 	    {
11232 	    case IX86_BUILTIN_UMWAIT:
11233 	      icode = CODE_FOR_umwait;
11234 	      break;
11235 	    case IX86_BUILTIN_TPAUSE:
11236 	      icode = CODE_FOR_tpause;
11237 	      break;
11238 	    default:
11239 	      gcc_unreachable ();
11240 	    }
11241 	  pat = GEN_FCN (icode) (op0, op1);
11242 	}
11243 
11244       if (!pat)
11245 	return 0;
11246 
11247       emit_insn (pat);
11248 
11249       if (target == 0
11250 	  || !register_operand (target, QImode))
11251 	target = gen_reg_rtx (QImode);
11252 
11253       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11254 			const0_rtx);
11255       emit_insn (gen_rtx_SET (target, pat));
11256 
11257       return target;
11258 
11259     case IX86_BUILTIN_CLZERO:
11260       arg0 = CALL_EXPR_ARG (exp, 0);
11261       op0 = expand_normal (arg0);
11262       if (!REG_P (op0))
11263 	op0 = ix86_zero_extend_to_Pmode (op0);
11264       emit_insn (gen_clzero (Pmode, op0));
11265       return 0;
11266 
11267     case IX86_BUILTIN_CLDEMOTE:
11268       arg0 = CALL_EXPR_ARG (exp, 0);
11269       op0 = expand_normal (arg0);
11270       icode = CODE_FOR_cldemote;
11271       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11272 	op0 = ix86_zero_extend_to_Pmode (op0);
11273 
11274       emit_insn (gen_cldemote (op0));
11275       return 0;
11276 
11277     case IX86_BUILTIN_VEC_INIT_V2SI:
11278     case IX86_BUILTIN_VEC_INIT_V4HI:
11279     case IX86_BUILTIN_VEC_INIT_V8QI:
11280       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11281 
11282     case IX86_BUILTIN_VEC_EXT_V2DF:
11283     case IX86_BUILTIN_VEC_EXT_V2DI:
11284     case IX86_BUILTIN_VEC_EXT_V4SF:
11285     case IX86_BUILTIN_VEC_EXT_V4SI:
11286     case IX86_BUILTIN_VEC_EXT_V8HI:
11287     case IX86_BUILTIN_VEC_EXT_V2SI:
11288     case IX86_BUILTIN_VEC_EXT_V4HI:
11289     case IX86_BUILTIN_VEC_EXT_V16QI:
11290       return ix86_expand_vec_ext_builtin (exp, target);
11291 
11292     case IX86_BUILTIN_VEC_SET_V2DI:
11293     case IX86_BUILTIN_VEC_SET_V4SF:
11294     case IX86_BUILTIN_VEC_SET_V4SI:
11295     case IX86_BUILTIN_VEC_SET_V8HI:
11296     case IX86_BUILTIN_VEC_SET_V4HI:
11297     case IX86_BUILTIN_VEC_SET_V16QI:
11298       return ix86_expand_vec_set_builtin (exp);
11299 
11300     case IX86_BUILTIN_NANQ:
11301     case IX86_BUILTIN_NANSQ:
11302       return expand_call (exp, target, ignore);
11303 
11304     case IX86_BUILTIN_RDPID:
11305 
11306       op0 = gen_reg_rtx (word_mode);
11307 
11308       if (TARGET_64BIT)
11309 	{
11310 	  insn = gen_rdpid_rex64 (op0);
11311 	  op0 = convert_to_mode (SImode, op0, 1);
11312 	}
11313       else
11314 	insn = gen_rdpid (op0);
11315 
11316       emit_insn (insn);
11317 
11318       if (target == 0
11319 	  || !register_operand (target, SImode))
11320 	target = gen_reg_rtx (SImode);
11321 
11322       emit_move_insn (target, op0);
11323       return target;
11324 
11325     case IX86_BUILTIN_2INTERSECTD512:
11326     case IX86_BUILTIN_2INTERSECTQ512:
11327     case IX86_BUILTIN_2INTERSECTD256:
11328     case IX86_BUILTIN_2INTERSECTQ256:
11329     case IX86_BUILTIN_2INTERSECTD128:
11330     case IX86_BUILTIN_2INTERSECTQ128:
11331       arg0 = CALL_EXPR_ARG (exp, 0);
11332       arg1 = CALL_EXPR_ARG (exp, 1);
11333       arg2 = CALL_EXPR_ARG (exp, 2);
11334       arg3 = CALL_EXPR_ARG (exp, 3);
11335       op0 = expand_normal (arg0);
11336       op1 = expand_normal (arg1);
11337       op2 = expand_normal (arg2);
11338       op3 = expand_normal (arg3);
11339 
11340       if (!address_operand (op0, VOIDmode))
11341 	{
11342 	  op0 = convert_memory_address (Pmode, op0);
11343 	  op0 = copy_addr_to_reg (op0);
11344 	}
11345       if (!address_operand (op1, VOIDmode))
11346 	{
11347 	  op1 = convert_memory_address (Pmode, op1);
11348 	  op1 = copy_addr_to_reg (op1);
11349 	}
11350 
11351       switch (fcode)
11352 	{
11353 	case IX86_BUILTIN_2INTERSECTD512:
11354 	  mode4 = P2HImode;
11355 	  icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11356 	  break;
11357 	case IX86_BUILTIN_2INTERSECTQ512:
11358 	  mode4 = P2QImode;
11359 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11360 	  break;
11361 	case IX86_BUILTIN_2INTERSECTD256:
11362 	  mode4 = P2QImode;
11363 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11364 	  break;
11365 	case IX86_BUILTIN_2INTERSECTQ256:
11366 	  mode4 = P2QImode;
11367 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11368 	  break;
11369 	case IX86_BUILTIN_2INTERSECTD128:
11370 	  mode4 = P2QImode;
11371 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11372 	  break;
11373 	case IX86_BUILTIN_2INTERSECTQ128:
11374 	  mode4 = P2QImode;
11375 	  icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11376 	  break;
11377 	default:
11378 	  gcc_unreachable ();
11379 	}
11380 
11381       mode2 = insn_data[icode].operand[1].mode;
11382       mode3 = insn_data[icode].operand[2].mode;
11383       if (!insn_data[icode].operand[1].predicate (op2, mode2))
11384 	op2 = copy_to_mode_reg (mode2, op2);
11385       if (!insn_data[icode].operand[2].predicate (op3, mode3))
11386 	op3 = copy_to_mode_reg (mode3, op3);
11387 
11388       op4 = gen_reg_rtx (mode4);
11389       emit_insn (GEN_FCN (icode) (op4, op2, op3));
11390       mode0 = mode4 == P2HImode ? HImode : QImode;
11391       emit_move_insn (gen_rtx_MEM (mode0, op0),
11392 		      gen_lowpart (mode0, op4));
11393       emit_move_insn (gen_rtx_MEM (mode0, op1),
11394 		      gen_highpart (mode0, op4));
11395 
11396       return 0;
11397 
11398     case IX86_BUILTIN_RDPMC:
11399     case IX86_BUILTIN_RDTSC:
11400     case IX86_BUILTIN_RDTSCP:
11401     case IX86_BUILTIN_XGETBV:
11402 
11403       op0 = gen_reg_rtx (DImode);
11404       op1 = gen_reg_rtx (DImode);
11405 
11406       if (fcode == IX86_BUILTIN_RDPMC)
11407 	{
11408 	  arg0 = CALL_EXPR_ARG (exp, 0);
11409 	  op2 = expand_normal (arg0);
11410 	  if (!register_operand (op2, SImode))
11411 	    op2 = copy_to_mode_reg (SImode, op2);
11412 
11413 	  insn = (TARGET_64BIT
11414 		  ? gen_rdpmc_rex64 (op0, op1, op2)
11415 		  : gen_rdpmc (op0, op2));
11416 	  emit_insn (insn);
11417 	}
11418       else if (fcode == IX86_BUILTIN_XGETBV)
11419 	{
11420 	  arg0 = CALL_EXPR_ARG (exp, 0);
11421 	  op2 = expand_normal (arg0);
11422 	  if (!register_operand (op2, SImode))
11423 	    op2 = copy_to_mode_reg (SImode, op2);
11424 
11425 	  insn = (TARGET_64BIT
11426 		  ? gen_xgetbv_rex64 (op0, op1, op2)
11427 		  : gen_xgetbv (op0, op2));
11428 	  emit_insn (insn);
11429 	}
11430       else if (fcode == IX86_BUILTIN_RDTSC)
11431 	{
11432 	  insn = (TARGET_64BIT
11433 		  ? gen_rdtsc_rex64 (op0, op1)
11434 		  : gen_rdtsc (op0));
11435 	  emit_insn (insn);
11436 	}
11437       else
11438 	{
11439 	  op2 = gen_reg_rtx (SImode);
11440 
11441 	  insn = (TARGET_64BIT
11442 		  ? gen_rdtscp_rex64 (op0, op1, op2)
11443 		  : gen_rdtscp (op0, op2));
11444 	  emit_insn (insn);
11445 
11446 	  arg0 = CALL_EXPR_ARG (exp, 0);
11447 	  op4 = expand_normal (arg0);
11448 	  if (!address_operand (op4, VOIDmode))
11449 	    {
11450 	      op4 = convert_memory_address (Pmode, op4);
11451 	      op4 = copy_addr_to_reg (op4);
11452 	    }
11453 	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11454 	}
11455 
11456       if (target == 0
11457 	  || !register_operand (target, DImode))
11458         target = gen_reg_rtx (DImode);
11459 
11460       if (TARGET_64BIT)
11461 	{
11462 	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11463 				     op1, 1, OPTAB_DIRECT);
11464 	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
11465 				     op0, 1, OPTAB_DIRECT);
11466 	}
11467 
11468       emit_move_insn (target, op0);
11469       return target;
11470 
11471     case IX86_BUILTIN_ENQCMD:
11472     case IX86_BUILTIN_ENQCMDS:
11473     case IX86_BUILTIN_MOVDIR64B:
11474 
11475       arg0 = CALL_EXPR_ARG (exp, 0);
11476       arg1 = CALL_EXPR_ARG (exp, 1);
11477       op0 = expand_normal (arg0);
11478       op1 = expand_normal (arg1);
11479 
11480       op0 = ix86_zero_extend_to_Pmode (op0);
11481       if (!address_operand (op1, VOIDmode))
11482       {
11483 	op1 = convert_memory_address (Pmode, op1);
11484 	op1 = copy_addr_to_reg (op1);
11485       }
11486       op1 = gen_rtx_MEM (XImode, op1);
11487 
11488       if (fcode == IX86_BUILTIN_MOVDIR64B)
11489 	{
11490 	  emit_insn (gen_movdir64b (Pmode, op0, op1));
11491 	  return 0;
11492 	}
11493       else
11494 	{
11495 	  rtx pat;
11496 
11497 	  target = gen_reg_rtx (SImode);
11498 	  emit_move_insn (target, const0_rtx);
11499 	  target = gen_rtx_SUBREG (QImode, target, 0);
11500 
11501 	  if (fcode == IX86_BUILTIN_ENQCMD)
11502 	    pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
11503 	  else
11504 	    pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
11505 
11506 	  emit_insn (pat);
11507 
11508 	  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11509 				  gen_rtx_fmt_ee (EQ, QImode,
11510 						  SET_DEST (pat),
11511 						  const0_rtx)));
11512 
11513 	  return SUBREG_REG (target);
11514 	}
11515 
11516     case IX86_BUILTIN_FXSAVE:
11517     case IX86_BUILTIN_FXRSTOR:
11518     case IX86_BUILTIN_FXSAVE64:
11519     case IX86_BUILTIN_FXRSTOR64:
11520     case IX86_BUILTIN_FNSTENV:
11521     case IX86_BUILTIN_FLDENV:
11522       mode0 = BLKmode;
11523       switch (fcode)
11524 	{
11525 	case IX86_BUILTIN_FXSAVE:
11526 	  icode = CODE_FOR_fxsave;
11527 	  break;
11528 	case IX86_BUILTIN_FXRSTOR:
11529 	  icode = CODE_FOR_fxrstor;
11530 	  break;
11531 	case IX86_BUILTIN_FXSAVE64:
11532 	  icode = CODE_FOR_fxsave64;
11533 	  break;
11534 	case IX86_BUILTIN_FXRSTOR64:
11535 	  icode = CODE_FOR_fxrstor64;
11536 	  break;
11537 	case IX86_BUILTIN_FNSTENV:
11538 	  icode = CODE_FOR_fnstenv;
11539 	  break;
11540 	case IX86_BUILTIN_FLDENV:
11541 	  icode = CODE_FOR_fldenv;
11542 	  break;
11543 	default:
11544 	  gcc_unreachable ();
11545 	}
11546 
11547       arg0 = CALL_EXPR_ARG (exp, 0);
11548       op0 = expand_normal (arg0);
11549 
11550       if (!address_operand (op0, VOIDmode))
11551 	{
11552 	  op0 = convert_memory_address (Pmode, op0);
11553 	  op0 = copy_addr_to_reg (op0);
11554 	}
11555       op0 = gen_rtx_MEM (mode0, op0);
11556 
11557       pat = GEN_FCN (icode) (op0);
11558       if (pat)
11559 	emit_insn (pat);
11560       return 0;
11561 
11562     case IX86_BUILTIN_XSETBV:
11563       arg0 = CALL_EXPR_ARG (exp, 0);
11564       arg1 = CALL_EXPR_ARG (exp, 1);
11565       op0 = expand_normal (arg0);
11566       op1 = expand_normal (arg1);
11567 
11568       if (!REG_P (op0))
11569 	op0 = copy_to_mode_reg (SImode, op0);
11570 
11571       op1 = force_reg (DImode, op1);
11572 
11573       if (TARGET_64BIT)
11574 	{
11575 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11576 				     NULL, 1, OPTAB_DIRECT);
11577 
11578 	  icode = CODE_FOR_xsetbv_rex64;
11579 
11580 	  op2 = gen_lowpart (SImode, op2);
11581 	  op1 = gen_lowpart (SImode, op1);
11582 	  pat = GEN_FCN (icode) (op0, op1, op2);
11583 	}
11584       else
11585 	{
11586 	  icode = CODE_FOR_xsetbv;
11587 
11588 	  pat = GEN_FCN (icode) (op0, op1);
11589 	}
11590       if (pat)
11591 	emit_insn (pat);
11592       return 0;
11593 
11594     case IX86_BUILTIN_XSAVE:
11595     case IX86_BUILTIN_XRSTOR:
11596     case IX86_BUILTIN_XSAVE64:
11597     case IX86_BUILTIN_XRSTOR64:
11598     case IX86_BUILTIN_XSAVEOPT:
11599     case IX86_BUILTIN_XSAVEOPT64:
11600     case IX86_BUILTIN_XSAVES:
11601     case IX86_BUILTIN_XRSTORS:
11602     case IX86_BUILTIN_XSAVES64:
11603     case IX86_BUILTIN_XRSTORS64:
11604     case IX86_BUILTIN_XSAVEC:
11605     case IX86_BUILTIN_XSAVEC64:
11606       arg0 = CALL_EXPR_ARG (exp, 0);
11607       arg1 = CALL_EXPR_ARG (exp, 1);
11608       op0 = expand_normal (arg0);
11609       op1 = expand_normal (arg1);
11610 
11611       if (!address_operand (op0, VOIDmode))
11612 	{
11613 	  op0 = convert_memory_address (Pmode, op0);
11614 	  op0 = copy_addr_to_reg (op0);
11615 	}
11616       op0 = gen_rtx_MEM (BLKmode, op0);
11617 
11618       op1 = force_reg (DImode, op1);
11619 
11620       if (TARGET_64BIT)
11621 	{
11622 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11623 				     NULL, 1, OPTAB_DIRECT);
11624 	  switch (fcode)
11625 	    {
11626 	    case IX86_BUILTIN_XSAVE:
11627 	      icode = CODE_FOR_xsave_rex64;
11628 	      break;
11629 	    case IX86_BUILTIN_XRSTOR:
11630 	      icode = CODE_FOR_xrstor_rex64;
11631 	      break;
11632 	    case IX86_BUILTIN_XSAVE64:
11633 	      icode = CODE_FOR_xsave64;
11634 	      break;
11635 	    case IX86_BUILTIN_XRSTOR64:
11636 	      icode = CODE_FOR_xrstor64;
11637 	      break;
11638 	    case IX86_BUILTIN_XSAVEOPT:
11639 	      icode = CODE_FOR_xsaveopt_rex64;
11640 	      break;
11641 	    case IX86_BUILTIN_XSAVEOPT64:
11642 	      icode = CODE_FOR_xsaveopt64;
11643 	      break;
11644 	    case IX86_BUILTIN_XSAVES:
11645 	      icode = CODE_FOR_xsaves_rex64;
11646 	      break;
11647 	    case IX86_BUILTIN_XRSTORS:
11648 	      icode = CODE_FOR_xrstors_rex64;
11649 	      break;
11650 	    case IX86_BUILTIN_XSAVES64:
11651 	      icode = CODE_FOR_xsaves64;
11652 	      break;
11653 	    case IX86_BUILTIN_XRSTORS64:
11654 	      icode = CODE_FOR_xrstors64;
11655 	      break;
11656 	    case IX86_BUILTIN_XSAVEC:
11657 	      icode = CODE_FOR_xsavec_rex64;
11658 	      break;
11659 	    case IX86_BUILTIN_XSAVEC64:
11660 	      icode = CODE_FOR_xsavec64;
11661 	      break;
11662 	    default:
11663 	      gcc_unreachable ();
11664 	    }
11665 
11666 	  op2 = gen_lowpart (SImode, op2);
11667 	  op1 = gen_lowpart (SImode, op1);
11668 	  pat = GEN_FCN (icode) (op0, op1, op2);
11669 	}
11670       else
11671 	{
11672 	  switch (fcode)
11673 	    {
11674 	    case IX86_BUILTIN_XSAVE:
11675 	      icode = CODE_FOR_xsave;
11676 	      break;
11677 	    case IX86_BUILTIN_XRSTOR:
11678 	      icode = CODE_FOR_xrstor;
11679 	      break;
11680 	    case IX86_BUILTIN_XSAVEOPT:
11681 	      icode = CODE_FOR_xsaveopt;
11682 	      break;
11683 	    case IX86_BUILTIN_XSAVES:
11684 	      icode = CODE_FOR_xsaves;
11685 	      break;
11686 	    case IX86_BUILTIN_XRSTORS:
11687 	      icode = CODE_FOR_xrstors;
11688 	      break;
11689 	    case IX86_BUILTIN_XSAVEC:
11690 	      icode = CODE_FOR_xsavec;
11691 	      break;
11692 	    default:
11693 	      gcc_unreachable ();
11694 	    }
11695 	  pat = GEN_FCN (icode) (op0, op1);
11696 	}
11697 
11698       if (pat)
11699 	emit_insn (pat);
11700       return 0;
11701 
11702     case IX86_BUILTIN_LLWPCB:
11703       arg0 = CALL_EXPR_ARG (exp, 0);
11704       op0 = expand_normal (arg0);
11705       icode = CODE_FOR_lwp_llwpcb;
11706       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11707 	op0 = ix86_zero_extend_to_Pmode (op0);
11708       emit_insn (gen_lwp_llwpcb (op0));
11709       return 0;
11710 
11711     case IX86_BUILTIN_SLWPCB:
11712       icode = CODE_FOR_lwp_slwpcb;
11713       if (!target
11714 	  || !insn_data[icode].operand[0].predicate (target, Pmode))
11715 	target = gen_reg_rtx (Pmode);
11716       emit_insn (gen_lwp_slwpcb (target));
11717       return target;
11718 
11719     case IX86_BUILTIN_BEXTRI32:
11720     case IX86_BUILTIN_BEXTRI64:
11721       arg0 = CALL_EXPR_ARG (exp, 0);
11722       arg1 = CALL_EXPR_ARG (exp, 1);
11723       op0 = expand_normal (arg0);
11724       op1 = expand_normal (arg1);
11725       icode = (fcode == IX86_BUILTIN_BEXTRI32
11726 	  ? CODE_FOR_tbm_bextri_si
11727 	  : CODE_FOR_tbm_bextri_di);
11728       if (!CONST_INT_P (op1))
11729         {
11730           error ("last argument must be an immediate");
11731           return const0_rtx;
11732         }
11733       else
11734         {
11735           unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
11736           unsigned char lsb_index = INTVAL (op1) & 0xFF;
11737           op1 = GEN_INT (length);
11738           op2 = GEN_INT (lsb_index);
11739 
11740 	  mode1 = insn_data[icode].operand[1].mode;
11741 	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
11742 	    op0 = copy_to_mode_reg (mode1, op0);
11743 
11744 	  mode0 = insn_data[icode].operand[0].mode;
11745 	  if (target == 0
11746 	      || !register_operand (target, mode0))
11747 	    target = gen_reg_rtx (mode0);
11748 
11749           pat = GEN_FCN (icode) (target, op0, op1, op2);
11750           if (pat)
11751             emit_insn (pat);
11752           return target;
11753         }
11754 
11755     case IX86_BUILTIN_RDRAND16_STEP:
11756       icode = CODE_FOR_rdrandhi_1;
11757       mode0 = HImode;
11758       goto rdrand_step;
11759 
11760     case IX86_BUILTIN_RDRAND32_STEP:
11761       icode = CODE_FOR_rdrandsi_1;
11762       mode0 = SImode;
11763       goto rdrand_step;
11764 
11765     case IX86_BUILTIN_RDRAND64_STEP:
11766       icode = CODE_FOR_rdranddi_1;
11767       mode0 = DImode;
11768 
11769 rdrand_step:
11770       arg0 = CALL_EXPR_ARG (exp, 0);
11771       op1 = expand_normal (arg0);
11772       if (!address_operand (op1, VOIDmode))
11773 	{
11774 	  op1 = convert_memory_address (Pmode, op1);
11775 	  op1 = copy_addr_to_reg (op1);
11776 	}
11777 
11778       op0 = gen_reg_rtx (mode0);
11779       emit_insn (GEN_FCN (icode) (op0));
11780 
11781       emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11782 
11783       op1 = gen_reg_rtx (SImode);
11784       emit_move_insn (op1, CONST1_RTX (SImode));
11785 
11786       /* Emit SImode conditional move.  */
11787       if (mode0 == HImode)
11788 	{
11789 	  if (TARGET_ZERO_EXTEND_WITH_AND
11790 	      && optimize_function_for_speed_p (cfun))
11791 	    {
11792 	      op2 = force_reg (SImode, const0_rtx);
11793 
11794 	      emit_insn (gen_movstricthi
11795 			 (gen_lowpart (HImode, op2), op0));
11796 	    }
11797 	  else
11798 	    {
11799 	      op2 = gen_reg_rtx (SImode);
11800 
11801 	      emit_insn (gen_zero_extendhisi2 (op2, op0));
11802 	    }
11803 	}
11804       else if (mode0 == SImode)
11805 	op2 = op0;
11806       else
11807 	op2 = gen_rtx_SUBREG (SImode, op0, 0);
11808 
11809       if (target == 0
11810 	  || !register_operand (target, SImode))
11811 	target = gen_reg_rtx (SImode);
11812 
11813       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11814 			 const0_rtx);
11815       emit_insn (gen_rtx_SET (target,
11816 			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11817       return target;
11818 
11819     case IX86_BUILTIN_RDSEED16_STEP:
11820       icode = CODE_FOR_rdseedhi_1;
11821       mode0 = HImode;
11822       goto rdseed_step;
11823 
11824     case IX86_BUILTIN_RDSEED32_STEP:
11825       icode = CODE_FOR_rdseedsi_1;
11826       mode0 = SImode;
11827       goto rdseed_step;
11828 
11829     case IX86_BUILTIN_RDSEED64_STEP:
11830       icode = CODE_FOR_rdseeddi_1;
11831       mode0 = DImode;
11832 
11833 rdseed_step:
11834       arg0 = CALL_EXPR_ARG (exp, 0);
11835       op1 = expand_normal (arg0);
11836       if (!address_operand (op1, VOIDmode))
11837 	{
11838 	  op1 = convert_memory_address (Pmode, op1);
11839 	  op1 = copy_addr_to_reg (op1);
11840 	}
11841 
11842       op0 = gen_reg_rtx (mode0);
11843       emit_insn (GEN_FCN (icode) (op0));
11844 
11845       emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11846 
11847       op2 = gen_reg_rtx (QImode);
11848 
11849       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11850                          const0_rtx);
11851       emit_insn (gen_rtx_SET (op2, pat));
11852 
11853       if (target == 0
11854 	  || !register_operand (target, SImode))
11855         target = gen_reg_rtx (SImode);
11856 
11857       emit_insn (gen_zero_extendqisi2 (target, op2));
11858       return target;
11859 
11860     case IX86_BUILTIN_SBB32:
11861       icode = CODE_FOR_subborrowsi;
11862       icode2 = CODE_FOR_subborrowsi_0;
11863       mode0 = SImode;
11864       mode1 = DImode;
11865       mode2 = CCmode;
11866       goto handlecarry;
11867 
11868     case IX86_BUILTIN_SBB64:
11869       icode = CODE_FOR_subborrowdi;
11870       icode2 = CODE_FOR_subborrowdi_0;
11871       mode0 = DImode;
11872       mode1 = TImode;
11873       mode2 = CCmode;
11874       goto handlecarry;
11875 
11876     case IX86_BUILTIN_ADDCARRYX32:
11877       icode = CODE_FOR_addcarrysi;
11878       icode2 = CODE_FOR_addcarrysi_0;
11879       mode0 = SImode;
11880       mode1 = DImode;
11881       mode2 = CCCmode;
11882       goto handlecarry;
11883 
11884     case IX86_BUILTIN_ADDCARRYX64:
11885       icode = CODE_FOR_addcarrydi;
11886       icode2 = CODE_FOR_addcarrydi_0;
11887       mode0 = DImode;
11888       mode1 = TImode;
11889       mode2 = CCCmode;
11890 
11891     handlecarry:
11892       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
11893       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
11894       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
11895       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
11896 
11897       op1 = expand_normal (arg0);
11898       if (!integer_zerop (arg0))
11899 	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11900 
11901       op2 = expand_normal (arg1);
11902       if (!register_operand (op2, mode0))
11903 	op2 = copy_to_mode_reg (mode0, op2);
11904 
11905       op3 = expand_normal (arg2);
11906       if (!register_operand (op3, mode0))
11907 	op3 = copy_to_mode_reg (mode0, op3);
11908 
11909       op4 = expand_normal (arg3);
11910       if (!address_operand (op4, VOIDmode))
11911 	{
11912 	  op4 = convert_memory_address (Pmode, op4);
11913 	  op4 = copy_addr_to_reg (op4);
11914 	}
11915 
11916       op0 = gen_reg_rtx (mode0);
11917       if (integer_zerop (arg0))
11918 	{
11919 	  /* If arg0 is 0, optimize right away into add or sub
11920 	     instruction that sets CCCmode flags.  */
11921 	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
11922 	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11923 	}
11924       else
11925 	{
11926 	  /* Generate CF from input operand.  */
11927 	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11928 
11929 	  /* Generate instruction that consumes CF.  */
11930 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11931 	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11932 	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11933 	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11934 	}
11935 
11936       /* Return current CF value.  */
11937       if (target == 0)
11938         target = gen_reg_rtx (QImode);
11939 
11940       pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11941       emit_insn (gen_rtx_SET (target, pat));
11942 
11943       /* Store the result.  */
11944       emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11945 
11946       return target;
11947 
11948     case IX86_BUILTIN_READ_FLAGS:
11949       if (ignore)
11950 	return const0_rtx;
11951 
11952       emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11953 
11954       if (optimize
11955 	  || target == NULL_RTX
11956 	  || !nonimmediate_operand (target, word_mode)
11957 	  || GET_MODE (target) != word_mode)
11958 	target = gen_reg_rtx (word_mode);
11959 
11960       emit_insn (gen_pop (target));
11961       return target;
11962 
11963     case IX86_BUILTIN_WRITE_FLAGS:
11964 
11965       arg0 = CALL_EXPR_ARG (exp, 0);
11966       op0 = expand_normal (arg0);
11967       if (!general_no_elim_operand (op0, word_mode))
11968 	op0 = copy_to_mode_reg (word_mode, op0);
11969 
11970       emit_insn (gen_push (op0));
11971       emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11972       return 0;
11973 
11974     case IX86_BUILTIN_KTESTC8:
11975       icode = CODE_FOR_ktestqi;
11976       mode3 = CCCmode;
11977       goto kortest;
11978 
11979     case IX86_BUILTIN_KTESTZ8:
11980       icode = CODE_FOR_ktestqi;
11981       mode3 = CCZmode;
11982       goto kortest;
11983 
11984     case IX86_BUILTIN_KTESTC16:
11985       icode = CODE_FOR_ktesthi;
11986       mode3 = CCCmode;
11987       goto kortest;
11988 
11989     case IX86_BUILTIN_KTESTZ16:
11990       icode = CODE_FOR_ktesthi;
11991       mode3 = CCZmode;
11992       goto kortest;
11993 
11994     case IX86_BUILTIN_KTESTC32:
11995       icode = CODE_FOR_ktestsi;
11996       mode3 = CCCmode;
11997       goto kortest;
11998 
11999     case IX86_BUILTIN_KTESTZ32:
12000       icode = CODE_FOR_ktestsi;
12001       mode3 = CCZmode;
12002       goto kortest;
12003 
12004     case IX86_BUILTIN_KTESTC64:
12005       icode = CODE_FOR_ktestdi;
12006       mode3 = CCCmode;
12007       goto kortest;
12008 
12009     case IX86_BUILTIN_KTESTZ64:
12010       icode = CODE_FOR_ktestdi;
12011       mode3 = CCZmode;
12012       goto kortest;
12013 
12014     case IX86_BUILTIN_KORTESTC8:
12015       icode = CODE_FOR_kortestqi;
12016       mode3 = CCCmode;
12017       goto kortest;
12018 
12019     case IX86_BUILTIN_KORTESTZ8:
12020       icode = CODE_FOR_kortestqi;
12021       mode3 = CCZmode;
12022       goto kortest;
12023 
12024     case IX86_BUILTIN_KORTESTC16:
12025       icode = CODE_FOR_kortesthi;
12026       mode3 = CCCmode;
12027       goto kortest;
12028 
12029     case IX86_BUILTIN_KORTESTZ16:
12030       icode = CODE_FOR_kortesthi;
12031       mode3 = CCZmode;
12032       goto kortest;
12033 
12034     case IX86_BUILTIN_KORTESTC32:
12035       icode = CODE_FOR_kortestsi;
12036       mode3 = CCCmode;
12037       goto kortest;
12038 
12039     case IX86_BUILTIN_KORTESTZ32:
12040       icode = CODE_FOR_kortestsi;
12041       mode3 = CCZmode;
12042       goto kortest;
12043 
12044     case IX86_BUILTIN_KORTESTC64:
12045       icode = CODE_FOR_kortestdi;
12046       mode3 = CCCmode;
12047       goto kortest;
12048 
12049     case IX86_BUILTIN_KORTESTZ64:
12050       icode = CODE_FOR_kortestdi;
12051       mode3 = CCZmode;
12052 
12053     kortest:
12054       arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
12055       arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
12056       op0 = expand_normal (arg0);
12057       op1 = expand_normal (arg1);
12058 
12059       mode0 = insn_data[icode].operand[0].mode;
12060       mode1 = insn_data[icode].operand[1].mode;
12061 
12062       if (GET_MODE (op0) != VOIDmode)
12063 	op0 = force_reg (GET_MODE (op0), op0);
12064 
12065       op0 = gen_lowpart (mode0, op0);
12066 
12067       if (!insn_data[icode].operand[0].predicate (op0, mode0))
12068 	op0 = copy_to_mode_reg (mode0, op0);
12069 
12070       if (GET_MODE (op1) != VOIDmode)
12071 	op1 = force_reg (GET_MODE (op1), op1);
12072 
12073       op1 = gen_lowpart (mode1, op1);
12074 
12075       if (!insn_data[icode].operand[1].predicate (op1, mode1))
12076 	op1 = copy_to_mode_reg (mode1, op1);
12077 
12078       target = gen_reg_rtx (QImode);
12079 
12080       /* Emit kortest.  */
12081       emit_insn (GEN_FCN (icode) (op0, op1));
12082       /* And use setcc to return result from flags.  */
12083       ix86_expand_setcc (target, EQ,
12084 			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12085       return target;
12086 
12087     case IX86_BUILTIN_GATHERSIV2DF:
12088       icode = CODE_FOR_avx2_gathersiv2df;
12089       goto gather_gen;
12090     case IX86_BUILTIN_GATHERSIV4DF:
12091       icode = CODE_FOR_avx2_gathersiv4df;
12092       goto gather_gen;
12093     case IX86_BUILTIN_GATHERDIV2DF:
12094       icode = CODE_FOR_avx2_gatherdiv2df;
12095       goto gather_gen;
12096     case IX86_BUILTIN_GATHERDIV4DF:
12097       icode = CODE_FOR_avx2_gatherdiv4df;
12098       goto gather_gen;
12099     case IX86_BUILTIN_GATHERSIV4SF:
12100       icode = CODE_FOR_avx2_gathersiv4sf;
12101       goto gather_gen;
12102     case IX86_BUILTIN_GATHERSIV8SF:
12103       icode = CODE_FOR_avx2_gathersiv8sf;
12104       goto gather_gen;
12105     case IX86_BUILTIN_GATHERDIV4SF:
12106       icode = CODE_FOR_avx2_gatherdiv4sf;
12107       goto gather_gen;
12108     case IX86_BUILTIN_GATHERDIV8SF:
12109       icode = CODE_FOR_avx2_gatherdiv8sf;
12110       goto gather_gen;
12111     case IX86_BUILTIN_GATHERSIV2DI:
12112       icode = CODE_FOR_avx2_gathersiv2di;
12113       goto gather_gen;
12114     case IX86_BUILTIN_GATHERSIV4DI:
12115       icode = CODE_FOR_avx2_gathersiv4di;
12116       goto gather_gen;
12117     case IX86_BUILTIN_GATHERDIV2DI:
12118       icode = CODE_FOR_avx2_gatherdiv2di;
12119       goto gather_gen;
12120     case IX86_BUILTIN_GATHERDIV4DI:
12121       icode = CODE_FOR_avx2_gatherdiv4di;
12122       goto gather_gen;
12123     case IX86_BUILTIN_GATHERSIV4SI:
12124       icode = CODE_FOR_avx2_gathersiv4si;
12125       goto gather_gen;
12126     case IX86_BUILTIN_GATHERSIV8SI:
12127       icode = CODE_FOR_avx2_gathersiv8si;
12128       goto gather_gen;
12129     case IX86_BUILTIN_GATHERDIV4SI:
12130       icode = CODE_FOR_avx2_gatherdiv4si;
12131       goto gather_gen;
12132     case IX86_BUILTIN_GATHERDIV8SI:
12133       icode = CODE_FOR_avx2_gatherdiv8si;
12134       goto gather_gen;
12135     case IX86_BUILTIN_GATHERALTSIV4DF:
12136       icode = CODE_FOR_avx2_gathersiv4df;
12137       goto gather_gen;
12138     case IX86_BUILTIN_GATHERALTDIV8SF:
12139       icode = CODE_FOR_avx2_gatherdiv8sf;
12140       goto gather_gen;
12141     case IX86_BUILTIN_GATHERALTSIV4DI:
12142       icode = CODE_FOR_avx2_gathersiv4di;
12143       goto gather_gen;
12144     case IX86_BUILTIN_GATHERALTDIV8SI:
12145       icode = CODE_FOR_avx2_gatherdiv8si;
12146       goto gather_gen;
12147     case IX86_BUILTIN_GATHER3SIV16SF:
12148       icode = CODE_FOR_avx512f_gathersiv16sf;
12149       goto gather_gen;
12150     case IX86_BUILTIN_GATHER3SIV8DF:
12151       icode = CODE_FOR_avx512f_gathersiv8df;
12152       goto gather_gen;
12153     case IX86_BUILTIN_GATHER3DIV16SF:
12154       icode = CODE_FOR_avx512f_gatherdiv16sf;
12155       goto gather_gen;
12156     case IX86_BUILTIN_GATHER3DIV8DF:
12157       icode = CODE_FOR_avx512f_gatherdiv8df;
12158       goto gather_gen;
12159     case IX86_BUILTIN_GATHER3SIV16SI:
12160       icode = CODE_FOR_avx512f_gathersiv16si;
12161       goto gather_gen;
12162     case IX86_BUILTIN_GATHER3SIV8DI:
12163       icode = CODE_FOR_avx512f_gathersiv8di;
12164       goto gather_gen;
12165     case IX86_BUILTIN_GATHER3DIV16SI:
12166       icode = CODE_FOR_avx512f_gatherdiv16si;
12167       goto gather_gen;
12168     case IX86_BUILTIN_GATHER3DIV8DI:
12169       icode = CODE_FOR_avx512f_gatherdiv8di;
12170       goto gather_gen;
12171     case IX86_BUILTIN_GATHER3ALTSIV8DF:
12172       icode = CODE_FOR_avx512f_gathersiv8df;
12173       goto gather_gen;
12174     case IX86_BUILTIN_GATHER3ALTDIV16SF:
12175       icode = CODE_FOR_avx512f_gatherdiv16sf;
12176       goto gather_gen;
12177     case IX86_BUILTIN_GATHER3ALTSIV8DI:
12178       icode = CODE_FOR_avx512f_gathersiv8di;
12179       goto gather_gen;
12180     case IX86_BUILTIN_GATHER3ALTDIV16SI:
12181       icode = CODE_FOR_avx512f_gatherdiv16si;
12182       goto gather_gen;
12183     case IX86_BUILTIN_GATHER3SIV2DF:
12184       icode = CODE_FOR_avx512vl_gathersiv2df;
12185       goto gather_gen;
12186     case IX86_BUILTIN_GATHER3SIV4DF:
12187       icode = CODE_FOR_avx512vl_gathersiv4df;
12188       goto gather_gen;
12189     case IX86_BUILTIN_GATHER3DIV2DF:
12190       icode = CODE_FOR_avx512vl_gatherdiv2df;
12191       goto gather_gen;
12192     case IX86_BUILTIN_GATHER3DIV4DF:
12193       icode = CODE_FOR_avx512vl_gatherdiv4df;
12194       goto gather_gen;
12195     case IX86_BUILTIN_GATHER3SIV4SF:
12196       icode = CODE_FOR_avx512vl_gathersiv4sf;
12197       goto gather_gen;
12198     case IX86_BUILTIN_GATHER3SIV8SF:
12199       icode = CODE_FOR_avx512vl_gathersiv8sf;
12200       goto gather_gen;
12201     case IX86_BUILTIN_GATHER3DIV4SF:
12202       icode = CODE_FOR_avx512vl_gatherdiv4sf;
12203       goto gather_gen;
12204     case IX86_BUILTIN_GATHER3DIV8SF:
12205       icode = CODE_FOR_avx512vl_gatherdiv8sf;
12206       goto gather_gen;
12207     case IX86_BUILTIN_GATHER3SIV2DI:
12208       icode = CODE_FOR_avx512vl_gathersiv2di;
12209       goto gather_gen;
12210     case IX86_BUILTIN_GATHER3SIV4DI:
12211       icode = CODE_FOR_avx512vl_gathersiv4di;
12212       goto gather_gen;
12213     case IX86_BUILTIN_GATHER3DIV2DI:
12214       icode = CODE_FOR_avx512vl_gatherdiv2di;
12215       goto gather_gen;
12216     case IX86_BUILTIN_GATHER3DIV4DI:
12217       icode = CODE_FOR_avx512vl_gatherdiv4di;
12218       goto gather_gen;
12219     case IX86_BUILTIN_GATHER3SIV4SI:
12220       icode = CODE_FOR_avx512vl_gathersiv4si;
12221       goto gather_gen;
12222     case IX86_BUILTIN_GATHER3SIV8SI:
12223       icode = CODE_FOR_avx512vl_gathersiv8si;
12224       goto gather_gen;
12225     case IX86_BUILTIN_GATHER3DIV4SI:
12226       icode = CODE_FOR_avx512vl_gatherdiv4si;
12227       goto gather_gen;
12228     case IX86_BUILTIN_GATHER3DIV8SI:
12229       icode = CODE_FOR_avx512vl_gatherdiv8si;
12230       goto gather_gen;
12231     case IX86_BUILTIN_GATHER3ALTSIV4DF:
12232       icode = CODE_FOR_avx512vl_gathersiv4df;
12233       goto gather_gen;
12234     case IX86_BUILTIN_GATHER3ALTDIV8SF:
12235       icode = CODE_FOR_avx512vl_gatherdiv8sf;
12236       goto gather_gen;
12237     case IX86_BUILTIN_GATHER3ALTSIV4DI:
12238       icode = CODE_FOR_avx512vl_gathersiv4di;
12239       goto gather_gen;
12240     case IX86_BUILTIN_GATHER3ALTDIV8SI:
12241       icode = CODE_FOR_avx512vl_gatherdiv8si;
12242       goto gather_gen;
12243     case IX86_BUILTIN_SCATTERSIV16SF:
12244       icode = CODE_FOR_avx512f_scattersiv16sf;
12245       goto scatter_gen;
12246     case IX86_BUILTIN_SCATTERSIV8DF:
12247       icode = CODE_FOR_avx512f_scattersiv8df;
12248       goto scatter_gen;
12249     case IX86_BUILTIN_SCATTERDIV16SF:
12250       icode = CODE_FOR_avx512f_scatterdiv16sf;
12251       goto scatter_gen;
12252     case IX86_BUILTIN_SCATTERDIV8DF:
12253       icode = CODE_FOR_avx512f_scatterdiv8df;
12254       goto scatter_gen;
12255     case IX86_BUILTIN_SCATTERSIV16SI:
12256       icode = CODE_FOR_avx512f_scattersiv16si;
12257       goto scatter_gen;
12258     case IX86_BUILTIN_SCATTERSIV8DI:
12259       icode = CODE_FOR_avx512f_scattersiv8di;
12260       goto scatter_gen;
12261     case IX86_BUILTIN_SCATTERDIV16SI:
12262       icode = CODE_FOR_avx512f_scatterdiv16si;
12263       goto scatter_gen;
12264     case IX86_BUILTIN_SCATTERDIV8DI:
12265       icode = CODE_FOR_avx512f_scatterdiv8di;
12266       goto scatter_gen;
12267     case IX86_BUILTIN_SCATTERSIV8SF:
12268       icode = CODE_FOR_avx512vl_scattersiv8sf;
12269       goto scatter_gen;
12270     case IX86_BUILTIN_SCATTERSIV4SF:
12271       icode = CODE_FOR_avx512vl_scattersiv4sf;
12272       goto scatter_gen;
12273     case IX86_BUILTIN_SCATTERSIV4DF:
12274       icode = CODE_FOR_avx512vl_scattersiv4df;
12275       goto scatter_gen;
12276     case IX86_BUILTIN_SCATTERSIV2DF:
12277       icode = CODE_FOR_avx512vl_scattersiv2df;
12278       goto scatter_gen;
12279     case IX86_BUILTIN_SCATTERDIV8SF:
12280       icode = CODE_FOR_avx512vl_scatterdiv8sf;
12281       goto scatter_gen;
12282     case IX86_BUILTIN_SCATTERDIV4SF:
12283       icode = CODE_FOR_avx512vl_scatterdiv4sf;
12284       goto scatter_gen;
12285     case IX86_BUILTIN_SCATTERDIV4DF:
12286       icode = CODE_FOR_avx512vl_scatterdiv4df;
12287       goto scatter_gen;
12288     case IX86_BUILTIN_SCATTERDIV2DF:
12289       icode = CODE_FOR_avx512vl_scatterdiv2df;
12290       goto scatter_gen;
12291     case IX86_BUILTIN_SCATTERSIV8SI:
12292       icode = CODE_FOR_avx512vl_scattersiv8si;
12293       goto scatter_gen;
12294     case IX86_BUILTIN_SCATTERSIV4SI:
12295       icode = CODE_FOR_avx512vl_scattersiv4si;
12296       goto scatter_gen;
12297     case IX86_BUILTIN_SCATTERSIV4DI:
12298       icode = CODE_FOR_avx512vl_scattersiv4di;
12299       goto scatter_gen;
12300     case IX86_BUILTIN_SCATTERSIV2DI:
12301       icode = CODE_FOR_avx512vl_scattersiv2di;
12302       goto scatter_gen;
12303     case IX86_BUILTIN_SCATTERDIV8SI:
12304       icode = CODE_FOR_avx512vl_scatterdiv8si;
12305       goto scatter_gen;
12306     case IX86_BUILTIN_SCATTERDIV4SI:
12307       icode = CODE_FOR_avx512vl_scatterdiv4si;
12308       goto scatter_gen;
12309     case IX86_BUILTIN_SCATTERDIV4DI:
12310       icode = CODE_FOR_avx512vl_scatterdiv4di;
12311       goto scatter_gen;
12312     case IX86_BUILTIN_SCATTERDIV2DI:
12313       icode = CODE_FOR_avx512vl_scatterdiv2di;
12314       goto scatter_gen;
12315     case IX86_BUILTIN_GATHERPFDPD:
12316       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12317       goto vec_prefetch_gen;
12318     case IX86_BUILTIN_SCATTERALTSIV8DF:
12319       icode = CODE_FOR_avx512f_scattersiv8df;
12320       goto scatter_gen;
12321     case IX86_BUILTIN_SCATTERALTDIV16SF:
12322       icode = CODE_FOR_avx512f_scatterdiv16sf;
12323       goto scatter_gen;
12324     case IX86_BUILTIN_SCATTERALTSIV8DI:
12325       icode = CODE_FOR_avx512f_scattersiv8di;
12326       goto scatter_gen;
12327     case IX86_BUILTIN_SCATTERALTDIV16SI:
12328       icode = CODE_FOR_avx512f_scatterdiv16si;
12329       goto scatter_gen;
12330     case IX86_BUILTIN_SCATTERALTSIV4DF:
12331       icode = CODE_FOR_avx512vl_scattersiv4df;
12332       goto scatter_gen;
12333     case IX86_BUILTIN_SCATTERALTDIV8SF:
12334       icode = CODE_FOR_avx512vl_scatterdiv8sf;
12335       goto scatter_gen;
12336     case IX86_BUILTIN_SCATTERALTSIV4DI:
12337       icode = CODE_FOR_avx512vl_scattersiv4di;
12338       goto scatter_gen;
12339     case IX86_BUILTIN_SCATTERALTDIV8SI:
12340       icode = CODE_FOR_avx512vl_scatterdiv8si;
12341       goto scatter_gen;
12342     case IX86_BUILTIN_SCATTERALTSIV2DF:
12343       icode = CODE_FOR_avx512vl_scattersiv2df;
12344       goto scatter_gen;
12345     case IX86_BUILTIN_SCATTERALTDIV4SF:
12346       icode = CODE_FOR_avx512vl_scatterdiv4sf;
12347       goto scatter_gen;
12348     case IX86_BUILTIN_SCATTERALTSIV2DI:
12349       icode = CODE_FOR_avx512vl_scattersiv2di;
12350       goto scatter_gen;
12351     case IX86_BUILTIN_SCATTERALTDIV4SI:
12352       icode = CODE_FOR_avx512vl_scatterdiv4si;
12353       goto scatter_gen;
12354     case IX86_BUILTIN_GATHERPFDPS:
12355       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12356       goto vec_prefetch_gen;
12357     case IX86_BUILTIN_GATHERPFQPD:
12358       icode = CODE_FOR_avx512pf_gatherpfv8didf;
12359       goto vec_prefetch_gen;
12360     case IX86_BUILTIN_GATHERPFQPS:
12361       icode = CODE_FOR_avx512pf_gatherpfv8disf;
12362       goto vec_prefetch_gen;
12363     case IX86_BUILTIN_SCATTERPFDPD:
12364       icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12365       goto vec_prefetch_gen;
12366     case IX86_BUILTIN_SCATTERPFDPS:
12367       icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12368       goto vec_prefetch_gen;
12369     case IX86_BUILTIN_SCATTERPFQPD:
12370       icode = CODE_FOR_avx512pf_scatterpfv8didf;
12371       goto vec_prefetch_gen;
12372     case IX86_BUILTIN_SCATTERPFQPS:
12373       icode = CODE_FOR_avx512pf_scatterpfv8disf;
12374       goto vec_prefetch_gen;
12375 
12376     gather_gen:
12377       rtx half;
12378       rtx (*gen) (rtx, rtx);
12379 
12380       arg0 = CALL_EXPR_ARG (exp, 0);
12381       arg1 = CALL_EXPR_ARG (exp, 1);
12382       arg2 = CALL_EXPR_ARG (exp, 2);
12383       arg3 = CALL_EXPR_ARG (exp, 3);
12384       arg4 = CALL_EXPR_ARG (exp, 4);
12385       op0 = expand_normal (arg0);
12386       op1 = expand_normal (arg1);
12387       op2 = expand_normal (arg2);
12388       op3 = expand_normal (arg3);
12389       op4 = expand_normal (arg4);
12390       /* Note the arg order is different from the operand order.  */
12391       mode0 = insn_data[icode].operand[1].mode;
12392       mode2 = insn_data[icode].operand[3].mode;
12393       mode3 = insn_data[icode].operand[4].mode;
12394       mode4 = insn_data[icode].operand[5].mode;
12395 
12396       if (target == NULL_RTX
12397 	  || GET_MODE (target) != insn_data[icode].operand[0].mode
12398 	  || !insn_data[icode].operand[0].predicate (target,
12399 						     GET_MODE (target)))
12400 	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12401       else
12402 	subtarget = target;
12403 
12404       switch (fcode)
12405 	{
12406 	case IX86_BUILTIN_GATHER3ALTSIV8DF:
12407 	case IX86_BUILTIN_GATHER3ALTSIV8DI:
12408 	  half = gen_reg_rtx (V8SImode);
12409 	  if (!nonimmediate_operand (op2, V16SImode))
12410 	    op2 = copy_to_mode_reg (V16SImode, op2);
12411 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
12412 	  op2 = half;
12413 	  break;
12414 	case IX86_BUILTIN_GATHER3ALTSIV4DF:
12415 	case IX86_BUILTIN_GATHER3ALTSIV4DI:
12416 	case IX86_BUILTIN_GATHERALTSIV4DF:
12417 	case IX86_BUILTIN_GATHERALTSIV4DI:
12418 	  half = gen_reg_rtx (V4SImode);
12419 	  if (!nonimmediate_operand (op2, V8SImode))
12420 	    op2 = copy_to_mode_reg (V8SImode, op2);
12421 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
12422 	  op2 = half;
12423 	  break;
12424 	case IX86_BUILTIN_GATHER3ALTDIV16SF:
12425 	case IX86_BUILTIN_GATHER3ALTDIV16SI:
12426 	  half = gen_reg_rtx (mode0);
12427 	  if (mode0 == V8SFmode)
12428 	    gen = gen_vec_extract_lo_v16sf;
12429 	  else
12430 	    gen = gen_vec_extract_lo_v16si;
12431 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
12432 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12433 	  emit_insn (gen (half, op0));
12434 	  op0 = half;
12435 	  op3 = lowpart_subreg (QImode, op3, HImode);
12436 	  break;
12437 	case IX86_BUILTIN_GATHER3ALTDIV8SF:
12438 	case IX86_BUILTIN_GATHER3ALTDIV8SI:
12439 	case IX86_BUILTIN_GATHERALTDIV8SF:
12440 	case IX86_BUILTIN_GATHERALTDIV8SI:
12441 	  half = gen_reg_rtx (mode0);
12442 	  if (mode0 == V4SFmode)
12443 	    gen = gen_vec_extract_lo_v8sf;
12444 	  else
12445 	    gen = gen_vec_extract_lo_v8si;
12446 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
12447 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12448 	  emit_insn (gen (half, op0));
12449 	  op0 = half;
12450 	  if (VECTOR_MODE_P (GET_MODE (op3)))
12451 	    {
12452 	      half = gen_reg_rtx (mode0);
12453 	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
12454 		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12455 	      emit_insn (gen (half, op3));
12456 	      op3 = half;
12457 	    }
12458 	  break;
12459 	default:
12460 	  break;
12461 	}
12462 
12463       /* Force memory operand only with base register here.  But we
12464 	 don't want to do it on memory operand for other builtin
12465 	 functions.  */
12466       op1 = ix86_zero_extend_to_Pmode (op1);
12467 
12468       if (!insn_data[icode].operand[1].predicate (op0, mode0))
12469 	op0 = copy_to_mode_reg (mode0, op0);
12470       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12471 	op1 = copy_to_mode_reg (Pmode, op1);
12472       if (!insn_data[icode].operand[3].predicate (op2, mode2))
12473 	op2 = copy_to_mode_reg (mode2, op2);
12474 
12475       op3 = fixup_modeless_constant (op3, mode3);
12476 
12477       if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12478 	{
12479 	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
12480 	    op3 = copy_to_mode_reg (mode3, op3);
12481 	}
12482       else
12483 	{
12484 	  op3 = copy_to_reg (op3);
12485 	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12486 	}
12487       if (!insn_data[icode].operand[5].predicate (op4, mode4))
12488 	{
12489           error ("the last argument must be scale 1, 2, 4, 8");
12490           return const0_rtx;
12491 	}
12492 
12493       /* Optimize.  If mask is known to have all high bits set,
12494 	 replace op0 with pc_rtx to signal that the instruction
12495 	 overwrites the whole destination and doesn't use its
12496 	 previous contents.  */
12497       if (optimize)
12498 	{
12499 	  if (TREE_CODE (arg3) == INTEGER_CST)
12500 	    {
12501 	      if (integer_all_onesp (arg3))
12502 		op0 = pc_rtx;
12503 	    }
12504 	  else if (TREE_CODE (arg3) == VECTOR_CST)
12505 	    {
12506 	      unsigned int negative = 0;
12507 	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12508 		{
12509 		  tree cst = VECTOR_CST_ELT (arg3, i);
12510 		  if (TREE_CODE (cst) == INTEGER_CST
12511 		      && tree_int_cst_sign_bit (cst))
12512 		    negative++;
12513 		  else if (TREE_CODE (cst) == REAL_CST
12514 			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12515 		    negative++;
12516 		}
12517 	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12518 		op0 = pc_rtx;
12519 	    }
12520 	  else if (TREE_CODE (arg3) == SSA_NAME
12521 		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12522 	    {
12523 	      /* Recognize also when mask is like:
12524 		 __v2df src = _mm_setzero_pd ();
12525 		 __v2df mask = _mm_cmpeq_pd (src, src);
12526 		 or
12527 		 __v8sf src = _mm256_setzero_ps ();
12528 		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12529 		 as that is a cheaper way to load all ones into
12530 		 a register than having to load a constant from
12531 		 memory.  */
12532 	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12533 	      if (is_gimple_call (def_stmt))
12534 		{
12535 		  tree fndecl = gimple_call_fndecl (def_stmt);
12536 		  if (fndecl
12537 		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12538 		    switch (DECL_MD_FUNCTION_CODE (fndecl))
12539 		      {
12540 		      case IX86_BUILTIN_CMPPD:
12541 		      case IX86_BUILTIN_CMPPS:
12542 		      case IX86_BUILTIN_CMPPD256:
12543 		      case IX86_BUILTIN_CMPPS256:
12544 			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12545 			  break;
12546 			/* FALLTHRU */
12547 		      case IX86_BUILTIN_CMPEQPD:
12548 		      case IX86_BUILTIN_CMPEQPS:
12549 			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12550 			    && initializer_zerop (gimple_call_arg (def_stmt,
12551 								   1)))
12552 			  op0 = pc_rtx;
12553 			break;
12554 		      default:
12555 			break;
12556 		      }
12557 		}
12558 	    }
12559 	}
12560 
12561       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12562       if (! pat)
12563 	return const0_rtx;
12564       emit_insn (pat);
12565 
12566       switch (fcode)
12567 	{
12568 	case IX86_BUILTIN_GATHER3DIV16SF:
12569 	  if (target == NULL_RTX)
12570 	    target = gen_reg_rtx (V8SFmode);
12571 	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12572 	  break;
12573 	case IX86_BUILTIN_GATHER3DIV16SI:
12574 	  if (target == NULL_RTX)
12575 	    target = gen_reg_rtx (V8SImode);
12576 	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12577 	  break;
12578 	case IX86_BUILTIN_GATHER3DIV8SF:
12579 	case IX86_BUILTIN_GATHERDIV8SF:
12580 	  if (target == NULL_RTX)
12581 	    target = gen_reg_rtx (V4SFmode);
12582 	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12583 	  break;
12584 	case IX86_BUILTIN_GATHER3DIV8SI:
12585 	case IX86_BUILTIN_GATHERDIV8SI:
12586 	  if (target == NULL_RTX)
12587 	    target = gen_reg_rtx (V4SImode);
12588 	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12589 	  break;
12590 	default:
12591 	  target = subtarget;
12592 	  break;
12593 	}
12594       return target;
12595 
12596     scatter_gen:
12597       arg0 = CALL_EXPR_ARG (exp, 0);
12598       arg1 = CALL_EXPR_ARG (exp, 1);
12599       arg2 = CALL_EXPR_ARG (exp, 2);
12600       arg3 = CALL_EXPR_ARG (exp, 3);
12601       arg4 = CALL_EXPR_ARG (exp, 4);
12602       op0 = expand_normal (arg0);
12603       op1 = expand_normal (arg1);
12604       op2 = expand_normal (arg2);
12605       op3 = expand_normal (arg3);
12606       op4 = expand_normal (arg4);
12607       mode1 = insn_data[icode].operand[1].mode;
12608       mode2 = insn_data[icode].operand[2].mode;
12609       mode3 = insn_data[icode].operand[3].mode;
12610       mode4 = insn_data[icode].operand[4].mode;
12611 
12612       /* Scatter instruction stores operand op3 to memory with
12613 	 indices from op2 and scale from op4 under writemask op1.
12614 	 If index operand op2 has more elements then source operand
12615 	 op3 one need to use only its low half. And vice versa.  */
12616       switch (fcode)
12617 	{
12618 	case IX86_BUILTIN_SCATTERALTSIV8DF:
12619 	case IX86_BUILTIN_SCATTERALTSIV8DI:
12620 	  half = gen_reg_rtx (V8SImode);
12621 	  if (!nonimmediate_operand (op2, V16SImode))
12622 	    op2 = copy_to_mode_reg (V16SImode, op2);
12623 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
12624 	  op2 = half;
12625 	  break;
12626 	case IX86_BUILTIN_SCATTERALTDIV16SF:
12627 	case IX86_BUILTIN_SCATTERALTDIV16SI:
12628 	  half = gen_reg_rtx (mode3);
12629 	  if (mode3 == V8SFmode)
12630 	    gen = gen_vec_extract_lo_v16sf;
12631 	  else
12632 	    gen = gen_vec_extract_lo_v16si;
12633 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
12634 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12635 	  emit_insn (gen (half, op3));
12636 	  op3 = half;
12637 	  break;
12638 	case IX86_BUILTIN_SCATTERALTSIV4DF:
12639 	case IX86_BUILTIN_SCATTERALTSIV4DI:
12640 	  half = gen_reg_rtx (V4SImode);
12641 	  if (!nonimmediate_operand (op2, V8SImode))
12642 	    op2 = copy_to_mode_reg (V8SImode, op2);
12643 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
12644 	  op2 = half;
12645 	  break;
12646 	case IX86_BUILTIN_SCATTERALTDIV8SF:
12647 	case IX86_BUILTIN_SCATTERALTDIV8SI:
12648 	  half = gen_reg_rtx (mode3);
12649 	  if (mode3 == V4SFmode)
12650 	    gen = gen_vec_extract_lo_v8sf;
12651 	  else
12652 	    gen = gen_vec_extract_lo_v8si;
12653 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
12654 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12655 	  emit_insn (gen (half, op3));
12656 	  op3 = half;
12657 	  break;
12658 	case IX86_BUILTIN_SCATTERALTSIV2DF:
12659 	case IX86_BUILTIN_SCATTERALTSIV2DI:
12660 	  if (!nonimmediate_operand (op2, V4SImode))
12661 	    op2 = copy_to_mode_reg (V4SImode, op2);
12662 	  break;
12663 	case IX86_BUILTIN_SCATTERALTDIV4SF:
12664 	case IX86_BUILTIN_SCATTERALTDIV4SI:
12665 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
12666 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12667 	  break;
12668 	default:
12669 	  break;
12670 	}
12671 
12672       /* Force memory operand only with base register here.  But we
12673 	 don't want to do it on memory operand for other builtin
12674 	 functions.  */
12675       op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12676 
12677       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12678 	op0 = copy_to_mode_reg (Pmode, op0);
12679 
12680       op1 = fixup_modeless_constant (op1, mode1);
12681 
12682       if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12683 	{
12684 	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
12685 	    op1 = copy_to_mode_reg (mode1, op1);
12686 	}
12687       else
12688 	{
12689 	  op1 = copy_to_reg (op1);
12690 	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12691 	}
12692 
12693       if (!insn_data[icode].operand[2].predicate (op2, mode2))
12694 	op2 = copy_to_mode_reg (mode2, op2);
12695 
12696       if (!insn_data[icode].operand[3].predicate (op3, mode3))
12697 	op3 = copy_to_mode_reg (mode3, op3);
12698 
12699       if (!insn_data[icode].operand[4].predicate (op4, mode4))
12700 	{
12701 	  error ("the last argument must be scale 1, 2, 4, 8");
12702 	  return const0_rtx;
12703 	}
12704 
12705       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12706       if (! pat)
12707 	return const0_rtx;
12708 
12709       emit_insn (pat);
12710       return 0;
12711 
12712     vec_prefetch_gen:
12713       arg0 = CALL_EXPR_ARG (exp, 0);
12714       arg1 = CALL_EXPR_ARG (exp, 1);
12715       arg2 = CALL_EXPR_ARG (exp, 2);
12716       arg3 = CALL_EXPR_ARG (exp, 3);
12717       arg4 = CALL_EXPR_ARG (exp, 4);
12718       op0 = expand_normal (arg0);
12719       op1 = expand_normal (arg1);
12720       op2 = expand_normal (arg2);
12721       op3 = expand_normal (arg3);
12722       op4 = expand_normal (arg4);
12723       mode0 = insn_data[icode].operand[0].mode;
12724       mode1 = insn_data[icode].operand[1].mode;
12725       mode3 = insn_data[icode].operand[3].mode;
12726       mode4 = insn_data[icode].operand[4].mode;
12727 
12728       op0 = fixup_modeless_constant (op0, mode0);
12729 
12730       if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12731 	{
12732 	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
12733 	    op0 = copy_to_mode_reg (mode0, op0);
12734 	}
12735       else
12736 	{
12737 	  op0 = copy_to_reg (op0);
12738 	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12739 	}
12740 
12741       if (!insn_data[icode].operand[1].predicate (op1, mode1))
12742 	op1 = copy_to_mode_reg (mode1, op1);
12743 
12744       /* Force memory operand only with base register here.  But we
12745 	 don't want to do it on memory operand for other builtin
12746 	 functions.  */
12747       op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12748 
12749       if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12750 	op2 = copy_to_mode_reg (Pmode, op2);
12751 
12752       if (!insn_data[icode].operand[3].predicate (op3, mode3))
12753 	{
12754 	  error ("the forth argument must be scale 1, 2, 4, 8");
12755 	  return const0_rtx;
12756 	}
12757 
12758       if (!insn_data[icode].operand[4].predicate (op4, mode4))
12759 	{
12760 	  error ("incorrect hint operand");
12761 	  return const0_rtx;
12762 	}
12763 
12764       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12765       if (! pat)
12766 	return const0_rtx;
12767 
12768       emit_insn (pat);
12769 
12770       return 0;
12771 
12772     case IX86_BUILTIN_XABORT:
12773       icode = CODE_FOR_xabort;
12774       arg0 = CALL_EXPR_ARG (exp, 0);
12775       op0 = expand_normal (arg0);
12776       mode0 = insn_data[icode].operand[0].mode;
12777       if (!insn_data[icode].operand[0].predicate (op0, mode0))
12778 	{
12779 	  error ("the argument to %<xabort%> intrinsic must "
12780 		 "be an 8-bit immediate");
12781 	  return const0_rtx;
12782 	}
12783       emit_insn (gen_xabort (op0));
12784       return 0;
12785 
12786     case IX86_BUILTIN_RSTORSSP:
12787     case IX86_BUILTIN_CLRSSBSY:
12788       arg0 = CALL_EXPR_ARG (exp, 0);
12789       op0 = expand_normal (arg0);
12790       icode = (fcode == IX86_BUILTIN_RSTORSSP
12791 	  ? CODE_FOR_rstorssp
12792 	  : CODE_FOR_clrssbsy);
12793       if (!address_operand (op0, VOIDmode))
12794 	{
12795 	  op1 = convert_memory_address (Pmode, op0);
12796 	  op0 = copy_addr_to_reg (op1);
12797 	}
12798       emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
12799       return 0;
12800 
12801     case IX86_BUILTIN_WRSSD:
12802     case IX86_BUILTIN_WRSSQ:
12803     case IX86_BUILTIN_WRUSSD:
12804     case IX86_BUILTIN_WRUSSQ:
12805       arg0 = CALL_EXPR_ARG (exp, 0);
12806       op0 = expand_normal (arg0);
12807       arg1 = CALL_EXPR_ARG (exp, 1);
12808       op1 = expand_normal (arg1);
12809       switch (fcode)
12810 	{
12811 	case IX86_BUILTIN_WRSSD:
12812 	  icode = CODE_FOR_wrsssi;
12813 	  mode = SImode;
12814 	  break;
12815 	case IX86_BUILTIN_WRSSQ:
12816 	  icode = CODE_FOR_wrssdi;
12817 	  mode = DImode;
12818 	  break;
12819 	case IX86_BUILTIN_WRUSSD:
12820 	  icode = CODE_FOR_wrusssi;
12821 	  mode = SImode;
12822 	  break;
12823 	case IX86_BUILTIN_WRUSSQ:
12824 	  icode = CODE_FOR_wrussdi;
12825 	  mode = DImode;
12826 	  break;
12827 	}
12828       op0 = force_reg (mode, op0);
12829       if (!address_operand (op1, VOIDmode))
12830 	{
12831 	  op2 = convert_memory_address (Pmode, op1);
12832 	  op1 = copy_addr_to_reg (op2);
12833 	}
12834       emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
12835       return 0;
12836 
12837     case IX86_BUILTIN_VZEROUPPER:
12838       cfun->machine->has_explicit_vzeroupper = true;
12839       break;
12840 
12841     default:
12842       break;
12843     }
12844 
12845   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12846       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12847     {
12848       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12849       return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12850 					       target);
12851     }
12852 
12853   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12854       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12855     {
12856       i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12857       rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12858       rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12859       rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12860       int masked = 1;
12861       machine_mode mode, wide_mode, nar_mode;
12862 
12863       nar_mode  = V4SFmode;
12864       mode      = V16SFmode;
12865       wide_mode = V64SFmode;
12866       fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
12867       fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12868 
12869       switch (fcode)
12870 	{
12871 	case IX86_BUILTIN_4FMAPS:
12872 	  fcn = gen_avx5124fmaddps_4fmaddps;
12873 	  masked = 0;
12874 	  goto v4fma_expand;
12875 
12876 	case IX86_BUILTIN_4DPWSSD:
12877 	  nar_mode  = V4SImode;
12878 	  mode      = V16SImode;
12879 	  wide_mode = V64SImode;
12880 	  fcn = gen_avx5124vnniw_vp4dpwssd;
12881 	  masked = 0;
12882 	  goto v4fma_expand;
12883 
12884 	case IX86_BUILTIN_4DPWSSDS:
12885 	  nar_mode  = V4SImode;
12886 	  mode      = V16SImode;
12887 	  wide_mode = V64SImode;
12888 	  fcn = gen_avx5124vnniw_vp4dpwssds;
12889 	  masked = 0;
12890 	  goto v4fma_expand;
12891 
12892 	case IX86_BUILTIN_4FNMAPS:
12893 	  fcn = gen_avx5124fmaddps_4fnmaddps;
12894 	  masked = 0;
12895 	  goto v4fma_expand;
12896 
12897 	case IX86_BUILTIN_4FNMAPS_MASK:
12898 	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
12899 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12900 	  goto v4fma_expand;
12901 
12902 	case IX86_BUILTIN_4DPWSSD_MASK:
12903 	  nar_mode  = V4SImode;
12904 	  mode      = V16SImode;
12905 	  wide_mode = V64SImode;
12906 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
12907 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12908 	  goto v4fma_expand;
12909 
12910 	case IX86_BUILTIN_4DPWSSDS_MASK:
12911 	  nar_mode  = V4SImode;
12912 	  mode      = V16SImode;
12913 	  wide_mode = V64SImode;
12914 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
12915 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12916 	  goto v4fma_expand;
12917 
12918 	case IX86_BUILTIN_4FMAPS_MASK:
12919 	  {
12920 	    tree args[4];
12921 	    rtx ops[4];
12922 	    rtx wide_reg;
12923 	    rtx accum;
12924 	    rtx addr;
12925 	    rtx mem;
12926 
12927 v4fma_expand:
12928 	    wide_reg = gen_reg_rtx (wide_mode);
12929 	    for (i = 0; i < 4; i++)
12930 	      {
12931 		args[i] = CALL_EXPR_ARG (exp, i);
12932 		ops[i] = expand_normal (args[i]);
12933 
12934 		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12935 				ops[i]);
12936 	      }
12937 
12938 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12939 	    accum = force_reg (mode, accum);
12940 
12941 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12942 	    addr = force_reg (Pmode, addr);
12943 
12944 	    mem = gen_rtx_MEM (nar_mode, addr);
12945 
12946 	    target = gen_reg_rtx (mode);
12947 
12948 	    emit_move_insn (target, accum);
12949 
12950 	    if (! masked)
12951 	      emit_insn (fcn (target, accum, wide_reg, mem));
12952 	    else
12953 	      {
12954 		rtx merge, mask;
12955 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12956 
12957 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12958 
12959 		if (CONST_INT_P (mask))
12960 		  mask = fixup_modeless_constant (mask, HImode);
12961 
12962 		mask = force_reg (HImode, mask);
12963 
12964 		if (GET_MODE (mask) != HImode)
12965 		  mask = gen_rtx_SUBREG (HImode, mask, 0);
12966 
12967 		/* If merge is 0 then we're about to emit z-masked variant.  */
12968 		if (const0_operand (merge, mode))
12969 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12970 		/* If merge is the same as accum then emit merge-masked variant.  */
12971 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12972 		  {
12973 		    merge = force_reg (mode, merge);
12974 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12975 		  }
12976 		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
12977 		else
12978 		  {
12979 		    target = gen_reg_rtx (mode);
12980 		    emit_move_insn (target, merge);
12981 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12982 		  }
12983 	      }
12984 	    return target;
12985 	  }
12986 
12987 	case IX86_BUILTIN_4FNMASS:
12988 	  fcn = gen_avx5124fmaddps_4fnmaddss;
12989 	  masked = 0;
12990 	  goto s4fma_expand;
12991 
12992 	case IX86_BUILTIN_4FMASS:
12993 	  fcn = gen_avx5124fmaddps_4fmaddss;
12994 	  masked = 0;
12995 	  goto s4fma_expand;
12996 
12997 	case IX86_BUILTIN_4FNMASS_MASK:
12998 	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
12999 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13000 	  goto s4fma_expand;
13001 
13002 	case IX86_BUILTIN_4FMASS_MASK:
13003 	  {
13004 	    tree args[4];
13005 	    rtx ops[4];
13006 	    rtx wide_reg;
13007 	    rtx accum;
13008 	    rtx addr;
13009 	    rtx mem;
13010 
13011 	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13012 	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13013 
13014 s4fma_expand:
13015 	    mode = V4SFmode;
13016 	    wide_reg = gen_reg_rtx (V64SFmode);
13017 	    for (i = 0; i < 4; i++)
13018 	      {
13019 		rtx tmp;
13020 		args[i] = CALL_EXPR_ARG (exp, i);
13021 		ops[i] = expand_normal (args[i]);
13022 
13023 		tmp = gen_reg_rtx (SFmode);
13024 		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13025 
13026 		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13027 				gen_rtx_SUBREG (V16SFmode, tmp, 0));
13028 	      }
13029 
13030 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13031 	    accum = force_reg (V4SFmode, accum);
13032 
13033 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13034 	    addr = force_reg (Pmode, addr);
13035 
13036 	    mem = gen_rtx_MEM (V4SFmode, addr);
13037 
13038 	    target = gen_reg_rtx (V4SFmode);
13039 
13040 	    emit_move_insn (target, accum);
13041 
13042 	    if (! masked)
13043 	      emit_insn (fcn (target, accum, wide_reg, mem));
13044 	    else
13045 	      {
13046 		rtx merge, mask;
13047 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13048 
13049 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13050 
13051 		if (CONST_INT_P (mask))
13052 		  mask = fixup_modeless_constant (mask, QImode);
13053 
13054 		mask = force_reg (QImode, mask);
13055 
13056 		if (GET_MODE (mask) != QImode)
13057 		  mask = gen_rtx_SUBREG (QImode, mask, 0);
13058 
13059 		/* If merge is 0 then we're about to emit z-masked variant.  */
13060 		if (const0_operand (merge, mode))
13061 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13062 		/* If merge is the same as accum then emit merge-masked
13063 		   variant.  */
13064 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13065 		  {
13066 		    merge = force_reg (mode, merge);
13067 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13068 		  }
13069 		/* Merge with something unknown might happen if we z-mask
13070 		   w/ -O0.  */
13071 		else
13072 		  {
13073 		    target = gen_reg_rtx (mode);
13074 		    emit_move_insn (target, merge);
13075 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13076 		  }
13077 		}
13078 	      return target;
13079 	    }
13080 	  case IX86_BUILTIN_RDPID:
13081 	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13082 						     target);
13083 	  case IX86_BUILTIN_FABSQ:
13084 	  case IX86_BUILTIN_COPYSIGNQ:
13085 	    if (!TARGET_SSE)
13086 	      /* Emit a normal call if SSE isn't available.  */
13087 	      return expand_call (exp, target, ignore);
13088 	    /* FALLTHRU */
13089 	  default:
13090 	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13091 	  }
13092     }
13093 
13094   if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13095       && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13096     {
13097       i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13098       return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13099     }
13100 
13101   if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13102       && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13103     {
13104       i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13105       return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13106     }
13107 
13108   if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13109       && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13110     {
13111       i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13112       return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13113     }
13114 
13115   if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13116       && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13117     {
13118       i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13119       return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13120     }
13121 
13122   if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13123       && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13124     {
13125       i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13126       const struct builtin_description *d = bdesc_multi_arg + i;
13127       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13128 					    (enum ix86_builtin_func_type)
13129 					    d->flag, d->comparison);
13130     }
13131 
13132   if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13133       && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13134     {
13135       i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13136       return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13137 					       target);
13138     }
13139 
13140   if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13141       && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
13142     {
13143       i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
13144       return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
13145 				       target);
13146     }
13147 
13148   gcc_unreachable ();
13149 }
13150 
13151 /* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
13152    fill target with val via vec_duplicate.  */
13153 
13154 static bool
ix86_vector_duplicate_value(machine_mode mode,rtx target,rtx val)13155 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13156 {
13157   bool ok;
13158   rtx_insn *insn;
13159   rtx dup;
13160 
13161   /* First attempt to recognize VAL as-is.  */
13162   dup = gen_vec_duplicate (mode, val);
13163   insn = emit_insn (gen_rtx_SET (target, dup));
13164   if (recog_memoized (insn) < 0)
13165     {
13166       rtx_insn *seq;
13167       machine_mode innermode = GET_MODE_INNER (mode);
13168       rtx reg;
13169 
13170       /* If that fails, force VAL into a register.  */
13171 
13172       start_sequence ();
13173       reg = force_reg (innermode, val);
13174       if (GET_MODE (reg) != innermode)
13175 	reg = gen_lowpart (innermode, reg);
13176       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13177       seq = get_insns ();
13178       end_sequence ();
13179       if (seq)
13180 	emit_insn_before (seq, insn);
13181 
13182       ok = recog_memoized (insn) >= 0;
13183       gcc_assert (ok);
13184     }
13185   return true;
13186 }
13187 
13188 /* Get a vector mode of the same size as the original but with elements
13189    twice as wide.  This is only guaranteed to apply to integral vectors.  */
13190 
13191 static machine_mode
get_mode_wider_vector(machine_mode o)13192 get_mode_wider_vector (machine_mode o)
13193 {
13194   /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
13195   machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13196   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13197   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13198   return n;
13199 }
13200 
13201 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13202 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13203 
13204 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
13205    with all elements equal to VAR.  Return true if successful.  */
13206 
13207 static bool
ix86_expand_vector_init_duplicate(bool mmx_ok,machine_mode mode,rtx target,rtx val)13208 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13209 				   rtx target, rtx val)
13210 {
13211   bool ok;
13212 
13213   switch (mode)
13214     {
13215     case E_V2SImode:
13216     case E_V2SFmode:
13217       if (!mmx_ok)
13218 	return false;
13219       /* FALLTHRU */
13220 
13221     case E_V4DFmode:
13222     case E_V4DImode:
13223     case E_V8SFmode:
13224     case E_V8SImode:
13225     case E_V2DFmode:
13226     case E_V2DImode:
13227     case E_V4SFmode:
13228     case E_V4SImode:
13229     case E_V16SImode:
13230     case E_V8DImode:
13231     case E_V16SFmode:
13232     case E_V8DFmode:
13233       return ix86_vector_duplicate_value (mode, target, val);
13234 
13235     case E_V4HImode:
13236       if (!mmx_ok)
13237 	return false;
13238       if (TARGET_SSE || TARGET_3DNOW_A)
13239 	{
13240 	  rtx x;
13241 
13242 	  val = gen_lowpart (SImode, val);
13243 	  x = gen_rtx_TRUNCATE (HImode, val);
13244 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
13245 	  emit_insn (gen_rtx_SET (target, x));
13246 	  return true;
13247 	}
13248       goto widen;
13249 
13250     case E_V8QImode:
13251       if (!mmx_ok)
13252 	return false;
13253       goto widen;
13254 
13255     case E_V8HImode:
13256       if (TARGET_AVX2)
13257 	return ix86_vector_duplicate_value (mode, target, val);
13258 
13259       if (TARGET_SSE2)
13260 	{
13261 	  struct expand_vec_perm_d dperm;
13262 	  rtx tmp1, tmp2;
13263 
13264 	permute:
13265 	  memset (&dperm, 0, sizeof (dperm));
13266 	  dperm.target = target;
13267 	  dperm.vmode = mode;
13268 	  dperm.nelt = GET_MODE_NUNITS (mode);
13269 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13270 	  dperm.one_operand_p = true;
13271 
13272 	  /* Extend to SImode using a paradoxical SUBREG.  */
13273 	  tmp1 = gen_reg_rtx (SImode);
13274 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
13275 
13276 	  /* Insert the SImode value as low element of a V4SImode vector. */
13277 	  tmp2 = gen_reg_rtx (V4SImode);
13278 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13279 	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13280 
13281 	  ok = (expand_vec_perm_1 (&dperm)
13282 		|| expand_vec_perm_broadcast_1 (&dperm));
13283 	  gcc_assert (ok);
13284 	  return ok;
13285 	}
13286       goto widen;
13287 
13288     case E_V16QImode:
13289       if (TARGET_AVX2)
13290 	return ix86_vector_duplicate_value (mode, target, val);
13291 
13292       if (TARGET_SSE2)
13293 	goto permute;
13294       goto widen;
13295 
13296     widen:
13297       /* Replicate the value once into the next wider mode and recurse.  */
13298       {
13299 	machine_mode smode, wsmode, wvmode;
13300 	rtx x;
13301 
13302 	smode = GET_MODE_INNER (mode);
13303 	wvmode = get_mode_wider_vector (mode);
13304 	wsmode = GET_MODE_INNER (wvmode);
13305 
13306 	val = convert_modes (wsmode, smode, val, true);
13307 	x = expand_simple_binop (wsmode, ASHIFT, val,
13308 				 GEN_INT (GET_MODE_BITSIZE (smode)),
13309 				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13310 	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13311 
13312 	x = gen_reg_rtx (wvmode);
13313 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13314 	gcc_assert (ok);
13315 	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13316 	return ok;
13317       }
13318 
13319     case E_V16HImode:
13320     case E_V32QImode:
13321       if (TARGET_AVX2)
13322 	return ix86_vector_duplicate_value (mode, target, val);
13323       else
13324 	{
13325 	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13326 	  rtx x = gen_reg_rtx (hvmode);
13327 
13328 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13329 	  gcc_assert (ok);
13330 
13331 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
13332 	  emit_insn (gen_rtx_SET (target, x));
13333 	}
13334       return true;
13335 
13336     case E_V64QImode:
13337     case E_V32HImode:
13338       if (TARGET_AVX512BW)
13339 	return ix86_vector_duplicate_value (mode, target, val);
13340       else
13341 	{
13342 	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13343 	  rtx x = gen_reg_rtx (hvmode);
13344 
13345 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13346 	  gcc_assert (ok);
13347 
13348 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
13349 	  emit_insn (gen_rtx_SET (target, x));
13350 	}
13351       return true;
13352 
13353     default:
13354       return false;
13355     }
13356 }
13357 
13358 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
13359    whose ONE_VAR element is VAR, and other elements are zero.  Return true
13360    if successful.  */
13361 
13362 static bool
ix86_expand_vector_init_one_nonzero(bool mmx_ok,machine_mode mode,rtx target,rtx var,int one_var)13363 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13364 				     rtx target, rtx var, int one_var)
13365 {
13366   machine_mode vsimode;
13367   rtx new_target;
13368   rtx x, tmp;
13369   bool use_vector_set = false;
13370   rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13371 
13372   switch (mode)
13373     {
13374     case E_V2DImode:
13375       /* For SSE4.1, we normally use vector set.  But if the second
13376 	 element is zero and inter-unit moves are OK, we use movq
13377 	 instead.  */
13378       use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13379 			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
13380 			     && one_var == 0));
13381       break;
13382     case E_V16QImode:
13383     case E_V4SImode:
13384     case E_V4SFmode:
13385       use_vector_set = TARGET_SSE4_1;
13386       break;
13387     case E_V8HImode:
13388       use_vector_set = TARGET_SSE2;
13389       break;
13390     case E_V8QImode:
13391       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13392       break;
13393     case E_V4HImode:
13394       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13395       break;
13396     case E_V32QImode:
13397     case E_V16HImode:
13398       use_vector_set = TARGET_AVX;
13399       break;
13400     case E_V8SImode:
13401       use_vector_set = TARGET_AVX;
13402       gen_vec_set_0 = gen_vec_setv8si_0;
13403       break;
13404     case E_V8SFmode:
13405       use_vector_set = TARGET_AVX;
13406       gen_vec_set_0 = gen_vec_setv8sf_0;
13407       break;
13408     case E_V4DFmode:
13409       use_vector_set = TARGET_AVX;
13410       gen_vec_set_0 = gen_vec_setv4df_0;
13411       break;
13412     case E_V4DImode:
13413       /* Use ix86_expand_vector_set in 64bit mode only.  */
13414       use_vector_set = TARGET_AVX && TARGET_64BIT;
13415       gen_vec_set_0 = gen_vec_setv4di_0;
13416       break;
13417     case E_V16SImode:
13418       use_vector_set = TARGET_AVX512F && one_var == 0;
13419       gen_vec_set_0 = gen_vec_setv16si_0;
13420       break;
13421     case E_V16SFmode:
13422       use_vector_set = TARGET_AVX512F && one_var == 0;
13423       gen_vec_set_0 = gen_vec_setv16sf_0;
13424       break;
13425     case E_V8DFmode:
13426       use_vector_set = TARGET_AVX512F && one_var == 0;
13427       gen_vec_set_0 = gen_vec_setv8df_0;
13428       break;
13429     case E_V8DImode:
13430       /* Use ix86_expand_vector_set in 64bit mode only.  */
13431       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13432       gen_vec_set_0 = gen_vec_setv8di_0;
13433       break;
13434     default:
13435       break;
13436     }
13437 
13438   if (use_vector_set)
13439     {
13440       if (gen_vec_set_0 && one_var == 0)
13441 	{
13442 	  var = force_reg (GET_MODE_INNER (mode), var);
13443 	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13444 	  return true;
13445 	}
13446       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13447       var = force_reg (GET_MODE_INNER (mode), var);
13448       ix86_expand_vector_set (mmx_ok, target, var, one_var);
13449       return true;
13450     }
13451 
13452   switch (mode)
13453     {
13454     case E_V2SFmode:
13455     case E_V2SImode:
13456       if (!mmx_ok)
13457 	return false;
13458       /* FALLTHRU */
13459 
13460     case E_V2DFmode:
13461     case E_V2DImode:
13462       if (one_var != 0)
13463 	return false;
13464       var = force_reg (GET_MODE_INNER (mode), var);
13465       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13466       emit_insn (gen_rtx_SET (target, x));
13467       return true;
13468 
13469     case E_V4SFmode:
13470     case E_V4SImode:
13471       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13472 	new_target = gen_reg_rtx (mode);
13473       else
13474 	new_target = target;
13475       var = force_reg (GET_MODE_INNER (mode), var);
13476       x = gen_rtx_VEC_DUPLICATE (mode, var);
13477       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13478       emit_insn (gen_rtx_SET (new_target, x));
13479       if (one_var != 0)
13480 	{
13481 	  /* We need to shuffle the value to the correct position, so
13482 	     create a new pseudo to store the intermediate result.  */
13483 
13484 	  /* With SSE2, we can use the integer shuffle insns.  */
13485 	  if (mode != V4SFmode && TARGET_SSE2)
13486 	    {
13487 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13488 					    const1_rtx,
13489 					    GEN_INT (one_var == 1 ? 0 : 1),
13490 					    GEN_INT (one_var == 2 ? 0 : 1),
13491 					    GEN_INT (one_var == 3 ? 0 : 1)));
13492 	      if (target != new_target)
13493 		emit_move_insn (target, new_target);
13494 	      return true;
13495 	    }
13496 
13497 	  /* Otherwise convert the intermediate result to V4SFmode and
13498 	     use the SSE1 shuffle instructions.  */
13499 	  if (mode != V4SFmode)
13500 	    {
13501 	      tmp = gen_reg_rtx (V4SFmode);
13502 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13503 	    }
13504 	  else
13505 	    tmp = new_target;
13506 
13507 	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13508 				       const1_rtx,
13509 				       GEN_INT (one_var == 1 ? 0 : 1),
13510 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
13511 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13512 
13513 	  if (mode != V4SFmode)
13514 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13515 	  else if (tmp != target)
13516 	    emit_move_insn (target, tmp);
13517 	}
13518       else if (target != new_target)
13519 	emit_move_insn (target, new_target);
13520       return true;
13521 
13522     case E_V8HImode:
13523     case E_V16QImode:
13524       vsimode = V4SImode;
13525       goto widen;
13526     case E_V4HImode:
13527     case E_V8QImode:
13528       if (!mmx_ok)
13529 	return false;
13530       vsimode = V2SImode;
13531       goto widen;
13532     widen:
13533       if (one_var != 0)
13534 	return false;
13535 
13536       /* Zero extend the variable element to SImode and recurse.  */
13537       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13538 
13539       x = gen_reg_rtx (vsimode);
13540       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13541 						var, one_var))
13542 	gcc_unreachable ();
13543 
13544       emit_move_insn (target, gen_lowpart (mode, x));
13545       return true;
13546 
13547     default:
13548       return false;
13549     }
13550 }
13551 
13552 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
13553    consisting of the values in VALS.  It is known that all elements
13554    except ONE_VAR are constants.  Return true if successful.  */
13555 
13556 static bool
ix86_expand_vector_init_one_var(bool mmx_ok,machine_mode mode,rtx target,rtx vals,int one_var)13557 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13558 				 rtx target, rtx vals, int one_var)
13559 {
13560   rtx var = XVECEXP (vals, 0, one_var);
13561   machine_mode wmode;
13562   rtx const_vec, x;
13563 
13564   const_vec = copy_rtx (vals);
13565   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13566   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13567 
13568   switch (mode)
13569     {
13570     case E_V2DFmode:
13571     case E_V2DImode:
13572     case E_V2SFmode:
13573     case E_V2SImode:
13574       /* For the two element vectors, it's just as easy to use
13575 	 the general case.  */
13576       return false;
13577 
13578     case E_V4DImode:
13579       /* Use ix86_expand_vector_set in 64bit mode only.  */
13580       if (!TARGET_64BIT)
13581 	return false;
13582       /* FALLTHRU */
13583     case E_V4DFmode:
13584     case E_V8SFmode:
13585     case E_V8SImode:
13586     case E_V16HImode:
13587     case E_V32QImode:
13588     case E_V4SFmode:
13589     case E_V4SImode:
13590     case E_V8HImode:
13591     case E_V4HImode:
13592       break;
13593 
13594     case E_V16QImode:
13595       if (TARGET_SSE4_1)
13596 	break;
13597       wmode = V8HImode;
13598       goto widen;
13599     case E_V8QImode:
13600       if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13601 	break;
13602       wmode = V4HImode;
13603       goto widen;
13604     widen:
13605       /* There's no way to set one QImode entry easily.  Combine
13606 	 the variable value with its adjacent constant value, and
13607 	 promote to an HImode set.  */
13608       x = XVECEXP (vals, 0, one_var ^ 1);
13609       if (one_var & 1)
13610 	{
13611 	  var = convert_modes (HImode, QImode, var, true);
13612 	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13613 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
13614 	  x = GEN_INT (INTVAL (x) & 0xff);
13615 	}
13616       else
13617 	{
13618 	  var = convert_modes (HImode, QImode, var, true);
13619 	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
13620 	}
13621       if (x != const0_rtx)
13622 	var = expand_simple_binop (HImode, IOR, var, x, var,
13623 				   1, OPTAB_LIB_WIDEN);
13624 
13625       x = gen_reg_rtx (wmode);
13626       emit_move_insn (x, gen_lowpart (wmode, const_vec));
13627       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13628 
13629       emit_move_insn (target, gen_lowpart (mode, x));
13630       return true;
13631 
13632     default:
13633       return false;
13634     }
13635 
13636   emit_move_insn (target, const_vec);
13637   ix86_expand_vector_set (mmx_ok, target, var, one_var);
13638   return true;
13639 }
13640 
13641 /* A subroutine of ix86_expand_vector_init_general.  Use vector
13642    concatenate to handle the most general case: all values variable,
13643    and none identical.  */
13644 
13645 static void
ix86_expand_vector_init_concat(machine_mode mode,rtx target,rtx * ops,int n)13646 ix86_expand_vector_init_concat (machine_mode mode,
13647 				rtx target, rtx *ops, int n)
13648 {
13649   machine_mode half_mode = VOIDmode;
13650   rtx half[2];
13651   rtvec v;
13652   int i, j;
13653 
13654   switch (n)
13655     {
13656     case 2:
13657       switch (mode)
13658 	{
13659 	case E_V16SImode:
13660 	  half_mode = V8SImode;
13661 	  break;
13662 	case E_V16SFmode:
13663 	  half_mode = V8SFmode;
13664 	  break;
13665 	case E_V8DImode:
13666 	  half_mode = V4DImode;
13667 	  break;
13668 	case E_V8DFmode:
13669 	  half_mode = V4DFmode;
13670 	  break;
13671 	case E_V8SImode:
13672 	  half_mode = V4SImode;
13673 	  break;
13674 	case E_V8SFmode:
13675 	  half_mode = V4SFmode;
13676 	  break;
13677 	case E_V4DImode:
13678 	  half_mode = V2DImode;
13679 	  break;
13680 	case E_V4DFmode:
13681 	  half_mode = V2DFmode;
13682 	  break;
13683 	case E_V4SImode:
13684 	  half_mode = V2SImode;
13685 	  break;
13686 	case E_V4SFmode:
13687 	  half_mode = V2SFmode;
13688 	  break;
13689 	case E_V2DImode:
13690 	  half_mode = DImode;
13691 	  break;
13692 	case E_V2SImode:
13693 	  half_mode = SImode;
13694 	  break;
13695 	case E_V2DFmode:
13696 	  half_mode = DFmode;
13697 	  break;
13698 	case E_V2SFmode:
13699 	  half_mode = SFmode;
13700 	  break;
13701 	default:
13702 	  gcc_unreachable ();
13703 	}
13704 
13705       if (!register_operand (ops[1], half_mode))
13706 	ops[1] = force_reg (half_mode, ops[1]);
13707       if (!register_operand (ops[0], half_mode))
13708 	ops[0] = force_reg (half_mode, ops[0]);
13709       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13710 							  ops[1])));
13711       break;
13712 
13713     case 4:
13714       switch (mode)
13715 	{
13716 	case E_V4DImode:
13717 	  half_mode = V2DImode;
13718 	  break;
13719 	case E_V4DFmode:
13720 	  half_mode = V2DFmode;
13721 	  break;
13722 	case E_V4SImode:
13723 	  half_mode = V2SImode;
13724 	  break;
13725 	case E_V4SFmode:
13726 	  half_mode = V2SFmode;
13727 	  break;
13728 	default:
13729 	  gcc_unreachable ();
13730 	}
13731       goto half;
13732 
13733     case 8:
13734       switch (mode)
13735 	{
13736 	case E_V8DImode:
13737 	  half_mode = V4DImode;
13738 	  break;
13739 	case E_V8DFmode:
13740 	  half_mode = V4DFmode;
13741 	  break;
13742 	case E_V8SImode:
13743 	  half_mode = V4SImode;
13744 	  break;
13745 	case E_V8SFmode:
13746 	  half_mode = V4SFmode;
13747 	  break;
13748 	default:
13749 	  gcc_unreachable ();
13750 	}
13751       goto half;
13752 
13753     case 16:
13754       switch (mode)
13755 	{
13756 	case E_V16SImode:
13757 	  half_mode = V8SImode;
13758 	  break;
13759 	case E_V16SFmode:
13760 	  half_mode = V8SFmode;
13761 	  break;
13762 	default:
13763 	  gcc_unreachable ();
13764 	}
13765       goto half;
13766 
13767 half:
13768       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
13769       i = n - 1;
13770       for (j = 1; j != -1; j--)
13771 	{
13772 	  half[j] = gen_reg_rtx (half_mode);
13773 	  switch (n >> 1)
13774 	    {
13775 	    case 2:
13776 	      v = gen_rtvec (2, ops[i-1], ops[i]);
13777 	      i -= 2;
13778 	      break;
13779 	    case 4:
13780 	      v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
13781 	      i -= 4;
13782 	      break;
13783 	    case 8:
13784 	      v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
13785 			     ops[i-3], ops[i-2], ops[i-1], ops[i]);
13786 	      i -= 8;
13787 	      break;
13788 	    default:
13789 	      gcc_unreachable ();
13790 	    }
13791 	  ix86_expand_vector_init (false, half[j],
13792 				   gen_rtx_PARALLEL (half_mode, v));
13793 	}
13794 
13795       ix86_expand_vector_init_concat (mode, target, half, 2);
13796       break;
13797 
13798     default:
13799       gcc_unreachable ();
13800     }
13801 }
13802 
13803 /* A subroutine of ix86_expand_vector_init_general.  Use vector
13804    interleave to handle the most general case: all values variable,
13805    and none identical.  */
13806 
13807 static void
ix86_expand_vector_init_interleave(machine_mode mode,rtx target,rtx * ops,int n)13808 ix86_expand_vector_init_interleave (machine_mode mode,
13809 				    rtx target, rtx *ops, int n)
13810 {
13811   machine_mode first_imode, second_imode, third_imode, inner_mode;
13812   int i, j;
13813   rtx op0, op1;
13814   rtx (*gen_load_even) (rtx, rtx, rtx);
13815   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13816   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13817 
13818   switch (mode)
13819     {
13820     case E_V8HImode:
13821       gen_load_even = gen_vec_setv8hi;
13822       gen_interleave_first_low = gen_vec_interleave_lowv4si;
13823       gen_interleave_second_low = gen_vec_interleave_lowv2di;
13824       inner_mode = HImode;
13825       first_imode = V4SImode;
13826       second_imode = V2DImode;
13827       third_imode = VOIDmode;
13828       break;
13829     case E_V16QImode:
13830       gen_load_even = gen_vec_setv16qi;
13831       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13832       gen_interleave_second_low = gen_vec_interleave_lowv4si;
13833       inner_mode = QImode;
13834       first_imode = V8HImode;
13835       second_imode = V4SImode;
13836       third_imode = V2DImode;
13837       break;
13838     default:
13839       gcc_unreachable ();
13840     }
13841 
13842   for (i = 0; i < n; i++)
13843     {
13844       /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
13845       op0 = gen_reg_rtx (SImode);
13846       emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13847 
13848       /* Insert the SImode value as low element of V4SImode vector. */
13849       op1 = gen_reg_rtx (V4SImode);
13850       op0 = gen_rtx_VEC_MERGE (V4SImode,
13851 			       gen_rtx_VEC_DUPLICATE (V4SImode,
13852 						      op0),
13853 			       CONST0_RTX (V4SImode),
13854 			       const1_rtx);
13855       emit_insn (gen_rtx_SET (op1, op0));
13856 
13857       /* Cast the V4SImode vector back to a vector in orignal mode.  */
13858       op0 = gen_reg_rtx (mode);
13859       emit_move_insn (op0, gen_lowpart (mode, op1));
13860 
13861       /* Load even elements into the second position.  */
13862       emit_insn (gen_load_even (op0,
13863 				force_reg (inner_mode,
13864 					   ops [i + i + 1]),
13865 				const1_rtx));
13866 
13867       /* Cast vector to FIRST_IMODE vector.  */
13868       ops[i] = gen_reg_rtx (first_imode);
13869       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13870     }
13871 
13872   /* Interleave low FIRST_IMODE vectors.  */
13873   for (i = j = 0; i < n; i += 2, j++)
13874     {
13875       op0 = gen_reg_rtx (first_imode);
13876       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13877 
13878       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
13879       ops[j] = gen_reg_rtx (second_imode);
13880       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13881     }
13882 
13883   /* Interleave low SECOND_IMODE vectors.  */
13884   switch (second_imode)
13885     {
13886     case E_V4SImode:
13887       for (i = j = 0; i < n / 2; i += 2, j++)
13888 	{
13889 	  op0 = gen_reg_rtx (second_imode);
13890 	  emit_insn (gen_interleave_second_low (op0, ops[i],
13891 						ops[i + 1]));
13892 
13893 	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13894 	     vector.  */
13895 	  ops[j] = gen_reg_rtx (third_imode);
13896 	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13897 	}
13898       second_imode = V2DImode;
13899       gen_interleave_second_low = gen_vec_interleave_lowv2di;
13900       /* FALLTHRU */
13901 
13902     case E_V2DImode:
13903       op0 = gen_reg_rtx (second_imode);
13904       emit_insn (gen_interleave_second_low (op0, ops[0],
13905 					    ops[1]));
13906 
13907       /* Cast the SECOND_IMODE vector back to a vector on original
13908 	 mode.  */
13909       emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13910       break;
13911 
13912     default:
13913       gcc_unreachable ();
13914     }
13915 }
13916 
13917 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
13918    all values variable, and none identical.  */
13919 
13920 static void
ix86_expand_vector_init_general(bool mmx_ok,machine_mode mode,rtx target,rtx vals)13921 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13922 				 rtx target, rtx vals)
13923 {
13924   rtx ops[64], op0, op1, op2, op3, op4, op5;
13925   machine_mode half_mode = VOIDmode;
13926   machine_mode quarter_mode = VOIDmode;
13927   int n, i;
13928 
13929   switch (mode)
13930     {
13931     case E_V2SFmode:
13932     case E_V2SImode:
13933       if (!mmx_ok && !TARGET_SSE)
13934 	break;
13935       /* FALLTHRU */
13936 
13937     case E_V16SImode:
13938     case E_V16SFmode:
13939     case E_V8DFmode:
13940     case E_V8DImode:
13941     case E_V8SFmode:
13942     case E_V8SImode:
13943     case E_V4DFmode:
13944     case E_V4DImode:
13945     case E_V4SFmode:
13946     case E_V4SImode:
13947     case E_V2DFmode:
13948     case E_V2DImode:
13949       n = GET_MODE_NUNITS (mode);
13950       for (i = 0; i < n; i++)
13951 	ops[i] = XVECEXP (vals, 0, i);
13952       ix86_expand_vector_init_concat (mode, target, ops, n);
13953       return;
13954 
13955     case E_V2TImode:
13956       for (i = 0; i < 2; i++)
13957 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13958       op0 = gen_reg_rtx (V4DImode);
13959       ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13960       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13961       return;
13962 
13963     case E_V4TImode:
13964       for (i = 0; i < 4; i++)
13965 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13966       ops[4] = gen_reg_rtx (V4DImode);
13967       ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
13968       ops[5] = gen_reg_rtx (V4DImode);
13969       ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
13970       op0 = gen_reg_rtx (V8DImode);
13971       ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
13972       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13973       return;
13974 
13975     case E_V32QImode:
13976       half_mode = V16QImode;
13977       goto half;
13978 
13979     case E_V16HImode:
13980       half_mode = V8HImode;
13981       goto half;
13982 
13983 half:
13984       n = GET_MODE_NUNITS (mode);
13985       for (i = 0; i < n; i++)
13986 	ops[i] = XVECEXP (vals, 0, i);
13987       op0 = gen_reg_rtx (half_mode);
13988       op1 = gen_reg_rtx (half_mode);
13989       ix86_expand_vector_init_interleave (half_mode, op0, ops,
13990 					  n >> 2);
13991       ix86_expand_vector_init_interleave (half_mode, op1,
13992 					  &ops [n >> 1], n >> 2);
13993       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
13994       return;
13995 
13996     case E_V64QImode:
13997       quarter_mode = V16QImode;
13998       half_mode = V32QImode;
13999       goto quarter;
14000 
14001     case E_V32HImode:
14002       quarter_mode = V8HImode;
14003       half_mode = V16HImode;
14004       goto quarter;
14005 
14006 quarter:
14007       n = GET_MODE_NUNITS (mode);
14008       for (i = 0; i < n; i++)
14009 	ops[i] = XVECEXP (vals, 0, i);
14010       op0 = gen_reg_rtx (quarter_mode);
14011       op1 = gen_reg_rtx (quarter_mode);
14012       op2 = gen_reg_rtx (quarter_mode);
14013       op3 = gen_reg_rtx (quarter_mode);
14014       op4 = gen_reg_rtx (half_mode);
14015       op5 = gen_reg_rtx (half_mode);
14016       ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14017 					  n >> 3);
14018       ix86_expand_vector_init_interleave (quarter_mode, op1,
14019 					  &ops [n >> 2], n >> 3);
14020       ix86_expand_vector_init_interleave (quarter_mode, op2,
14021 					  &ops [n >> 1], n >> 3);
14022       ix86_expand_vector_init_interleave (quarter_mode, op3,
14023 					  &ops [(n >> 1) | (n >> 2)], n >> 3);
14024       emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14025       emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14026       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14027       return;
14028 
14029     case E_V16QImode:
14030       if (!TARGET_SSE4_1)
14031 	break;
14032       /* FALLTHRU */
14033 
14034     case E_V8HImode:
14035       if (!TARGET_SSE2)
14036 	break;
14037 
14038       /* Don't use ix86_expand_vector_init_interleave if we can't
14039 	 move from GPR to SSE register directly.  */
14040       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14041 	break;
14042 
14043       n = GET_MODE_NUNITS (mode);
14044       for (i = 0; i < n; i++)
14045 	ops[i] = XVECEXP (vals, 0, i);
14046       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14047       return;
14048 
14049     case E_V4HImode:
14050     case E_V8QImode:
14051       break;
14052 
14053     default:
14054       gcc_unreachable ();
14055     }
14056 
14057     {
14058       int i, j, n_elts, n_words, n_elt_per_word;
14059       machine_mode inner_mode;
14060       rtx words[4], shift;
14061 
14062       inner_mode = GET_MODE_INNER (mode);
14063       n_elts = GET_MODE_NUNITS (mode);
14064       n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14065       n_elt_per_word = n_elts / n_words;
14066       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14067 
14068       for (i = 0; i < n_words; ++i)
14069 	{
14070 	  rtx word = NULL_RTX;
14071 
14072 	  for (j = 0; j < n_elt_per_word; ++j)
14073 	    {
14074 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14075 	      elt = convert_modes (word_mode, inner_mode, elt, true);
14076 
14077 	      if (j == 0)
14078 		word = elt;
14079 	      else
14080 		{
14081 		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14082 					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
14083 		  word = expand_simple_binop (word_mode, IOR, word, elt,
14084 					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
14085 		}
14086 	    }
14087 
14088 	  words[i] = word;
14089 	}
14090 
14091       if (n_words == 1)
14092 	emit_move_insn (target, gen_lowpart (mode, words[0]));
14093       else if (n_words == 2)
14094 	{
14095 	  rtx tmp = gen_reg_rtx (mode);
14096 	  emit_clobber (tmp);
14097 	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14098 	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14099 	  emit_move_insn (target, tmp);
14100 	}
14101       else if (n_words == 4)
14102 	{
14103 	  rtx tmp = gen_reg_rtx (V4SImode);
14104 	  gcc_assert (word_mode == SImode);
14105 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14106 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14107 	  emit_move_insn (target, gen_lowpart (mode, tmp));
14108 	}
14109       else
14110 	gcc_unreachable ();
14111     }
14112 }
14113 
14114 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
14115    instructions unless MMX_OK is true.  */
14116 
14117 void
ix86_expand_vector_init(bool mmx_ok,rtx target,rtx vals)14118 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14119 {
14120   machine_mode mode = GET_MODE (target);
14121   machine_mode inner_mode = GET_MODE_INNER (mode);
14122   int n_elts = GET_MODE_NUNITS (mode);
14123   int n_var = 0, one_var = -1;
14124   bool all_same = true, all_const_zero = true;
14125   int i;
14126   rtx x;
14127 
14128   /* Handle first initialization from vector elts.  */
14129   if (n_elts != XVECLEN (vals, 0))
14130     {
14131       rtx subtarget = target;
14132       x = XVECEXP (vals, 0, 0);
14133       gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14134       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14135 	{
14136 	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14137 	  if (inner_mode == QImode
14138 	      || inner_mode == HImode
14139 	      || inner_mode == TImode)
14140 	    {
14141 	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14142 	      scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
14143 	      n_bits /= GET_MODE_SIZE (elt_mode);
14144 	      mode = mode_for_vector (elt_mode, n_bits).require ();
14145 	      inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
14146 	      ops[0] = gen_lowpart (inner_mode, ops[0]);
14147 	      ops[1] = gen_lowpart (inner_mode, ops[1]);
14148 	      subtarget = gen_reg_rtx (mode);
14149 	    }
14150 	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14151 	  if (subtarget != target)
14152 	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14153 	  return;
14154 	}
14155       gcc_unreachable ();
14156     }
14157 
14158   for (i = 0; i < n_elts; ++i)
14159     {
14160       x = XVECEXP (vals, 0, i);
14161       if (!(CONST_SCALAR_INT_P (x)
14162 	    || CONST_DOUBLE_P (x)
14163 	    || CONST_FIXED_P (x)))
14164 	n_var++, one_var = i;
14165       else if (x != CONST0_RTX (inner_mode))
14166 	all_const_zero = false;
14167       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14168 	all_same = false;
14169     }
14170 
14171   /* Constants are best loaded from the constant pool.  */
14172   if (n_var == 0)
14173     {
14174       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14175       return;
14176     }
14177 
14178   /* If all values are identical, broadcast the value.  */
14179   if (all_same
14180       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14181 					    XVECEXP (vals, 0, 0)))
14182     return;
14183 
14184   /* Values where only one field is non-constant are best loaded from
14185      the pool and overwritten via move later.  */
14186   if (n_var == 1)
14187     {
14188       if (all_const_zero
14189 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14190 						  XVECEXP (vals, 0, one_var),
14191 						  one_var))
14192 	return;
14193 
14194       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14195 	return;
14196     }
14197 
14198   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14199 }
14200 
14201 void
ix86_expand_vector_set(bool mmx_ok,rtx target,rtx val,int elt)14202 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14203 {
14204   machine_mode mode = GET_MODE (target);
14205   machine_mode inner_mode = GET_MODE_INNER (mode);
14206   machine_mode half_mode;
14207   bool use_vec_merge = false;
14208   rtx tmp;
14209   static rtx (*gen_extract[6][2]) (rtx, rtx)
14210     = {
14211 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14212 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14213 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14214 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14215 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14216 	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14217       };
14218   static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14219     = {
14220 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14221 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14222 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14223 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14224 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14225 	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14226       };
14227   int i, j, n;
14228   machine_mode mmode = VOIDmode;
14229   rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14230 
14231   switch (mode)
14232     {
14233     case E_V2SImode:
14234       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14235       if (use_vec_merge)
14236 	break;
14237       /* FALLTHRU */
14238 
14239     case E_V2SFmode:
14240       if (mmx_ok)
14241 	{
14242 	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14243 	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14244 	  if (elt == 0)
14245 	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14246 	  else
14247 	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14248 	  emit_insn (gen_rtx_SET (target, tmp));
14249 	  return;
14250 	}
14251       break;
14252 
14253     case E_V2DImode:
14254       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14255       if (use_vec_merge)
14256 	break;
14257 
14258       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14259       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14260       if (elt == 0)
14261 	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14262       else
14263 	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14264       emit_insn (gen_rtx_SET (target, tmp));
14265       return;
14266 
14267     case E_V2DFmode:
14268       /* NB: For ELT == 0, use standard scalar operation patterns which
14269 	 preserve the rest of the vector for combiner:
14270 
14271 	 (vec_merge:V2DF
14272 	   (vec_duplicate:V2DF (reg:DF))
14273 	   (reg:V2DF)
14274 	   (const_int 1))
14275        */
14276       if (elt == 0)
14277 	goto do_vec_merge;
14278 
14279       {
14280 	rtx op0, op1;
14281 
14282 	/* For the two element vectors, we implement a VEC_CONCAT with
14283 	   the extraction of the other element.  */
14284 
14285 	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14286 	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14287 
14288 	if (elt == 0)
14289 	  op0 = val, op1 = tmp;
14290 	else
14291 	  op0 = tmp, op1 = val;
14292 
14293 	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14294 	emit_insn (gen_rtx_SET (target, tmp));
14295       }
14296       return;
14297 
14298     case E_V4SFmode:
14299       use_vec_merge = TARGET_SSE4_1;
14300       if (use_vec_merge)
14301 	break;
14302 
14303       switch (elt)
14304 	{
14305 	case 0:
14306 	  use_vec_merge = true;
14307 	  break;
14308 
14309 	case 1:
14310 	  /* tmp = target = A B C D */
14311 	  tmp = copy_to_reg (target);
14312 	  /* target = A A B B */
14313 	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14314 	  /* target = X A B B */
14315 	  ix86_expand_vector_set (false, target, val, 0);
14316 	  /* target = A X C D  */
14317 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14318 					  const1_rtx, const0_rtx,
14319 					  GEN_INT (2+4), GEN_INT (3+4)));
14320 	  return;
14321 
14322 	case 2:
14323 	  /* tmp = target = A B C D */
14324 	  tmp = copy_to_reg (target);
14325 	  /* tmp = X B C D */
14326 	  ix86_expand_vector_set (false, tmp, val, 0);
14327 	  /* target = A B X D */
14328 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14329 					  const0_rtx, const1_rtx,
14330 					  GEN_INT (0+4), GEN_INT (3+4)));
14331 	  return;
14332 
14333 	case 3:
14334 	  /* tmp = target = A B C D */
14335 	  tmp = copy_to_reg (target);
14336 	  /* tmp = X B C D */
14337 	  ix86_expand_vector_set (false, tmp, val, 0);
14338 	  /* target = A B X D */
14339 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14340 					  const0_rtx, const1_rtx,
14341 					  GEN_INT (2+4), GEN_INT (0+4)));
14342 	  return;
14343 
14344 	default:
14345 	  gcc_unreachable ();
14346 	}
14347       break;
14348 
14349     case E_V4SImode:
14350       use_vec_merge = TARGET_SSE4_1;
14351       if (use_vec_merge)
14352 	break;
14353 
14354       /* Element 0 handled by vec_merge below.  */
14355       if (elt == 0)
14356 	{
14357 	  use_vec_merge = true;
14358 	  break;
14359 	}
14360 
14361       if (TARGET_SSE2)
14362 	{
14363 	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
14364 	     store into element 0, then shuffle them back.  */
14365 
14366 	  rtx order[4];
14367 
14368 	  order[0] = GEN_INT (elt);
14369 	  order[1] = const1_rtx;
14370 	  order[2] = const2_rtx;
14371 	  order[3] = GEN_INT (3);
14372 	  order[elt] = const0_rtx;
14373 
14374 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14375 					order[1], order[2], order[3]));
14376 
14377 	  ix86_expand_vector_set (false, target, val, 0);
14378 
14379 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14380 					order[1], order[2], order[3]));
14381 	}
14382       else
14383 	{
14384 	  /* For SSE1, we have to reuse the V4SF code.  */
14385 	  rtx t = gen_reg_rtx (V4SFmode);
14386 	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
14387 	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14388 	  emit_move_insn (target, gen_lowpart (mode, t));
14389 	}
14390       return;
14391 
14392     case E_V8HImode:
14393       use_vec_merge = TARGET_SSE2;
14394       break;
14395     case E_V4HImode:
14396       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14397       break;
14398 
14399     case E_V16QImode:
14400       use_vec_merge = TARGET_SSE4_1;
14401       break;
14402 
14403     case E_V8QImode:
14404       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14405       break;
14406 
14407     case E_V32QImode:
14408       half_mode = V16QImode;
14409       j = 0;
14410       n = 16;
14411       goto half;
14412 
14413     case E_V16HImode:
14414       half_mode = V8HImode;
14415       j = 1;
14416       n = 8;
14417       goto half;
14418 
14419     case E_V8SImode:
14420       half_mode = V4SImode;
14421       j = 2;
14422       n = 4;
14423       goto half;
14424 
14425     case E_V4DImode:
14426       half_mode = V2DImode;
14427       j = 3;
14428       n = 2;
14429       goto half;
14430 
14431     case E_V8SFmode:
14432       half_mode = V4SFmode;
14433       j = 4;
14434       n = 4;
14435       goto half;
14436 
14437     case E_V4DFmode:
14438       half_mode = V2DFmode;
14439       j = 5;
14440       n = 2;
14441       goto half;
14442 
14443 half:
14444       /* Compute offset.  */
14445       i = elt / n;
14446       elt %= n;
14447 
14448       gcc_assert (i <= 1);
14449 
14450       /* Extract the half.  */
14451       tmp = gen_reg_rtx (half_mode);
14452       emit_insn (gen_extract[j][i] (tmp, target));
14453 
14454       /* Put val in tmp at elt.  */
14455       ix86_expand_vector_set (false, tmp, val, elt);
14456 
14457       /* Put it back.  */
14458       emit_insn (gen_insert[j][i] (target, target, tmp));
14459       return;
14460 
14461     case E_V8DFmode:
14462       if (TARGET_AVX512F)
14463 	{
14464 	  mmode = QImode;
14465 	  gen_blendm = gen_avx512f_blendmv8df;
14466 	}
14467       break;
14468 
14469     case E_V8DImode:
14470       if (TARGET_AVX512F)
14471 	{
14472 	  mmode = QImode;
14473 	  gen_blendm = gen_avx512f_blendmv8di;
14474 	}
14475       break;
14476 
14477     case E_V16SFmode:
14478       if (TARGET_AVX512F)
14479 	{
14480 	  mmode = HImode;
14481 	  gen_blendm = gen_avx512f_blendmv16sf;
14482 	}
14483       break;
14484 
14485     case E_V16SImode:
14486       if (TARGET_AVX512F)
14487 	{
14488 	  mmode = HImode;
14489 	  gen_blendm = gen_avx512f_blendmv16si;
14490 	}
14491       break;
14492 
14493     case E_V32HImode:
14494       if (TARGET_AVX512BW)
14495 	{
14496 	  mmode = SImode;
14497 	  gen_blendm = gen_avx512bw_blendmv32hi;
14498 	}
14499       else if (TARGET_AVX512F)
14500 	{
14501 	  half_mode = E_V8HImode;
14502 	  n = 8;
14503 	  goto quarter;
14504 	}
14505       break;
14506 
14507     case E_V64QImode:
14508       if (TARGET_AVX512BW)
14509 	{
14510 	  mmode = DImode;
14511 	  gen_blendm = gen_avx512bw_blendmv64qi;
14512 	}
14513       else if (TARGET_AVX512F)
14514 	{
14515 	  half_mode = E_V16QImode;
14516 	  n = 16;
14517 	  goto quarter;
14518 	}
14519       break;
14520 
14521 quarter:
14522       /* Compute offset.  */
14523       i = elt / n;
14524       elt %= n;
14525 
14526       gcc_assert (i <= 3);
14527 
14528       {
14529 	/* Extract the quarter.  */
14530 	tmp = gen_reg_rtx (V4SImode);
14531 	rtx tmp2 = gen_lowpart (V16SImode, target);
14532 	rtx mask = gen_reg_rtx (QImode);
14533 
14534 	emit_move_insn (mask, constm1_rtx);
14535 	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14536 						   tmp, mask));
14537 
14538 	tmp2 = gen_reg_rtx (half_mode);
14539 	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14540 	tmp = tmp2;
14541 
14542 	/* Put val in tmp at elt.  */
14543 	ix86_expand_vector_set (false, tmp, val, elt);
14544 
14545 	/* Put it back.  */
14546 	tmp2 = gen_reg_rtx (V16SImode);
14547 	rtx tmp3 = gen_lowpart (V16SImode, target);
14548 	mask = gen_reg_rtx (HImode);
14549 	emit_move_insn (mask, constm1_rtx);
14550 	tmp = gen_lowpart (V4SImode, tmp);
14551 	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14552 						  tmp3, mask));
14553 	emit_move_insn (target, gen_lowpart (mode, tmp2));
14554       }
14555       return;
14556 
14557     default:
14558       break;
14559     }
14560 
14561   if (mmode != VOIDmode)
14562     {
14563       tmp = gen_reg_rtx (mode);
14564       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14565       /* The avx512*_blendm<mode> expanders have different operand order
14566 	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
14567 	 elements where the mask is set and second input operand otherwise,
14568 	 in {sse,avx}*_*blend* the first input operand is used for elements
14569 	 where the mask is clear and second input operand otherwise.  */
14570       emit_insn (gen_blendm (target, target, tmp,
14571 			     force_reg (mmode,
14572 					gen_int_mode (HOST_WIDE_INT_1U << elt,
14573 						      mmode))));
14574     }
14575   else if (use_vec_merge)
14576     {
14577 do_vec_merge:
14578       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14579       tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14580 			       GEN_INT (HOST_WIDE_INT_1U << elt));
14581       emit_insn (gen_rtx_SET (target, tmp));
14582     }
14583   else
14584     {
14585       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14586 
14587       emit_move_insn (mem, target);
14588 
14589       tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14590       emit_move_insn (tmp, val);
14591 
14592       emit_move_insn (target, mem);
14593     }
14594 }
14595 
14596 void
ix86_expand_vector_extract(bool mmx_ok,rtx target,rtx vec,int elt)14597 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14598 {
14599   machine_mode mode = GET_MODE (vec);
14600   machine_mode inner_mode = GET_MODE_INNER (mode);
14601   bool use_vec_extr = false;
14602   rtx tmp;
14603 
14604   switch (mode)
14605     {
14606     case E_V2SImode:
14607       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14608       if (use_vec_extr)
14609 	break;
14610       /* FALLTHRU */
14611 
14612     case E_V2SFmode:
14613       if (!mmx_ok)
14614 	break;
14615       /* FALLTHRU */
14616 
14617     case E_V2DFmode:
14618     case E_V2DImode:
14619     case E_V2TImode:
14620     case E_V4TImode:
14621       use_vec_extr = true;
14622       break;
14623 
14624     case E_V4SFmode:
14625       use_vec_extr = TARGET_SSE4_1;
14626       if (use_vec_extr)
14627 	break;
14628 
14629       switch (elt)
14630 	{
14631 	case 0:
14632 	  tmp = vec;
14633 	  break;
14634 
14635 	case 1:
14636 	case 3:
14637 	  tmp = gen_reg_rtx (mode);
14638 	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14639 				       GEN_INT (elt), GEN_INT (elt),
14640 				       GEN_INT (elt+4), GEN_INT (elt+4)));
14641 	  break;
14642 
14643 	case 2:
14644 	  tmp = gen_reg_rtx (mode);
14645 	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14646 	  break;
14647 
14648 	default:
14649 	  gcc_unreachable ();
14650 	}
14651       vec = tmp;
14652       use_vec_extr = true;
14653       elt = 0;
14654       break;
14655 
14656     case E_V4SImode:
14657       use_vec_extr = TARGET_SSE4_1;
14658       if (use_vec_extr)
14659 	break;
14660 
14661       if (TARGET_SSE2)
14662 	{
14663 	  switch (elt)
14664 	    {
14665 	    case 0:
14666 	      tmp = vec;
14667 	      break;
14668 
14669 	    case 1:
14670 	    case 3:
14671 	      tmp = gen_reg_rtx (mode);
14672 	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14673 					    GEN_INT (elt), GEN_INT (elt),
14674 					    GEN_INT (elt), GEN_INT (elt)));
14675 	      break;
14676 
14677 	    case 2:
14678 	      tmp = gen_reg_rtx (mode);
14679 	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14680 	      break;
14681 
14682 	    default:
14683 	      gcc_unreachable ();
14684 	    }
14685 	  vec = tmp;
14686 	  use_vec_extr = true;
14687 	  elt = 0;
14688 	}
14689       else
14690 	{
14691 	  /* For SSE1, we have to reuse the V4SF code.  */
14692 	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14693 				      gen_lowpart (V4SFmode, vec), elt);
14694 	  return;
14695 	}
14696       break;
14697 
14698     case E_V8HImode:
14699       use_vec_extr = TARGET_SSE2;
14700       break;
14701     case E_V4HImode:
14702       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14703       break;
14704 
14705     case E_V16QImode:
14706       use_vec_extr = TARGET_SSE4_1;
14707       if (!use_vec_extr
14708 	  && TARGET_SSE2
14709 	  && elt == 0
14710 	  && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
14711 	{
14712 	  tmp = gen_reg_rtx (SImode);
14713 	  ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
14714 				      0);
14715 	  emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
14716 	  return;
14717 	}
14718       break;
14719 
14720     case E_V8SFmode:
14721       if (TARGET_AVX)
14722 	{
14723 	  tmp = gen_reg_rtx (V4SFmode);
14724 	  if (elt < 4)
14725 	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14726 	  else
14727 	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14728 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
14729 	  return;
14730 	}
14731       break;
14732 
14733     case E_V4DFmode:
14734       if (TARGET_AVX)
14735 	{
14736 	  tmp = gen_reg_rtx (V2DFmode);
14737 	  if (elt < 2)
14738 	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14739 	  else
14740 	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14741 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
14742 	  return;
14743 	}
14744       break;
14745 
14746     case E_V32QImode:
14747       if (TARGET_AVX)
14748 	{
14749 	  tmp = gen_reg_rtx (V16QImode);
14750 	  if (elt < 16)
14751 	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14752 	  else
14753 	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14754 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
14755 	  return;
14756 	}
14757       break;
14758 
14759     case E_V16HImode:
14760       if (TARGET_AVX)
14761 	{
14762 	  tmp = gen_reg_rtx (V8HImode);
14763 	  if (elt < 8)
14764 	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14765 	  else
14766 	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14767 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
14768 	  return;
14769 	}
14770       break;
14771 
14772     case E_V8SImode:
14773       if (TARGET_AVX)
14774 	{
14775 	  tmp = gen_reg_rtx (V4SImode);
14776 	  if (elt < 4)
14777 	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14778 	  else
14779 	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14780 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
14781 	  return;
14782 	}
14783       break;
14784 
14785     case E_V4DImode:
14786       if (TARGET_AVX)
14787 	{
14788 	  tmp = gen_reg_rtx (V2DImode);
14789 	  if (elt < 2)
14790 	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14791 	  else
14792 	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14793 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
14794 	  return;
14795 	}
14796       break;
14797 
14798     case E_V32HImode:
14799       if (TARGET_AVX512BW)
14800 	{
14801 	  tmp = gen_reg_rtx (V16HImode);
14802 	  if (elt < 16)
14803 	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14804 	  else
14805 	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14806 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
14807 	  return;
14808 	}
14809       break;
14810 
14811     case E_V64QImode:
14812       if (TARGET_AVX512BW)
14813 	{
14814 	  tmp = gen_reg_rtx (V32QImode);
14815 	  if (elt < 32)
14816 	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14817 	  else
14818 	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14819 	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
14820 	  return;
14821 	}
14822       break;
14823 
14824     case E_V16SFmode:
14825       tmp = gen_reg_rtx (V8SFmode);
14826       if (elt < 8)
14827 	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14828       else
14829 	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14830       ix86_expand_vector_extract (false, target, tmp, elt & 7);
14831       return;
14832 
14833     case E_V8DFmode:
14834       tmp = gen_reg_rtx (V4DFmode);
14835       if (elt < 4)
14836 	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14837       else
14838 	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14839       ix86_expand_vector_extract (false, target, tmp, elt & 3);
14840       return;
14841 
14842     case E_V16SImode:
14843       tmp = gen_reg_rtx (V8SImode);
14844       if (elt < 8)
14845 	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14846       else
14847 	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14848       ix86_expand_vector_extract (false, target, tmp, elt & 7);
14849       return;
14850 
14851     case E_V8DImode:
14852       tmp = gen_reg_rtx (V4DImode);
14853       if (elt < 4)
14854 	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14855       else
14856 	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14857       ix86_expand_vector_extract (false, target, tmp, elt & 3);
14858       return;
14859 
14860     case E_V8QImode:
14861       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14862       /* ??? Could extract the appropriate HImode element and shift.  */
14863       break;
14864 
14865     default:
14866       break;
14867     }
14868 
14869   if (use_vec_extr)
14870     {
14871       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14872       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14873 
14874       /* Let the rtl optimizers know about the zero extension performed.  */
14875       if (inner_mode == QImode || inner_mode == HImode)
14876 	{
14877 	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14878 	  target = gen_lowpart (SImode, target);
14879 	}
14880 
14881       emit_insn (gen_rtx_SET (target, tmp));
14882     }
14883   else
14884     {
14885       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14886 
14887       emit_move_insn (mem, vec);
14888 
14889       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14890       emit_move_insn (target, tmp);
14891     }
14892 }
14893 
14894 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14895    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14896    The upper bits of DEST are undefined, though they shouldn't cause
14897    exceptions (some bits from src or all zeros are ok).  */
14898 
14899 static void
emit_reduc_half(rtx dest,rtx src,int i)14900 emit_reduc_half (rtx dest, rtx src, int i)
14901 {
14902   rtx tem, d = dest;
14903   switch (GET_MODE (src))
14904     {
14905     case E_V4SFmode:
14906       if (i == 128)
14907 	tem = gen_sse_movhlps (dest, src, src);
14908       else
14909 	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14910 				   GEN_INT (1 + 4), GEN_INT (1 + 4));
14911       break;
14912     case E_V2DFmode:
14913       tem = gen_vec_interleave_highv2df (dest, src, src);
14914       break;
14915     case E_V16QImode:
14916     case E_V8HImode:
14917     case E_V4SImode:
14918     case E_V2DImode:
14919       d = gen_reg_rtx (V1TImode);
14920       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14921 				GEN_INT (i / 2));
14922       break;
14923     case E_V8SFmode:
14924       if (i == 256)
14925 	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14926       else
14927 	tem = gen_avx_shufps256 (dest, src, src,
14928 				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14929       break;
14930     case E_V4DFmode:
14931       if (i == 256)
14932 	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14933       else
14934 	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14935       break;
14936     case E_V32QImode:
14937     case E_V16HImode:
14938     case E_V8SImode:
14939     case E_V4DImode:
14940       if (i == 256)
14941 	{
14942 	  if (GET_MODE (dest) != V4DImode)
14943 	    d = gen_reg_rtx (V4DImode);
14944 	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14945 				   gen_lowpart (V4DImode, src),
14946 				   const1_rtx);
14947 	}
14948       else
14949 	{
14950 	  d = gen_reg_rtx (V2TImode);
14951 	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14952 				    GEN_INT (i / 2));
14953 	}
14954       break;
14955     case E_V64QImode:
14956     case E_V32HImode:
14957       if (i < 64)
14958 	{
14959 	  d = gen_reg_rtx (V4TImode);
14960 	  tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
14961 					GEN_INT (i / 2));
14962 	  break;
14963 	}
14964       /* FALLTHRU */
14965     case E_V16SImode:
14966     case E_V16SFmode:
14967     case E_V8DImode:
14968     case E_V8DFmode:
14969       if (i > 128)
14970 	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
14971 					gen_lowpart (V16SImode, src),
14972 					gen_lowpart (V16SImode, src),
14973 					GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14974 					GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14975 					GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14976 					GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14977 					GEN_INT (0xC), GEN_INT (0xD),
14978 					GEN_INT (0xE), GEN_INT (0xF),
14979 					GEN_INT (0x10), GEN_INT (0x11),
14980 					GEN_INT (0x12), GEN_INT (0x13),
14981 					GEN_INT (0x14), GEN_INT (0x15),
14982 					GEN_INT (0x16), GEN_INT (0x17));
14983       else
14984 	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
14985 				    gen_lowpart (V16SImode, src),
14986 				    GEN_INT (i == 128 ? 0x2 : 0x1),
14987 				    GEN_INT (0x3),
14988 				    GEN_INT (0x3),
14989 				    GEN_INT (0x3),
14990 				    GEN_INT (i == 128 ? 0x6 : 0x5),
14991 				    GEN_INT (0x7),
14992 				    GEN_INT (0x7),
14993 				    GEN_INT (0x7),
14994 				    GEN_INT (i == 128 ? 0xA : 0x9),
14995 				    GEN_INT (0xB),
14996 				    GEN_INT (0xB),
14997 				    GEN_INT (0xB),
14998 				    GEN_INT (i == 128 ? 0xE : 0xD),
14999 				    GEN_INT (0xF),
15000 				    GEN_INT (0xF),
15001 				    GEN_INT (0xF));
15002       break;
15003     default:
15004       gcc_unreachable ();
15005     }
15006   emit_insn (tem);
15007   if (d != dest)
15008     emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15009 }
15010 
15011 /* Expand a vector reduction.  FN is the binary pattern to reduce;
15012    DEST is the destination; IN is the input vector.  */
15013 
15014 void
ix86_expand_reduc(rtx (* fn)(rtx,rtx,rtx),rtx dest,rtx in)15015 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15016 {
15017   rtx half, dst, vec = in;
15018   machine_mode mode = GET_MODE (in);
15019   int i;
15020 
15021   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
15022   if (TARGET_SSE4_1
15023       && mode == V8HImode
15024       && fn == gen_uminv8hi3)
15025     {
15026       emit_insn (gen_sse4_1_phminposuw (dest, in));
15027       return;
15028     }
15029 
15030   for (i = GET_MODE_BITSIZE (mode);
15031        i > GET_MODE_UNIT_BITSIZE (mode);
15032        i >>= 1)
15033     {
15034       half = gen_reg_rtx (mode);
15035       emit_reduc_half (half, vec, i);
15036       if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15037 	dst = dest;
15038       else
15039 	dst = gen_reg_rtx (mode);
15040       emit_insn (fn (dst, half, vec));
15041       vec = dst;
15042     }
15043 }
15044 
15045 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15046    FP status register is set.  */
15047 
15048 void
ix86_emit_fp_unordered_jump(rtx label)15049 ix86_emit_fp_unordered_jump (rtx label)
15050 {
15051   rtx reg = gen_reg_rtx (HImode);
15052   rtx_insn *insn;
15053   rtx temp;
15054 
15055   emit_insn (gen_x86_fnstsw_1 (reg));
15056 
15057   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15058     {
15059       emit_insn (gen_x86_sahf_1 (reg));
15060 
15061       temp = gen_rtx_REG (CCmode, FLAGS_REG);
15062       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15063     }
15064   else
15065     {
15066       emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15067 
15068       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15069       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15070     }
15071 
15072   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15073 			      gen_rtx_LABEL_REF (VOIDmode, label),
15074 			      pc_rtx);
15075   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15076   predict_jump (REG_BR_PROB_BASE * 10 / 100);
15077   JUMP_LABEL (insn) = label;
15078 }
15079 
15080 /* Output code to perform an sinh XFmode calculation.  */
15081 
ix86_emit_i387_sinh(rtx op0,rtx op1)15082 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15083 {
15084   rtx e1 = gen_reg_rtx (XFmode);
15085   rtx e2 = gen_reg_rtx (XFmode);
15086   rtx scratch = gen_reg_rtx (HImode);
15087   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15088   rtx half = const_double_from_real_value (dconsthalf, XFmode);
15089   rtx cst1, tmp;
15090   rtx_code_label *jump_label = gen_label_rtx ();
15091   rtx_insn *insn;
15092 
15093   /* scratch = fxam (op1) */
15094   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15095 
15096   /* e1 = expm1 (|op1|) */
15097   emit_insn (gen_absxf2 (e2, op1));
15098   emit_insn (gen_expm1xf2 (e1, e2));
15099 
15100   /* e2 = e1 / (e1 + 1.0) + e1 */
15101   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15102   emit_insn (gen_addxf3 (e2, e1, cst1));
15103   emit_insn (gen_divxf3 (e2, e1, e2));
15104   emit_insn (gen_addxf3 (e2, e2, e1));
15105 
15106   /* flags = signbit (op1) */
15107   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15108 
15109   /* if (flags) then e2 = -e2 */
15110   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15111 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15112 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15113 			      pc_rtx);
15114   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15115   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15116   JUMP_LABEL (insn) = jump_label;
15117 
15118   emit_insn (gen_negxf2 (e2, e2));
15119 
15120   emit_label (jump_label);
15121   LABEL_NUSES (jump_label) = 1;
15122 
15123   /* op0 = 0.5 * e2 */
15124   half = force_reg (XFmode, half);
15125   emit_insn (gen_mulxf3 (op0, e2, half));
15126 }
15127 
15128 /* Output code to perform an cosh XFmode calculation.  */
15129 
ix86_emit_i387_cosh(rtx op0,rtx op1)15130 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15131 {
15132   rtx e1 = gen_reg_rtx (XFmode);
15133   rtx e2 = gen_reg_rtx (XFmode);
15134   rtx half = const_double_from_real_value (dconsthalf, XFmode);
15135   rtx cst1;
15136 
15137   /* e1 = exp (op1) */
15138   emit_insn (gen_expxf2 (e1, op1));
15139 
15140   /* e2 = e1 + 1.0 / e1 */
15141   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15142   emit_insn (gen_divxf3 (e2, cst1, e1));
15143   emit_insn (gen_addxf3 (e2, e1, e2));
15144 
15145   /* op0 = 0.5 * e2 */
15146   half = force_reg (XFmode, half);
15147   emit_insn (gen_mulxf3 (op0, e2, half));
15148 }
15149 
15150 /* Output code to perform an tanh XFmode calculation.  */
15151 
ix86_emit_i387_tanh(rtx op0,rtx op1)15152 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15153 {
15154   rtx e1 = gen_reg_rtx (XFmode);
15155   rtx e2 = gen_reg_rtx (XFmode);
15156   rtx scratch = gen_reg_rtx (HImode);
15157   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15158   rtx cst2, tmp;
15159   rtx_code_label *jump_label = gen_label_rtx ();
15160   rtx_insn *insn;
15161 
15162   /* scratch = fxam (op1) */
15163   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15164 
15165   /* e1 = expm1 (-|2 * op1|) */
15166   emit_insn (gen_addxf3 (e2, op1, op1));
15167   emit_insn (gen_absxf2 (e2, e2));
15168   emit_insn (gen_negxf2 (e2, e2));
15169   emit_insn (gen_expm1xf2 (e1, e2));
15170 
15171   /* e2 = e1 / (e1 + 2.0) */
15172   cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15173   emit_insn (gen_addxf3 (e2, e1, cst2));
15174   emit_insn (gen_divxf3 (e2, e1, e2));
15175 
15176   /* flags = signbit (op1) */
15177   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15178 
15179   /* if (!flags) then e2 = -e2 */
15180   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15181 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
15182 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15183 			      pc_rtx);
15184   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15185   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15186   JUMP_LABEL (insn) = jump_label;
15187 
15188   emit_insn (gen_negxf2 (e2, e2));
15189 
15190   emit_label (jump_label);
15191   LABEL_NUSES (jump_label) = 1;
15192 
15193   emit_move_insn (op0, e2);
15194 }
15195 
15196 /* Output code to perform an asinh XFmode calculation.  */
15197 
ix86_emit_i387_asinh(rtx op0,rtx op1)15198 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15199 {
15200   rtx e1 = gen_reg_rtx (XFmode);
15201   rtx e2 = gen_reg_rtx (XFmode);
15202   rtx scratch = gen_reg_rtx (HImode);
15203   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15204   rtx cst1, tmp;
15205   rtx_code_label *jump_label = gen_label_rtx ();
15206   rtx_insn *insn;
15207 
15208   /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15209   emit_insn (gen_mulxf3 (e1, op1, op1));
15210   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15211   emit_insn (gen_addxf3 (e2, e1, cst1));
15212   emit_insn (gen_sqrtxf2 (e2, e2));
15213   emit_insn (gen_addxf3 (e2, e2, cst1));
15214 
15215   /* e1 = e1 / e2 */
15216   emit_insn (gen_divxf3 (e1, e1, e2));
15217 
15218   /* scratch = fxam (op1) */
15219   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15220 
15221   /* e1 = e1 + |op1| */
15222   emit_insn (gen_absxf2 (e2, op1));
15223   emit_insn (gen_addxf3 (e1, e1, e2));
15224 
15225   /* e2 = log1p (e1) */
15226   ix86_emit_i387_log1p (e2, e1);
15227 
15228   /* flags = signbit (op1) */
15229   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15230 
15231   /* if (flags) then e2 = -e2 */
15232   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15233 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15234 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15235 			      pc_rtx);
15236   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15237   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15238   JUMP_LABEL (insn) = jump_label;
15239 
15240   emit_insn (gen_negxf2 (e2, e2));
15241 
15242   emit_label (jump_label);
15243   LABEL_NUSES (jump_label) = 1;
15244 
15245   emit_move_insn (op0, e2);
15246 }
15247 
15248 /* Output code to perform an acosh XFmode calculation.  */
15249 
ix86_emit_i387_acosh(rtx op0,rtx op1)15250 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15251 {
15252   rtx e1 = gen_reg_rtx (XFmode);
15253   rtx e2 = gen_reg_rtx (XFmode);
15254   rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15255 
15256   /* e2 = sqrt (op1 + 1.0) */
15257   emit_insn (gen_addxf3 (e2, op1, cst1));
15258   emit_insn (gen_sqrtxf2 (e2, e2));
15259 
15260   /* e1 = sqrt (op1 - 1.0) */
15261   emit_insn (gen_subxf3 (e1, op1, cst1));
15262   emit_insn (gen_sqrtxf2 (e1, e1));
15263 
15264   /* e1 = e1 * e2 */
15265   emit_insn (gen_mulxf3 (e1, e1, e2));
15266 
15267   /* e1 = e1 + op1 */
15268   emit_insn (gen_addxf3 (e1, e1, op1));
15269 
15270   /* op0 = log (e1) */
15271   emit_insn (gen_logxf2 (op0, e1));
15272 }
15273 
15274 /* Output code to perform an atanh XFmode calculation.  */
15275 
ix86_emit_i387_atanh(rtx op0,rtx op1)15276 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15277 {
15278   rtx e1 = gen_reg_rtx (XFmode);
15279   rtx e2 = gen_reg_rtx (XFmode);
15280   rtx scratch = gen_reg_rtx (HImode);
15281   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15282   rtx half = const_double_from_real_value (dconsthalf, XFmode);
15283   rtx cst1, tmp;
15284   rtx_code_label *jump_label = gen_label_rtx ();
15285   rtx_insn *insn;
15286 
15287   /* scratch = fxam (op1) */
15288   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15289 
15290   /* e2 = |op1| */
15291   emit_insn (gen_absxf2 (e2, op1));
15292 
15293   /* e1 = -(e2 + e2) / (e2 + 1.0) */
15294   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15295   emit_insn (gen_addxf3 (e1, e2, cst1));
15296   emit_insn (gen_addxf3 (e2, e2, e2));
15297   emit_insn (gen_negxf2 (e2, e2));
15298   emit_insn (gen_divxf3 (e1, e2, e1));
15299 
15300   /* e2 = log1p (e1) */
15301   ix86_emit_i387_log1p (e2, e1);
15302 
15303   /* flags = signbit (op1) */
15304   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15305 
15306   /* if (!flags) then e2 = -e2 */
15307   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15308 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
15309 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15310 			      pc_rtx);
15311   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15312   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15313   JUMP_LABEL (insn) = jump_label;
15314 
15315   emit_insn (gen_negxf2 (e2, e2));
15316 
15317   emit_label (jump_label);
15318   LABEL_NUSES (jump_label) = 1;
15319 
15320   /* op0 = 0.5 * e2 */
15321   half = force_reg (XFmode, half);
15322   emit_insn (gen_mulxf3 (op0, e2, half));
15323 }
15324 
15325 /* Output code to perform a log1p XFmode calculation.  */
15326 
ix86_emit_i387_log1p(rtx op0,rtx op1)15327 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15328 {
15329   rtx_code_label *label1 = gen_label_rtx ();
15330   rtx_code_label *label2 = gen_label_rtx ();
15331 
15332   rtx tmp = gen_reg_rtx (XFmode);
15333   rtx res = gen_reg_rtx (XFmode);
15334   rtx cst, cstln2, cst1;
15335   rtx_insn *insn;
15336 
15337   /* The emit_jump call emits pending stack adjust, make sure it is emitted
15338      before the conditional jump, otherwise the stack adjustment will be
15339      only conditional.  */
15340   do_pending_stack_adjust ();
15341 
15342   cst = const_double_from_real_value
15343     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15344   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15345 
15346   emit_insn (gen_absxf2 (tmp, op1));
15347 
15348   cst = force_reg (XFmode, cst);
15349   ix86_expand_branch (GE, tmp, cst, label1);
15350   predict_jump (REG_BR_PROB_BASE * 10 / 100);
15351   insn = get_last_insn ();
15352   JUMP_LABEL (insn) = label1;
15353 
15354   emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15355   emit_jump (label2);
15356 
15357   emit_label (label1);
15358   LABEL_NUSES (label1) = 1;
15359 
15360   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15361   emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15362   emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15363 
15364   emit_label (label2);
15365   LABEL_NUSES (label2) = 1;
15366 
15367   emit_move_insn (op0, res);
15368 }
15369 
15370 /* Emit code for round calculation.  */
ix86_emit_i387_round(rtx op0,rtx op1)15371 void ix86_emit_i387_round (rtx op0, rtx op1)
15372 {
15373   machine_mode inmode = GET_MODE (op1);
15374   machine_mode outmode = GET_MODE (op0);
15375   rtx e1 = gen_reg_rtx (XFmode);
15376   rtx e2 = gen_reg_rtx (XFmode);
15377   rtx scratch = gen_reg_rtx (HImode);
15378   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15379   rtx half = const_double_from_real_value (dconsthalf, XFmode);
15380   rtx res = gen_reg_rtx (outmode);
15381   rtx_code_label *jump_label = gen_label_rtx ();
15382   rtx (*floor_insn) (rtx, rtx);
15383   rtx (*neg_insn) (rtx, rtx);
15384   rtx_insn *insn;
15385   rtx tmp;
15386 
15387   switch (inmode)
15388     {
15389     case E_SFmode:
15390     case E_DFmode:
15391       tmp = gen_reg_rtx (XFmode);
15392 
15393       emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15394       op1 = tmp;
15395       break;
15396     case E_XFmode:
15397       break;
15398     default:
15399       gcc_unreachable ();
15400     }
15401 
15402   switch (outmode)
15403     {
15404     case E_SFmode:
15405       floor_insn = gen_frndintxf2_floor;
15406       neg_insn = gen_negsf2;
15407       break;
15408     case E_DFmode:
15409       floor_insn = gen_frndintxf2_floor;
15410       neg_insn = gen_negdf2;
15411       break;
15412     case E_XFmode:
15413       floor_insn = gen_frndintxf2_floor;
15414       neg_insn = gen_negxf2;
15415       break;
15416     case E_HImode:
15417       floor_insn = gen_lfloorxfhi2;
15418       neg_insn = gen_neghi2;
15419       break;
15420     case E_SImode:
15421       floor_insn = gen_lfloorxfsi2;
15422       neg_insn = gen_negsi2;
15423       break;
15424     case E_DImode:
15425       floor_insn = gen_lfloorxfdi2;
15426       neg_insn = gen_negdi2;
15427       break;
15428     default:
15429       gcc_unreachable ();
15430     }
15431 
15432   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15433 
15434   /* scratch = fxam(op1) */
15435   emit_insn (gen_fxamxf2_i387 (scratch, op1));
15436 
15437   /* e1 = fabs(op1) */
15438   emit_insn (gen_absxf2 (e1, op1));
15439 
15440   /* e2 = e1 + 0.5 */
15441   half = force_reg (XFmode, half);
15442   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15443 
15444   /* res = floor(e2) */
15445   switch (outmode)
15446     {
15447     case E_SFmode:
15448     case E_DFmode:
15449       {
15450 	tmp = gen_reg_rtx (XFmode);
15451 
15452 	emit_insn (floor_insn (tmp, e2));
15453 	emit_insn (gen_rtx_SET (res,
15454 				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15455 						UNSPEC_TRUNC_NOOP)));
15456       }
15457       break;
15458     default:
15459       emit_insn (floor_insn (res, e2));
15460     }
15461 
15462   /* flags = signbit(a) */
15463   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15464 
15465   /* if (flags) then res = -res */
15466   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15467 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15468 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
15469 			      pc_rtx);
15470   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15471   predict_jump (REG_BR_PROB_BASE * 50 / 100);
15472   JUMP_LABEL (insn) = jump_label;
15473 
15474   emit_insn (neg_insn (res, res));
15475 
15476   emit_label (jump_label);
15477   LABEL_NUSES (jump_label) = 1;
15478 
15479   emit_move_insn (op0, res);
15480 }
15481 
15482 /* Output code to perform a Newton-Rhapson approximation of a single precision
15483    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
15484 
ix86_emit_swdivsf(rtx res,rtx a,rtx b,machine_mode mode)15485 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15486 {
15487   rtx x0, x1, e0, e1;
15488 
15489   x0 = gen_reg_rtx (mode);
15490   e0 = gen_reg_rtx (mode);
15491   e1 = gen_reg_rtx (mode);
15492   x1 = gen_reg_rtx (mode);
15493 
15494   /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15495 
15496   b = force_reg (mode, b);
15497 
15498   /* x0 = rcp(b) estimate */
15499   if (mode == V16SFmode || mode == V8DFmode)
15500     {
15501       if (TARGET_AVX512ER)
15502 	{
15503 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15504 						      UNSPEC_RCP28)));
15505 	  /* res = a * x0 */
15506 	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15507 	  return;
15508 	}
15509       else
15510 	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15511 						    UNSPEC_RCP14)));
15512     }
15513   else
15514     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15515 						UNSPEC_RCP)));
15516 
15517   /* e0 = x0 * b */
15518   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15519 
15520   /* e0 = x0 * e0 */
15521   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15522 
15523   /* e1 = x0 + x0 */
15524   emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15525 
15526   /* x1 = e1 - e0 */
15527   emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15528 
15529   /* res = a * x1 */
15530   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15531 }
15532 
15533 /* Output code to perform a Newton-Rhapson approximation of a
15534    single precision floating point [reciprocal] square root.  */
15535 
ix86_emit_swsqrtsf(rtx res,rtx a,machine_mode mode,bool recip)15536 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15537 {
15538   rtx x0, e0, e1, e2, e3, mthree, mhalf;
15539   REAL_VALUE_TYPE r;
15540   int unspec;
15541 
15542   x0 = gen_reg_rtx (mode);
15543   e0 = gen_reg_rtx (mode);
15544   e1 = gen_reg_rtx (mode);
15545   e2 = gen_reg_rtx (mode);
15546   e3 = gen_reg_rtx (mode);
15547 
15548   if (TARGET_AVX512ER && mode == V16SFmode)
15549     {
15550       if (recip)
15551 	/* res = rsqrt28(a) estimate */
15552 	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15553 						     UNSPEC_RSQRT28)));
15554       else
15555 	{
15556 	  /* x0 = rsqrt28(a) estimate */
15557 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15558 						      UNSPEC_RSQRT28)));
15559 	  /* res = rcp28(x0) estimate */
15560 	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15561 						       UNSPEC_RCP28)));
15562 	}
15563       return;
15564     }
15565 
15566   real_from_integer (&r, VOIDmode, -3, SIGNED);
15567   mthree = const_double_from_real_value (r, SFmode);
15568 
15569   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15570   mhalf = const_double_from_real_value (r, SFmode);
15571   unspec = UNSPEC_RSQRT;
15572 
15573   if (VECTOR_MODE_P (mode))
15574     {
15575       mthree = ix86_build_const_vector (mode, true, mthree);
15576       mhalf = ix86_build_const_vector (mode, true, mhalf);
15577       /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
15578       if (GET_MODE_SIZE (mode) == 64)
15579 	unspec = UNSPEC_RSQRT14;
15580     }
15581 
15582   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15583      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15584 
15585   a = force_reg (mode, a);
15586 
15587   /* x0 = rsqrt(a) estimate */
15588   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15589 					      unspec)));
15590 
15591   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
15592   if (!recip)
15593     {
15594       rtx zero = force_reg (mode, CONST0_RTX(mode));
15595       rtx mask;
15596 
15597       /* Handle masked compare.  */
15598       if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15599 	{
15600 	  mask = gen_reg_rtx (HImode);
15601 	  /* Imm value 0x4 corresponds to not-equal comparison.  */
15602 	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15603 	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15604 	}
15605       else
15606 	{
15607 	  mask = gen_reg_rtx (mode);
15608 	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15609 	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15610 	}
15611     }
15612 
15613   /* e0 = x0 * a */
15614   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15615   /* e1 = e0 * x0 */
15616   emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15617 
15618   /* e2 = e1 - 3. */
15619   mthree = force_reg (mode, mthree);
15620   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15621 
15622   mhalf = force_reg (mode, mhalf);
15623   if (recip)
15624     /* e3 = -.5 * x0 */
15625     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15626   else
15627     /* e3 = -.5 * e0 */
15628     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15629   /* ret = e2 * e3 */
15630   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15631 }
15632 
15633 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
15634    mask for masking out the sign-bit is stored in *SMASK, if that is
15635    non-null.  */
15636 
15637 static rtx
ix86_expand_sse_fabs(rtx op0,rtx * smask)15638 ix86_expand_sse_fabs (rtx op0, rtx *smask)
15639 {
15640   machine_mode vmode, mode = GET_MODE (op0);
15641   rtx xa, mask;
15642 
15643   xa = gen_reg_rtx (mode);
15644   if (mode == SFmode)
15645     vmode = V4SFmode;
15646   else if (mode == DFmode)
15647     vmode = V2DFmode;
15648   else
15649     vmode = mode;
15650   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15651   if (!VECTOR_MODE_P (mode))
15652     {
15653       /* We need to generate a scalar mode mask in this case.  */
15654       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15655       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15656       mask = gen_reg_rtx (mode);
15657       emit_insn (gen_rtx_SET (mask, tmp));
15658     }
15659   emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15660 
15661   if (smask)
15662     *smask = mask;
15663 
15664   return xa;
15665 }
15666 
15667 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15668    swapping the operands if SWAP_OPERANDS is true.  The expanded
15669    code is a forward jump to a newly created label in case the
15670    comparison is true.  The generated label rtx is returned.  */
15671 static rtx_code_label *
ix86_expand_sse_compare_and_jump(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)15672 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15673                                   bool swap_operands)
15674 {
15675   bool unordered_compare = ix86_unordered_fp_compare (code);
15676   rtx_code_label *label;
15677   rtx tmp, reg;
15678 
15679   if (swap_operands)
15680     std::swap (op0, op1);
15681 
15682   label = gen_label_rtx ();
15683   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15684   if (unordered_compare)
15685     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15686   reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15687   emit_insn (gen_rtx_SET (reg, tmp));
15688   tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15689   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15690 			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15691   tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15692   JUMP_LABEL (tmp) = label;
15693 
15694   return label;
15695 }
15696 
15697 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15698    using comparison code CODE.  Operands are swapped for the comparison if
15699    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
15700 static rtx
ix86_expand_sse_compare_mask(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)15701 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15702 			      bool swap_operands)
15703 {
15704   rtx (*insn)(rtx, rtx, rtx, rtx);
15705   machine_mode mode = GET_MODE (op0);
15706   rtx mask = gen_reg_rtx (mode);
15707 
15708   if (swap_operands)
15709     std::swap (op0, op1);
15710 
15711   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15712 
15713   emit_insn (insn (mask, op0, op1,
15714 		   gen_rtx_fmt_ee (code, mode, op0, op1)));
15715   return mask;
15716 }
15717 
15718 /* Expand copysign from SIGN to the positive value ABS_VALUE
15719    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
15720    the sign-bit.  */
15721 
15722 static void
ix86_sse_copysign_to_positive(rtx result,rtx abs_value,rtx sign,rtx mask)15723 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15724 {
15725   machine_mode mode = GET_MODE (sign);
15726   rtx sgn = gen_reg_rtx (mode);
15727   if (mask == NULL_RTX)
15728     {
15729       machine_mode vmode;
15730 
15731       if (mode == SFmode)
15732 	vmode = V4SFmode;
15733       else if (mode == DFmode)
15734 	vmode = V2DFmode;
15735       else
15736 	vmode = mode;
15737 
15738       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15739       if (!VECTOR_MODE_P (mode))
15740 	{
15741 	  /* We need to generate a scalar mode mask in this case.  */
15742 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15743 	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15744 	  mask = gen_reg_rtx (mode);
15745 	  emit_insn (gen_rtx_SET (mask, tmp));
15746 	}
15747     }
15748   else
15749     mask = gen_rtx_NOT (mode, mask);
15750   emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15751   emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15752 }
15753 
15754 /* Expand SSE sequence for computing lround from OP1 storing
15755    into OP0.  */
15756 
15757 void
ix86_expand_lround(rtx op0,rtx op1)15758 ix86_expand_lround (rtx op0, rtx op1)
15759 {
15760   /* C code for the stuff we're doing below:
15761        tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15762        return (long)tmp;
15763    */
15764   machine_mode mode = GET_MODE (op1);
15765   const struct real_format *fmt;
15766   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15767   rtx adj;
15768 
15769   /* load nextafter (0.5, 0.0) */
15770   fmt = REAL_MODE_FORMAT (mode);
15771   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15772   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15773 
15774   /* adj = copysign (0.5, op1) */
15775   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15776   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15777 
15778   /* adj = op1 + adj */
15779   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15780 
15781   /* op0 = (imode)adj */
15782   expand_fix (op0, adj, 0);
15783 }
15784 
15785 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15786    into OPERAND0.  */
15787 
15788 void
ix86_expand_lfloorceil(rtx op0,rtx op1,bool do_floor)15789 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15790 {
15791   /* C code for the stuff we're doing below (for do_floor):
15792 	xi = (long)op1;
15793         xi -= (double)xi > op1 ? 1 : 0;
15794         return xi;
15795    */
15796   machine_mode fmode = GET_MODE (op1);
15797   machine_mode imode = GET_MODE (op0);
15798   rtx ireg, freg, tmp;
15799   rtx_code_label *label;
15800 
15801   /* reg = (long)op1 */
15802   ireg = gen_reg_rtx (imode);
15803   expand_fix (ireg, op1, 0);
15804 
15805   /* freg = (double)reg */
15806   freg = gen_reg_rtx (fmode);
15807   expand_float (freg, ireg, 0);
15808 
15809   /* ireg = (freg > op1) ? ireg - 1 : ireg */
15810   label = ix86_expand_sse_compare_and_jump (UNLE,
15811 					    freg, op1, !do_floor);
15812   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15813 			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15814   emit_move_insn (ireg, tmp);
15815 
15816   emit_label (label);
15817   LABEL_NUSES (label) = 1;
15818 
15819   emit_move_insn (op0, ireg);
15820 }
15821 
15822 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15823    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
15824 
15825 static rtx
ix86_gen_TWO52(machine_mode mode)15826 ix86_gen_TWO52 (machine_mode mode)
15827 {
15828   REAL_VALUE_TYPE TWO52r;
15829   rtx TWO52;
15830 
15831   real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15832   TWO52 = const_double_from_real_value (TWO52r, mode);
15833   TWO52 = force_reg (mode, TWO52);
15834 
15835   return TWO52;
15836 }
15837 
15838 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
15839 
15840 void
ix86_expand_rint(rtx operand0,rtx operand1)15841 ix86_expand_rint (rtx operand0, rtx operand1)
15842 {
15843   /* C code for the stuff we're doing below:
15844 	xa = fabs (operand1);
15845         if (!isless (xa, 2**52))
15846 	  return operand1;
15847         two52 = 2**52;
15848         if (flag_rounding_math)
15849 	  {
15850 	    two52 = copysign (two52, operand1);
15851 	    xa = operand1;
15852 	  }
15853         xa = xa + two52 - two52;
15854         return copysign (xa, operand1);
15855    */
15856   machine_mode mode = GET_MODE (operand0);
15857   rtx res, xa, TWO52, mask;
15858   rtx_code_label *label;
15859 
15860   res = gen_reg_rtx (mode);
15861   emit_move_insn (res, operand1);
15862 
15863   /* xa = abs (operand1) */
15864   xa = ix86_expand_sse_fabs (res, &mask);
15865 
15866   /* if (!isless (xa, TWO52)) goto label; */
15867   TWO52 = ix86_gen_TWO52 (mode);
15868   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15869 
15870   if (flag_rounding_math)
15871     {
15872       ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
15873       xa = res;
15874     }
15875 
15876   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15877   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
15878 
15879   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
15880   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
15881     xa = ix86_expand_sse_fabs (xa, NULL);
15882 
15883   ix86_sse_copysign_to_positive (res, xa, res, mask);
15884 
15885   emit_label (label);
15886   LABEL_NUSES (label) = 1;
15887 
15888   emit_move_insn (operand0, res);
15889 }
15890 
15891 /* Expand SSE2 sequence for computing floor or ceil
15892    from OPERAND1 storing into OPERAND0.  */
15893 void
ix86_expand_floorceil(rtx operand0,rtx operand1,bool do_floor)15894 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15895 {
15896   /* C code for the stuff we expand below.
15897 	double xa = fabs (x), x2;
15898         if (!isless (xa, TWO52))
15899           return x;
15900 	x2 = (double)(long)x;
15901 
15902      Compensate.  Floor:
15903 	if (x2 > x)
15904 	  x2 -= 1;
15905      Compensate.  Ceil:
15906 	if (x2 < x)
15907 	  x2 += 1;
15908 
15909 	if (HONOR_SIGNED_ZEROS (mode))
15910 	  return copysign (x2, x);
15911 	return x2;
15912    */
15913   machine_mode mode = GET_MODE (operand0);
15914   rtx xa, xi, TWO52, tmp, one, res, mask;
15915   rtx_code_label *label;
15916 
15917   TWO52 = ix86_gen_TWO52 (mode);
15918 
15919   /* Temporary for holding the result, initialized to the input
15920      operand to ease control flow.  */
15921   res = gen_reg_rtx (mode);
15922   emit_move_insn (res, operand1);
15923 
15924   /* xa = abs (operand1) */
15925   xa = ix86_expand_sse_fabs (res, &mask);
15926 
15927   /* if (!isless (xa, TWO52)) goto label; */
15928   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15929 
15930   /* xa = (double)(long)x */
15931   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15932   expand_fix (xi, res, 0);
15933   expand_float (xa, xi, 0);
15934 
15935   /* generate 1.0 */
15936   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15937 
15938   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15939   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15940   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15941   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15942 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15943   if (HONOR_SIGNED_ZEROS (mode))
15944     {
15945       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
15946       if (do_floor && flag_rounding_math)
15947 	tmp = ix86_expand_sse_fabs (tmp, NULL);
15948 
15949       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
15950     }
15951   emit_move_insn (res, tmp);
15952 
15953   emit_label (label);
15954   LABEL_NUSES (label) = 1;
15955 
15956   emit_move_insn (operand0, res);
15957 }
15958 
15959 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15960    into OPERAND0 without relying on DImode truncation via cvttsd2siq
15961    that is only available on 64bit targets.  */
15962 void
ix86_expand_floorceildf_32(rtx operand0,rtx operand1,bool do_floor)15963 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
15964 {
15965   /* C code for the stuff we expand below.
15966         double xa = fabs (x), x2;
15967         if (!isless (xa, TWO52))
15968           return x;
15969         xa = xa + TWO52 - TWO52;
15970         x2 = copysign (xa, x);
15971 
15972      Compensate.  Floor:
15973         if (x2 > x)
15974           x2 -= 1;
15975      Compensate.  Ceil:
15976         if (x2 < x)
15977           x2 += 1;
15978 
15979 	if (HONOR_SIGNED_ZEROS (mode))
15980 	  x2 = copysign (x2, x);
15981 	return x2;
15982    */
15983   machine_mode mode = GET_MODE (operand0);
15984   rtx xa, TWO52, tmp, one, res, mask;
15985   rtx_code_label *label;
15986 
15987   TWO52 = ix86_gen_TWO52 (mode);
15988 
15989   /* Temporary for holding the result, initialized to the input
15990      operand to ease control flow.  */
15991   res = gen_reg_rtx (mode);
15992   emit_move_insn (res, operand1);
15993 
15994   /* xa = abs (operand1) */
15995   xa = ix86_expand_sse_fabs (res, &mask);
15996 
15997   /* if (!isless (xa, TWO52)) goto label; */
15998   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15999 
16000   /* xa = xa + TWO52 - TWO52; */
16001   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16002   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16003 
16004   /* xa = copysign (xa, operand1) */
16005   ix86_sse_copysign_to_positive (xa, xa, res, mask);
16006 
16007   /* generate 1.0 */
16008   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16009 
16010   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16011   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16012   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16013   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16014 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16015   if (HONOR_SIGNED_ZEROS (mode))
16016     {
16017       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
16018       if (do_floor && flag_rounding_math)
16019 	tmp = ix86_expand_sse_fabs (tmp, NULL);
16020 
16021       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16022     }
16023   emit_move_insn (res, tmp);
16024 
16025   emit_label (label);
16026   LABEL_NUSES (label) = 1;
16027 
16028   emit_move_insn (operand0, res);
16029 }
16030 
16031 /* Expand SSE sequence for computing trunc
16032    from OPERAND1 storing into OPERAND0.  */
16033 void
ix86_expand_trunc(rtx operand0,rtx operand1)16034 ix86_expand_trunc (rtx operand0, rtx operand1)
16035 {
16036   /* C code for SSE variant we expand below.
16037         double xa = fabs (x), x2;
16038         if (!isless (xa, TWO52))
16039           return x;
16040         x2 = (double)(long)x;
16041 	if (HONOR_SIGNED_ZEROS (mode))
16042 	  return copysign (x2, x);
16043 	return x2;
16044    */
16045   machine_mode mode = GET_MODE (operand0);
16046   rtx xa, xi, TWO52, res, mask;
16047   rtx_code_label *label;
16048 
16049   TWO52 = ix86_gen_TWO52 (mode);
16050 
16051   /* Temporary for holding the result, initialized to the input
16052      operand to ease control flow.  */
16053   res = gen_reg_rtx (mode);
16054   emit_move_insn (res, operand1);
16055 
16056   /* xa = abs (operand1) */
16057   xa = ix86_expand_sse_fabs (res, &mask);
16058 
16059   /* if (!isless (xa, TWO52)) goto label; */
16060   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16061 
16062   /* x = (double)(long)x */
16063   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16064   expand_fix (xi, res, 0);
16065   expand_float (res, xi, 0);
16066 
16067   if (HONOR_SIGNED_ZEROS (mode))
16068     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16069 
16070   emit_label (label);
16071   LABEL_NUSES (label) = 1;
16072 
16073   emit_move_insn (operand0, res);
16074 }
16075 
16076 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16077    into OPERAND0 without relying on DImode truncation via cvttsd2siq
16078    that is only available on 64bit targets.  */
16079 void
ix86_expand_truncdf_32(rtx operand0,rtx operand1)16080 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16081 {
16082   machine_mode mode = GET_MODE (operand0);
16083   rtx xa, xa2, TWO52, tmp, one, res, mask;
16084   rtx_code_label *label;
16085 
16086   /* C code for SSE variant we expand below.
16087         double xa = fabs (x), x2;
16088         if (!isless (xa, TWO52))
16089           return x;
16090         xa2 = xa + TWO52 - TWO52;
16091      Compensate:
16092         if (xa2 > xa)
16093           xa2 -= 1.0;
16094         x2 = copysign (xa2, x);
16095         return x2;
16096    */
16097 
16098   TWO52 = ix86_gen_TWO52 (mode);
16099 
16100   /* Temporary for holding the result, initialized to the input
16101      operand to ease control flow.  */
16102   res = gen_reg_rtx (mode);
16103   emit_move_insn (res, operand1);
16104 
16105   /* xa = abs (operand1) */
16106   xa = ix86_expand_sse_fabs (res, &mask);
16107 
16108   /* if (!isless (xa, TWO52)) goto label; */
16109   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16110 
16111   /* xa2 = xa + TWO52 - TWO52; */
16112   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16113   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16114 
16115   /* generate 1.0 */
16116   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16117 
16118   /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0)  */
16119   tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
16120   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16121   tmp = expand_simple_binop (mode, MINUS,
16122 			     xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16123   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
16124   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16125     tmp = ix86_expand_sse_fabs (tmp, NULL);
16126 
16127   /* res = copysign (xa2, operand1) */
16128   ix86_sse_copysign_to_positive (res, tmp, res, mask);
16129 
16130   emit_label (label);
16131   LABEL_NUSES (label) = 1;
16132 
16133   emit_move_insn (operand0, res);
16134 }
16135 
16136 /* Expand SSE sequence for computing round
16137    from OPERAND1 storing into OPERAND0.  */
16138 void
ix86_expand_round(rtx operand0,rtx operand1)16139 ix86_expand_round (rtx operand0, rtx operand1)
16140 {
16141   /* C code for the stuff we're doing below:
16142         double xa = fabs (x);
16143         if (!isless (xa, TWO52))
16144           return x;
16145         xa = (double)(long)(xa + nextafter (0.5, 0.0));
16146         return copysign (xa, x);
16147    */
16148   machine_mode mode = GET_MODE (operand0);
16149   rtx res, TWO52, xa, xi, half, mask;
16150   rtx_code_label *label;
16151   const struct real_format *fmt;
16152   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16153 
16154   /* Temporary for holding the result, initialized to the input
16155      operand to ease control flow.  */
16156   res = gen_reg_rtx (mode);
16157   emit_move_insn (res, operand1);
16158 
16159   TWO52 = ix86_gen_TWO52 (mode);
16160   xa = ix86_expand_sse_fabs (res, &mask);
16161   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16162 
16163   /* load nextafter (0.5, 0.0) */
16164   fmt = REAL_MODE_FORMAT (mode);
16165   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16166   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16167 
16168   /* xa = xa + 0.5 */
16169   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16170   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16171 
16172   /* xa = (double)(int64_t)xa */
16173   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16174   expand_fix (xi, xa, 0);
16175   expand_float (xa, xi, 0);
16176 
16177   /* res = copysign (xa, operand1) */
16178   ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16179 
16180   emit_label (label);
16181   LABEL_NUSES (label) = 1;
16182 
16183   emit_move_insn (operand0, res);
16184 }
16185 
16186 /* Expand SSE sequence for computing round from OPERAND1 storing
16187    into OPERAND0 without relying on DImode truncation via cvttsd2siq
16188    that is only available on 64bit targets.  */
16189 void
ix86_expand_rounddf_32(rtx operand0,rtx operand1)16190 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16191 {
16192   /* C code for the stuff we expand below.
16193         double xa = fabs (x), xa2, x2;
16194         if (!isless (xa, TWO52))
16195           return x;
16196      Using the absolute value and copying back sign makes
16197      -0.0 -> -0.0 correct.
16198         xa2 = xa + TWO52 - TWO52;
16199      Compensate.
16200 	dxa = xa2 - xa;
16201         if (dxa <= -0.5)
16202           xa2 += 1;
16203         else if (dxa > 0.5)
16204           xa2 -= 1;
16205         x2 = copysign (xa2, x);
16206         return x2;
16207    */
16208   machine_mode mode = GET_MODE (operand0);
16209   rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16210   rtx_code_label *label;
16211 
16212   TWO52 = ix86_gen_TWO52 (mode);
16213 
16214   /* Temporary for holding the result, initialized to the input
16215      operand to ease control flow.  */
16216   res = gen_reg_rtx (mode);
16217   emit_move_insn (res, operand1);
16218 
16219   /* xa = abs (operand1) */
16220   xa = ix86_expand_sse_fabs (res, &mask);
16221 
16222   /* if (!isless (xa, TWO52)) goto label; */
16223   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16224 
16225   /* xa2 = xa + TWO52 - TWO52; */
16226   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16227   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16228 
16229   /* dxa = xa2 - xa; */
16230   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16231 
16232   /* generate 0.5, 1.0 and -0.5 */
16233   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16234   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16235   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16236 			       0, OPTAB_DIRECT);
16237 
16238   /* Compensate.  */
16239   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16240   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16241   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16242   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16243   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16244   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16245   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16246   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16247 
16248   /* res = copysign (xa2, operand1) */
16249   ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16250 
16251   emit_label (label);
16252   LABEL_NUSES (label) = 1;
16253 
16254   emit_move_insn (operand0, res);
16255 }
16256 
16257 /* Expand SSE sequence for computing round
16258    from OP1 storing into OP0 using sse4 round insn.  */
16259 void
ix86_expand_round_sse4(rtx op0,rtx op1)16260 ix86_expand_round_sse4 (rtx op0, rtx op1)
16261 {
16262   machine_mode mode = GET_MODE (op0);
16263   rtx e1, e2, res, half;
16264   const struct real_format *fmt;
16265   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16266   rtx (*gen_copysign) (rtx, rtx, rtx);
16267   rtx (*gen_round) (rtx, rtx, rtx);
16268 
16269   switch (mode)
16270     {
16271     case E_SFmode:
16272       gen_copysign = gen_copysignsf3;
16273       gen_round = gen_sse4_1_roundsf2;
16274       break;
16275     case E_DFmode:
16276       gen_copysign = gen_copysigndf3;
16277       gen_round = gen_sse4_1_rounddf2;
16278       break;
16279     default:
16280       gcc_unreachable ();
16281     }
16282 
16283   /* round (a) = trunc (a + copysign (0.5, a)) */
16284 
16285   /* load nextafter (0.5, 0.0) */
16286   fmt = REAL_MODE_FORMAT (mode);
16287   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16288   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16289   half = const_double_from_real_value (pred_half, mode);
16290 
16291   /* e1 = copysign (0.5, op1) */
16292   e1 = gen_reg_rtx (mode);
16293   emit_insn (gen_copysign (e1, half, op1));
16294 
16295   /* e2 = op1 + e1 */
16296   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16297 
16298   /* res = trunc (e2) */
16299   res = gen_reg_rtx (mode);
16300   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16301 
16302   emit_move_insn (op0, res);
16303 }
16304 
16305 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16306    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16307    insn every time.  */
16308 
16309 static GTY(()) rtx_insn *vselect_insn;
16310 
16311 /* Initialize vselect_insn.  */
16312 
16313 static void
init_vselect_insn(void)16314 init_vselect_insn (void)
16315 {
16316   unsigned i;
16317   rtx x;
16318 
16319   x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16320   for (i = 0; i < MAX_VECT_LEN; ++i)
16321     XVECEXP (x, 0, i) = const0_rtx;
16322   x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16323 							const0_rtx), x);
16324   x = gen_rtx_SET (const0_rtx, x);
16325   start_sequence ();
16326   vselect_insn = emit_insn (x);
16327   end_sequence ();
16328 }
16329 
16330 /* Construct (set target (vec_select op0 (parallel perm))) and
16331    return true if that's a valid instruction in the active ISA.  */
16332 
16333 static bool
expand_vselect(rtx target,rtx op0,const unsigned char * perm,unsigned nelt,bool testing_p)16334 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16335 		unsigned nelt, bool testing_p)
16336 {
16337   unsigned int i;
16338   rtx x, save_vconcat;
16339   int icode;
16340 
16341   if (vselect_insn == NULL_RTX)
16342     init_vselect_insn ();
16343 
16344   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16345   PUT_NUM_ELEM (XVEC (x, 0), nelt);
16346   for (i = 0; i < nelt; ++i)
16347     XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16348   save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16349   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16350   PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16351   SET_DEST (PATTERN (vselect_insn)) = target;
16352   icode = recog_memoized (vselect_insn);
16353 
16354   if (icode >= 0 && !testing_p)
16355     emit_insn (copy_rtx (PATTERN (vselect_insn)));
16356 
16357   SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16358   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16359   INSN_CODE (vselect_insn) = -1;
16360 
16361   return icode >= 0;
16362 }
16363 
16364 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
16365 
16366 static bool
expand_vselect_vconcat(rtx target,rtx op0,rtx op1,const unsigned char * perm,unsigned nelt,bool testing_p)16367 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16368 			const unsigned char *perm, unsigned nelt,
16369 			bool testing_p)
16370 {
16371   machine_mode v2mode;
16372   rtx x;
16373   bool ok;
16374 
16375   if (vselect_insn == NULL_RTX)
16376     init_vselect_insn ();
16377 
16378   if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16379     return false;
16380   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16381   PUT_MODE (x, v2mode);
16382   XEXP (x, 0) = op0;
16383   XEXP (x, 1) = op1;
16384   ok = expand_vselect (target, x, perm, nelt, testing_p);
16385   XEXP (x, 0) = const0_rtx;
16386   XEXP (x, 1) = const0_rtx;
16387   return ok;
16388 }
16389 
16390 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16391    using movss or movsd.  */
16392 static bool
expand_vec_perm_movs(struct expand_vec_perm_d * d)16393 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16394 {
16395   machine_mode vmode = d->vmode;
16396   unsigned i, nelt = d->nelt;
16397   rtx x;
16398 
16399   if (d->one_operand_p)
16400     return false;
16401 
16402   if (!(TARGET_SSE && vmode == V4SFmode)
16403       && !(TARGET_SSE2 && vmode == V2DFmode))
16404     return false;
16405 
16406   /* Only the first element is changed.  */
16407   if (d->perm[0] != nelt && d->perm[0] != 0)
16408     return false;
16409   for (i = 1; i < nelt; ++i)
16410     if (d->perm[i] != i + nelt - d->perm[0])
16411       return false;
16412 
16413   if (d->testing_p)
16414     return true;
16415 
16416   if (d->perm[0] == nelt)
16417     x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16418   else
16419     x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16420 
16421   emit_insn (gen_rtx_SET (d->target, x));
16422 
16423   return true;
16424 }
16425 
16426 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16427    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
16428 
16429 static bool
expand_vec_perm_blend(struct expand_vec_perm_d * d)16430 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16431 {
16432   machine_mode mmode, vmode = d->vmode;
16433   unsigned i, nelt = d->nelt;
16434   unsigned HOST_WIDE_INT mask;
16435   rtx target, op0, op1, maskop, x;
16436   rtx rperm[32], vperm;
16437 
16438   if (d->one_operand_p)
16439     return false;
16440   if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16441       && (TARGET_AVX512BW
16442 	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
16443     ;
16444   else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16445     ;
16446   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16447     ;
16448   else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16449     ;
16450   else
16451     return false;
16452 
16453   /* This is a blend, not a permute.  Elements must stay in their
16454      respective lanes.  */
16455   for (i = 0; i < nelt; ++i)
16456     {
16457       unsigned e = d->perm[i];
16458       if (!(e == i || e == i + nelt))
16459 	return false;
16460     }
16461 
16462   if (d->testing_p)
16463     return true;
16464 
16465   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
16466      decision should be extracted elsewhere, so that we only try that
16467      sequence once all budget==3 options have been tried.  */
16468   target = d->target;
16469   op0 = d->op0;
16470   op1 = d->op1;
16471   mask = 0;
16472 
16473   switch (vmode)
16474     {
16475     case E_V8DFmode:
16476     case E_V16SFmode:
16477     case E_V4DFmode:
16478     case E_V8SFmode:
16479     case E_V2DFmode:
16480     case E_V4SFmode:
16481     case E_V8HImode:
16482     case E_V8SImode:
16483     case E_V32HImode:
16484     case E_V64QImode:
16485     case E_V16SImode:
16486     case E_V8DImode:
16487       for (i = 0; i < nelt; ++i)
16488 	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16489       break;
16490 
16491     case E_V2DImode:
16492       for (i = 0; i < 2; ++i)
16493 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16494       vmode = V8HImode;
16495       goto do_subreg;
16496 
16497     case E_V4SImode:
16498       for (i = 0; i < 4; ++i)
16499 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16500       vmode = V8HImode;
16501       goto do_subreg;
16502 
16503     case E_V16QImode:
16504       /* See if bytes move in pairs so we can use pblendw with
16505 	 an immediate argument, rather than pblendvb with a vector
16506 	 argument.  */
16507       for (i = 0; i < 16; i += 2)
16508 	if (d->perm[i] + 1 != d->perm[i + 1])
16509 	  {
16510 	  use_pblendvb:
16511 	    for (i = 0; i < nelt; ++i)
16512 	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16513 
16514 	  finish_pblendvb:
16515 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16516 	    vperm = force_reg (vmode, vperm);
16517 
16518 	    if (GET_MODE_SIZE (vmode) == 16)
16519 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16520 	    else
16521 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16522 	    if (target != d->target)
16523 	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16524 	    return true;
16525 	  }
16526 
16527       for (i = 0; i < 8; ++i)
16528 	mask |= (d->perm[i * 2] >= 16) << i;
16529       vmode = V8HImode;
16530       /* FALLTHRU */
16531 
16532     do_subreg:
16533       target = gen_reg_rtx (vmode);
16534       op0 = gen_lowpart (vmode, op0);
16535       op1 = gen_lowpart (vmode, op1);
16536       break;
16537 
16538     case E_V32QImode:
16539       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
16540       for (i = 0; i < 32; i += 2)
16541 	if (d->perm[i] + 1 != d->perm[i + 1])
16542 	  goto use_pblendvb;
16543       /* See if bytes move in quadruplets.  If yes, vpblendd
16544 	 with immediate can be used.  */
16545       for (i = 0; i < 32; i += 4)
16546 	if (d->perm[i] + 2 != d->perm[i + 2])
16547 	  break;
16548       if (i < 32)
16549 	{
16550 	  /* See if bytes move the same in both lanes.  If yes,
16551 	     vpblendw with immediate can be used.  */
16552 	  for (i = 0; i < 16; i += 2)
16553 	    if (d->perm[i] + 16 != d->perm[i + 16])
16554 	      goto use_pblendvb;
16555 
16556 	  /* Use vpblendw.  */
16557 	  for (i = 0; i < 16; ++i)
16558 	    mask |= (d->perm[i * 2] >= 32) << i;
16559 	  vmode = V16HImode;
16560 	  goto do_subreg;
16561 	}
16562 
16563       /* Use vpblendd.  */
16564       for (i = 0; i < 8; ++i)
16565 	mask |= (d->perm[i * 4] >= 32) << i;
16566       vmode = V8SImode;
16567       goto do_subreg;
16568 
16569     case E_V16HImode:
16570       /* See if words move in pairs.  If yes, vpblendd can be used.  */
16571       for (i = 0; i < 16; i += 2)
16572 	if (d->perm[i] + 1 != d->perm[i + 1])
16573 	  break;
16574       if (i < 16)
16575 	{
16576 	  /* See if words move the same in both lanes.  If not,
16577 	     vpblendvb must be used.  */
16578 	  for (i = 0; i < 8; i++)
16579 	    if (d->perm[i] + 8 != d->perm[i + 8])
16580 	      {
16581 		/* Use vpblendvb.  */
16582 		for (i = 0; i < 32; ++i)
16583 		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16584 
16585 		vmode = V32QImode;
16586 		nelt = 32;
16587 		target = gen_reg_rtx (vmode);
16588 		op0 = gen_lowpart (vmode, op0);
16589 		op1 = gen_lowpart (vmode, op1);
16590 		goto finish_pblendvb;
16591 	      }
16592 
16593 	  /* Use vpblendw.  */
16594 	  for (i = 0; i < 16; ++i)
16595 	    mask |= (d->perm[i] >= 16) << i;
16596 	  break;
16597 	}
16598 
16599       /* Use vpblendd.  */
16600       for (i = 0; i < 8; ++i)
16601 	mask |= (d->perm[i * 2] >= 16) << i;
16602       vmode = V8SImode;
16603       goto do_subreg;
16604 
16605     case E_V4DImode:
16606       /* Use vpblendd.  */
16607       for (i = 0; i < 4; ++i)
16608 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16609       vmode = V8SImode;
16610       goto do_subreg;
16611 
16612     default:
16613       gcc_unreachable ();
16614     }
16615 
16616   switch (vmode)
16617     {
16618     case E_V8DFmode:
16619     case E_V8DImode:
16620       mmode = QImode;
16621       break;
16622     case E_V16SFmode:
16623     case E_V16SImode:
16624       mmode = HImode;
16625       break;
16626     case E_V32HImode:
16627       mmode = SImode;
16628       break;
16629     case E_V64QImode:
16630       mmode = DImode;
16631       break;
16632     default:
16633       mmode = VOIDmode;
16634     }
16635 
16636   if (mmode != VOIDmode)
16637     maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16638   else
16639     maskop = GEN_INT (mask);
16640 
16641   /* This matches five different patterns with the different modes.  */
16642   x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16643   x = gen_rtx_SET (target, x);
16644   emit_insn (x);
16645   if (target != d->target)
16646     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16647 
16648   return true;
16649 }
16650 
16651 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16652    in terms of the variable form of vpermilps.
16653 
16654    Note that we will have already failed the immediate input vpermilps,
16655    which requires that the high and low part shuffle be identical; the
16656    variable form doesn't require that.  */
16657 
16658 static bool
expand_vec_perm_vpermil(struct expand_vec_perm_d * d)16659 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16660 {
16661   rtx rperm[8], vperm;
16662   unsigned i;
16663 
16664   if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16665     return false;
16666 
16667   /* We can only permute within the 128-bit lane.  */
16668   for (i = 0; i < 8; ++i)
16669     {
16670       unsigned e = d->perm[i];
16671       if (i < 4 ? e >= 4 : e < 4)
16672 	return false;
16673     }
16674 
16675   if (d->testing_p)
16676     return true;
16677 
16678   for (i = 0; i < 8; ++i)
16679     {
16680       unsigned e = d->perm[i];
16681 
16682       /* Within each 128-bit lane, the elements of op0 are numbered
16683 	 from 0 and the elements of op1 are numbered from 4.  */
16684       if (e >= 8 + 4)
16685 	e -= 8;
16686       else if (e >= 4)
16687 	e -= 4;
16688 
16689       rperm[i] = GEN_INT (e);
16690     }
16691 
16692   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16693   vperm = force_reg (V8SImode, vperm);
16694   emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16695 
16696   return true;
16697 }
16698 
16699 /* Return true if permutation D can be performed as VMODE permutation
16700    instead.  */
16701 
16702 static bool
valid_perm_using_mode_p(machine_mode vmode,struct expand_vec_perm_d * d)16703 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16704 {
16705   unsigned int i, j, chunk;
16706 
16707   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16708       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16709       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16710     return false;
16711 
16712   if (GET_MODE_NUNITS (vmode) >= d->nelt)
16713     return true;
16714 
16715   chunk = d->nelt / GET_MODE_NUNITS (vmode);
16716   for (i = 0; i < d->nelt; i += chunk)
16717     if (d->perm[i] & (chunk - 1))
16718       return false;
16719     else
16720       for (j = 1; j < chunk; ++j)
16721 	if (d->perm[i] + j != d->perm[i + j])
16722 	  return false;
16723 
16724   return true;
16725 }
16726 
16727 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
16728    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
16729 
16730 static bool
expand_vec_perm_pshufb(struct expand_vec_perm_d * d)16731 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16732 {
16733   unsigned i, nelt, eltsz, mask;
16734   unsigned char perm[64];
16735   machine_mode vmode = V16QImode;
16736   rtx rperm[64], vperm, target, op0, op1;
16737 
16738   nelt = d->nelt;
16739 
16740   if (!d->one_operand_p)
16741     {
16742       if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16743 	{
16744 	  if (TARGET_AVX2
16745 	      && valid_perm_using_mode_p (V2TImode, d))
16746 	    {
16747 	      if (d->testing_p)
16748 		return true;
16749 
16750 	      /* Use vperm2i128 insn.  The pattern uses
16751 		 V4DImode instead of V2TImode.  */
16752 	      target = d->target;
16753 	      if (d->vmode != V4DImode)
16754 		target = gen_reg_rtx (V4DImode);
16755 	      op0 = gen_lowpart (V4DImode, d->op0);
16756 	      op1 = gen_lowpart (V4DImode, d->op1);
16757 	      rperm[0]
16758 		= GEN_INT ((d->perm[0] / (nelt / 2))
16759 			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16760 	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16761 	      if (target != d->target)
16762 		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16763 	      return true;
16764 	    }
16765 	  return false;
16766 	}
16767     }
16768   else
16769     {
16770       if (GET_MODE_SIZE (d->vmode) == 16)
16771 	{
16772 	  if (!TARGET_SSSE3)
16773 	    return false;
16774 	}
16775       else if (GET_MODE_SIZE (d->vmode) == 32)
16776 	{
16777 	  if (!TARGET_AVX2)
16778 	    return false;
16779 
16780 	  /* V4DImode should be already handled through
16781 	     expand_vselect by vpermq instruction.  */
16782 	  gcc_assert (d->vmode != V4DImode);
16783 
16784 	  vmode = V32QImode;
16785 	  if (d->vmode == V8SImode
16786 	      || d->vmode == V16HImode
16787 	      || d->vmode == V32QImode)
16788 	    {
16789 	      /* First see if vpermq can be used for
16790 		 V8SImode/V16HImode/V32QImode.  */
16791 	      if (valid_perm_using_mode_p (V4DImode, d))
16792 		{
16793 		  for (i = 0; i < 4; i++)
16794 		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16795 		  if (d->testing_p)
16796 		    return true;
16797 		  target = gen_reg_rtx (V4DImode);
16798 		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16799 				      perm, 4, false))
16800 		    {
16801 		      emit_move_insn (d->target,
16802 				      gen_lowpart (d->vmode, target));
16803 		      return true;
16804 		    }
16805 		  return false;
16806 		}
16807 
16808 	      /* Next see if vpermd can be used.  */
16809 	      if (valid_perm_using_mode_p (V8SImode, d))
16810 		vmode = V8SImode;
16811 	    }
16812 	  /* Or if vpermps can be used.  */
16813 	  else if (d->vmode == V8SFmode)
16814 	    vmode = V8SImode;
16815 
16816 	  if (vmode == V32QImode)
16817 	    {
16818 	      /* vpshufb only works intra lanes, it is not
16819 		 possible to shuffle bytes in between the lanes.  */
16820 	      for (i = 0; i < nelt; ++i)
16821 		if ((d->perm[i] ^ i) & (nelt / 2))
16822 		  return false;
16823 	    }
16824 	}
16825       else if (GET_MODE_SIZE (d->vmode) == 64)
16826 	{
16827 	  if (!TARGET_AVX512BW)
16828 	    return false;
16829 
16830 	  /* If vpermq didn't work, vpshufb won't work either.  */
16831 	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
16832 	    return false;
16833 
16834 	  vmode = V64QImode;
16835 	  if (d->vmode == V16SImode
16836 	      || d->vmode == V32HImode
16837 	      || d->vmode == V64QImode)
16838 	    {
16839 	      /* First see if vpermq can be used for
16840 		 V16SImode/V32HImode/V64QImode.  */
16841 	      if (valid_perm_using_mode_p (V8DImode, d))
16842 		{
16843 		  for (i = 0; i < 8; i++)
16844 		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16845 		  if (d->testing_p)
16846 		    return true;
16847 		  target = gen_reg_rtx (V8DImode);
16848 		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16849 				      perm, 8, false))
16850 		    {
16851 		      emit_move_insn (d->target,
16852 				      gen_lowpart (d->vmode, target));
16853 		      return true;
16854 		    }
16855 		  return false;
16856 		}
16857 
16858 	      /* Next see if vpermd can be used.  */
16859 	      if (valid_perm_using_mode_p (V16SImode, d))
16860 		vmode = V16SImode;
16861 	    }
16862 	  /* Or if vpermps can be used.  */
16863 	  else if (d->vmode == V16SFmode)
16864 	    vmode = V16SImode;
16865 	  if (vmode == V64QImode)
16866 	    {
16867 	      /* vpshufb only works intra lanes, it is not
16868 		 possible to shuffle bytes in between the lanes.  */
16869 	      for (i = 0; i < nelt; ++i)
16870 		if ((d->perm[i] ^ i) & (3 * nelt / 4))
16871 		  return false;
16872 	    }
16873 	}
16874       else
16875 	return false;
16876     }
16877 
16878   if (d->testing_p)
16879     return true;
16880 
16881   if (vmode == V8SImode)
16882     for (i = 0; i < 8; ++i)
16883       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16884   else if (vmode == V16SImode)
16885     for (i = 0; i < 16; ++i)
16886       rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16887   else
16888     {
16889       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16890       if (!d->one_operand_p)
16891 	mask = 2 * nelt - 1;
16892       else if (vmode == V16QImode)
16893 	mask = nelt - 1;
16894       else if (vmode == V64QImode)
16895 	mask = nelt / 4 - 1;
16896       else
16897 	mask = nelt / 2 - 1;
16898 
16899       for (i = 0; i < nelt; ++i)
16900 	{
16901 	  unsigned j, e = d->perm[i] & mask;
16902 	  for (j = 0; j < eltsz; ++j)
16903 	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16904 	}
16905     }
16906 
16907   vperm = gen_rtx_CONST_VECTOR (vmode,
16908 				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16909   vperm = force_reg (vmode, vperm);
16910 
16911   target = d->target;
16912   if (d->vmode != vmode)
16913     target = gen_reg_rtx (vmode);
16914   op0 = gen_lowpart (vmode, d->op0);
16915   if (d->one_operand_p)
16916     {
16917       if (vmode == V16QImode)
16918 	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16919       else if (vmode == V32QImode)
16920 	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16921       else if (vmode == V64QImode)
16922 	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16923       else if (vmode == V8SFmode)
16924 	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16925       else if (vmode == V8SImode)
16926 	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16927       else if (vmode == V16SFmode)
16928 	emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16929       else if (vmode == V16SImode)
16930 	emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16931       else
16932 	gcc_unreachable ();
16933     }
16934   else
16935     {
16936       op1 = gen_lowpart (vmode, d->op1);
16937       emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16938     }
16939   if (target != d->target)
16940     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16941 
16942   return true;
16943 }
16944 
16945 /* For V*[QHS]Imode permutations, check if the same permutation
16946    can't be performed in a 2x, 4x or 8x wider inner mode.  */
16947 
16948 static bool
canonicalize_vector_int_perm(const struct expand_vec_perm_d * d,struct expand_vec_perm_d * nd)16949 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16950 			      struct expand_vec_perm_d *nd)
16951 {
16952   int i;
16953   machine_mode mode = VOIDmode;
16954 
16955   switch (d->vmode)
16956     {
16957     case E_V16QImode: mode = V8HImode; break;
16958     case E_V32QImode: mode = V16HImode; break;
16959     case E_V64QImode: mode = V32HImode; break;
16960     case E_V8HImode: mode = V4SImode; break;
16961     case E_V16HImode: mode = V8SImode; break;
16962     case E_V32HImode: mode = V16SImode; break;
16963     case E_V4SImode: mode = V2DImode; break;
16964     case E_V8SImode: mode = V4DImode; break;
16965     case E_V16SImode: mode = V8DImode; break;
16966     default: return false;
16967     }
16968   for (i = 0; i < d->nelt; i += 2)
16969     if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16970       return false;
16971   nd->vmode = mode;
16972   nd->nelt = d->nelt / 2;
16973   for (i = 0; i < nd->nelt; i++)
16974     nd->perm[i] = d->perm[2 * i] / 2;
16975   if (GET_MODE_INNER (mode) != DImode)
16976     canonicalize_vector_int_perm (nd, nd);
16977   if (nd != d)
16978     {
16979       nd->one_operand_p = d->one_operand_p;
16980       nd->testing_p = d->testing_p;
16981       if (d->op0 == d->op1)
16982 	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16983       else
16984 	{
16985 	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
16986 	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
16987 	}
16988       if (d->testing_p)
16989 	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16990       else
16991 	nd->target = gen_reg_rtx (nd->vmode);
16992     }
16993   return true;
16994 }
16995 
16996 /* Try to expand one-operand permutation with constant mask.  */
16997 
16998 static bool
ix86_expand_vec_one_operand_perm_avx512(struct expand_vec_perm_d * d)16999 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
17000 {
17001   machine_mode mode = GET_MODE (d->op0);
17002   machine_mode maskmode = mode;
17003   rtx (*gen) (rtx, rtx, rtx) = NULL;
17004   rtx target, op0, mask;
17005   rtx vec[64];
17006 
17007   if (!rtx_equal_p (d->op0, d->op1))
17008     return false;
17009 
17010   if (!TARGET_AVX512F)
17011     return false;
17012 
17013   switch (mode)
17014     {
17015     case E_V16SImode:
17016       gen = gen_avx512f_permvarv16si;
17017       break;
17018     case E_V16SFmode:
17019       gen = gen_avx512f_permvarv16sf;
17020       maskmode = V16SImode;
17021       break;
17022     case E_V8DImode:
17023       gen = gen_avx512f_permvarv8di;
17024       break;
17025     case E_V8DFmode:
17026       gen = gen_avx512f_permvarv8df;
17027       maskmode = V8DImode;
17028       break;
17029     default:
17030       return false;
17031     }
17032 
17033   target = d->target;
17034   op0 = d->op0;
17035   for (int i = 0; i < d->nelt; ++i)
17036     vec[i] = GEN_INT (d->perm[i]);
17037   mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17038   emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17039   return true;
17040 }
17041 
17042 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17043 
17044 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
17045    in a single instruction.  */
17046 
17047 static bool
expand_vec_perm_1(struct expand_vec_perm_d * d)17048 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17049 {
17050   unsigned i, nelt = d->nelt;
17051   struct expand_vec_perm_d nd;
17052 
17053   /* Check plain VEC_SELECT first, because AVX has instructions that could
17054      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17055      input where SEL+CONCAT may not.  */
17056   if (d->one_operand_p)
17057     {
17058       int mask = nelt - 1;
17059       bool identity_perm = true;
17060       bool broadcast_perm = true;
17061 
17062       for (i = 0; i < nelt; i++)
17063 	{
17064 	  nd.perm[i] = d->perm[i] & mask;
17065 	  if (nd.perm[i] != i)
17066 	    identity_perm = false;
17067 	  if (nd.perm[i])
17068 	    broadcast_perm = false;
17069 	}
17070 
17071       if (identity_perm)
17072 	{
17073 	  if (!d->testing_p)
17074 	    emit_move_insn (d->target, d->op0);
17075 	  return true;
17076 	}
17077       else if (broadcast_perm && TARGET_AVX2)
17078 	{
17079 	  /* Use vpbroadcast{b,w,d}.  */
17080 	  rtx (*gen) (rtx, rtx) = NULL;
17081 	  switch (d->vmode)
17082 	    {
17083 	    case E_V64QImode:
17084 	      if (TARGET_AVX512BW)
17085 		gen = gen_avx512bw_vec_dupv64qi_1;
17086 	      break;
17087 	    case E_V32QImode:
17088 	      gen = gen_avx2_pbroadcastv32qi_1;
17089 	      break;
17090 	    case E_V32HImode:
17091 	      if (TARGET_AVX512BW)
17092 		gen = gen_avx512bw_vec_dupv32hi_1;
17093 	      break;
17094 	    case E_V16HImode:
17095 	      gen = gen_avx2_pbroadcastv16hi_1;
17096 	      break;
17097 	    case E_V16SImode:
17098 	      if (TARGET_AVX512F)
17099 		gen = gen_avx512f_vec_dupv16si_1;
17100 	      break;
17101 	    case E_V8SImode:
17102 	      gen = gen_avx2_pbroadcastv8si_1;
17103 	      break;
17104 	    case E_V16QImode:
17105 	      gen = gen_avx2_pbroadcastv16qi;
17106 	      break;
17107 	    case E_V8HImode:
17108 	      gen = gen_avx2_pbroadcastv8hi;
17109 	      break;
17110 	    case E_V16SFmode:
17111 	      if (TARGET_AVX512F)
17112 		gen = gen_avx512f_vec_dupv16sf_1;
17113 	      break;
17114 	    case E_V8SFmode:
17115 	      gen = gen_avx2_vec_dupv8sf_1;
17116 	      break;
17117 	    case E_V8DFmode:
17118 	      if (TARGET_AVX512F)
17119 		gen = gen_avx512f_vec_dupv8df_1;
17120 	      break;
17121 	    case E_V8DImode:
17122 	      if (TARGET_AVX512F)
17123 		gen = gen_avx512f_vec_dupv8di_1;
17124 	      break;
17125 	    /* For other modes prefer other shuffles this function creates.  */
17126 	    default: break;
17127 	    }
17128 	  if (gen != NULL)
17129 	    {
17130 	      if (!d->testing_p)
17131 		emit_insn (gen (d->target, d->op0));
17132 	      return true;
17133 	    }
17134 	}
17135 
17136       if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17137 	return true;
17138 
17139       /* There are plenty of patterns in sse.md that are written for
17140 	 SEL+CONCAT and are not replicated for a single op.  Perhaps
17141 	 that should be changed, to avoid the nastiness here.  */
17142 
17143       /* Recognize interleave style patterns, which means incrementing
17144 	 every other permutation operand.  */
17145       for (i = 0; i < nelt; i += 2)
17146 	{
17147 	  nd.perm[i] = d->perm[i] & mask;
17148 	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17149 	}
17150       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17151 				  d->testing_p))
17152 	return true;
17153 
17154       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
17155       if (nelt >= 4)
17156 	{
17157 	  for (i = 0; i < nelt; i += 4)
17158 	    {
17159 	      nd.perm[i + 0] = d->perm[i + 0] & mask;
17160 	      nd.perm[i + 1] = d->perm[i + 1] & mask;
17161 	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17162 	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17163 	    }
17164 
17165 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17166 				      d->testing_p))
17167 	    return true;
17168 	}
17169     }
17170 
17171   /* Try movss/movsd instructions.  */
17172   if (expand_vec_perm_movs (d))
17173     return true;
17174 
17175   /* Finally, try the fully general two operand permute.  */
17176   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17177 			      d->testing_p))
17178     return true;
17179 
17180   /* Recognize interleave style patterns with reversed operands.  */
17181   if (!d->one_operand_p)
17182     {
17183       for (i = 0; i < nelt; ++i)
17184 	{
17185 	  unsigned e = d->perm[i];
17186 	  if (e >= nelt)
17187 	    e -= nelt;
17188 	  else
17189 	    e += nelt;
17190 	  nd.perm[i] = e;
17191 	}
17192 
17193       if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17194 				  d->testing_p))
17195 	return true;
17196     }
17197 
17198   /* Try the SSE4.1 blend variable merge instructions.  */
17199   if (expand_vec_perm_blend (d))
17200     return true;
17201 
17202   /* Try one of the AVX vpermil variable permutations.  */
17203   if (expand_vec_perm_vpermil (d))
17204     return true;
17205 
17206   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17207      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
17208   if (expand_vec_perm_pshufb (d))
17209     return true;
17210 
17211   /* Try the AVX2 vpalignr instruction.  */
17212   if (expand_vec_perm_palignr (d, true))
17213     return true;
17214 
17215   /* Try the AVX512F vperm{s,d} instructions.  */
17216   if (ix86_expand_vec_one_operand_perm_avx512 (d))
17217     return true;
17218 
17219   /* Try the AVX512F vpermt2/vpermi2 instructions.  */
17220   if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17221     return true;
17222 
17223   /* See if we can get the same permutation in different vector integer
17224      mode.  */
17225   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17226     {
17227       if (!d->testing_p)
17228 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17229       return true;
17230     }
17231   return false;
17232 }
17233 
17234 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
17235    in terms of a pair of pshuflw + pshufhw instructions.  */
17236 
17237 static bool
expand_vec_perm_pshuflw_pshufhw(struct expand_vec_perm_d * d)17238 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17239 {
17240   unsigned char perm2[MAX_VECT_LEN];
17241   unsigned i;
17242   bool ok;
17243 
17244   if (d->vmode != V8HImode || !d->one_operand_p)
17245     return false;
17246 
17247   /* The two permutations only operate in 64-bit lanes.  */
17248   for (i = 0; i < 4; ++i)
17249     if (d->perm[i] >= 4)
17250       return false;
17251   for (i = 4; i < 8; ++i)
17252     if (d->perm[i] < 4)
17253       return false;
17254 
17255   if (d->testing_p)
17256     return true;
17257 
17258   /* Emit the pshuflw.  */
17259   memcpy (perm2, d->perm, 4);
17260   for (i = 4; i < 8; ++i)
17261     perm2[i] = i;
17262   ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17263   gcc_assert (ok);
17264 
17265   /* Emit the pshufhw.  */
17266   memcpy (perm2 + 4, d->perm + 4, 4);
17267   for (i = 0; i < 4; ++i)
17268     perm2[i] = i;
17269   ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17270   gcc_assert (ok);
17271 
17272   return true;
17273 }
17274 
17275 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17276    the permutation using the SSSE3 palignr instruction.  This succeeds
17277    when all of the elements in PERM fit within one vector and we merely
17278    need to shift them down so that a single vector permutation has a
17279    chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
17280    the vpalignr instruction itself can perform the requested permutation.  */
17281 
17282 static bool
expand_vec_perm_palignr(struct expand_vec_perm_d * d,bool single_insn_only_p)17283 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17284 {
17285   unsigned i, nelt = d->nelt;
17286   unsigned min, max, minswap, maxswap;
17287   bool in_order, ok, swap = false;
17288   rtx shift, target;
17289   struct expand_vec_perm_d dcopy;
17290 
17291   /* Even with AVX, palignr only operates on 128-bit vectors,
17292      in AVX2 palignr operates on both 128-bit lanes.  */
17293   if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17294       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17295     return false;
17296 
17297   min = 2 * nelt;
17298   max = 0;
17299   minswap = 2 * nelt;
17300   maxswap = 0;
17301   for (i = 0; i < nelt; ++i)
17302     {
17303       unsigned e = d->perm[i];
17304       unsigned eswap = d->perm[i] ^ nelt;
17305       if (GET_MODE_SIZE (d->vmode) == 32)
17306 	{
17307 	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17308 	  eswap = e ^ (nelt / 2);
17309 	}
17310       if (e < min)
17311 	min = e;
17312       if (e > max)
17313 	max = e;
17314       if (eswap < minswap)
17315 	minswap = eswap;
17316       if (eswap > maxswap)
17317 	maxswap = eswap;
17318     }
17319   if (min == 0
17320       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17321     {
17322       if (d->one_operand_p
17323 	  || minswap == 0
17324 	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17325 				   ? nelt / 2 : nelt))
17326 	return false;
17327       swap = true;
17328       min = minswap;
17329       max = maxswap;
17330     }
17331 
17332   /* Given that we have SSSE3, we know we'll be able to implement the
17333      single operand permutation after the palignr with pshufb for
17334      128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
17335      first.  */
17336   if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17337     return true;
17338 
17339   dcopy = *d;
17340   if (swap)
17341     {
17342       dcopy.op0 = d->op1;
17343       dcopy.op1 = d->op0;
17344       for (i = 0; i < nelt; ++i)
17345 	dcopy.perm[i] ^= nelt;
17346     }
17347 
17348   in_order = true;
17349   for (i = 0; i < nelt; ++i)
17350     {
17351       unsigned e = dcopy.perm[i];
17352       if (GET_MODE_SIZE (d->vmode) == 32
17353 	  && e >= nelt
17354 	  && (e & (nelt / 2 - 1)) < min)
17355 	e = e - min - (nelt / 2);
17356       else
17357 	e = e - min;
17358       if (e != i)
17359 	in_order = false;
17360       dcopy.perm[i] = e;
17361     }
17362   dcopy.one_operand_p = true;
17363 
17364   if (single_insn_only_p && !in_order)
17365     return false;
17366 
17367   /* For AVX2, test whether we can permute the result in one instruction.  */
17368   if (d->testing_p)
17369     {
17370       if (in_order)
17371 	return true;
17372       dcopy.op1 = dcopy.op0;
17373       return expand_vec_perm_1 (&dcopy);
17374     }
17375 
17376   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17377   if (GET_MODE_SIZE (d->vmode) == 16)
17378     {
17379       target = gen_reg_rtx (TImode);
17380       emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17381 				      gen_lowpart (TImode, dcopy.op0), shift));
17382     }
17383   else
17384     {
17385       target = gen_reg_rtx (V2TImode);
17386       emit_insn (gen_avx2_palignrv2ti (target,
17387 				       gen_lowpart (V2TImode, dcopy.op1),
17388 				       gen_lowpart (V2TImode, dcopy.op0),
17389 				       shift));
17390     }
17391 
17392   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17393 
17394   /* Test for the degenerate case where the alignment by itself
17395      produces the desired permutation.  */
17396   if (in_order)
17397     {
17398       emit_move_insn (d->target, dcopy.op0);
17399       return true;
17400     }
17401 
17402   ok = expand_vec_perm_1 (&dcopy);
17403   gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17404 
17405   return ok;
17406 }
17407 
17408 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17409    the permutation using the SSE4_1 pblendv instruction.  Potentially
17410    reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
17411 
17412 static bool
expand_vec_perm_pblendv(struct expand_vec_perm_d * d)17413 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17414 {
17415   unsigned i, which, nelt = d->nelt;
17416   struct expand_vec_perm_d dcopy, dcopy1;
17417   machine_mode vmode = d->vmode;
17418   bool ok;
17419 
17420   /* Use the same checks as in expand_vec_perm_blend.  */
17421   if (d->one_operand_p)
17422     return false;
17423   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17424     ;
17425   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17426     ;
17427   else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17428     ;
17429   else
17430     return false;
17431 
17432   /* Figure out where permutation elements stay not in their
17433      respective lanes.  */
17434   for (i = 0, which = 0; i < nelt; ++i)
17435     {
17436       unsigned e = d->perm[i];
17437       if (e != i)
17438 	which |= (e < nelt ? 1 : 2);
17439     }
17440   /* We can pblend the part where elements stay not in their
17441      respective lanes only when these elements are all in one
17442      half of a permutation.
17443      {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17444      lanes, but both 8 and 9 >= 8
17445      {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17446      respective lanes and 8 >= 8, but 2 not.  */
17447   if (which != 1 && which != 2)
17448     return false;
17449   if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17450     return true;
17451 
17452   /* First we apply one operand permutation to the part where
17453      elements stay not in their respective lanes.  */
17454   dcopy = *d;
17455   if (which == 2)
17456     dcopy.op0 = dcopy.op1 = d->op1;
17457   else
17458     dcopy.op0 = dcopy.op1 = d->op0;
17459   if (!d->testing_p)
17460     dcopy.target = gen_reg_rtx (vmode);
17461   dcopy.one_operand_p = true;
17462 
17463   for (i = 0; i < nelt; ++i)
17464     dcopy.perm[i] = d->perm[i] & (nelt - 1);
17465 
17466   ok = expand_vec_perm_1 (&dcopy);
17467   if (GET_MODE_SIZE (vmode) != 16 && !ok)
17468     return false;
17469   else
17470     gcc_assert (ok);
17471   if (d->testing_p)
17472     return true;
17473 
17474   /* Next we put permuted elements into their positions.  */
17475   dcopy1 = *d;
17476   if (which == 2)
17477     dcopy1.op1 = dcopy.target;
17478   else
17479     dcopy1.op0 = dcopy.target;
17480 
17481   for (i = 0; i < nelt; ++i)
17482     dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17483 
17484   ok = expand_vec_perm_blend (&dcopy1);
17485   gcc_assert (ok);
17486 
17487   return true;
17488 }
17489 
17490 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17491 
17492 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17493    a two vector permutation into a single vector permutation by using
17494    an interleave operation to merge the vectors.  */
17495 
17496 static bool
expand_vec_perm_interleave2(struct expand_vec_perm_d * d)17497 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17498 {
17499   struct expand_vec_perm_d dremap, dfinal;
17500   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17501   unsigned HOST_WIDE_INT contents;
17502   unsigned char remap[2 * MAX_VECT_LEN];
17503   rtx_insn *seq;
17504   bool ok, same_halves = false;
17505 
17506   if (GET_MODE_SIZE (d->vmode) == 16)
17507     {
17508       if (d->one_operand_p)
17509 	return false;
17510     }
17511   else if (GET_MODE_SIZE (d->vmode) == 32)
17512     {
17513       if (!TARGET_AVX)
17514 	return false;
17515       /* For 32-byte modes allow even d->one_operand_p.
17516 	 The lack of cross-lane shuffling in some instructions
17517 	 might prevent a single insn shuffle.  */
17518       dfinal = *d;
17519       dfinal.testing_p = true;
17520       /* If expand_vec_perm_interleave3 can expand this into
17521 	 a 3 insn sequence, give up and let it be expanded as
17522 	 3 insn sequence.  While that is one insn longer,
17523 	 it doesn't need a memory operand and in the common
17524 	 case that both interleave low and high permutations
17525 	 with the same operands are adjacent needs 4 insns
17526 	 for both after CSE.  */
17527       if (expand_vec_perm_interleave3 (&dfinal))
17528 	return false;
17529     }
17530   else
17531     return false;
17532 
17533   /* Examine from whence the elements come.  */
17534   contents = 0;
17535   for (i = 0; i < nelt; ++i)
17536     contents |= HOST_WIDE_INT_1U << d->perm[i];
17537 
17538   memset (remap, 0xff, sizeof (remap));
17539   dremap = *d;
17540 
17541   if (GET_MODE_SIZE (d->vmode) == 16)
17542     {
17543       unsigned HOST_WIDE_INT h1, h2, h3, h4;
17544 
17545       /* Split the two input vectors into 4 halves.  */
17546       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17547       h2 = h1 << nelt2;
17548       h3 = h2 << nelt2;
17549       h4 = h3 << nelt2;
17550 
17551       /* If the elements from the low halves use interleave low, and similarly
17552 	 for interleave high.  If the elements are from mis-matched halves, we
17553 	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
17554       if ((contents & (h1 | h3)) == contents)
17555 	{
17556 	  /* punpckl* */
17557 	  for (i = 0; i < nelt2; ++i)
17558 	    {
17559 	      remap[i] = i * 2;
17560 	      remap[i + nelt] = i * 2 + 1;
17561 	      dremap.perm[i * 2] = i;
17562 	      dremap.perm[i * 2 + 1] = i + nelt;
17563 	    }
17564 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
17565 	    dremap.vmode = V4SFmode;
17566 	}
17567       else if ((contents & (h2 | h4)) == contents)
17568 	{
17569 	  /* punpckh* */
17570 	  for (i = 0; i < nelt2; ++i)
17571 	    {
17572 	      remap[i + nelt2] = i * 2;
17573 	      remap[i + nelt + nelt2] = i * 2 + 1;
17574 	      dremap.perm[i * 2] = i + nelt2;
17575 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17576 	    }
17577 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
17578 	    dremap.vmode = V4SFmode;
17579 	}
17580       else if ((contents & (h1 | h4)) == contents)
17581 	{
17582 	  /* shufps */
17583 	  for (i = 0; i < nelt2; ++i)
17584 	    {
17585 	      remap[i] = i;
17586 	      remap[i + nelt + nelt2] = i + nelt2;
17587 	      dremap.perm[i] = i;
17588 	      dremap.perm[i + nelt2] = i + nelt + nelt2;
17589 	    }
17590 	  if (nelt != 4)
17591 	    {
17592 	      /* shufpd */
17593 	      dremap.vmode = V2DImode;
17594 	      dremap.nelt = 2;
17595 	      dremap.perm[0] = 0;
17596 	      dremap.perm[1] = 3;
17597 	    }
17598 	}
17599       else if ((contents & (h2 | h3)) == contents)
17600 	{
17601 	  /* shufps */
17602 	  for (i = 0; i < nelt2; ++i)
17603 	    {
17604 	      remap[i + nelt2] = i;
17605 	      remap[i + nelt] = i + nelt2;
17606 	      dremap.perm[i] = i + nelt2;
17607 	      dremap.perm[i + nelt2] = i + nelt;
17608 	    }
17609 	  if (nelt != 4)
17610 	    {
17611 	      /* shufpd */
17612 	      dremap.vmode = V2DImode;
17613 	      dremap.nelt = 2;
17614 	      dremap.perm[0] = 1;
17615 	      dremap.perm[1] = 2;
17616 	    }
17617 	}
17618       else
17619 	return false;
17620     }
17621   else
17622     {
17623       unsigned int nelt4 = nelt / 4, nzcnt = 0;
17624       unsigned HOST_WIDE_INT q[8];
17625       unsigned int nonzero_halves[4];
17626 
17627       /* Split the two input vectors into 8 quarters.  */
17628       q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17629       for (i = 1; i < 8; ++i)
17630 	q[i] = q[0] << (nelt4 * i);
17631       for (i = 0; i < 4; ++i)
17632 	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17633 	  {
17634 	    nonzero_halves[nzcnt] = i;
17635 	    ++nzcnt;
17636 	  }
17637 
17638       if (nzcnt == 1)
17639 	{
17640 	  gcc_assert (d->one_operand_p);
17641 	  nonzero_halves[1] = nonzero_halves[0];
17642 	  same_halves = true;
17643 	}
17644       else if (d->one_operand_p)
17645 	{
17646 	  gcc_assert (nonzero_halves[0] == 0);
17647 	  gcc_assert (nonzero_halves[1] == 1);
17648 	}
17649 
17650       if (nzcnt <= 2)
17651 	{
17652 	  if (d->perm[0] / nelt2 == nonzero_halves[1])
17653 	    {
17654 	      /* Attempt to increase the likelihood that dfinal
17655 		 shuffle will be intra-lane.  */
17656 	      std::swap (nonzero_halves[0], nonzero_halves[1]);
17657 	    }
17658 
17659 	  /* vperm2f128 or vperm2i128.  */
17660 	  for (i = 0; i < nelt2; ++i)
17661 	    {
17662 	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17663 	      remap[i + nonzero_halves[0] * nelt2] = i;
17664 	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17665 	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17666 	    }
17667 
17668 	  if (d->vmode != V8SFmode
17669 	      && d->vmode != V4DFmode
17670 	      && d->vmode != V8SImode)
17671 	    {
17672 	      dremap.vmode = V8SImode;
17673 	      dremap.nelt = 8;
17674 	      for (i = 0; i < 4; ++i)
17675 		{
17676 		  dremap.perm[i] = i + nonzero_halves[0] * 4;
17677 		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17678 		}
17679 	    }
17680 	}
17681       else if (d->one_operand_p)
17682 	return false;
17683       else if (TARGET_AVX2
17684 	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17685 	{
17686 	  /* vpunpckl* */
17687 	  for (i = 0; i < nelt4; ++i)
17688 	    {
17689 	      remap[i] = i * 2;
17690 	      remap[i + nelt] = i * 2 + 1;
17691 	      remap[i + nelt2] = i * 2 + nelt2;
17692 	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17693 	      dremap.perm[i * 2] = i;
17694 	      dremap.perm[i * 2 + 1] = i + nelt;
17695 	      dremap.perm[i * 2 + nelt2] = i + nelt2;
17696 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17697 	    }
17698 	}
17699       else if (TARGET_AVX2
17700 	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17701 	{
17702 	  /* vpunpckh* */
17703 	  for (i = 0; i < nelt4; ++i)
17704 	    {
17705 	      remap[i + nelt4] = i * 2;
17706 	      remap[i + nelt + nelt4] = i * 2 + 1;
17707 	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17708 	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17709 	      dremap.perm[i * 2] = i + nelt4;
17710 	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17711 	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17712 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17713 	    }
17714 	}
17715       else
17716 	return false;
17717     }
17718 
17719   /* Use the remapping array set up above to move the elements from their
17720      swizzled locations into their final destinations.  */
17721   dfinal = *d;
17722   for (i = 0; i < nelt; ++i)
17723     {
17724       unsigned e = remap[d->perm[i]];
17725       gcc_assert (e < nelt);
17726       /* If same_halves is true, both halves of the remapped vector are the
17727 	 same.  Avoid cross-lane accesses if possible.  */
17728       if (same_halves && i >= nelt2)
17729 	{
17730 	  gcc_assert (e < nelt2);
17731 	  dfinal.perm[i] = e + nelt2;
17732 	}
17733       else
17734 	dfinal.perm[i] = e;
17735     }
17736   if (!d->testing_p)
17737     {
17738       dremap.target = gen_reg_rtx (dremap.vmode);
17739       dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17740     }
17741   dfinal.op1 = dfinal.op0;
17742   dfinal.one_operand_p = true;
17743 
17744   /* Test if the final remap can be done with a single insn.  For V4SFmode or
17745      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
17746   start_sequence ();
17747   ok = expand_vec_perm_1 (&dfinal);
17748   seq = get_insns ();
17749   end_sequence ();
17750 
17751   if (!ok)
17752     return false;
17753 
17754   if (d->testing_p)
17755     return true;
17756 
17757   if (dremap.vmode != dfinal.vmode)
17758     {
17759       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17760       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17761     }
17762 
17763   ok = expand_vec_perm_1 (&dremap);
17764   gcc_assert (ok);
17765 
17766   emit_insn (seq);
17767   return true;
17768 }
17769 
17770 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17771    a single vector cross-lane permutation into vpermq followed
17772    by any of the single insn permutations.  */
17773 
17774 static bool
expand_vec_perm_vpermq_perm_1(struct expand_vec_perm_d * d)17775 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17776 {
17777   struct expand_vec_perm_d dremap, dfinal;
17778   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17779   unsigned contents[2];
17780   bool ok;
17781 
17782   if (!(TARGET_AVX2
17783 	&& (d->vmode == V32QImode || d->vmode == V16HImode)
17784 	&& d->one_operand_p))
17785     return false;
17786 
17787   contents[0] = 0;
17788   contents[1] = 0;
17789   for (i = 0; i < nelt2; ++i)
17790     {
17791       contents[0] |= 1u << (d->perm[i] / nelt4);
17792       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17793     }
17794 
17795   for (i = 0; i < 2; ++i)
17796     {
17797       unsigned int cnt = 0;
17798       for (j = 0; j < 4; ++j)
17799 	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17800 	  return false;
17801     }
17802 
17803   if (d->testing_p)
17804     return true;
17805 
17806   dremap = *d;
17807   dremap.vmode = V4DImode;
17808   dremap.nelt = 4;
17809   dremap.target = gen_reg_rtx (V4DImode);
17810   dremap.op0 = gen_lowpart (V4DImode, d->op0);
17811   dremap.op1 = dremap.op0;
17812   dremap.one_operand_p = true;
17813   for (i = 0; i < 2; ++i)
17814     {
17815       unsigned int cnt = 0;
17816       for (j = 0; j < 4; ++j)
17817 	if ((contents[i] & (1u << j)) != 0)
17818 	  dremap.perm[2 * i + cnt++] = j;
17819       for (; cnt < 2; ++cnt)
17820 	dremap.perm[2 * i + cnt] = 0;
17821     }
17822 
17823   dfinal = *d;
17824   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17825   dfinal.op1 = dfinal.op0;
17826   dfinal.one_operand_p = true;
17827   for (i = 0, j = 0; i < nelt; ++i)
17828     {
17829       if (i == nelt2)
17830 	j = 2;
17831       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17832       if ((d->perm[i] / nelt4) == dremap.perm[j])
17833 	;
17834       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17835 	dfinal.perm[i] |= nelt4;
17836       else
17837 	gcc_unreachable ();
17838     }
17839 
17840   ok = expand_vec_perm_1 (&dremap);
17841   gcc_assert (ok);
17842 
17843   ok = expand_vec_perm_1 (&dfinal);
17844   gcc_assert (ok);
17845 
17846   return true;
17847 }
17848 
17849 static bool canonicalize_perm (struct expand_vec_perm_d *d);
17850 
17851 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
17852    a vector permutation using two instructions, vperm2f128 resp.
17853    vperm2i128 followed by any single in-lane permutation.  */
17854 
17855 static bool
expand_vec_perm_vperm2f128(struct expand_vec_perm_d * d)17856 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17857 {
17858   struct expand_vec_perm_d dfirst, dsecond;
17859   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17860   bool ok;
17861 
17862   if (!TARGET_AVX
17863       || GET_MODE_SIZE (d->vmode) != 32
17864       || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17865     return false;
17866 
17867   dsecond = *d;
17868   dsecond.one_operand_p = false;
17869   dsecond.testing_p = true;
17870 
17871   /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17872      immediate.  For perm < 16 the second permutation uses
17873      d->op0 as first operand, for perm >= 16 it uses d->op1
17874      as first operand.  The second operand is the result of
17875      vperm2[fi]128.  */
17876   for (perm = 0; perm < 32; perm++)
17877     {
17878       /* Ignore permutations which do not move anything cross-lane.  */
17879       if (perm < 16)
17880 	{
17881 	  /* The second shuffle for e.g. V4DFmode has
17882 	     0123 and ABCD operands.
17883 	     Ignore AB23, as 23 is already in the second lane
17884 	     of the first operand.  */
17885 	  if ((perm & 0xc) == (1 << 2)) continue;
17886 	  /* And 01CD, as 01 is in the first lane of the first
17887 	     operand.  */
17888 	  if ((perm & 3) == 0) continue;
17889 	  /* And 4567, as then the vperm2[fi]128 doesn't change
17890 	     anything on the original 4567 second operand.  */
17891 	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17892 	}
17893       else
17894 	{
17895 	  /* The second shuffle for e.g. V4DFmode has
17896 	     4567 and ABCD operands.
17897 	     Ignore AB67, as 67 is already in the second lane
17898 	     of the first operand.  */
17899 	  if ((perm & 0xc) == (3 << 2)) continue;
17900 	  /* And 45CD, as 45 is in the first lane of the first
17901 	     operand.  */
17902 	  if ((perm & 3) == 2) continue;
17903 	  /* And 0123, as then the vperm2[fi]128 doesn't change
17904 	     anything on the original 0123 first operand.  */
17905 	  if ((perm & 0xf) == (1 << 2)) continue;
17906 	}
17907 
17908       for (i = 0; i < nelt; i++)
17909 	{
17910 	  j = d->perm[i] / nelt2;
17911 	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17912 	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17913 	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17914 	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
17915 	  else
17916 	    break;
17917 	}
17918 
17919       if (i == nelt)
17920 	{
17921 	  start_sequence ();
17922 	  ok = expand_vec_perm_1 (&dsecond);
17923 	  end_sequence ();
17924 	}
17925       else
17926 	ok = false;
17927 
17928       if (ok)
17929 	{
17930 	  if (d->testing_p)
17931 	    return true;
17932 
17933 	  /* Found a usable second shuffle.  dfirst will be
17934 	     vperm2f128 on d->op0 and d->op1.  */
17935 	  dsecond.testing_p = false;
17936 	  dfirst = *d;
17937 	  dfirst.target = gen_reg_rtx (d->vmode);
17938 	  for (i = 0; i < nelt; i++)
17939 	    dfirst.perm[i] = (i & (nelt2 - 1))
17940 			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17941 
17942 	  canonicalize_perm (&dfirst);
17943 	  ok = expand_vec_perm_1 (&dfirst);
17944 	  gcc_assert (ok);
17945 
17946 	  /* And dsecond is some single insn shuffle, taking
17947 	     d->op0 and result of vperm2f128 (if perm < 16) or
17948 	     d->op1 and result of vperm2f128 (otherwise).  */
17949 	  if (perm >= 16)
17950 	    dsecond.op0 = dsecond.op1;
17951 	  dsecond.op1 = dfirst.target;
17952 
17953 	  ok = expand_vec_perm_1 (&dsecond);
17954 	  gcc_assert (ok);
17955 
17956 	  return true;
17957 	}
17958 
17959       /* For one operand, the only useful vperm2f128 permutation is 0x01
17960 	 aka lanes swap.  */
17961       if (d->one_operand_p)
17962 	return false;
17963     }
17964 
17965   return false;
17966 }
17967 
17968 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
17969    a two vector permutation using 2 intra-lane interleave insns
17970    and cross-lane shuffle for 32-byte vectors.  */
17971 
17972 static bool
expand_vec_perm_interleave3(struct expand_vec_perm_d * d)17973 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17974 {
17975   unsigned i, nelt;
17976   rtx (*gen) (rtx, rtx, rtx);
17977 
17978   if (d->one_operand_p)
17979     return false;
17980   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17981     ;
17982   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17983     ;
17984   else
17985     return false;
17986 
17987   nelt = d->nelt;
17988   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17989     return false;
17990   for (i = 0; i < nelt; i += 2)
17991     if (d->perm[i] != d->perm[0] + i / 2
17992 	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17993       return false;
17994 
17995   if (d->testing_p)
17996     return true;
17997 
17998   switch (d->vmode)
17999     {
18000     case E_V32QImode:
18001       if (d->perm[0])
18002 	gen = gen_vec_interleave_highv32qi;
18003       else
18004 	gen = gen_vec_interleave_lowv32qi;
18005       break;
18006     case E_V16HImode:
18007       if (d->perm[0])
18008 	gen = gen_vec_interleave_highv16hi;
18009       else
18010 	gen = gen_vec_interleave_lowv16hi;
18011       break;
18012     case E_V8SImode:
18013       if (d->perm[0])
18014 	gen = gen_vec_interleave_highv8si;
18015       else
18016 	gen = gen_vec_interleave_lowv8si;
18017       break;
18018     case E_V4DImode:
18019       if (d->perm[0])
18020 	gen = gen_vec_interleave_highv4di;
18021       else
18022 	gen = gen_vec_interleave_lowv4di;
18023       break;
18024     case E_V8SFmode:
18025       if (d->perm[0])
18026 	gen = gen_vec_interleave_highv8sf;
18027       else
18028 	gen = gen_vec_interleave_lowv8sf;
18029       break;
18030     case E_V4DFmode:
18031       if (d->perm[0])
18032 	gen = gen_vec_interleave_highv4df;
18033       else
18034 	gen = gen_vec_interleave_lowv4df;
18035       break;
18036     default:
18037       gcc_unreachable ();
18038     }
18039 
18040   emit_insn (gen (d->target, d->op0, d->op1));
18041   return true;
18042 }
18043 
18044 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
18045    a single vector permutation using a single intra-lane vector
18046    permutation, vperm2f128 swapping the lanes and vblend* insn blending
18047    the non-swapped and swapped vectors together.  */
18048 
18049 static bool
expand_vec_perm_vperm2f128_vblend(struct expand_vec_perm_d * d)18050 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18051 {
18052   struct expand_vec_perm_d dfirst, dsecond;
18053   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18054   rtx_insn *seq;
18055   bool ok;
18056   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18057 
18058   if (!TARGET_AVX
18059       || TARGET_AVX2
18060       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18061       || !d->one_operand_p)
18062     return false;
18063 
18064   dfirst = *d;
18065   for (i = 0; i < nelt; i++)
18066     dfirst.perm[i] = 0xff;
18067   for (i = 0, msk = 0; i < nelt; i++)
18068     {
18069       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18070       if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18071 	return false;
18072       dfirst.perm[j] = d->perm[i];
18073       if (j != i)
18074 	msk |= (1 << i);
18075     }
18076   for (i = 0; i < nelt; i++)
18077     if (dfirst.perm[i] == 0xff)
18078       dfirst.perm[i] = i;
18079 
18080   if (!d->testing_p)
18081     dfirst.target = gen_reg_rtx (dfirst.vmode);
18082 
18083   start_sequence ();
18084   ok = expand_vec_perm_1 (&dfirst);
18085   seq = get_insns ();
18086   end_sequence ();
18087 
18088   if (!ok)
18089     return false;
18090 
18091   if (d->testing_p)
18092     return true;
18093 
18094   emit_insn (seq);
18095 
18096   dsecond = *d;
18097   dsecond.op0 = dfirst.target;
18098   dsecond.op1 = dfirst.target;
18099   dsecond.one_operand_p = true;
18100   dsecond.target = gen_reg_rtx (dsecond.vmode);
18101   for (i = 0; i < nelt; i++)
18102     dsecond.perm[i] = i ^ nelt2;
18103 
18104   ok = expand_vec_perm_1 (&dsecond);
18105   gcc_assert (ok);
18106 
18107   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18108   emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18109   return true;
18110 }
18111 
18112 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
18113    permutation using two vperm2f128, followed by a vshufpd insn blending
18114    the two vectors together.  */
18115 
18116 static bool
expand_vec_perm_2vperm2f128_vshuf(struct expand_vec_perm_d * d)18117 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18118 {
18119   struct expand_vec_perm_d dfirst, dsecond, dthird;
18120   bool ok;
18121 
18122   if (!TARGET_AVX || (d->vmode != V4DFmode))
18123     return false;
18124 
18125   if (d->testing_p)
18126     return true;
18127 
18128   dfirst = *d;
18129   dsecond = *d;
18130   dthird = *d;
18131 
18132   dfirst.perm[0] = (d->perm[0] & ~1);
18133   dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18134   dfirst.perm[2] = (d->perm[2] & ~1);
18135   dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18136   dsecond.perm[0] = (d->perm[1] & ~1);
18137   dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18138   dsecond.perm[2] = (d->perm[3] & ~1);
18139   dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18140   dthird.perm[0] = (d->perm[0] % 2);
18141   dthird.perm[1] = (d->perm[1] % 2) + 4;
18142   dthird.perm[2] = (d->perm[2] % 2) + 2;
18143   dthird.perm[3] = (d->perm[3] % 2) + 6;
18144 
18145   dfirst.target = gen_reg_rtx (dfirst.vmode);
18146   dsecond.target = gen_reg_rtx (dsecond.vmode);
18147   dthird.op0 = dfirst.target;
18148   dthird.op1 = dsecond.target;
18149   dthird.one_operand_p = false;
18150 
18151   canonicalize_perm (&dfirst);
18152   canonicalize_perm (&dsecond);
18153 
18154   ok = expand_vec_perm_1 (&dfirst)
18155        && expand_vec_perm_1 (&dsecond)
18156        && expand_vec_perm_1 (&dthird);
18157 
18158   gcc_assert (ok);
18159 
18160   return true;
18161 }
18162 
18163 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18164 
18165 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
18166    a two vector permutation using two intra-lane vector
18167    permutations, vperm2f128 swapping the lanes and vblend* insn blending
18168    the non-swapped and swapped vectors together.  */
18169 
18170 static bool
expand_vec_perm2_vperm2f128_vblend(struct expand_vec_perm_d * d)18171 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18172 {
18173   struct expand_vec_perm_d dfirst, dsecond, dthird;
18174   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18175   rtx_insn *seq1, *seq2;
18176   bool ok;
18177   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18178 
18179   if (!TARGET_AVX
18180       || TARGET_AVX2
18181       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18182       || d->one_operand_p)
18183     return false;
18184 
18185   dfirst = *d;
18186   dsecond = *d;
18187   for (i = 0; i < nelt; i++)
18188     {
18189       dfirst.perm[i] = 0xff;
18190       dsecond.perm[i] = 0xff;
18191     }
18192   for (i = 0, msk = 0; i < nelt; i++)
18193     {
18194       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18195       if (j == i)
18196 	{
18197 	  dfirst.perm[j] = d->perm[i];
18198 	  which1 |= (d->perm[i] < nelt ? 1 : 2);
18199 	}
18200       else
18201 	{
18202 	  dsecond.perm[j] = d->perm[i];
18203 	  which2 |= (d->perm[i] < nelt ? 1 : 2);
18204 	  msk |= (1U << i);
18205 	}
18206     }
18207   if (msk == 0 || msk == (1U << nelt) - 1)
18208     return false;
18209 
18210   if (!d->testing_p)
18211     {
18212       dfirst.target = gen_reg_rtx (dfirst.vmode);
18213       dsecond.target = gen_reg_rtx (dsecond.vmode);
18214     }
18215 
18216   for (i = 0; i < nelt; i++)
18217     {
18218       if (dfirst.perm[i] == 0xff)
18219 	dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18220       if (dsecond.perm[i] == 0xff)
18221 	dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18222     }
18223   canonicalize_perm (&dfirst);
18224   start_sequence ();
18225   ok = ix86_expand_vec_perm_const_1 (&dfirst);
18226   seq1 = get_insns ();
18227   end_sequence ();
18228 
18229   if (!ok)
18230     return false;
18231 
18232   canonicalize_perm (&dsecond);
18233   start_sequence ();
18234   ok = ix86_expand_vec_perm_const_1 (&dsecond);
18235   seq2 = get_insns ();
18236   end_sequence ();
18237 
18238   if (!ok)
18239     return false;
18240 
18241   if (d->testing_p)
18242     return true;
18243 
18244   emit_insn (seq1);
18245   emit_insn (seq2);
18246 
18247   dthird = *d;
18248   dthird.op0 = dsecond.target;
18249   dthird.op1 = dsecond.target;
18250   dthird.one_operand_p = true;
18251   dthird.target = gen_reg_rtx (dthird.vmode);
18252   for (i = 0; i < nelt; i++)
18253     dthird.perm[i] = i ^ nelt2;
18254 
18255   ok = expand_vec_perm_1 (&dthird);
18256   gcc_assert (ok);
18257 
18258   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18259   emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18260   return true;
18261 }
18262 
18263 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
18264    permutation with two pshufb insns and an ior.  We should have already
18265    failed all two instruction sequences.  */
18266 
18267 static bool
expand_vec_perm_pshufb2(struct expand_vec_perm_d * d)18268 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18269 {
18270   rtx rperm[2][16], vperm, l, h, op, m128;
18271   unsigned int i, nelt, eltsz;
18272 
18273   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18274     return false;
18275   gcc_assert (!d->one_operand_p);
18276 
18277   if (d->testing_p)
18278     return true;
18279 
18280   nelt = d->nelt;
18281   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18282 
18283   /* Generate two permutation masks.  If the required element is within
18284      the given vector it is shuffled into the proper lane.  If the required
18285      element is in the other vector, force a zero into the lane by setting
18286      bit 7 in the permutation mask.  */
18287   m128 = GEN_INT (-128);
18288   for (i = 0; i < nelt; ++i)
18289     {
18290       unsigned j, e = d->perm[i];
18291       unsigned which = (e >= nelt);
18292       if (e >= nelt)
18293 	e -= nelt;
18294 
18295       for (j = 0; j < eltsz; ++j)
18296 	{
18297 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18298 	  rperm[1-which][i*eltsz + j] = m128;
18299 	}
18300     }
18301 
18302   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18303   vperm = force_reg (V16QImode, vperm);
18304 
18305   l = gen_reg_rtx (V16QImode);
18306   op = gen_lowpart (V16QImode, d->op0);
18307   emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18308 
18309   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18310   vperm = force_reg (V16QImode, vperm);
18311 
18312   h = gen_reg_rtx (V16QImode);
18313   op = gen_lowpart (V16QImode, d->op1);
18314   emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18315 
18316   op = d->target;
18317   if (d->vmode != V16QImode)
18318     op = gen_reg_rtx (V16QImode);
18319   emit_insn (gen_iorv16qi3 (op, l, h));
18320   if (op != d->target)
18321     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18322 
18323   return true;
18324 }
18325 
18326 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18327    with two vpshufb insns, vpermq and vpor.  We should have already failed
18328    all two or three instruction sequences.  */
18329 
18330 static bool
expand_vec_perm_vpshufb2_vpermq(struct expand_vec_perm_d * d)18331 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18332 {
18333   rtx rperm[2][32], vperm, l, h, hp, op, m128;
18334   unsigned int i, nelt, eltsz;
18335 
18336   if (!TARGET_AVX2
18337       || !d->one_operand_p
18338       || (d->vmode != V32QImode && d->vmode != V16HImode))
18339     return false;
18340 
18341   if (d->testing_p)
18342     return true;
18343 
18344   nelt = d->nelt;
18345   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18346 
18347   /* Generate two permutation masks.  If the required element is within
18348      the same lane, it is shuffled in.  If the required element from the
18349      other lane, force a zero by setting bit 7 in the permutation mask.
18350      In the other mask the mask has non-negative elements if element
18351      is requested from the other lane, but also moved to the other lane,
18352      so that the result of vpshufb can have the two V2TImode halves
18353      swapped.  */
18354   m128 = GEN_INT (-128);
18355   for (i = 0; i < nelt; ++i)
18356     {
18357       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18358       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18359 
18360       for (j = 0; j < eltsz; ++j)
18361 	{
18362 	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18363 	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18364 	}
18365     }
18366 
18367   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18368   vperm = force_reg (V32QImode, vperm);
18369 
18370   h = gen_reg_rtx (V32QImode);
18371   op = gen_lowpart (V32QImode, d->op0);
18372   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18373 
18374   /* Swap the 128-byte lanes of h into hp.  */
18375   hp = gen_reg_rtx (V4DImode);
18376   op = gen_lowpart (V4DImode, h);
18377   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18378 				  const1_rtx));
18379 
18380   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18381   vperm = force_reg (V32QImode, vperm);
18382 
18383   l = gen_reg_rtx (V32QImode);
18384   op = gen_lowpart (V32QImode, d->op0);
18385   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18386 
18387   op = d->target;
18388   if (d->vmode != V32QImode)
18389     op = gen_reg_rtx (V32QImode);
18390   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18391   if (op != d->target)
18392     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18393 
18394   return true;
18395 }
18396 
18397 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
18398    and extract-odd permutations of two V32QImode and V16QImode operand
18399    with two vpshufb insns, vpor and vpermq.  We should have already
18400    failed all two or three instruction sequences.  */
18401 
18402 static bool
expand_vec_perm_vpshufb2_vpermq_even_odd(struct expand_vec_perm_d * d)18403 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18404 {
18405   rtx rperm[2][32], vperm, l, h, ior, op, m128;
18406   unsigned int i, nelt, eltsz;
18407 
18408   if (!TARGET_AVX2
18409       || d->one_operand_p
18410       || (d->vmode != V32QImode && d->vmode != V16HImode))
18411     return false;
18412 
18413   for (i = 0; i < d->nelt; ++i)
18414     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18415       return false;
18416 
18417   if (d->testing_p)
18418     return true;
18419 
18420   nelt = d->nelt;
18421   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18422 
18423   /* Generate two permutation masks.  In the first permutation mask
18424      the first quarter will contain indexes for the first half
18425      of the op0, the second quarter will contain bit 7 set, third quarter
18426      will contain indexes for the second half of the op0 and the
18427      last quarter bit 7 set.  In the second permutation mask
18428      the first quarter will contain bit 7 set, the second quarter
18429      indexes for the first half of the op1, the third quarter bit 7 set
18430      and last quarter indexes for the second half of the op1.
18431      I.e. the first mask e.g. for V32QImode extract even will be:
18432      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18433      (all values masked with 0xf except for -128) and second mask
18434      for extract even will be
18435      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
18436   m128 = GEN_INT (-128);
18437   for (i = 0; i < nelt; ++i)
18438     {
18439       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18440       unsigned which = d->perm[i] >= nelt;
18441       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18442 
18443       for (j = 0; j < eltsz; ++j)
18444 	{
18445 	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18446 	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18447 	}
18448     }
18449 
18450   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18451   vperm = force_reg (V32QImode, vperm);
18452 
18453   l = gen_reg_rtx (V32QImode);
18454   op = gen_lowpart (V32QImode, d->op0);
18455   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18456 
18457   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18458   vperm = force_reg (V32QImode, vperm);
18459 
18460   h = gen_reg_rtx (V32QImode);
18461   op = gen_lowpart (V32QImode, d->op1);
18462   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18463 
18464   ior = gen_reg_rtx (V32QImode);
18465   emit_insn (gen_iorv32qi3 (ior, l, h));
18466 
18467   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
18468   op = gen_reg_rtx (V4DImode);
18469   ior = gen_lowpart (V4DImode, ior);
18470   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18471 				  const1_rtx, GEN_INT (3)));
18472   emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18473 
18474   return true;
18475 }
18476 
18477 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
18478    and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18479    with two "and" and "pack" or two "shift" and "pack" insns.  We should
18480    have already failed all two instruction sequences.  */
18481 
18482 static bool
expand_vec_perm_even_odd_pack(struct expand_vec_perm_d * d)18483 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18484 {
18485   rtx op, dop0, dop1, t;
18486   unsigned i, odd, c, s, nelt = d->nelt;
18487   bool end_perm = false;
18488   machine_mode half_mode;
18489   rtx (*gen_and) (rtx, rtx, rtx);
18490   rtx (*gen_pack) (rtx, rtx, rtx);
18491   rtx (*gen_shift) (rtx, rtx, rtx);
18492 
18493   if (d->one_operand_p)
18494     return false;
18495 
18496   switch (d->vmode)
18497     {
18498     case E_V8HImode:
18499       /* Required for "pack".  */
18500       if (!TARGET_SSE4_1)
18501         return false;
18502       c = 0xffff;
18503       s = 16;
18504       half_mode = V4SImode;
18505       gen_and = gen_andv4si3;
18506       gen_pack = gen_sse4_1_packusdw;
18507       gen_shift = gen_lshrv4si3;
18508       break;
18509     case E_V16QImode:
18510       /* No check as all instructions are SSE2.  */
18511       c = 0xff;
18512       s = 8;
18513       half_mode = V8HImode;
18514       gen_and = gen_andv8hi3;
18515       gen_pack = gen_sse2_packuswb;
18516       gen_shift = gen_lshrv8hi3;
18517       break;
18518     case E_V16HImode:
18519       if (!TARGET_AVX2)
18520         return false;
18521       c = 0xffff;
18522       s = 16;
18523       half_mode = V8SImode;
18524       gen_and = gen_andv8si3;
18525       gen_pack = gen_avx2_packusdw;
18526       gen_shift = gen_lshrv8si3;
18527       end_perm = true;
18528       break;
18529     case E_V32QImode:
18530       if (!TARGET_AVX2)
18531         return false;
18532       c = 0xff;
18533       s = 8;
18534       half_mode = V16HImode;
18535       gen_and = gen_andv16hi3;
18536       gen_pack = gen_avx2_packuswb;
18537       gen_shift = gen_lshrv16hi3;
18538       end_perm = true;
18539       break;
18540     default:
18541       /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18542 	 general shuffles.  */
18543       return false;
18544     }
18545 
18546   /* Check that permutation is even or odd.  */
18547   odd = d->perm[0];
18548   if (odd > 1)
18549     return false;
18550 
18551   for (i = 1; i < nelt; ++i)
18552     if (d->perm[i] != 2 * i + odd)
18553       return false;
18554 
18555   if (d->testing_p)
18556     return true;
18557 
18558   dop0 = gen_reg_rtx (half_mode);
18559   dop1 = gen_reg_rtx (half_mode);
18560   if (odd == 0)
18561     {
18562       t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18563       t = force_reg (half_mode, t);
18564       emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18565       emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18566     }
18567   else
18568     {
18569       emit_insn (gen_shift (dop0,
18570 			    gen_lowpart (half_mode, d->op0),
18571 			    GEN_INT (s)));
18572       emit_insn (gen_shift (dop1,
18573 			    gen_lowpart (half_mode, d->op1),
18574 			    GEN_INT (s)));
18575     }
18576   /* In AVX2 for 256 bit case we need to permute pack result.  */
18577   if (TARGET_AVX2 && end_perm)
18578     {
18579       op = gen_reg_rtx (d->vmode);
18580       t = gen_reg_rtx (V4DImode);
18581       emit_insn (gen_pack (op, dop0, dop1));
18582       emit_insn (gen_avx2_permv4di_1 (t,
18583 				      gen_lowpart (V4DImode, op),
18584 				      const0_rtx,
18585 				      const2_rtx,
18586 				      const1_rtx,
18587 				      GEN_INT (3)));
18588       emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18589     }
18590   else
18591     emit_insn (gen_pack (d->target, dop0, dop1));
18592 
18593   return true;
18594 }
18595 
18596 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
18597    and extract-odd permutations of two V64QI operands
18598    with two "shifts", two "truncs" and one "concat" insns for "odd"
18599    and two "truncs" and one concat insn for "even."
18600    Have already failed all two instruction sequences.  */
18601 
18602 static bool
expand_vec_perm_even_odd_trunc(struct expand_vec_perm_d * d)18603 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18604 {
18605   rtx t1, t2, t3, t4;
18606   unsigned i, odd, nelt = d->nelt;
18607 
18608   if (!TARGET_AVX512BW
18609       || d->one_operand_p
18610       || d->vmode != V64QImode)
18611     return false;
18612 
18613   /* Check that permutation is even or odd.  */
18614   odd = d->perm[0];
18615   if (odd > 1)
18616     return false;
18617 
18618   for (i = 1; i < nelt; ++i)
18619     if (d->perm[i] != 2 * i + odd)
18620       return false;
18621 
18622   if (d->testing_p)
18623     return true;
18624 
18625 
18626   if (odd)
18627     {
18628       t1 = gen_reg_rtx (V32HImode);
18629       t2 = gen_reg_rtx (V32HImode);
18630       emit_insn (gen_lshrv32hi3 (t1,
18631 				 gen_lowpart (V32HImode, d->op0),
18632 				 GEN_INT (8)));
18633       emit_insn (gen_lshrv32hi3 (t2,
18634 				 gen_lowpart (V32HImode, d->op1),
18635 				 GEN_INT (8)));
18636     }
18637   else
18638     {
18639       t1 = gen_lowpart (V32HImode, d->op0);
18640       t2 = gen_lowpart (V32HImode, d->op1);
18641     }
18642 
18643   t3 = gen_reg_rtx (V32QImode);
18644   t4 = gen_reg_rtx (V32QImode);
18645   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18646   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18647   emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18648 
18649   return true;
18650 }
18651 
18652 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
18653    and extract-odd permutations.  */
18654 
18655 static bool
expand_vec_perm_even_odd_1(struct expand_vec_perm_d * d,unsigned odd)18656 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18657 {
18658   rtx t1, t2, t3, t4, t5;
18659 
18660   switch (d->vmode)
18661     {
18662     case E_V4DFmode:
18663       if (d->testing_p)
18664 	break;
18665       t1 = gen_reg_rtx (V4DFmode);
18666       t2 = gen_reg_rtx (V4DFmode);
18667 
18668       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
18669       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18670       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18671 
18672       /* Now an unpck[lh]pd will produce the result required.  */
18673       if (odd)
18674 	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18675       else
18676 	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18677       emit_insn (t3);
18678       break;
18679 
18680     case E_V8SFmode:
18681       {
18682 	int mask = odd ? 0xdd : 0x88;
18683 
18684 	if (d->testing_p)
18685 	  break;
18686 	t1 = gen_reg_rtx (V8SFmode);
18687 	t2 = gen_reg_rtx (V8SFmode);
18688 	t3 = gen_reg_rtx (V8SFmode);
18689 
18690 	/* Shuffle within the 128-bit lanes to produce:
18691 	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
18692 	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18693 				      GEN_INT (mask)));
18694 
18695 	/* Shuffle the lanes around to produce:
18696 	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
18697 	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18698 					    GEN_INT (0x3)));
18699 
18700 	/* Shuffle within the 128-bit lanes to produce:
18701 	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
18702 	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18703 
18704 	/* Shuffle within the 128-bit lanes to produce:
18705 	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
18706 	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18707 
18708 	/* Shuffle the lanes around to produce:
18709 	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
18710 	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18711 					    GEN_INT (0x20)));
18712       }
18713       break;
18714 
18715     case E_V2DFmode:
18716     case E_V4SFmode:
18717     case E_V2DImode:
18718     case E_V4SImode:
18719       /* These are always directly implementable by expand_vec_perm_1.  */
18720       gcc_unreachable ();
18721 
18722     case E_V8HImode:
18723       if (TARGET_SSE4_1)
18724 	return expand_vec_perm_even_odd_pack (d);
18725       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18726 	return expand_vec_perm_pshufb2 (d);
18727       else
18728 	{
18729 	  if (d->testing_p)
18730 	    break;
18731 	  /* We need 2*log2(N)-1 operations to achieve odd/even
18732 	     with interleave. */
18733 	  t1 = gen_reg_rtx (V8HImode);
18734 	  t2 = gen_reg_rtx (V8HImode);
18735 	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18736 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18737 	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18738 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18739 	  if (odd)
18740 	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18741 	  else
18742 	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18743 	  emit_insn (t3);
18744 	}
18745       break;
18746 
18747     case E_V16QImode:
18748       return expand_vec_perm_even_odd_pack (d);
18749 
18750     case E_V16HImode:
18751     case E_V32QImode:
18752       return expand_vec_perm_even_odd_pack (d);
18753 
18754     case E_V64QImode:
18755       return expand_vec_perm_even_odd_trunc (d);
18756 
18757     case E_V4DImode:
18758       if (!TARGET_AVX2)
18759 	{
18760 	  struct expand_vec_perm_d d_copy = *d;
18761 	  d_copy.vmode = V4DFmode;
18762 	  if (d->testing_p)
18763 	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18764 	  else
18765 	    d_copy.target = gen_reg_rtx (V4DFmode);
18766 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18767 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18768 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18769 	    {
18770 	      if (!d->testing_p)
18771 		emit_move_insn (d->target,
18772 				gen_lowpart (V4DImode, d_copy.target));
18773 	      return true;
18774 	    }
18775 	  return false;
18776 	}
18777 
18778       if (d->testing_p)
18779 	break;
18780 
18781       t1 = gen_reg_rtx (V4DImode);
18782       t2 = gen_reg_rtx (V4DImode);
18783 
18784       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
18785       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18786       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18787 
18788       /* Now an vpunpck[lh]qdq will produce the result required.  */
18789       if (odd)
18790 	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18791       else
18792 	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18793       emit_insn (t3);
18794       break;
18795 
18796     case E_V8SImode:
18797       if (!TARGET_AVX2)
18798 	{
18799 	  struct expand_vec_perm_d d_copy = *d;
18800 	  d_copy.vmode = V8SFmode;
18801 	  if (d->testing_p)
18802 	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18803 	  else
18804 	    d_copy.target = gen_reg_rtx (V8SFmode);
18805 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18806 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18807 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18808 	    {
18809 	      if (!d->testing_p)
18810 		emit_move_insn (d->target,
18811 				gen_lowpart (V8SImode, d_copy.target));
18812 	      return true;
18813 	    }
18814 	  return false;
18815 	}
18816 
18817       if (d->testing_p)
18818 	break;
18819 
18820       t1 = gen_reg_rtx (V8SImode);
18821       t2 = gen_reg_rtx (V8SImode);
18822       t3 = gen_reg_rtx (V4DImode);
18823       t4 = gen_reg_rtx (V4DImode);
18824       t5 = gen_reg_rtx (V4DImode);
18825 
18826       /* Shuffle the lanes around into
18827 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
18828       emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18829 				    gen_lowpart (V4DImode, d->op1),
18830 				    GEN_INT (0x20)));
18831       emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18832 				    gen_lowpart (V4DImode, d->op1),
18833 				    GEN_INT (0x31)));
18834 
18835       /* Swap the 2nd and 3rd position in each lane into
18836 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
18837       emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18838 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18839       emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18840 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18841 
18842       /* Now an vpunpck[lh]qdq will produce
18843 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
18844       if (odd)
18845 	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18846 					   gen_lowpart (V4DImode, t2));
18847       else
18848 	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18849 					  gen_lowpart (V4DImode, t2));
18850       emit_insn (t3);
18851       emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18852       break;
18853 
18854     default:
18855       gcc_unreachable ();
18856     }
18857 
18858   return true;
18859 }
18860 
18861 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
18862    extract-even and extract-odd permutations.  */
18863 
18864 static bool
expand_vec_perm_even_odd(struct expand_vec_perm_d * d)18865 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18866 {
18867   unsigned i, odd, nelt = d->nelt;
18868 
18869   odd = d->perm[0];
18870   if (odd != 0 && odd != 1)
18871     return false;
18872 
18873   for (i = 1; i < nelt; ++i)
18874     if (d->perm[i] != 2 * i + odd)
18875       return false;
18876 
18877   return expand_vec_perm_even_odd_1 (d, odd);
18878 }
18879 
18880 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
18881    permutations.  We assume that expand_vec_perm_1 has already failed.  */
18882 
18883 static bool
expand_vec_perm_broadcast_1(struct expand_vec_perm_d * d)18884 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18885 {
18886   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18887   machine_mode vmode = d->vmode;
18888   unsigned char perm2[4];
18889   rtx op0 = d->op0, dest;
18890   bool ok;
18891 
18892   switch (vmode)
18893     {
18894     case E_V4DFmode:
18895     case E_V8SFmode:
18896       /* These are special-cased in sse.md so that we can optionally
18897 	 use the vbroadcast instruction.  They expand to two insns
18898 	 if the input happens to be in a register.  */
18899       gcc_unreachable ();
18900 
18901     case E_V2DFmode:
18902     case E_V2DImode:
18903     case E_V4SFmode:
18904     case E_V4SImode:
18905       /* These are always implementable using standard shuffle patterns.  */
18906       gcc_unreachable ();
18907 
18908     case E_V8HImode:
18909     case E_V16QImode:
18910       /* These can be implemented via interleave.  We save one insn by
18911 	 stopping once we have promoted to V4SImode and then use pshufd.  */
18912       if (d->testing_p)
18913 	return true;
18914       do
18915 	{
18916 	  rtx dest;
18917 	  rtx (*gen) (rtx, rtx, rtx)
18918 	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18919 				 : gen_vec_interleave_lowv8hi;
18920 
18921 	  if (elt >= nelt2)
18922 	    {
18923 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18924 				       : gen_vec_interleave_highv8hi;
18925 	      elt -= nelt2;
18926 	    }
18927 	  nelt2 /= 2;
18928 
18929 	  dest = gen_reg_rtx (vmode);
18930 	  emit_insn (gen (dest, op0, op0));
18931 	  vmode = get_mode_wider_vector (vmode);
18932 	  op0 = gen_lowpart (vmode, dest);
18933 	}
18934       while (vmode != V4SImode);
18935 
18936       memset (perm2, elt, 4);
18937       dest = gen_reg_rtx (V4SImode);
18938       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18939       gcc_assert (ok);
18940       if (!d->testing_p)
18941 	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18942       return true;
18943 
18944     case E_V64QImode:
18945     case E_V32QImode:
18946     case E_V16HImode:
18947     case E_V8SImode:
18948     case E_V4DImode:
18949       /* For AVX2 broadcasts of the first element vpbroadcast* or
18950 	 vpermq should be used by expand_vec_perm_1.  */
18951       gcc_assert (!TARGET_AVX2 || d->perm[0]);
18952       return false;
18953 
18954     default:
18955       gcc_unreachable ();
18956     }
18957 }
18958 
18959 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
18960    broadcast permutations.  */
18961 
18962 static bool
expand_vec_perm_broadcast(struct expand_vec_perm_d * d)18963 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18964 {
18965   unsigned i, elt, nelt = d->nelt;
18966 
18967   if (!d->one_operand_p)
18968     return false;
18969 
18970   elt = d->perm[0];
18971   for (i = 1; i < nelt; ++i)
18972     if (d->perm[i] != elt)
18973       return false;
18974 
18975   return expand_vec_perm_broadcast_1 (d);
18976 }
18977 
18978 /* Implement arbitrary permutations of two V64QImode operands
18979    with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
18980 static bool
expand_vec_perm_vpermt2_vpshub2(struct expand_vec_perm_d * d)18981 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18982 {
18983   if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
18984     return false;
18985 
18986   if (d->testing_p)
18987     return true;
18988 
18989   struct expand_vec_perm_d ds[2];
18990   rtx rperm[128], vperm, target0, target1;
18991   unsigned int i, nelt;
18992   machine_mode vmode;
18993 
18994   nelt = d->nelt;
18995   vmode = V64QImode;
18996 
18997   for (i = 0; i < 2; i++)
18998     {
18999       ds[i] = *d;
19000       ds[i].vmode = V32HImode;
19001       ds[i].nelt = 32;
19002       ds[i].target = gen_reg_rtx (V32HImode);
19003       ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19004       ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19005     }
19006 
19007   /* Prepare permutations such that the first one takes care of
19008      putting the even bytes into the right positions or one higher
19009      positions (ds[0]) and the second one takes care of
19010      putting the odd bytes into the right positions or one below
19011      (ds[1]).  */
19012 
19013   for (i = 0; i < nelt; i++)
19014     {
19015       ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19016       if (i & 1)
19017 	{
19018 	  rperm[i] = constm1_rtx;
19019 	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19020 	}
19021       else
19022 	{
19023 	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19024 	  rperm[i + 64] = constm1_rtx;
19025 	}
19026     }
19027 
19028   bool ok = expand_vec_perm_1 (&ds[0]);
19029   gcc_assert (ok);
19030   ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19031 
19032   ok = expand_vec_perm_1 (&ds[1]);
19033   gcc_assert (ok);
19034   ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19035 
19036   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19037   vperm = force_reg (vmode, vperm);
19038   target0 = gen_reg_rtx (V64QImode);
19039   emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19040 
19041   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19042   vperm = force_reg (vmode, vperm);
19043   target1 = gen_reg_rtx (V64QImode);
19044   emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19045 
19046   emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19047   return true;
19048 }
19049 
19050 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19051    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
19052    all the shorter instruction sequences.  */
19053 
19054 static bool
expand_vec_perm_vpshufb4_vpermq2(struct expand_vec_perm_d * d)19055 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19056 {
19057   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19058   unsigned int i, nelt, eltsz;
19059   bool used[4];
19060 
19061   if (!TARGET_AVX2
19062       || d->one_operand_p
19063       || (d->vmode != V32QImode && d->vmode != V16HImode))
19064     return false;
19065 
19066   if (d->testing_p)
19067     return true;
19068 
19069   nelt = d->nelt;
19070   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19071 
19072   /* Generate 4 permutation masks.  If the required element is within
19073      the same lane, it is shuffled in.  If the required element from the
19074      other lane, force a zero by setting bit 7 in the permutation mask.
19075      In the other mask the mask has non-negative elements if element
19076      is requested from the other lane, but also moved to the other lane,
19077      so that the result of vpshufb can have the two V2TImode halves
19078      swapped.  */
19079   m128 = GEN_INT (-128);
19080   for (i = 0; i < 32; ++i)
19081     {
19082       rperm[0][i] = m128;
19083       rperm[1][i] = m128;
19084       rperm[2][i] = m128;
19085       rperm[3][i] = m128;
19086     }
19087   used[0] = false;
19088   used[1] = false;
19089   used[2] = false;
19090   used[3] = false;
19091   for (i = 0; i < nelt; ++i)
19092     {
19093       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19094       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19095       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19096 
19097       for (j = 0; j < eltsz; ++j)
19098 	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19099       used[which] = true;
19100     }
19101 
19102   for (i = 0; i < 2; ++i)
19103     {
19104       if (!used[2 * i + 1])
19105 	{
19106 	  h[i] = NULL_RTX;
19107 	  continue;
19108 	}
19109       vperm = gen_rtx_CONST_VECTOR (V32QImode,
19110 				    gen_rtvec_v (32, rperm[2 * i + 1]));
19111       vperm = force_reg (V32QImode, vperm);
19112       h[i] = gen_reg_rtx (V32QImode);
19113       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19114       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19115     }
19116 
19117   /* Swap the 128-byte lanes of h[X].  */
19118   for (i = 0; i < 2; ++i)
19119    {
19120      if (h[i] == NULL_RTX)
19121        continue;
19122      op = gen_reg_rtx (V4DImode);
19123      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19124 				     const2_rtx, GEN_INT (3), const0_rtx,
19125 				     const1_rtx));
19126      h[i] = gen_lowpart (V32QImode, op);
19127    }
19128 
19129   for (i = 0; i < 2; ++i)
19130     {
19131       if (!used[2 * i])
19132 	{
19133 	  l[i] = NULL_RTX;
19134 	  continue;
19135 	}
19136       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19137       vperm = force_reg (V32QImode, vperm);
19138       l[i] = gen_reg_rtx (V32QImode);
19139       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19140       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19141     }
19142 
19143   for (i = 0; i < 2; ++i)
19144     {
19145       if (h[i] && l[i])
19146 	{
19147 	  op = gen_reg_rtx (V32QImode);
19148 	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19149 	  l[i] = op;
19150 	}
19151       else if (h[i])
19152 	l[i] = h[i];
19153     }
19154 
19155   gcc_assert (l[0] && l[1]);
19156   op = d->target;
19157   if (d->vmode != V32QImode)
19158     op = gen_reg_rtx (V32QImode);
19159   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19160   if (op != d->target)
19161     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19162   return true;
19163 }
19164 
19165 /* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
19166    taken care of, perform the expansion in D and return true on success.  */
19167 
19168 static bool
ix86_expand_vec_perm_const_1(struct expand_vec_perm_d * d)19169 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19170 {
19171   /* Try a single instruction expansion.  */
19172   if (expand_vec_perm_1 (d))
19173     return true;
19174 
19175   /* Try sequences of two instructions.  */
19176 
19177   if (expand_vec_perm_pshuflw_pshufhw (d))
19178     return true;
19179 
19180   if (expand_vec_perm_palignr (d, false))
19181     return true;
19182 
19183   if (expand_vec_perm_interleave2 (d))
19184     return true;
19185 
19186   if (expand_vec_perm_broadcast (d))
19187     return true;
19188 
19189   if (expand_vec_perm_vpermq_perm_1 (d))
19190     return true;
19191 
19192   if (expand_vec_perm_vperm2f128 (d))
19193     return true;
19194 
19195   if (expand_vec_perm_pblendv (d))
19196     return true;
19197 
19198   /* Try sequences of three instructions.  */
19199 
19200   if (expand_vec_perm_even_odd_pack (d))
19201     return true;
19202 
19203   if (expand_vec_perm_2vperm2f128_vshuf (d))
19204     return true;
19205 
19206   if (expand_vec_perm_pshufb2 (d))
19207     return true;
19208 
19209   if (expand_vec_perm_interleave3 (d))
19210     return true;
19211 
19212   if (expand_vec_perm_vperm2f128_vblend (d))
19213     return true;
19214 
19215   /* Try sequences of four instructions.  */
19216 
19217   if (expand_vec_perm_even_odd_trunc (d))
19218     return true;
19219   if (expand_vec_perm_vpshufb2_vpermq (d))
19220     return true;
19221 
19222   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19223     return true;
19224 
19225   if (expand_vec_perm_vpermt2_vpshub2 (d))
19226     return true;
19227 
19228   /* ??? Look for narrow permutations whose element orderings would
19229      allow the promotion to a wider mode.  */
19230 
19231   /* ??? Look for sequences of interleave or a wider permute that place
19232      the data into the correct lanes for a half-vector shuffle like
19233      pshuf[lh]w or vpermilps.  */
19234 
19235   /* ??? Look for sequences of interleave that produce the desired results.
19236      The combinatorics of punpck[lh] get pretty ugly... */
19237 
19238   if (expand_vec_perm_even_odd (d))
19239     return true;
19240 
19241   /* Even longer sequences.  */
19242   if (expand_vec_perm_vpshufb4_vpermq2 (d))
19243     return true;
19244 
19245   /* See if we can get the same permutation in different vector integer
19246      mode.  */
19247   struct expand_vec_perm_d nd;
19248   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19249     {
19250       if (!d->testing_p)
19251 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19252       return true;
19253     }
19254 
19255   /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
19256   if (expand_vec_perm2_vperm2f128_vblend (d))
19257     return true;
19258 
19259   return false;
19260 }
19261 
19262 /* If a permutation only uses one operand, make it clear. Returns true
19263    if the permutation references both operands.  */
19264 
19265 static bool
canonicalize_perm(struct expand_vec_perm_d * d)19266 canonicalize_perm (struct expand_vec_perm_d *d)
19267 {
19268   int i, which, nelt = d->nelt;
19269 
19270   for (i = which = 0; i < nelt; ++i)
19271     which |= (d->perm[i] < nelt ? 1 : 2);
19272 
19273   d->one_operand_p = true;
19274   switch (which)
19275     {
19276     default:
19277       gcc_unreachable();
19278 
19279     case 3:
19280       if (!rtx_equal_p (d->op0, d->op1))
19281         {
19282 	  d->one_operand_p = false;
19283 	  break;
19284         }
19285       /* The elements of PERM do not suggest that only the first operand
19286 	 is used, but both operands are identical.  Allow easier matching
19287 	 of the permutation by folding the permutation into the single
19288 	 input vector.  */
19289       /* FALLTHRU */
19290 
19291     case 2:
19292       for (i = 0; i < nelt; ++i)
19293         d->perm[i] &= nelt - 1;
19294       d->op0 = d->op1;
19295       break;
19296 
19297     case 1:
19298       d->op1 = d->op0;
19299       break;
19300     }
19301 
19302   return (which == 3);
19303 }
19304 
19305 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
19306 
19307 bool
ix86_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)19308 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19309 			       rtx op1, const vec_perm_indices &sel)
19310 {
19311   struct expand_vec_perm_d d;
19312   unsigned char perm[MAX_VECT_LEN];
19313   unsigned int i, nelt, which;
19314   bool two_args;
19315 
19316   d.target = target;
19317   d.op0 = op0;
19318   d.op1 = op1;
19319 
19320   d.vmode = vmode;
19321   gcc_assert (VECTOR_MODE_P (d.vmode));
19322   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19323   d.testing_p = !target;
19324 
19325   gcc_assert (sel.length () == nelt);
19326   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19327 
19328   /* Given sufficient ISA support we can just return true here
19329      for selected vector modes.  */
19330   switch (d.vmode)
19331     {
19332     case E_V16SFmode:
19333     case E_V16SImode:
19334     case E_V8DImode:
19335     case E_V8DFmode:
19336       if (!TARGET_AVX512F)
19337 	return false;
19338       /* All implementable with a single vperm[it]2 insn.  */
19339       if (d.testing_p)
19340 	return true;
19341       break;
19342     case E_V32HImode:
19343       if (!TARGET_AVX512BW)
19344 	return false;
19345       if (d.testing_p)
19346 	/* All implementable with a single vperm[it]2 insn.  */
19347 	return true;
19348       break;
19349     case E_V64QImode:
19350       if (!TARGET_AVX512BW)
19351 	return false;
19352       if (d.testing_p)
19353 	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
19354 	return true;
19355       break;
19356     case E_V8SImode:
19357     case E_V8SFmode:
19358     case E_V4DFmode:
19359     case E_V4DImode:
19360       if (!TARGET_AVX)
19361 	return false;
19362       if (d.testing_p && TARGET_AVX512VL)
19363 	/* All implementable with a single vperm[it]2 insn.  */
19364 	return true;
19365       break;
19366     case E_V16HImode:
19367       if (!TARGET_SSE2)
19368 	return false;
19369       if (d.testing_p && TARGET_AVX2)
19370 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
19371 	return true;
19372       break;
19373     case E_V32QImode:
19374       if (!TARGET_SSE2)
19375 	return false;
19376       if (d.testing_p && TARGET_AVX2)
19377 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
19378 	return true;
19379       break;
19380     case E_V8HImode:
19381     case E_V16QImode:
19382       if (!TARGET_SSE2)
19383 	return false;
19384       /* Fall through.  */
19385     case E_V4SImode:
19386     case E_V4SFmode:
19387       if (!TARGET_SSE)
19388 	return false;
19389       /* All implementable with a single vpperm insn.  */
19390       if (d.testing_p && TARGET_XOP)
19391 	return true;
19392       /* All implementable with 2 pshufb + 1 ior.  */
19393       if (d.testing_p && TARGET_SSSE3)
19394 	return true;
19395       break;
19396     case E_V2DImode:
19397     case E_V2DFmode:
19398       if (!TARGET_SSE)
19399 	return false;
19400       /* All implementable with shufpd or unpck[lh]pd.  */
19401       if (d.testing_p)
19402 	return true;
19403       break;
19404     default:
19405       return false;
19406     }
19407 
19408   for (i = which = 0; i < nelt; ++i)
19409     {
19410       unsigned char e = sel[i];
19411       gcc_assert (e < 2 * nelt);
19412       d.perm[i] = e;
19413       perm[i] = e;
19414       which |= (e < nelt ? 1 : 2);
19415     }
19416 
19417   if (d.testing_p)
19418     {
19419       /* For all elements from second vector, fold the elements to first.  */
19420       if (which == 2)
19421 	for (i = 0; i < nelt; ++i)
19422 	  d.perm[i] -= nelt;
19423 
19424       /* Check whether the mask can be applied to the vector type.  */
19425       d.one_operand_p = (which != 3);
19426 
19427       /* Implementable with shufps or pshufd.  */
19428       if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
19429 	return true;
19430 
19431       /* Otherwise we have to go through the motions and see if we can
19432 	 figure out how to generate the requested permutation.  */
19433       d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19434       d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19435       if (!d.one_operand_p)
19436 	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19437 
19438       start_sequence ();
19439       bool ret = ix86_expand_vec_perm_const_1 (&d);
19440       end_sequence ();
19441 
19442       return ret;
19443     }
19444 
19445   two_args = canonicalize_perm (&d);
19446 
19447   if (ix86_expand_vec_perm_const_1 (&d))
19448     return true;
19449 
19450   /* If the selector says both arguments are needed, but the operands are the
19451      same, the above tried to expand with one_operand_p and flattened selector.
19452      If that didn't work, retry without one_operand_p; we succeeded with that
19453      during testing.  */
19454   if (two_args && d.one_operand_p)
19455     {
19456       d.one_operand_p = false;
19457       memcpy (d.perm, perm, sizeof (perm));
19458       return ix86_expand_vec_perm_const_1 (&d);
19459     }
19460 
19461   return false;
19462 }
19463 
19464 void
ix86_expand_vec_extract_even_odd(rtx targ,rtx op0,rtx op1,unsigned odd)19465 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19466 {
19467   struct expand_vec_perm_d d;
19468   unsigned i, nelt;
19469 
19470   d.target = targ;
19471   d.op0 = op0;
19472   d.op1 = op1;
19473   d.vmode = GET_MODE (targ);
19474   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19475   d.one_operand_p = false;
19476   d.testing_p = false;
19477 
19478   for (i = 0; i < nelt; ++i)
19479     d.perm[i] = i * 2 + odd;
19480 
19481   /* We'll either be able to implement the permutation directly...  */
19482   if (expand_vec_perm_1 (&d))
19483     return;
19484 
19485   /* ... or we use the special-case patterns.  */
19486   expand_vec_perm_even_odd_1 (&d, odd);
19487 }
19488 
19489 static void
ix86_expand_vec_interleave(rtx targ,rtx op0,rtx op1,bool high_p)19490 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19491 {
19492   struct expand_vec_perm_d d;
19493   unsigned i, nelt, base;
19494   bool ok;
19495 
19496   d.target = targ;
19497   d.op0 = op0;
19498   d.op1 = op1;
19499   d.vmode = GET_MODE (targ);
19500   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19501   d.one_operand_p = false;
19502   d.testing_p = false;
19503 
19504   base = high_p ? nelt / 2 : 0;
19505   for (i = 0; i < nelt / 2; ++i)
19506     {
19507       d.perm[i * 2] = i + base;
19508       d.perm[i * 2 + 1] = i + base + nelt;
19509     }
19510 
19511   /* Note that for AVX this isn't one instruction.  */
19512   ok = ix86_expand_vec_perm_const_1 (&d);
19513   gcc_assert (ok);
19514 }
19515 
19516 
19517 /* Expand a vector operation CODE for a V*QImode in terms of the
19518    same operation on V*HImode.  */
19519 
19520 void
ix86_expand_vecop_qihi(enum rtx_code code,rtx dest,rtx op1,rtx op2)19521 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19522 {
19523   machine_mode qimode = GET_MODE (dest);
19524   machine_mode himode;
19525   rtx (*gen_il) (rtx, rtx, rtx);
19526   rtx (*gen_ih) (rtx, rtx, rtx);
19527   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19528   struct expand_vec_perm_d d;
19529   bool ok, full_interleave;
19530   bool uns_p = false;
19531   int i;
19532 
19533   switch (qimode)
19534     {
19535     case E_V16QImode:
19536       himode = V8HImode;
19537       gen_il = gen_vec_interleave_lowv16qi;
19538       gen_ih = gen_vec_interleave_highv16qi;
19539       break;
19540     case E_V32QImode:
19541       himode = V16HImode;
19542       gen_il = gen_avx2_interleave_lowv32qi;
19543       gen_ih = gen_avx2_interleave_highv32qi;
19544       break;
19545     case E_V64QImode:
19546       himode = V32HImode;
19547       gen_il = gen_avx512bw_interleave_lowv64qi;
19548       gen_ih = gen_avx512bw_interleave_highv64qi;
19549       break;
19550     default:
19551       gcc_unreachable ();
19552     }
19553 
19554   op2_l = op2_h = op2;
19555   switch (code)
19556     {
19557     case MULT:
19558       /* Unpack data such that we've got a source byte in each low byte of
19559 	 each word.  We don't care what goes into the high byte of each word.
19560 	 Rather than trying to get zero in there, most convenient is to let
19561 	 it be a copy of the low byte.  */
19562       op2_l = gen_reg_rtx (qimode);
19563       op2_h = gen_reg_rtx (qimode);
19564       emit_insn (gen_il (op2_l, op2, op2));
19565       emit_insn (gen_ih (op2_h, op2, op2));
19566 
19567       op1_l = gen_reg_rtx (qimode);
19568       op1_h = gen_reg_rtx (qimode);
19569       emit_insn (gen_il (op1_l, op1, op1));
19570       emit_insn (gen_ih (op1_h, op1, op1));
19571       full_interleave = qimode == V16QImode;
19572       break;
19573 
19574     case ASHIFT:
19575     case LSHIFTRT:
19576       uns_p = true;
19577       /* FALLTHRU */
19578     case ASHIFTRT:
19579       op1_l = gen_reg_rtx (himode);
19580       op1_h = gen_reg_rtx (himode);
19581       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19582       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19583       full_interleave = true;
19584       break;
19585     default:
19586       gcc_unreachable ();
19587     }
19588 
19589   /* Perform the operation.  */
19590   res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19591 			       1, OPTAB_DIRECT);
19592   res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19593 			       1, OPTAB_DIRECT);
19594   gcc_assert (res_l && res_h);
19595 
19596   /* Merge the data back into the right place.  */
19597   d.target = dest;
19598   d.op0 = gen_lowpart (qimode, res_l);
19599   d.op1 = gen_lowpart (qimode, res_h);
19600   d.vmode = qimode;
19601   d.nelt = GET_MODE_NUNITS (qimode);
19602   d.one_operand_p = false;
19603   d.testing_p = false;
19604 
19605   if (full_interleave)
19606     {
19607       /* For SSE2, we used an full interleave, so the desired
19608 	 results are in the even elements.  */
19609       for (i = 0; i < d.nelt; ++i)
19610 	d.perm[i] = i * 2;
19611     }
19612   else
19613     {
19614       /* For AVX, the interleave used above was not cross-lane.  So the
19615 	 extraction is evens but with the second and third quarter swapped.
19616 	 Happily, that is even one insn shorter than even extraction.
19617 	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
19618 	 always first from the first and then from the second source operand,
19619 	 the index bits above the low 4 bits remains the same.
19620 	 Thus, for d.nelt == 32 we want permutation
19621 	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19622 	 and for d.nelt == 64 we want permutation
19623 	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19624 	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
19625       for (i = 0; i < d.nelt; ++i)
19626 	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19627     }
19628 
19629   ok = ix86_expand_vec_perm_const_1 (&d);
19630   gcc_assert (ok);
19631 
19632   set_unique_reg_note (get_last_insn (), REG_EQUAL,
19633 		       gen_rtx_fmt_ee (code, qimode, op1, op2));
19634 }
19635 
19636 /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
19637    if op is CONST_VECTOR with all odd elements equal to their
19638    preceding element.  */
19639 
19640 static bool
const_vector_equal_evenodd_p(rtx op)19641 const_vector_equal_evenodd_p (rtx op)
19642 {
19643   machine_mode mode = GET_MODE (op);
19644   int i, nunits = GET_MODE_NUNITS (mode);
19645   if (GET_CODE (op) != CONST_VECTOR
19646       || nunits != CONST_VECTOR_NUNITS (op))
19647     return false;
19648   for (i = 0; i < nunits; i += 2)
19649     if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19650       return false;
19651   return true;
19652 }
19653 
19654 void
ix86_expand_mul_widen_evenodd(rtx dest,rtx op1,rtx op2,bool uns_p,bool odd_p)19655 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19656 			       bool uns_p, bool odd_p)
19657 {
19658   machine_mode mode = GET_MODE (op1);
19659   machine_mode wmode = GET_MODE (dest);
19660   rtx x;
19661   rtx orig_op1 = op1, orig_op2 = op2;
19662 
19663   if (!nonimmediate_operand (op1, mode))
19664     op1 = force_reg (mode, op1);
19665   if (!nonimmediate_operand (op2, mode))
19666     op2 = force_reg (mode, op2);
19667 
19668   /* We only play even/odd games with vectors of SImode.  */
19669   gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19670 
19671   /* If we're looking for the odd results, shift those members down to
19672      the even slots.  For some cpus this is faster than a PSHUFD.  */
19673   if (odd_p)
19674     {
19675       /* For XOP use vpmacsdqh, but only for smult, as it is only
19676 	 signed.  */
19677       if (TARGET_XOP && mode == V4SImode && !uns_p)
19678 	{
19679 	  x = force_reg (wmode, CONST0_RTX (wmode));
19680 	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19681 	  return;
19682 	}
19683 
19684       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19685       if (!const_vector_equal_evenodd_p (orig_op1))
19686 	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19687 			    x, NULL, 1, OPTAB_DIRECT);
19688       if (!const_vector_equal_evenodd_p (orig_op2))
19689 	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19690 			    x, NULL, 1, OPTAB_DIRECT);
19691       op1 = gen_lowpart (mode, op1);
19692       op2 = gen_lowpart (mode, op2);
19693     }
19694 
19695   if (mode == V16SImode)
19696     {
19697       if (uns_p)
19698 	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19699       else
19700 	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19701     }
19702   else if (mode == V8SImode)
19703     {
19704       if (uns_p)
19705 	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19706       else
19707 	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19708     }
19709   else if (uns_p)
19710     x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19711   else if (TARGET_SSE4_1)
19712     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19713   else
19714     {
19715       rtx s1, s2, t0, t1, t2;
19716 
19717       /* The easiest way to implement this without PMULDQ is to go through
19718 	 the motions as if we are performing a full 64-bit multiply.  With
19719 	 the exception that we need to do less shuffling of the elements.  */
19720 
19721       /* Compute the sign-extension, aka highparts, of the two operands.  */
19722       s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19723 				op1, pc_rtx, pc_rtx);
19724       s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19725 				op2, pc_rtx, pc_rtx);
19726 
19727       /* Multiply LO(A) * HI(B), and vice-versa.  */
19728       t1 = gen_reg_rtx (wmode);
19729       t2 = gen_reg_rtx (wmode);
19730       emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19731       emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19732 
19733       /* Multiply LO(A) * LO(B).  */
19734       t0 = gen_reg_rtx (wmode);
19735       emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19736 
19737       /* Combine and shift the highparts into place.  */
19738       t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19739       t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19740 			 1, OPTAB_DIRECT);
19741 
19742       /* Combine high and low parts.  */
19743       force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19744       return;
19745     }
19746   emit_insn (x);
19747 }
19748 
19749 void
ix86_expand_mul_widen_hilo(rtx dest,rtx op1,rtx op2,bool uns_p,bool high_p)19750 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19751 			    bool uns_p, bool high_p)
19752 {
19753   machine_mode wmode = GET_MODE (dest);
19754   machine_mode mode = GET_MODE (op1);
19755   rtx t1, t2, t3, t4, mask;
19756 
19757   switch (mode)
19758     {
19759     case E_V4SImode:
19760       t1 = gen_reg_rtx (mode);
19761       t2 = gen_reg_rtx (mode);
19762       if (TARGET_XOP && !uns_p)
19763 	{
19764 	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
19765 	     shuffle the elements once so that all elements are in the right
19766 	     place for immediate use: { A C B D }.  */
19767 	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19768 					const1_rtx, GEN_INT (3)));
19769 	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19770 					const1_rtx, GEN_INT (3)));
19771 	}
19772       else
19773 	{
19774 	  /* Put the elements into place for the multiply.  */
19775 	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
19776 	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
19777 	  high_p = false;
19778 	}
19779       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19780       break;
19781 
19782     case E_V8SImode:
19783       /* Shuffle the elements between the lanes.  After this we
19784 	 have { A B E F | C D G H } for each operand.  */
19785       t1 = gen_reg_rtx (V4DImode);
19786       t2 = gen_reg_rtx (V4DImode);
19787       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19788 				      const0_rtx, const2_rtx,
19789 				      const1_rtx, GEN_INT (3)));
19790       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19791 				      const0_rtx, const2_rtx,
19792 				      const1_rtx, GEN_INT (3)));
19793 
19794       /* Shuffle the elements within the lanes.  After this we
19795 	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
19796       t3 = gen_reg_rtx (V8SImode);
19797       t4 = gen_reg_rtx (V8SImode);
19798       mask = GEN_INT (high_p
19799 		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19800 		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19801       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19802       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19803 
19804       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19805       break;
19806 
19807     case E_V8HImode:
19808     case E_V16HImode:
19809       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19810 			 uns_p, OPTAB_DIRECT);
19811       t2 = expand_binop (mode,
19812 			 uns_p ? umul_highpart_optab : smul_highpart_optab,
19813 			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
19814       gcc_assert (t1 && t2);
19815 
19816       t3 = gen_reg_rtx (mode);
19817       ix86_expand_vec_interleave (t3, t1, t2, high_p);
19818       emit_move_insn (dest, gen_lowpart (wmode, t3));
19819       break;
19820 
19821     case E_V16QImode:
19822     case E_V32QImode:
19823     case E_V32HImode:
19824     case E_V16SImode:
19825     case E_V64QImode:
19826       t1 = gen_reg_rtx (wmode);
19827       t2 = gen_reg_rtx (wmode);
19828       ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
19829       ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
19830 
19831       emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
19832       break;
19833 
19834     default:
19835       gcc_unreachable ();
19836     }
19837 }
19838 
19839 void
ix86_expand_sse2_mulv4si3(rtx op0,rtx op1,rtx op2)19840 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
19841 {
19842   rtx res_1, res_2, res_3, res_4;
19843 
19844   res_1 = gen_reg_rtx (V4SImode);
19845   res_2 = gen_reg_rtx (V4SImode);
19846   res_3 = gen_reg_rtx (V2DImode);
19847   res_4 = gen_reg_rtx (V2DImode);
19848   ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
19849   ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
19850 
19851   /* Move the results in element 2 down to element 1; we don't care
19852      what goes in elements 2 and 3.  Then we can merge the parts
19853      back together with an interleave.
19854 
19855      Note that two other sequences were tried:
19856      (1) Use interleaves at the start instead of psrldq, which allows
19857      us to use a single shufps to merge things back at the end.
19858      (2) Use shufps here to combine the two vectors, then pshufd to
19859      put the elements in the correct order.
19860      In both cases the cost of the reformatting stall was too high
19861      and the overall sequence slower.  */
19862 
19863   emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
19864 				const0_rtx, const2_rtx,
19865 				const0_rtx, const0_rtx));
19866   emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
19867 				const0_rtx, const2_rtx,
19868 				const0_rtx, const0_rtx));
19869   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
19870 
19871   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
19872 }
19873 
19874 void
ix86_expand_sse2_mulvxdi3(rtx op0,rtx op1,rtx op2)19875 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
19876 {
19877   machine_mode mode = GET_MODE (op0);
19878   rtx t1, t2, t3, t4, t5, t6;
19879 
19880   if (TARGET_AVX512DQ && mode == V8DImode)
19881     emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
19882   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
19883     emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
19884   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
19885     emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
19886   else if (TARGET_XOP && mode == V2DImode)
19887     {
19888       /* op1: A,B,C,D, op2: E,F,G,H */
19889       op1 = gen_lowpart (V4SImode, op1);
19890       op2 = gen_lowpart (V4SImode, op2);
19891 
19892       t1 = gen_reg_rtx (V4SImode);
19893       t2 = gen_reg_rtx (V4SImode);
19894       t3 = gen_reg_rtx (V2DImode);
19895       t4 = gen_reg_rtx (V2DImode);
19896 
19897       /* t1: B,A,D,C */
19898       emit_insn (gen_sse2_pshufd_1 (t1, op1,
19899 				    GEN_INT (1),
19900 				    GEN_INT (0),
19901 				    GEN_INT (3),
19902 				    GEN_INT (2)));
19903 
19904       /* t2: (B*E),(A*F),(D*G),(C*H) */
19905       emit_insn (gen_mulv4si3 (t2, t1, op2));
19906 
19907       /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19908       emit_insn (gen_xop_phadddq (t3, t2));
19909 
19910       /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19911       emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
19912 
19913       /* Multiply lower parts and add all */
19914       t5 = gen_reg_rtx (V2DImode);
19915       emit_insn (gen_vec_widen_umult_even_v4si (t5,
19916 					gen_lowpart (V4SImode, op1),
19917 					gen_lowpart (V4SImode, op2)));
19918       force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
19919     }
19920   else
19921     {
19922       machine_mode nmode;
19923       rtx (*umul) (rtx, rtx, rtx);
19924 
19925       if (mode == V2DImode)
19926 	{
19927 	  umul = gen_vec_widen_umult_even_v4si;
19928 	  nmode = V4SImode;
19929 	}
19930       else if (mode == V4DImode)
19931 	{
19932 	  umul = gen_vec_widen_umult_even_v8si;
19933 	  nmode = V8SImode;
19934 	}
19935       else if (mode == V8DImode)
19936 	{
19937 	  umul = gen_vec_widen_umult_even_v16si;
19938 	  nmode = V16SImode;
19939 	}
19940       else
19941 	gcc_unreachable ();
19942 
19943 
19944       /* Multiply low parts.  */
19945       t1 = gen_reg_rtx (mode);
19946       emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
19947 
19948       /* Shift input vectors right 32 bits so we can multiply high parts.  */
19949       t6 = GEN_INT (32);
19950       t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
19951       t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
19952 
19953       /* Multiply high parts by low parts.  */
19954       t4 = gen_reg_rtx (mode);
19955       t5 = gen_reg_rtx (mode);
19956       emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
19957       emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
19958 
19959       /* Combine and shift the highparts back.  */
19960       t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
19961       t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
19962 
19963       /* Combine high and low parts.  */
19964       force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
19965     }
19966 
19967   set_unique_reg_note (get_last_insn (), REG_EQUAL,
19968 		       gen_rtx_MULT (mode, op1, op2));
19969 }
19970 
19971 /* Return 1 if control tansfer instruction INSN
19972    should be encoded with notrack prefix.  */
19973 
19974 bool
ix86_notrack_prefixed_insn_p(rtx_insn * insn)19975 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
19976 {
19977   if (!insn || !((flag_cf_protection & CF_BRANCH)))
19978     return false;
19979 
19980   if (CALL_P (insn))
19981     {
19982       rtx call = get_call_rtx_from (insn);
19983       gcc_assert (call != NULL_RTX);
19984       rtx addr = XEXP (call, 0);
19985 
19986       /* Do not emit 'notrack' if it's not an indirect call.  */
19987       if (MEM_P (addr)
19988 	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
19989 	return false;
19990       else
19991 	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
19992     }
19993 
19994   if (JUMP_P (insn) && !flag_cet_switch)
19995     {
19996       rtx target = JUMP_LABEL (insn);
19997       if (target == NULL_RTX || ANY_RETURN_P (target))
19998 	return false;
19999 
20000       /* Check the jump is a switch table.  */
20001       rtx_insn *label = as_a<rtx_insn *> (target);
20002       rtx_insn *table = next_insn (label);
20003       if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20004 	return false;
20005       else
20006 	return true;
20007     }
20008   return false;
20009 }
20010 
20011 /* Calculate integer abs() using only SSE2 instructions.  */
20012 
20013 void
ix86_expand_sse2_abs(rtx target,rtx input)20014 ix86_expand_sse2_abs (rtx target, rtx input)
20015 {
20016   machine_mode mode = GET_MODE (target);
20017   rtx tmp0, tmp1, x;
20018 
20019   switch (mode)
20020     {
20021     case E_V2DImode:
20022     case E_V4DImode:
20023       /* For 64-bit signed integer X, with SSE4.2 use
20024 	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20025 	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20026 	 32 and use logical instead of arithmetic right shift (which is
20027 	 unimplemented) and subtract.  */
20028       if (TARGET_SSE4_2)
20029 	{
20030 	  tmp0 = gen_reg_rtx (mode);
20031 	  tmp1 = gen_reg_rtx (mode);
20032 	  emit_move_insn (tmp1, CONST0_RTX (mode));
20033 	  if (mode == E_V2DImode)
20034 	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20035 	  else
20036 	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20037 	}
20038       else
20039 	{
20040 	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20041 				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20042 					       - 1), NULL, 0, OPTAB_DIRECT);
20043 	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20044 	}
20045 
20046       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20047 				  NULL, 0, OPTAB_DIRECT);
20048       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20049 			       target, 0, OPTAB_DIRECT);
20050       break;
20051 
20052     case E_V4SImode:
20053       /* For 32-bit signed integer X, the best way to calculate the absolute
20054 	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
20055       tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20056 				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20057 				  NULL, 0, OPTAB_DIRECT);
20058       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20059 				  NULL, 0, OPTAB_DIRECT);
20060       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20061 			       target, 0, OPTAB_DIRECT);
20062       break;
20063 
20064     case E_V8HImode:
20065       /* For 16-bit signed integer X, the best way to calculate the absolute
20066 	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
20067       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20068 
20069       x = expand_simple_binop (mode, SMAX, tmp0, input,
20070 			       target, 0, OPTAB_DIRECT);
20071       break;
20072 
20073     case E_V16QImode:
20074       /* For 8-bit signed integer X, the best way to calculate the absolute
20075 	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20076 	 as SSE2 provides the PMINUB insn.  */
20077       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20078 
20079       x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20080 			       target, 0, OPTAB_DIRECT);
20081       break;
20082 
20083     default:
20084       gcc_unreachable ();
20085     }
20086 
20087   if (x != target)
20088     emit_move_insn (target, x);
20089 }
20090 
20091 /* Expand an extract from a vector register through pextr insn.
20092    Return true if successful.  */
20093 
20094 bool
ix86_expand_pextr(rtx * operands)20095 ix86_expand_pextr (rtx *operands)
20096 {
20097   rtx dst = operands[0];
20098   rtx src = operands[1];
20099 
20100   unsigned int size = INTVAL (operands[2]);
20101   unsigned int pos = INTVAL (operands[3]);
20102 
20103   if (SUBREG_P (dst))
20104     {
20105       /* Reject non-lowpart subregs.  */
20106       if (SUBREG_BYTE (dst) > 0)
20107 	return false;
20108       dst = SUBREG_REG (dst);
20109     }
20110 
20111   if (SUBREG_P (src))
20112     {
20113       pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20114       src = SUBREG_REG (src);
20115     }
20116 
20117   switch (GET_MODE (src))
20118     {
20119     case E_V16QImode:
20120     case E_V8HImode:
20121     case E_V4SImode:
20122     case E_V2DImode:
20123     case E_V1TImode:
20124       {
20125 	machine_mode srcmode, dstmode;
20126 	rtx d, pat;
20127 
20128 	if (!int_mode_for_size (size, 0).exists (&dstmode))
20129 	  return false;
20130 
20131 	switch (dstmode)
20132 	  {
20133 	  case E_QImode:
20134 	    if (!TARGET_SSE4_1)
20135 	      return false;
20136 	    srcmode = V16QImode;
20137 	    break;
20138 
20139 	  case E_HImode:
20140 	    if (!TARGET_SSE2)
20141 	      return false;
20142 	    srcmode = V8HImode;
20143 	    break;
20144 
20145 	  case E_SImode:
20146 	    if (!TARGET_SSE4_1)
20147 	      return false;
20148 	    srcmode = V4SImode;
20149 	    break;
20150 
20151 	  case E_DImode:
20152 	    gcc_assert (TARGET_64BIT);
20153 	    if (!TARGET_SSE4_1)
20154 	      return false;
20155 	    srcmode = V2DImode;
20156 	    break;
20157 
20158 	  default:
20159 	    return false;
20160 	  }
20161 
20162 	/* Reject extractions from misaligned positions.  */
20163 	if (pos & (size-1))
20164 	  return false;
20165 
20166 	if (GET_MODE (dst) == dstmode)
20167 	  d = dst;
20168 	else
20169 	  d = gen_reg_rtx (dstmode);
20170 
20171 	/* Construct insn pattern.  */
20172 	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20173 	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20174 
20175 	/* Let the rtl optimizers know about the zero extension performed.  */
20176 	if (dstmode == QImode || dstmode == HImode)
20177 	  {
20178 	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20179 	    d = gen_lowpart (SImode, d);
20180 	  }
20181 
20182 	emit_insn (gen_rtx_SET (d, pat));
20183 
20184 	if (d != dst)
20185 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20186 	return true;
20187       }
20188 
20189     default:
20190       return false;
20191     }
20192 }
20193 
20194 /* Expand an insert into a vector register through pinsr insn.
20195    Return true if successful.  */
20196 
20197 bool
ix86_expand_pinsr(rtx * operands)20198 ix86_expand_pinsr (rtx *operands)
20199 {
20200   rtx dst = operands[0];
20201   rtx src = operands[3];
20202 
20203   unsigned int size = INTVAL (operands[1]);
20204   unsigned int pos = INTVAL (operands[2]);
20205 
20206   if (SUBREG_P (dst))
20207     {
20208       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20209       dst = SUBREG_REG (dst);
20210     }
20211 
20212   switch (GET_MODE (dst))
20213     {
20214     case E_V16QImode:
20215     case E_V8HImode:
20216     case E_V4SImode:
20217     case E_V2DImode:
20218     case E_V1TImode:
20219       {
20220 	machine_mode srcmode, dstmode;
20221 	rtx (*pinsr)(rtx, rtx, rtx, rtx);
20222 	rtx d;
20223 
20224 	if (!int_mode_for_size (size, 0).exists (&srcmode))
20225 	  return false;
20226 
20227 	switch (srcmode)
20228 	  {
20229 	  case E_QImode:
20230 	    if (!TARGET_SSE4_1)
20231 	      return false;
20232 	    dstmode = V16QImode;
20233 	    pinsr = gen_sse4_1_pinsrb;
20234 	    break;
20235 
20236 	  case E_HImode:
20237 	    if (!TARGET_SSE2)
20238 	      return false;
20239 	    dstmode = V8HImode;
20240 	    pinsr = gen_sse2_pinsrw;
20241 	    break;
20242 
20243 	  case E_SImode:
20244 	    if (!TARGET_SSE4_1)
20245 	      return false;
20246 	    dstmode = V4SImode;
20247 	    pinsr = gen_sse4_1_pinsrd;
20248 	    break;
20249 
20250 	  case E_DImode:
20251 	    gcc_assert (TARGET_64BIT);
20252 	    if (!TARGET_SSE4_1)
20253 	      return false;
20254 	    dstmode = V2DImode;
20255 	    pinsr = gen_sse4_1_pinsrq;
20256 	    break;
20257 
20258 	  default:
20259 	    return false;
20260 	  }
20261 
20262 	/* Reject insertions to misaligned positions.  */
20263 	if (pos & (size-1))
20264 	  return false;
20265 
20266 	if (SUBREG_P (src))
20267 	  {
20268 	    unsigned int srcpos = SUBREG_BYTE (src);
20269 
20270 	    if (srcpos > 0)
20271 	      {
20272 		rtx extr_ops[4];
20273 
20274 		extr_ops[0] = gen_reg_rtx (srcmode);
20275 		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20276 		extr_ops[2] = GEN_INT (size);
20277 		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20278 
20279 		if (!ix86_expand_pextr (extr_ops))
20280 		  return false;
20281 
20282 		src = extr_ops[0];
20283 	      }
20284 	    else
20285 	      src = gen_lowpart (srcmode, SUBREG_REG (src));
20286 	  }
20287 
20288 	if (GET_MODE (dst) == dstmode)
20289 	  d = dst;
20290 	else
20291 	  d = gen_reg_rtx (dstmode);
20292 
20293 	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20294 			  gen_lowpart (srcmode, src),
20295 			  GEN_INT (1 << (pos / size))));
20296 	if (d != dst)
20297 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20298 	return true;
20299       }
20300 
20301     default:
20302       return false;
20303     }
20304 }
20305 
20306 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20307    upper against lower halves up to SSE reg size.  */
20308 
20309 machine_mode
ix86_split_reduction(machine_mode mode)20310 ix86_split_reduction (machine_mode mode)
20311 {
20312   /* Reduce lowpart against highpart until we reach SSE reg width to
20313      avoid cross-lane operations.  */
20314   switch (mode)
20315     {
20316     case E_V8DImode:
20317     case E_V4DImode:
20318       return V2DImode;
20319     case E_V16SImode:
20320     case E_V8SImode:
20321       return V4SImode;
20322     case E_V32HImode:
20323     case E_V16HImode:
20324       return V8HImode;
20325     case E_V64QImode:
20326     case E_V32QImode:
20327       return V16QImode;
20328     case E_V16SFmode:
20329     case E_V8SFmode:
20330       return V4SFmode;
20331     case E_V8DFmode:
20332     case E_V4DFmode:
20333       return V2DFmode;
20334     default:
20335       return mode;
20336     }
20337 }
20338 
20339 /* Generate call to __divmoddi4.  */
20340 
20341 void
ix86_expand_divmod_libfunc(rtx libfunc,machine_mode mode,rtx op0,rtx op1,rtx * quot_p,rtx * rem_p)20342 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20343 			    rtx op0, rtx op1,
20344 			    rtx *quot_p, rtx *rem_p)
20345 {
20346   rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20347 
20348   rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20349 				      mode, op0, mode, op1, mode,
20350 				      XEXP (rem, 0), Pmode);
20351   *quot_p = quot;
20352   *rem_p = rem;
20353 }
20354 
20355 #include "gt-i386-expand.h"
20356