xref: /netbsd-src/external/gpl3/gcc/dist/gcc/config/i386/i386-expand.cc (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9 
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3.  If not see
17 <http://www.gnu.org/licenses/>.  */
18 
19 #define IN_TARGET_CODE 1
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95 
96 /* Split one or more double-mode RTL references into pairs of half-mode
97    references.  The RTL can be REG, offsettable MEM, integer constant, or
98    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
99    split and "num" is its length.  lo_half and hi_half are output arrays
100    that parallel "operands".  */
101 
102 void
split_double_mode(machine_mode mode,rtx operands[],int num,rtx lo_half[],rtx hi_half[])103 split_double_mode (machine_mode mode, rtx operands[],
104 		   int num, rtx lo_half[], rtx hi_half[])
105 {
106   machine_mode half_mode;
107   unsigned int byte;
108   rtx mem_op = NULL_RTX;
109   int mem_num = 0;
110 
111   switch (mode)
112     {
113     case E_TImode:
114       half_mode = DImode;
115       break;
116     case E_DImode:
117       half_mode = SImode;
118       break;
119     case E_P2HImode:
120       half_mode = HImode;
121       break;
122     case E_P2QImode:
123       half_mode = QImode;
124       break;
125     default:
126       gcc_unreachable ();
127     }
128 
129   byte = GET_MODE_SIZE (half_mode);
130 
131   while (num--)
132     {
133       rtx op = operands[num];
134 
135       /* simplify_subreg refuse to split volatile memory addresses,
136          but we still have to handle it.  */
137       if (MEM_P (op))
138 	{
139 	  if (mem_op && rtx_equal_p (op, mem_op))
140 	    {
141 	      lo_half[num] = lo_half[mem_num];
142 	      hi_half[num] = hi_half[mem_num];
143 	    }
144 	  else
145 	    {
146 	      mem_op = op;
147 	      mem_num = num;
148 	      lo_half[num] = adjust_address (op, half_mode, 0);
149 	      hi_half[num] = adjust_address (op, half_mode, byte);
150 	    }
151 	}
152       else
153 	{
154 	  lo_half[num] = simplify_gen_subreg (half_mode, op,
155 					      GET_MODE (op) == VOIDmode
156 					      ? mode : GET_MODE (op), 0);
157 
158 	  rtx tmp = simplify_gen_subreg (half_mode, op,
159 					 GET_MODE (op) == VOIDmode
160 					 ? mode : GET_MODE (op), byte);
161 	  /* simplify_gen_subreg will return NULL RTX for the
162 	     high half of the paradoxical subreg. */
163 	  hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
164 	}
165     }
166 }
167 
168 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
169    for the target.  */
170 
171 void
ix86_expand_clear(rtx dest)172 ix86_expand_clear (rtx dest)
173 {
174   rtx tmp;
175 
176   /* We play register width games, which are only valid after reload.  */
177   gcc_assert (reload_completed);
178 
179   /* Avoid HImode and its attendant prefix byte.  */
180   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
181     dest = gen_rtx_REG (SImode, REGNO (dest));
182   tmp = gen_rtx_SET (dest, const0_rtx);
183 
184   if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
185     {
186       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
187       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
188     }
189 
190   emit_insn (tmp);
191 }
192 
193 /* Return true if V can be broadcasted from an integer of WIDTH bits
194    which is returned in VAL_BROADCAST.  Otherwise, return false.  */
195 
196 static bool
ix86_broadcast(HOST_WIDE_INT v,unsigned int width,HOST_WIDE_INT & val_broadcast)197 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
198 		HOST_WIDE_INT &val_broadcast)
199 {
200   wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
201   val_broadcast = wi::extract_uhwi (val, 0, width);
202   for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
203     {
204       HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
205       if (val_broadcast != each)
206 	return false;
207     }
208   val_broadcast = sext_hwi (val_broadcast, width);
209   return true;
210 }
211 
212 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE.  */
213 
214 static rtx
ix86_convert_const_wide_int_to_broadcast(machine_mode mode,rtx op)215 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
216 {
217   /* Don't use integer vector broadcast if we can't move from GPR to SSE
218      register directly.  */
219   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
220     return nullptr;
221 
222   /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
223      broadcast only if vector broadcast is available.  */
224   if (!TARGET_AVX
225       || !CONST_WIDE_INT_P (op)
226       || standard_sse_constant_p (op, mode)
227       || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
228 	  != GET_MODE_BITSIZE (mode)))
229     return nullptr;
230 
231   HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
232   HOST_WIDE_INT val_broadcast;
233   scalar_int_mode broadcast_mode;
234   if (TARGET_AVX2
235       && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
236 			 val_broadcast))
237     broadcast_mode = QImode;
238   else if (TARGET_AVX2
239 	   && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
240 			      val_broadcast))
241     broadcast_mode = HImode;
242   else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
243 			   val_broadcast))
244     broadcast_mode = SImode;
245   else if (TARGET_64BIT
246 	   && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
247 			      val_broadcast))
248     broadcast_mode = DImode;
249   else
250     return nullptr;
251 
252   /* Check if OP can be broadcasted from VAL.  */
253   for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
254     if (val != CONST_WIDE_INT_ELT (op, i))
255       return nullptr;
256 
257   unsigned int nunits = (GET_MODE_SIZE (mode)
258 			 / GET_MODE_SIZE (broadcast_mode));
259   machine_mode vector_mode;
260   if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
261     gcc_unreachable ();
262   rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
263   bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
264 					       target,
265 					       GEN_INT (val_broadcast));
266   gcc_assert (ok);
267   target = lowpart_subreg (mode, target, vector_mode);
268   return target;
269 }
270 
271 void
ix86_expand_move(machine_mode mode,rtx operands[])272 ix86_expand_move (machine_mode mode, rtx operands[])
273 {
274   rtx op0, op1;
275   rtx tmp, addend = NULL_RTX;
276   enum tls_model model;
277 
278   op0 = operands[0];
279   op1 = operands[1];
280 
281   /* Avoid complex sets of likely spilled hard registers before reload.  */
282   if (!ix86_hardreg_mov_ok (op0, op1))
283     {
284       tmp = gen_reg_rtx (mode);
285       operands[0] = tmp;
286       ix86_expand_move (mode, operands);
287       operands[0] = op0;
288       operands[1] = tmp;
289       op1 = tmp;
290     }
291 
292   switch (GET_CODE (op1))
293     {
294     case CONST:
295       tmp = XEXP (op1, 0);
296 
297       if (GET_CODE (tmp) != PLUS
298 	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
299 	break;
300 
301       op1 = XEXP (tmp, 0);
302       addend = XEXP (tmp, 1);
303       /* FALLTHRU */
304 
305     case SYMBOL_REF:
306       model = SYMBOL_REF_TLS_MODEL (op1);
307 
308       if (model)
309 	op1 = legitimize_tls_address (op1, model, true);
310       else if (ix86_force_load_from_GOT_p (op1))
311 	{
312 	  /* Load the external function address via GOT slot to avoid PLT.  */
313 	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
314 				(TARGET_64BIT
315 				 ? UNSPEC_GOTPCREL
316 				 : UNSPEC_GOT));
317 	  op1 = gen_rtx_CONST (Pmode, op1);
318 	  op1 = gen_const_mem (Pmode, op1);
319 	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
320 	}
321       else
322 	{
323 	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
324 	  if (tmp)
325 	    {
326 	      op1 = tmp;
327 	      if (!addend)
328 		break;
329 	    }
330 	  else
331 	    {
332 	      op1 = operands[1];
333 	      break;
334 	    }
335 	}
336 
337       if (addend)
338 	{
339 	  op1 = force_operand (op1, NULL_RTX);
340 	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
341 				     op0, 1, OPTAB_DIRECT);
342 	}
343       else
344 	op1 = force_operand (op1, op0);
345 
346       if (op1 == op0)
347 	return;
348 
349       op1 = convert_to_mode (mode, op1, 1);
350 
351     default:
352       break;
353 
354     case SUBREG:
355       /* As not all values in XFmode are representable in real_value,
356 	 we might be called with unfoldable SUBREGs of constants.  */
357       if (mode == XFmode
358 	  && CONSTANT_P (SUBREG_REG (op1))
359 	  && can_create_pseudo_p ())
360 	{
361 	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
362 	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
363 	  if (r)
364 	    r = validize_mem (r);
365 	  else
366 	    r = force_reg (imode, SUBREG_REG (op1));
367 	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
368 	}
369       break;
370     }
371 
372   if ((flag_pic || MACHOPIC_INDIRECT)
373       && symbolic_operand (op1, mode))
374     {
375       if (TARGET_MACHO && !TARGET_64BIT)
376 	{
377 #if TARGET_MACHO
378 	  /* dynamic-no-pic */
379 	  if (MACHOPIC_INDIRECT)
380 	    {
381 	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
382 			 ? op0 : gen_reg_rtx (Pmode);
383 	      op1 = machopic_indirect_data_reference (op1, temp);
384 	      if (MACHOPIC_PURE)
385 		op1 = machopic_legitimize_pic_address (op1, mode,
386 						       temp == op1 ? 0 : temp);
387 	    }
388 	  if (op0 != op1 && GET_CODE (op0) != MEM)
389 	    {
390 	      rtx insn = gen_rtx_SET (op0, op1);
391 	      emit_insn (insn);
392 	      return;
393 	    }
394 	  if (GET_CODE (op0) == MEM)
395 	    op1 = force_reg (Pmode, op1);
396 	  else
397 	    {
398 	      rtx temp = op0;
399 	      if (GET_CODE (temp) != REG)
400 		temp = gen_reg_rtx (Pmode);
401 	      temp = legitimize_pic_address (op1, temp);
402 	      if (temp == op0)
403 	    return;
404 	      op1 = temp;
405 	    }
406       /* dynamic-no-pic */
407 #endif
408 	}
409       else
410 	{
411 	  if (MEM_P (op0))
412 	    op1 = force_reg (mode, op1);
413 	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
414 	    {
415 	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
416 	      op1 = legitimize_pic_address (op1, reg);
417 	      if (op0 == op1)
418 		return;
419 	      op1 = convert_to_mode (mode, op1, 1);
420 	    }
421 	}
422     }
423   else
424     {
425       if (MEM_P (op0)
426 	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
427 	      || !push_operand (op0, mode))
428 	  && MEM_P (op1))
429 	op1 = force_reg (mode, op1);
430 
431       if (push_operand (op0, mode)
432 	  && ! general_no_elim_operand (op1, mode))
433 	op1 = copy_to_mode_reg (mode, op1);
434 
435       /* Force large constants in 64bit compilation into register
436 	 to get them CSEed.  */
437       if (can_create_pseudo_p ()
438 	  && (mode == DImode) && TARGET_64BIT
439 	  && immediate_operand (op1, mode)
440 	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
441 	  && !register_operand (op0, mode)
442 	  && optimize)
443 	op1 = copy_to_mode_reg (mode, op1);
444 
445       if (can_create_pseudo_p ())
446 	{
447 	  if (CONST_DOUBLE_P (op1))
448 	    {
449 	      /* If we are loading a floating point constant to a
450 		 register, force the value to memory now, since we'll
451 		 get better code out the back end.  */
452 
453 	      op1 = validize_mem (force_const_mem (mode, op1));
454 	      if (!register_operand (op0, mode))
455 		{
456 		  rtx temp = gen_reg_rtx (mode);
457 		  emit_insn (gen_rtx_SET (temp, op1));
458 		  emit_move_insn (op0, temp);
459 		  return;
460 		}
461 	    }
462 	  else if (GET_MODE_SIZE (mode) >= 16)
463 	    {
464 	      rtx tmp = ix86_convert_const_wide_int_to_broadcast
465 		(GET_MODE (op0), op1);
466 	      if (tmp != nullptr)
467 		op1 = tmp;
468 	    }
469 	}
470     }
471 
472   emit_insn (gen_rtx_SET (op0, op1));
473 }
474 
475 /* OP is a memref of CONST_VECTOR, return scalar constant mem
476    if CONST_VECTOR is a vec_duplicate, else return NULL.  */
477 static rtx
ix86_broadcast_from_constant(machine_mode mode,rtx op)478 ix86_broadcast_from_constant (machine_mode mode, rtx op)
479 {
480   int nunits = GET_MODE_NUNITS (mode);
481   if (nunits < 2)
482     return nullptr;
483 
484   /* Don't use integer vector broadcast if we can't move from GPR to SSE
485      register directly.  */
486   if (!TARGET_INTER_UNIT_MOVES_TO_VEC
487       && INTEGRAL_MODE_P (mode))
488     return nullptr;
489 
490   /* Convert CONST_VECTOR to a non-standard SSE constant integer
491      broadcast only if vector broadcast is available.  */
492   if (!(TARGET_AVX2
493 	|| (TARGET_AVX
494 	    && (GET_MODE_INNER (mode) == SImode
495 		|| GET_MODE_INNER (mode) == DImode))
496 	|| FLOAT_MODE_P (mode))
497       || standard_sse_constant_p (op, mode))
498     return nullptr;
499 
500   /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
501      We can still put 64-bit integer constant in memory when
502      avx512 embed broadcast is available.  */
503   if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
504       && (!TARGET_AVX512F
505 	  || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
506     return nullptr;
507 
508   if (GET_MODE_INNER (mode) == TImode)
509     return nullptr;
510 
511   rtx constant = get_pool_constant (XEXP (op, 0));
512   if (GET_CODE (constant) != CONST_VECTOR)
513     return nullptr;
514 
515   /* There could be some rtx like
516      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
517      but with "*.LC1" refer to V2DI constant vector.  */
518   if (GET_MODE (constant) != mode)
519     {
520       constant = simplify_subreg (mode, constant, GET_MODE (constant),
521 				  0);
522       if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
523 	return nullptr;
524     }
525 
526   rtx first = XVECEXP (constant, 0, 0);
527 
528   for (int i = 1; i < nunits; ++i)
529     {
530       rtx tmp = XVECEXP (constant, 0, i);
531       /* Vector duplicate value.  */
532       if (!rtx_equal_p (tmp, first))
533 	return nullptr;
534     }
535 
536   return first;
537 }
538 
539 void
ix86_expand_vector_move(machine_mode mode,rtx operands[])540 ix86_expand_vector_move (machine_mode mode, rtx operands[])
541 {
542   rtx op0 = operands[0], op1 = operands[1];
543   /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
544      psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
545   unsigned int align = (TARGET_IAMCU
546 			? GET_MODE_BITSIZE (mode)
547 			: GET_MODE_ALIGNMENT (mode));
548 
549   if (push_operand (op0, VOIDmode))
550     op0 = emit_move_resolve_push (mode, op0);
551 
552   /* Force constants other than zero into memory.  We do not know how
553      the instructions used to build constants modify the upper 64 bits
554      of the register, once we have that information we may be able
555      to handle some of them more efficiently.  */
556   if (can_create_pseudo_p ()
557       && (CONSTANT_P (op1)
558 	  || (SUBREG_P (op1)
559 	      && CONSTANT_P (SUBREG_REG (op1))))
560       && ((register_operand (op0, mode)
561 	   && !standard_sse_constant_p (op1, mode))
562 	  /* ix86_expand_vector_move_misalign() does not like constants.  */
563 	  || (SSE_REG_MODE_P (mode)
564 	      && MEM_P (op0)
565 	      && MEM_ALIGN (op0) < align)))
566     {
567       if (SUBREG_P (op1))
568 	{
569 	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
570 	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
571 	  if (r)
572 	    r = validize_mem (r);
573 	  else
574 	    r = force_reg (imode, SUBREG_REG (op1));
575 	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
576 	}
577       else
578 	{
579 	  machine_mode mode = GET_MODE (op0);
580 	  rtx tmp = ix86_convert_const_wide_int_to_broadcast
581 	    (mode, op1);
582 	  if (tmp == nullptr)
583 	    op1 = validize_mem (force_const_mem (mode, op1));
584 	  else
585 	    op1 = tmp;
586 	}
587     }
588 
589   if (can_create_pseudo_p ()
590       && GET_MODE_SIZE (mode) >= 16
591       && VECTOR_MODE_P (mode)
592       && (MEM_P (op1)
593 	  && SYMBOL_REF_P (XEXP (op1, 0))
594 	  && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
595     {
596       rtx first = ix86_broadcast_from_constant (mode, op1);
597       if (first != nullptr)
598 	{
599 	  /* Broadcast to XMM/YMM/ZMM register from an integer
600 	     constant or scalar mem.  */
601 	  op1 = gen_reg_rtx (mode);
602 	  if (FLOAT_MODE_P (mode)
603 	      || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
604 	    first = force_const_mem (GET_MODE_INNER (mode), first);
605 	  bool ok = ix86_expand_vector_init_duplicate (false, mode,
606 						       op1, first);
607 	  gcc_assert (ok);
608 	  emit_move_insn (op0, op1);
609 	  return;
610 	}
611     }
612 
613   /* We need to check memory alignment for SSE mode since attribute
614      can make operands unaligned.  */
615   if (can_create_pseudo_p ()
616       && SSE_REG_MODE_P (mode)
617       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
618 	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
619     {
620       rtx tmp[2];
621 
622       /* ix86_expand_vector_move_misalign() does not like both
623 	 arguments in memory.  */
624       if (!register_operand (op0, mode)
625 	  && !register_operand (op1, mode))
626 	{
627 	  rtx scratch = ix86_gen_scratch_sse_rtx (mode);
628 	  emit_move_insn (scratch, op1);
629 	  op1 = scratch;
630 	}
631 
632       tmp[0] = op0; tmp[1] = op1;
633       ix86_expand_vector_move_misalign (mode, tmp);
634       return;
635     }
636 
637   /* Special case TImode to V1TImode conversions, via V2DI.  */
638   if (mode == V1TImode
639       && SUBREG_P (op1)
640       && GET_MODE (SUBREG_REG (op1)) == TImode
641       && TARGET_64BIT && TARGET_SSE
642       && can_create_pseudo_p ())
643     {
644       rtx tmp = gen_reg_rtx (V2DImode);
645       rtx lo = gen_reg_rtx (DImode);
646       rtx hi = gen_reg_rtx (DImode);
647       emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
648       emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
649       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
650       emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
651       return;
652     }
653 
654   /* If operand0 is a hard register, make operand1 a pseudo.  */
655   if (can_create_pseudo_p ()
656       && !ix86_hardreg_mov_ok (op0, op1))
657     {
658       rtx tmp = gen_reg_rtx (GET_MODE (op0));
659       emit_move_insn (tmp, op1);
660       emit_move_insn (op0, tmp);
661       return;
662     }
663 
664   /* Make operand1 a register if it isn't already.  */
665   if (can_create_pseudo_p ()
666       && !register_operand (op0, mode)
667       && !register_operand (op1, mode))
668     {
669       rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
670       emit_move_insn (tmp, op1);
671       emit_move_insn (op0, tmp);
672       return;
673     }
674 
675   emit_insn (gen_rtx_SET (op0, op1));
676 }
677 
678 /* Split 32-byte AVX unaligned load and store if needed.  */
679 
680 static void
ix86_avx256_split_vector_move_misalign(rtx op0,rtx op1)681 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
682 {
683   rtx m;
684   rtx (*extract) (rtx, rtx, rtx);
685   machine_mode mode;
686 
687   if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
688       || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
689     {
690       emit_insn (gen_rtx_SET (op0, op1));
691       return;
692     }
693 
694   rtx orig_op0 = NULL_RTX;
695   mode = GET_MODE (op0);
696   switch (GET_MODE_CLASS (mode))
697     {
698     case MODE_VECTOR_INT:
699     case MODE_INT:
700       if (mode != V32QImode)
701 	{
702 	  if (!MEM_P (op0))
703 	    {
704 	      orig_op0 = op0;
705 	      op0 = gen_reg_rtx (V32QImode);
706 	    }
707 	  else
708 	    op0 = gen_lowpart (V32QImode, op0);
709 	  op1 = gen_lowpart (V32QImode, op1);
710 	  mode = V32QImode;
711 	}
712       break;
713     case MODE_VECTOR_FLOAT:
714       break;
715     default:
716       gcc_unreachable ();
717     }
718 
719   switch (mode)
720     {
721     default:
722       gcc_unreachable ();
723     case E_V32QImode:
724       extract = gen_avx_vextractf128v32qi;
725       mode = V16QImode;
726       break;
727     case E_V16HFmode:
728       extract = gen_avx_vextractf128v16hf;
729       mode = V8HFmode;
730       break;
731     case E_V8SFmode:
732       extract = gen_avx_vextractf128v8sf;
733       mode = V4SFmode;
734       break;
735     case E_V4DFmode:
736       extract = gen_avx_vextractf128v4df;
737       mode = V2DFmode;
738       break;
739     }
740 
741   if (MEM_P (op1))
742     {
743       rtx r = gen_reg_rtx (mode);
744       m = adjust_address (op1, mode, 0);
745       emit_move_insn (r, m);
746       m = adjust_address (op1, mode, 16);
747       r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
748       emit_move_insn (op0, r);
749     }
750   else if (MEM_P (op0))
751     {
752       m = adjust_address (op0, mode, 0);
753       emit_insn (extract (m, op1, const0_rtx));
754       m = adjust_address (op0, mode, 16);
755       emit_insn (extract (m, copy_rtx (op1), const1_rtx));
756     }
757   else
758     gcc_unreachable ();
759 
760   if (orig_op0)
761     emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
762 }
763 
764 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
765    straight to ix86_expand_vector_move.  */
766 /* Code generation for scalar reg-reg moves of single and double precision data:
767      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
768        movaps reg, reg
769      else
770        movss reg, reg
771      if (x86_sse_partial_reg_dependency == true)
772        movapd reg, reg
773      else
774        movsd reg, reg
775 
776    Code generation for scalar loads of double precision data:
777      if (x86_sse_split_regs == true)
778        movlpd mem, reg      (gas syntax)
779      else
780        movsd mem, reg
781 
782    Code generation for unaligned packed loads of single precision data
783    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
784      if (x86_sse_unaligned_move_optimal)
785        movups mem, reg
786 
787      if (x86_sse_partial_reg_dependency == true)
788        {
789          xorps  reg, reg
790          movlps mem, reg
791          movhps mem+8, reg
792        }
793      else
794        {
795          movlps mem, reg
796          movhps mem+8, reg
797        }
798 
799    Code generation for unaligned packed loads of double precision data
800    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
801      if (x86_sse_unaligned_move_optimal)
802        movupd mem, reg
803 
804      if (x86_sse_split_regs == true)
805        {
806          movlpd mem, reg
807          movhpd mem+8, reg
808        }
809      else
810        {
811          movsd  mem, reg
812          movhpd mem+8, reg
813        }
814  */
815 
816 void
ix86_expand_vector_move_misalign(machine_mode mode,rtx operands[])817 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
818 {
819   rtx op0, op1, m;
820 
821   op0 = operands[0];
822   op1 = operands[1];
823 
824   /* Use unaligned load/store for AVX512 or when optimizing for size.  */
825   if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
826     {
827       emit_insn (gen_rtx_SET (op0, op1));
828       return;
829     }
830 
831   if (TARGET_AVX)
832     {
833       if (GET_MODE_SIZE (mode) == 32)
834 	ix86_avx256_split_vector_move_misalign (op0, op1);
835       else
836 	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
837 	emit_insn (gen_rtx_SET (op0, op1));
838       return;
839     }
840 
841   if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
842       || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
843     {
844       emit_insn (gen_rtx_SET (op0, op1));
845       return;
846     }
847 
848   /* ??? If we have typed data, then it would appear that using
849      movdqu is the only way to get unaligned data loaded with
850      integer type.  */
851   if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
852     {
853       emit_insn (gen_rtx_SET (op0, op1));
854       return;
855     }
856 
857   if (MEM_P (op1))
858     {
859       if (TARGET_SSE2 && mode == V2DFmode)
860         {
861           rtx zero;
862 
863 	  /* When SSE registers are split into halves, we can avoid
864 	     writing to the top half twice.  */
865 	  if (TARGET_SSE_SPLIT_REGS)
866 	    {
867 	      emit_clobber (op0);
868 	      zero = op0;
869 	    }
870 	  else
871 	    {
872 	      /* ??? Not sure about the best option for the Intel chips.
873 		 The following would seem to satisfy; the register is
874 		 entirely cleared, breaking the dependency chain.  We
875 		 then store to the upper half, with a dependency depth
876 		 of one.  A rumor has it that Intel recommends two movsd
877 		 followed by an unpacklpd, but this is unconfirmed.  And
878 		 given that the dependency depth of the unpacklpd would
879 		 still be one, I'm not sure why this would be better.  */
880 	      zero = CONST0_RTX (V2DFmode);
881 	    }
882 
883 	  m = adjust_address (op1, DFmode, 0);
884 	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
885 	  m = adjust_address (op1, DFmode, 8);
886 	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
887 	}
888       else
889         {
890 	  rtx t;
891 
892 	  if (mode != V4SFmode)
893 	    t = gen_reg_rtx (V4SFmode);
894 	  else
895 	    t = op0;
896 
897 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
898 	    emit_move_insn (t, CONST0_RTX (V4SFmode));
899 	  else
900 	    emit_clobber (t);
901 
902 	  m = adjust_address (op1, V2SFmode, 0);
903 	  emit_insn (gen_sse_loadlps (t, t, m));
904 	  m = adjust_address (op1, V2SFmode, 8);
905 	  emit_insn (gen_sse_loadhps (t, t, m));
906 	  if (mode != V4SFmode)
907 	    emit_move_insn (op0, gen_lowpart (mode, t));
908 	}
909     }
910   else if (MEM_P (op0))
911     {
912       if (TARGET_SSE2 && mode == V2DFmode)
913 	{
914 	  m = adjust_address (op0, DFmode, 0);
915 	  emit_insn (gen_sse2_storelpd (m, op1));
916 	  m = adjust_address (op0, DFmode, 8);
917 	  emit_insn (gen_sse2_storehpd (m, op1));
918 	}
919       else
920 	{
921 	  if (mode != V4SFmode)
922 	    op1 = gen_lowpart (V4SFmode, op1);
923 
924 	  m = adjust_address (op0, V2SFmode, 0);
925 	  emit_insn (gen_sse_storelps (m, op1));
926 	  m = adjust_address (op0, V2SFmode, 8);
927 	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
928 	}
929     }
930   else
931     gcc_unreachable ();
932 }
933 
934 /* Move bits 64:95 to bits 32:63.  */
935 
936 void
ix86_move_vector_high_sse_to_mmx(rtx op)937 ix86_move_vector_high_sse_to_mmx (rtx op)
938 {
939   rtx mask = gen_rtx_PARALLEL (VOIDmode,
940 			       gen_rtvec (4, GEN_INT (0), GEN_INT (2),
941 					  GEN_INT (0), GEN_INT (0)));
942   rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
943   op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
944   rtx insn = gen_rtx_SET (dest, op);
945   emit_insn (insn);
946 }
947 
948 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
949 
950 void
ix86_split_mmx_pack(rtx operands[],enum rtx_code code)951 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
952 {
953   rtx op0 = operands[0];
954   rtx op1 = operands[1];
955   rtx op2 = operands[2];
956 
957   machine_mode dmode = GET_MODE (op0);
958   machine_mode smode = GET_MODE (op1);
959   machine_mode inner_dmode = GET_MODE_INNER (dmode);
960   machine_mode inner_smode = GET_MODE_INNER (smode);
961 
962   /* Get the corresponding SSE mode for destination.  */
963   int nunits = 16 / GET_MODE_SIZE (inner_dmode);
964   machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
965 					    nunits).require ();
966   machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
967 						 nunits / 2).require ();
968 
969   /* Get the corresponding SSE mode for source.  */
970   nunits = 16 / GET_MODE_SIZE (inner_smode);
971   machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
972 					    nunits).require ();
973 
974   /* Generate SSE pack with signed/unsigned saturation.  */
975   rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
976   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
977   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
978 
979   op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
980   op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
981   rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
982 						    op1, op2));
983   emit_insn (insn);
984 
985   ix86_move_vector_high_sse_to_mmx (op0);
986 }
987 
988 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
989 
990 void
ix86_split_mmx_punpck(rtx operands[],bool high_p)991 ix86_split_mmx_punpck (rtx operands[], bool high_p)
992 {
993   rtx op0 = operands[0];
994   rtx op1 = operands[1];
995   rtx op2 = operands[2];
996   machine_mode mode = GET_MODE (op0);
997   rtx mask;
998   /* The corresponding SSE mode.  */
999   machine_mode sse_mode, double_sse_mode;
1000 
1001   switch (mode)
1002     {
1003     case E_V4QImode:
1004     case E_V8QImode:
1005       sse_mode = V16QImode;
1006       double_sse_mode = V32QImode;
1007       mask = gen_rtx_PARALLEL (VOIDmode,
1008 			       gen_rtvec (16,
1009 					  GEN_INT (0), GEN_INT (16),
1010 					  GEN_INT (1), GEN_INT (17),
1011 					  GEN_INT (2), GEN_INT (18),
1012 					  GEN_INT (3), GEN_INT (19),
1013 					  GEN_INT (4), GEN_INT (20),
1014 					  GEN_INT (5), GEN_INT (21),
1015 					  GEN_INT (6), GEN_INT (22),
1016 					  GEN_INT (7), GEN_INT (23)));
1017       break;
1018 
1019     case E_V4HImode:
1020     case E_V2HImode:
1021       sse_mode = V8HImode;
1022       double_sse_mode = V16HImode;
1023       mask = gen_rtx_PARALLEL (VOIDmode,
1024 			       gen_rtvec (8,
1025 					  GEN_INT (0), GEN_INT (8),
1026 					  GEN_INT (1), GEN_INT (9),
1027 					  GEN_INT (2), GEN_INT (10),
1028 					  GEN_INT (3), GEN_INT (11)));
1029       break;
1030 
1031     case E_V2SImode:
1032       sse_mode = V4SImode;
1033       double_sse_mode = V8SImode;
1034       mask = gen_rtx_PARALLEL (VOIDmode,
1035 			       gen_rtvec (4,
1036 					  GEN_INT (0), GEN_INT (4),
1037 					  GEN_INT (1), GEN_INT (5)));
1038       break;
1039 
1040     case E_V2SFmode:
1041       sse_mode = V4SFmode;
1042       double_sse_mode = V8SFmode;
1043       mask = gen_rtx_PARALLEL (VOIDmode,
1044 			       gen_rtvec (4,
1045 					  GEN_INT (0), GEN_INT (4),
1046 					  GEN_INT (1), GEN_INT (5)));
1047       break;
1048 
1049     default:
1050       gcc_unreachable ();
1051     }
1052 
1053   /* Generate SSE punpcklXX.  */
1054   rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1055   op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1056   op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1057 
1058   op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1059   op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1060   rtx insn = gen_rtx_SET (dest, op2);
1061   emit_insn (insn);
1062 
1063   /* Move high bits to low bits.  */
1064   if (high_p)
1065     {
1066       if (sse_mode == V4SFmode)
1067 	{
1068 	  mask = gen_rtx_PARALLEL (VOIDmode,
1069 				   gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1070 					      GEN_INT (4), GEN_INT (5)));
1071 	  op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1072 	  op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1073 	}
1074       else
1075 	{
1076 	  int sz = GET_MODE_SIZE (mode);
1077 
1078 	  if (sz == 4)
1079 	    mask = gen_rtx_PARALLEL (VOIDmode,
1080 				     gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1081 						GEN_INT (0), GEN_INT (1)));
1082 	  else if (sz == 8)
1083 	    mask = gen_rtx_PARALLEL (VOIDmode,
1084 				     gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1085 						GEN_INT (0), GEN_INT (1)));
1086 	  else
1087 	    gcc_unreachable ();
1088 
1089 	  dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1090 	  op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1091 	}
1092 
1093       insn = gen_rtx_SET (dest, op1);
1094       emit_insn (insn);
1095     }
1096 }
1097 
1098 /* Helper function of ix86_fixup_binary_operands to canonicalize
1099    operand order.  Returns true if the operands should be swapped.  */
1100 
1101 static bool
ix86_swap_binary_operands_p(enum rtx_code code,machine_mode mode,rtx operands[])1102 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1103 			     rtx operands[])
1104 {
1105   rtx dst = operands[0];
1106   rtx src1 = operands[1];
1107   rtx src2 = operands[2];
1108 
1109   /* If the operation is not commutative, we can't do anything.  */
1110   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1111       && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1112     return false;
1113 
1114   /* Highest priority is that src1 should match dst.  */
1115   if (rtx_equal_p (dst, src1))
1116     return false;
1117   if (rtx_equal_p (dst, src2))
1118     return true;
1119 
1120   /* Next highest priority is that immediate constants come second.  */
1121   if (immediate_operand (src2, mode))
1122     return false;
1123   if (immediate_operand (src1, mode))
1124     return true;
1125 
1126   /* Lowest priority is that memory references should come second.  */
1127   if (MEM_P (src2))
1128     return false;
1129   if (MEM_P (src1))
1130     return true;
1131 
1132   return false;
1133 }
1134 
1135 
1136 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
1137    destination to use for the operation.  If different from the true
1138    destination in operands[0], a copy operation will be required.  */
1139 
1140 rtx
ix86_fixup_binary_operands(enum rtx_code code,machine_mode mode,rtx operands[])1141 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1142 			    rtx operands[])
1143 {
1144   rtx dst = operands[0];
1145   rtx src1 = operands[1];
1146   rtx src2 = operands[2];
1147 
1148   /* Canonicalize operand order.  */
1149   if (ix86_swap_binary_operands_p (code, mode, operands))
1150     {
1151       /* It is invalid to swap operands of different modes.  */
1152       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1153 
1154       std::swap (src1, src2);
1155     }
1156 
1157   /* Both source operands cannot be in memory.  */
1158   if (MEM_P (src1) && MEM_P (src2))
1159     {
1160       /* Optimization: Only read from memory once.  */
1161       if (rtx_equal_p (src1, src2))
1162 	{
1163 	  src2 = force_reg (mode, src2);
1164 	  src1 = src2;
1165 	}
1166       else if (rtx_equal_p (dst, src1))
1167 	src2 = force_reg (mode, src2);
1168       else
1169 	src1 = force_reg (mode, src1);
1170     }
1171 
1172   /* If the destination is memory, and we do not have matching source
1173      operands, do things in registers.  */
1174   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1175     dst = gen_reg_rtx (mode);
1176 
1177   /* Source 1 cannot be a constant.  */
1178   if (CONSTANT_P (src1))
1179     src1 = force_reg (mode, src1);
1180 
1181   /* Source 1 cannot be a non-matching memory.  */
1182   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1183     src1 = force_reg (mode, src1);
1184 
1185   /* Improve address combine.  */
1186   if (code == PLUS
1187       && GET_MODE_CLASS (mode) == MODE_INT
1188       && MEM_P (src2))
1189     src2 = force_reg (mode, src2);
1190 
1191   operands[1] = src1;
1192   operands[2] = src2;
1193   return dst;
1194 }
1195 
1196 /* Similarly, but assume that the destination has already been
1197    set up properly.  */
1198 
1199 void
ix86_fixup_binary_operands_no_copy(enum rtx_code code,machine_mode mode,rtx operands[])1200 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1201 				    machine_mode mode, rtx operands[])
1202 {
1203   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1204   gcc_assert (dst == operands[0]);
1205 }
1206 
1207 /* Attempt to expand a binary operator.  Make the expansion closer to the
1208    actual machine, then just general_operand, which will allow 3 separate
1209    memory references (one output, two input) in a single insn.  */
1210 
1211 void
ix86_expand_binary_operator(enum rtx_code code,machine_mode mode,rtx operands[])1212 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1213 			     rtx operands[])
1214 {
1215   rtx src1, src2, dst, op, clob;
1216 
1217   dst = ix86_fixup_binary_operands (code, mode, operands);
1218   src1 = operands[1];
1219   src2 = operands[2];
1220 
1221  /* Emit the instruction.  */
1222 
1223   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1224 
1225   if (reload_completed
1226       && code == PLUS
1227       && !rtx_equal_p (dst, src1))
1228     {
1229       /* This is going to be an LEA; avoid splitting it later.  */
1230       emit_insn (op);
1231     }
1232   else
1233     {
1234       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1235       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1236     }
1237 
1238   /* Fix up the destination if needed.  */
1239   if (dst != operands[0])
1240     emit_move_insn (operands[0], dst);
1241 }
1242 
1243 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1244    the given OPERANDS.  */
1245 
1246 void
ix86_expand_vector_logical_operator(enum rtx_code code,machine_mode mode,rtx operands[])1247 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1248 				     rtx operands[])
1249 {
1250   rtx op1 = NULL_RTX, op2 = NULL_RTX;
1251   if (SUBREG_P (operands[1]))
1252     {
1253       op1 = operands[1];
1254       op2 = operands[2];
1255     }
1256   else if (SUBREG_P (operands[2]))
1257     {
1258       op1 = operands[2];
1259       op2 = operands[1];
1260     }
1261   /* Optimize (__m128i) d | (__m128i) e and similar code
1262      when d and e are float vectors into float vector logical
1263      insn.  In C/C++ without using intrinsics there is no other way
1264      to express vector logical operation on float vectors than
1265      to cast them temporarily to integer vectors.  */
1266   if (op1
1267       && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1268       && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1269       && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1270       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1271       && SUBREG_BYTE (op1) == 0
1272       && (GET_CODE (op2) == CONST_VECTOR
1273 	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1274 	      && SUBREG_BYTE (op2) == 0))
1275       && can_create_pseudo_p ())
1276     {
1277       rtx dst;
1278       switch (GET_MODE (SUBREG_REG (op1)))
1279 	{
1280 	case E_V4SFmode:
1281 	case E_V8SFmode:
1282 	case E_V16SFmode:
1283 	case E_V2DFmode:
1284 	case E_V4DFmode:
1285 	case E_V8DFmode:
1286 	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1287 	  if (GET_CODE (op2) == CONST_VECTOR)
1288 	    {
1289 	      op2 = gen_lowpart (GET_MODE (dst), op2);
1290 	      op2 = force_reg (GET_MODE (dst), op2);
1291 	    }
1292 	  else
1293 	    {
1294 	      op1 = operands[1];
1295 	      op2 = SUBREG_REG (operands[2]);
1296 	      if (!vector_operand (op2, GET_MODE (dst)))
1297 		op2 = force_reg (GET_MODE (dst), op2);
1298 	    }
1299 	  op1 = SUBREG_REG (op1);
1300 	  if (!vector_operand (op1, GET_MODE (dst)))
1301 	    op1 = force_reg (GET_MODE (dst), op1);
1302 	  emit_insn (gen_rtx_SET (dst,
1303 				  gen_rtx_fmt_ee (code, GET_MODE (dst),
1304 						  op1, op2)));
1305 	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
1306 	  return;
1307 	default:
1308 	  break;
1309 	}
1310     }
1311   if (!vector_operand (operands[1], mode))
1312     operands[1] = force_reg (mode, operands[1]);
1313   if (!vector_operand (operands[2], mode))
1314     operands[2] = force_reg (mode, operands[2]);
1315   ix86_fixup_binary_operands_no_copy (code, mode, operands);
1316   emit_insn (gen_rtx_SET (operands[0],
1317 			  gen_rtx_fmt_ee (code, mode, operands[1],
1318 					  operands[2])));
1319 }
1320 
1321 /* Return TRUE or FALSE depending on whether the binary operator meets the
1322    appropriate constraints.  */
1323 
1324 bool
ix86_binary_operator_ok(enum rtx_code code,machine_mode mode,rtx operands[3])1325 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1326 			 rtx operands[3])
1327 {
1328   rtx dst = operands[0];
1329   rtx src1 = operands[1];
1330   rtx src2 = operands[2];
1331 
1332   /* Both source operands cannot be in memory.  */
1333   if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1334       && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1335     return false;
1336 
1337   /* Canonicalize operand order for commutative operators.  */
1338   if (ix86_swap_binary_operands_p (code, mode, operands))
1339     std::swap (src1, src2);
1340 
1341   /* If the destination is memory, we must have a matching source operand.  */
1342   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1343     return false;
1344 
1345   /* Source 1 cannot be a constant.  */
1346   if (CONSTANT_P (src1))
1347     return false;
1348 
1349   /* Source 1 cannot be a non-matching memory.  */
1350   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1351     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
1352     return (code == AND
1353 	    && (mode == HImode
1354 		|| mode == SImode
1355 		|| (TARGET_64BIT && mode == DImode))
1356 	    && satisfies_constraint_L (src2));
1357 
1358   return true;
1359 }
1360 
1361 /* Attempt to expand a unary operator.  Make the expansion closer to the
1362    actual machine, then just general_operand, which will allow 2 separate
1363    memory references (one output, one input) in a single insn.  */
1364 
1365 void
ix86_expand_unary_operator(enum rtx_code code,machine_mode mode,rtx operands[])1366 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1367 			    rtx operands[])
1368 {
1369   bool matching_memory = false;
1370   rtx src, dst, op, clob;
1371 
1372   dst = operands[0];
1373   src = operands[1];
1374 
1375   /* If the destination is memory, and we do not have matching source
1376      operands, do things in registers.  */
1377   if (MEM_P (dst))
1378     {
1379       if (rtx_equal_p (dst, src))
1380 	matching_memory = true;
1381       else
1382 	dst = gen_reg_rtx (mode);
1383     }
1384 
1385   /* When source operand is memory, destination must match.  */
1386   if (MEM_P (src) && !matching_memory)
1387     src = force_reg (mode, src);
1388 
1389   /* Emit the instruction.  */
1390 
1391   op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1392 
1393   if (code == NOT)
1394     emit_insn (op);
1395   else
1396     {
1397       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1398       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1399     }
1400 
1401   /* Fix up the destination if needed.  */
1402   if (dst != operands[0])
1403     emit_move_insn (operands[0], dst);
1404 }
1405 
1406 /* Predict just emitted jump instruction to be taken with probability PROB.  */
1407 
1408 static void
predict_jump(int prob)1409 predict_jump (int prob)
1410 {
1411   rtx_insn *insn = get_last_insn ();
1412   gcc_assert (JUMP_P (insn));
1413   add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1414 }
1415 
1416 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1417    divisor are within the range [0-255].  */
1418 
1419 void
ix86_split_idivmod(machine_mode mode,rtx operands[],bool unsigned_p)1420 ix86_split_idivmod (machine_mode mode, rtx operands[],
1421 		    bool unsigned_p)
1422 {
1423   rtx_code_label *end_label, *qimode_label;
1424   rtx div, mod;
1425   rtx_insn *insn;
1426   rtx scratch, tmp0, tmp1, tmp2;
1427   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1428 
1429   operands[2] = force_reg (mode, operands[2]);
1430   operands[3] = force_reg (mode, operands[3]);
1431 
1432   switch (mode)
1433     {
1434     case E_SImode:
1435       if (GET_MODE (operands[0]) == SImode)
1436 	{
1437 	  if (GET_MODE (operands[1]) == SImode)
1438 	    gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1439 	  else
1440 	    gen_divmod4_1
1441 	      = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1442 	}
1443       else
1444 	gen_divmod4_1
1445 	  = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1446       break;
1447 
1448     case E_DImode:
1449       gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1450       break;
1451 
1452     default:
1453       gcc_unreachable ();
1454     }
1455 
1456   end_label = gen_label_rtx ();
1457   qimode_label = gen_label_rtx ();
1458 
1459   scratch = gen_reg_rtx (mode);
1460 
1461   /* Use 8bit unsigned divimod if dividend and divisor are within
1462      the range [0-255].  */
1463   emit_move_insn (scratch, operands[2]);
1464   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1465 				 scratch, 1, OPTAB_DIRECT);
1466   emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1467   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1468   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1469   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1470 			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1471 			       pc_rtx);
1472   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1473   predict_jump (REG_BR_PROB_BASE * 50 / 100);
1474   JUMP_LABEL (insn) = qimode_label;
1475 
1476   /* Generate original signed/unsigned divimod.  */
1477   emit_insn (gen_divmod4_1 (operands[0], operands[1],
1478 			    operands[2], operands[3]));
1479 
1480   /* Branch to the end.  */
1481   emit_jump_insn (gen_jump (end_label));
1482   emit_barrier ();
1483 
1484   /* Generate 8bit unsigned divide.  */
1485   emit_label (qimode_label);
1486   /* Don't use operands[0] for result of 8bit divide since not all
1487      registers support QImode ZERO_EXTRACT.  */
1488   tmp0 = lowpart_subreg (HImode, scratch, mode);
1489   tmp1 = lowpart_subreg (HImode, operands[2], mode);
1490   tmp2 = lowpart_subreg (QImode, operands[3], mode);
1491   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1492 
1493   if (unsigned_p)
1494     {
1495       div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1496       mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1497     }
1498   else
1499     {
1500       div = gen_rtx_DIV (mode, operands[2], operands[3]);
1501       mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1502     }
1503   if (mode == SImode)
1504     {
1505       if (GET_MODE (operands[0]) != SImode)
1506 	div = gen_rtx_ZERO_EXTEND (DImode, div);
1507       if (GET_MODE (operands[1]) != SImode)
1508 	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1509     }
1510 
1511   /* Extract remainder from AH.  */
1512   scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1513   tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1514 			       GEN_INT (8), GEN_INT (8));
1515   insn = emit_move_insn (operands[1], tmp1);
1516   set_unique_reg_note (insn, REG_EQUAL, mod);
1517 
1518   /* Zero extend quotient from AL.  */
1519   tmp1 = gen_lowpart (QImode, tmp0);
1520   insn = emit_insn (gen_extend_insn
1521 		    (operands[0], tmp1,
1522 		     GET_MODE (operands[0]), QImode, 1));
1523   set_unique_reg_note (insn, REG_EQUAL, div);
1524 
1525   emit_label (end_label);
1526 }
1527 
1528 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1529    matches destination.  RTX includes clobber of FLAGS_REG.  */
1530 
1531 void
ix86_emit_binop(enum rtx_code code,machine_mode mode,rtx dst,rtx src)1532 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1533 		 rtx dst, rtx src)
1534 {
1535   rtx op, clob;
1536 
1537   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1538   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1539 
1540   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1541 }
1542 
1543 /* Return true if regno1 def is nearest to the insn.  */
1544 
1545 static bool
find_nearest_reg_def(rtx_insn * insn,int regno1,int regno2)1546 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1547 {
1548   rtx_insn *prev = insn;
1549   rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1550 
1551   if (insn == start)
1552     return false;
1553   while (prev && prev != start)
1554     {
1555       if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1556 	{
1557 	  prev = PREV_INSN (prev);
1558 	  continue;
1559 	}
1560       if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1561 	return true;
1562       else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1563 	return false;
1564       prev = PREV_INSN (prev);
1565     }
1566 
1567   /* None of the regs is defined in the bb.  */
1568   return false;
1569 }
1570 
1571 /* INSN_UID of the last insn emitted by zero store peephole2s.  */
1572 int ix86_last_zero_store_uid;
1573 
1574 /* Split lea instructions into a sequence of instructions
1575    which are executed on ALU to avoid AGU stalls.
1576    It is assumed that it is allowed to clobber flags register
1577    at lea position.  */
1578 
1579 void
ix86_split_lea_for_addr(rtx_insn * insn,rtx operands[],machine_mode mode)1580 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1581 {
1582   unsigned int regno0, regno1, regno2;
1583   struct ix86_address parts;
1584   rtx target, tmp;
1585   int ok, adds;
1586 
1587   ok = ix86_decompose_address (operands[1], &parts);
1588   gcc_assert (ok);
1589 
1590   target = gen_lowpart (mode, operands[0]);
1591 
1592   regno0 = true_regnum (target);
1593   regno1 = INVALID_REGNUM;
1594   regno2 = INVALID_REGNUM;
1595 
1596   if (parts.base)
1597     {
1598       parts.base = gen_lowpart (mode, parts.base);
1599       regno1 = true_regnum (parts.base);
1600     }
1601 
1602   if (parts.index)
1603     {
1604       parts.index = gen_lowpart (mode, parts.index);
1605       regno2 = true_regnum (parts.index);
1606     }
1607 
1608   if (parts.disp)
1609     parts.disp = gen_lowpart (mode, parts.disp);
1610 
1611   if (parts.scale > 1)
1612     {
1613       /* Case r1 = r1 + ...  */
1614       if (regno1 == regno0)
1615 	{
1616 	  /* If we have a case r1 = r1 + C * r2 then we
1617 	     should use multiplication which is very
1618 	     expensive.  Assume cost model is wrong if we
1619 	     have such case here.  */
1620 	  gcc_assert (regno2 != regno0);
1621 
1622 	  for (adds = parts.scale; adds > 0; adds--)
1623 	    ix86_emit_binop (PLUS, mode, target, parts.index);
1624 	}
1625       else
1626 	{
1627 	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
1628 	  if (regno0 != regno2)
1629 	    emit_insn (gen_rtx_SET (target, parts.index));
1630 
1631 	  /* Use shift for scaling, but emit it as MULT instead
1632 	     to avoid it being immediately peephole2 optimized back
1633 	     into lea.  */
1634 	  ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1635 
1636 	  if (parts.base)
1637 	    ix86_emit_binop (PLUS, mode, target, parts.base);
1638 
1639 	  if (parts.disp && parts.disp != const0_rtx)
1640 	    ix86_emit_binop (PLUS, mode, target, parts.disp);
1641 	}
1642     }
1643   else if (!parts.base && !parts.index)
1644     {
1645       gcc_assert(parts.disp);
1646       emit_insn (gen_rtx_SET (target, parts.disp));
1647     }
1648   else
1649     {
1650       if (!parts.base)
1651 	{
1652 	  if (regno0 != regno2)
1653 	    emit_insn (gen_rtx_SET (target, parts.index));
1654 	}
1655       else if (!parts.index)
1656 	{
1657 	  if (regno0 != regno1)
1658 	    emit_insn (gen_rtx_SET (target, parts.base));
1659 	}
1660       else
1661 	{
1662 	  if (regno0 == regno1)
1663 	    tmp = parts.index;
1664 	  else if (regno0 == regno2)
1665 	    tmp = parts.base;
1666 	  else
1667 	    {
1668 	      rtx tmp1;
1669 
1670 	      /* Find better operand for SET instruction, depending
1671 		 on which definition is farther from the insn.  */
1672 	      if (find_nearest_reg_def (insn, regno1, regno2))
1673 		tmp = parts.index, tmp1 = parts.base;
1674 	      else
1675 		tmp = parts.base, tmp1 = parts.index;
1676 
1677 	      emit_insn (gen_rtx_SET (target, tmp));
1678 
1679 	      if (parts.disp && parts.disp != const0_rtx)
1680 		ix86_emit_binop (PLUS, mode, target, parts.disp);
1681 
1682 	      ix86_emit_binop (PLUS, mode, target, tmp1);
1683 	      return;
1684 	    }
1685 
1686 	  ix86_emit_binop (PLUS, mode, target, tmp);
1687 	}
1688 
1689       if (parts.disp && parts.disp != const0_rtx)
1690 	ix86_emit_binop (PLUS, mode, target, parts.disp);
1691     }
1692 }
1693 
1694 /* Post-reload splitter for converting an SF or DFmode value in an
1695    SSE register into an unsigned SImode.  */
1696 
1697 void
ix86_split_convert_uns_si_sse(rtx operands[])1698 ix86_split_convert_uns_si_sse (rtx operands[])
1699 {
1700   machine_mode vecmode;
1701   rtx value, large, zero_or_two31, input, two31, x;
1702 
1703   large = operands[1];
1704   zero_or_two31 = operands[2];
1705   input = operands[3];
1706   two31 = operands[4];
1707   vecmode = GET_MODE (large);
1708   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1709 
1710   /* Load up the value into the low element.  We must ensure that the other
1711      elements are valid floats -- zero is the easiest such value.  */
1712   if (MEM_P (input))
1713     {
1714       if (vecmode == V4SFmode)
1715 	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1716       else
1717 	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1718     }
1719   else
1720     {
1721       input = gen_rtx_REG (vecmode, REGNO (input));
1722       emit_move_insn (value, CONST0_RTX (vecmode));
1723       if (vecmode == V4SFmode)
1724 	emit_insn (gen_sse_movss (value, value, input));
1725       else
1726 	emit_insn (gen_sse2_movsd (value, value, input));
1727     }
1728 
1729   emit_move_insn (large, two31);
1730   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1731 
1732   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1733   emit_insn (gen_rtx_SET (large, x));
1734 
1735   x = gen_rtx_AND (vecmode, zero_or_two31, large);
1736   emit_insn (gen_rtx_SET (zero_or_two31, x));
1737 
1738   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1739   emit_insn (gen_rtx_SET (value, x));
1740 
1741   large = gen_rtx_REG (V4SImode, REGNO (large));
1742   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1743 
1744   x = gen_rtx_REG (V4SImode, REGNO (value));
1745   if (vecmode == V4SFmode)
1746     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1747   else
1748     emit_insn (gen_sse2_cvttpd2dq (x, value));
1749   value = x;
1750 
1751   emit_insn (gen_xorv4si3 (value, value, large));
1752 }
1753 
1754 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1755 						 machine_mode mode, rtx target,
1756 						 rtx var, int one_var);
1757 
1758 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1759    Expects the 64-bit DImode to be supplied in a pair of integral
1760    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
1761    -mfpmath=sse, !optimize_size only.  */
1762 
1763 void
ix86_expand_convert_uns_didf_sse(rtx target,rtx input)1764 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1765 {
1766   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1767   rtx int_xmm, fp_xmm;
1768   rtx biases, exponents;
1769   rtx x;
1770 
1771   int_xmm = gen_reg_rtx (V4SImode);
1772   if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1773     emit_insn (gen_movdi_to_sse (int_xmm, input));
1774   else if (TARGET_SSE_SPLIT_REGS)
1775     {
1776       emit_clobber (int_xmm);
1777       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1778     }
1779   else
1780     {
1781       x = gen_reg_rtx (V2DImode);
1782       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1783       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1784     }
1785 
1786   x = gen_rtx_CONST_VECTOR (V4SImode,
1787 			    gen_rtvec (4, GEN_INT (0x43300000UL),
1788 				       GEN_INT (0x45300000UL),
1789 				       const0_rtx, const0_rtx));
1790   exponents = validize_mem (force_const_mem (V4SImode, x));
1791 
1792   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1793   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1794 
1795   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1796      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1797      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1798      (0x1.0p84 + double(fp_value_hi_xmm)).
1799      Note these exponents differ by 32.  */
1800 
1801   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1802 
1803   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1804      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
1805   real_ldexp (&bias_lo_rvt, &dconst1, 52);
1806   real_ldexp (&bias_hi_rvt, &dconst1, 84);
1807   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1808   x = const_double_from_real_value (bias_hi_rvt, DFmode);
1809   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1810   biases = validize_mem (force_const_mem (V2DFmode, biases));
1811   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1812 
1813   /* Add the upper and lower DFmode values together.  */
1814   if (TARGET_SSE3)
1815     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1816   else
1817     {
1818       x = copy_to_mode_reg (V2DFmode, fp_xmm);
1819       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1820       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1821     }
1822 
1823   ix86_expand_vector_extract (false, target, fp_xmm, 0);
1824 }
1825 
1826 /* Not used, but eases macroization of patterns.  */
1827 void
ix86_expand_convert_uns_sixf_sse(rtx,rtx)1828 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1829 {
1830   gcc_unreachable ();
1831 }
1832 
1833 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1834 
1835 /* Convert an unsigned SImode value into a DFmode.  Only currently used
1836    for SSE, but applicable anywhere.  */
1837 
1838 void
ix86_expand_convert_uns_sidf_sse(rtx target,rtx input)1839 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1840 {
1841   REAL_VALUE_TYPE TWO31r;
1842   rtx x, fp;
1843 
1844   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1845 			   NULL, 1, OPTAB_DIRECT);
1846 
1847   fp = gen_reg_rtx (DFmode);
1848   emit_insn (gen_floatsidf2 (fp, x));
1849 
1850   real_ldexp (&TWO31r, &dconst1, 31);
1851   x = const_double_from_real_value (TWO31r, DFmode);
1852 
1853   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1854 
1855   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
1856   if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1857     x = ix86_expand_sse_fabs (x, NULL);
1858 
1859   if (x != target)
1860     emit_move_insn (target, x);
1861 }
1862 
1863 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
1864    32-bit mode; otherwise we have a direct convert instruction.  */
1865 
1866 void
ix86_expand_convert_sign_didf_sse(rtx target,rtx input)1867 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1868 {
1869   REAL_VALUE_TYPE TWO32r;
1870   rtx fp_lo, fp_hi, x;
1871 
1872   fp_lo = gen_reg_rtx (DFmode);
1873   fp_hi = gen_reg_rtx (DFmode);
1874 
1875   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1876 
1877   real_ldexp (&TWO32r, &dconst1, 32);
1878   x = const_double_from_real_value (TWO32r, DFmode);
1879   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1880 
1881   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1882 
1883   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1884 			   0, OPTAB_DIRECT);
1885   if (x != target)
1886     emit_move_insn (target, x);
1887 }
1888 
1889 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1890    For x86_32, -mfpmath=sse, !optimize_size only.  */
1891 void
ix86_expand_convert_uns_sisf_sse(rtx target,rtx input)1892 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1893 {
1894   REAL_VALUE_TYPE ONE16r;
1895   rtx fp_hi, fp_lo, int_hi, int_lo, x;
1896 
1897   real_ldexp (&ONE16r, &dconst1, 16);
1898   x = const_double_from_real_value (ONE16r, SFmode);
1899   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1900 				      NULL, 0, OPTAB_DIRECT);
1901   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1902 				      NULL, 0, OPTAB_DIRECT);
1903   fp_hi = gen_reg_rtx (SFmode);
1904   fp_lo = gen_reg_rtx (SFmode);
1905   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1906   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1907   if (TARGET_FMA)
1908     {
1909       x = validize_mem (force_const_mem (SFmode, x));
1910       fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1911       emit_move_insn (target, fp_hi);
1912     }
1913   else
1914     {
1915       fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1916 				   0, OPTAB_DIRECT);
1917       fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1918 				   0, OPTAB_DIRECT);
1919       if (!rtx_equal_p (target, fp_hi))
1920 	emit_move_insn (target, fp_hi);
1921     }
1922 }
1923 
1924 /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
1925    a vector of unsigned ints VAL to vector of floats TARGET.  */
1926 
1927 void
ix86_expand_vector_convert_uns_vsivsf(rtx target,rtx val)1928 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1929 {
1930   rtx tmp[8];
1931   REAL_VALUE_TYPE TWO16r;
1932   machine_mode intmode = GET_MODE (val);
1933   machine_mode fltmode = GET_MODE (target);
1934   rtx (*cvt) (rtx, rtx);
1935 
1936   if (intmode == V4SImode)
1937     cvt = gen_floatv4siv4sf2;
1938   else
1939     cvt = gen_floatv8siv8sf2;
1940   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1941   tmp[0] = force_reg (intmode, tmp[0]);
1942   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1943 				OPTAB_DIRECT);
1944   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1945 				NULL_RTX, 1, OPTAB_DIRECT);
1946   tmp[3] = gen_reg_rtx (fltmode);
1947   emit_insn (cvt (tmp[3], tmp[1]));
1948   tmp[4] = gen_reg_rtx (fltmode);
1949   emit_insn (cvt (tmp[4], tmp[2]));
1950   real_ldexp (&TWO16r, &dconst1, 16);
1951   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1952   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1953   if (TARGET_FMA)
1954     {
1955       tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
1956       emit_move_insn (target, tmp[6]);
1957     }
1958   else
1959     {
1960       tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
1961 				    NULL_RTX, 1, OPTAB_DIRECT);
1962       tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
1963 				    target, 1, OPTAB_DIRECT);
1964       if (tmp[7] != target)
1965 	emit_move_insn (target, tmp[7]);
1966     }
1967 }
1968 
1969 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1970    pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1971    This is done by doing just signed conversion if < 0x1p31, and otherwise by
1972    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
1973 
1974 rtx
ix86_expand_adjust_ufix_to_sfix_si(rtx val,rtx * xorp)1975 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1976 {
1977   REAL_VALUE_TYPE TWO31r;
1978   rtx two31r, tmp[4];
1979   machine_mode mode = GET_MODE (val);
1980   machine_mode scalarmode = GET_MODE_INNER (mode);
1981   machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1982   rtx (*cmp) (rtx, rtx, rtx, rtx);
1983   int i;
1984 
1985   for (i = 0; i < 3; i++)
1986     tmp[i] = gen_reg_rtx (mode);
1987   real_ldexp (&TWO31r, &dconst1, 31);
1988   two31r = const_double_from_real_value (TWO31r, scalarmode);
1989   two31r = ix86_build_const_vector (mode, 1, two31r);
1990   two31r = force_reg (mode, two31r);
1991   switch (mode)
1992     {
1993     case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1994     case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1995     case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1996     case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1997     default: gcc_unreachable ();
1998     }
1999   tmp[3] = gen_rtx_LE (mode, two31r, val);
2000   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2001   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2002 				0, OPTAB_DIRECT);
2003   if (intmode == V4SImode || TARGET_AVX2)
2004     *xorp = expand_simple_binop (intmode, ASHIFT,
2005 				 gen_lowpart (intmode, tmp[0]),
2006 				 GEN_INT (31), NULL_RTX, 0,
2007 				 OPTAB_DIRECT);
2008   else
2009     {
2010       rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2011       two31 = ix86_build_const_vector (intmode, 1, two31);
2012       *xorp = expand_simple_binop (intmode, AND,
2013 				   gen_lowpart (intmode, tmp[0]),
2014 				   two31, NULL_RTX, 0,
2015 				   OPTAB_DIRECT);
2016     }
2017   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2018 			      0, OPTAB_DIRECT);
2019 }
2020 
2021 /* Generate code for floating point ABS or NEG.  */
2022 
2023 void
ix86_expand_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])2024 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2025 				rtx operands[])
2026 {
2027   rtx set, dst, src;
2028   bool use_sse = false;
2029   bool vector_mode = VECTOR_MODE_P (mode);
2030   machine_mode vmode = mode;
2031   rtvec par;
2032 
2033   if (vector_mode || mode == TFmode || mode == HFmode)
2034     {
2035       use_sse = true;
2036       if (mode == HFmode)
2037 	vmode = V8HFmode;
2038     }
2039   else if (TARGET_SSE_MATH)
2040     {
2041       use_sse = SSE_FLOAT_MODE_P (mode);
2042       if (mode == SFmode)
2043 	vmode = V4SFmode;
2044       else if (mode == DFmode)
2045 	vmode = V2DFmode;
2046     }
2047 
2048   dst = operands[0];
2049   src = operands[1];
2050 
2051   set = gen_rtx_fmt_e (code, mode, src);
2052   set = gen_rtx_SET (dst, set);
2053 
2054   if (use_sse)
2055     {
2056       rtx mask, use, clob;
2057 
2058       /* NEG and ABS performed with SSE use bitwise mask operations.
2059 	 Create the appropriate mask now.  */
2060       mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2061       use = gen_rtx_USE (VOIDmode, mask);
2062       if (vector_mode || mode == TFmode)
2063 	par = gen_rtvec (2, set, use);
2064       else
2065 	{
2066           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2067 	  par = gen_rtvec (3, set, use, clob);
2068         }
2069     }
2070   else
2071     {
2072       rtx clob;
2073 
2074       /* Changing of sign for FP values is doable using integer unit too.  */
2075       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2076       par = gen_rtvec (2, set, clob);
2077     }
2078 
2079   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2080 }
2081 
2082 /* Deconstruct a floating point ABS or NEG operation
2083    with integer registers into integer operations.  */
2084 
2085 void
ix86_split_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])2086 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2087 			       rtx operands[])
2088 {
2089   enum rtx_code absneg_op;
2090   rtx dst, set;
2091 
2092   gcc_assert (operands_match_p (operands[0], operands[1]));
2093 
2094   switch (mode)
2095     {
2096     case E_SFmode:
2097       dst = gen_lowpart (SImode, operands[0]);
2098 
2099       if (code == ABS)
2100 	{
2101 	  set = gen_int_mode (0x7fffffff, SImode);
2102 	  absneg_op = AND;
2103 	}
2104       else
2105 	{
2106 	  set = gen_int_mode (0x80000000, SImode);
2107 	  absneg_op = XOR;
2108 	}
2109       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2110       break;
2111 
2112     case E_DFmode:
2113       if (TARGET_64BIT)
2114 	{
2115 	  dst = gen_lowpart (DImode, operands[0]);
2116 	  dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2117 
2118 	  if (code == ABS)
2119 	    set = const0_rtx;
2120 	  else
2121 	    set = gen_rtx_NOT (DImode, dst);
2122 	}
2123       else
2124 	{
2125 	  dst = gen_highpart (SImode, operands[0]);
2126 
2127 	  if (code == ABS)
2128 	    {
2129 	      set = gen_int_mode (0x7fffffff, SImode);
2130 	      absneg_op = AND;
2131 	    }
2132 	  else
2133 	    {
2134 	      set = gen_int_mode (0x80000000, SImode);
2135 	      absneg_op = XOR;
2136 	    }
2137 	  set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2138 	}
2139       break;
2140 
2141     case E_XFmode:
2142       dst = gen_rtx_REG (SImode,
2143 			 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2144       if (code == ABS)
2145 	{
2146 	  set = GEN_INT (0x7fff);
2147 	  absneg_op = AND;
2148 	}
2149       else
2150 	{
2151 	  set = GEN_INT (0x8000);
2152 	  absneg_op = XOR;
2153 	}
2154       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2155       break;
2156 
2157     default:
2158       gcc_unreachable ();
2159     }
2160 
2161   set = gen_rtx_SET (dst, set);
2162 
2163   rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2164   rtvec par = gen_rtvec (2, set, clob);
2165 
2166   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2167 }
2168 
2169 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
2170 
2171 void
ix86_expand_copysign(rtx operands[])2172 ix86_expand_copysign (rtx operands[])
2173 {
2174   machine_mode mode, vmode;
2175   rtx dest, vdest, op0, op1, mask, op2, op3;
2176 
2177   mode = GET_MODE (operands[0]);
2178 
2179   if (mode == HFmode)
2180     vmode = V8HFmode;
2181   else if (mode == SFmode)
2182     vmode = V4SFmode;
2183   else if (mode == DFmode)
2184     vmode = V2DFmode;
2185   else if (mode == TFmode)
2186     vmode = mode;
2187   else
2188     gcc_unreachable ();
2189 
2190   if (rtx_equal_p (operands[1], operands[2]))
2191     {
2192       emit_move_insn (operands[0], operands[1]);
2193       return;
2194     }
2195 
2196   dest = operands[0];
2197   vdest = lowpart_subreg (vmode, dest, mode);
2198   if (vdest == NULL_RTX)
2199     vdest = gen_reg_rtx (vmode);
2200   else
2201     dest = NULL_RTX;
2202   op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2203   mask = ix86_build_signbit_mask (vmode, 0, 0);
2204 
2205   if (CONST_DOUBLE_P (operands[1]))
2206     {
2207       op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2208       /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  */
2209       if (op0 == CONST0_RTX (mode))
2210 	{
2211 	  emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2212 	  if (dest)
2213 	    emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2214 	  return;
2215 	}
2216 
2217       if (GET_MODE_SIZE (mode) < 16)
2218 	op0 = ix86_build_const_vector (vmode, false, op0);
2219       op0 = force_reg (vmode, op0);
2220     }
2221   else
2222     op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2223 
2224   op2 = gen_reg_rtx (vmode);
2225   op3 = gen_reg_rtx (vmode);
2226   emit_move_insn (op2, gen_rtx_AND (vmode,
2227 				    gen_rtx_NOT (vmode, mask),
2228 				    op0));
2229   emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2230   emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2231   if (dest)
2232     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2233 }
2234 
2235 /* Expand an xorsign operation.  */
2236 
2237 void
ix86_expand_xorsign(rtx operands[])2238 ix86_expand_xorsign (rtx operands[])
2239 {
2240   machine_mode mode, vmode;
2241   rtx dest, vdest, op0, op1, mask, x, temp;
2242 
2243   dest = operands[0];
2244   op0 = operands[1];
2245   op1 = operands[2];
2246 
2247   mode = GET_MODE (dest);
2248 
2249   if (mode == HFmode)
2250     vmode = V8HFmode;
2251   else if (mode == SFmode)
2252     vmode = V4SFmode;
2253   else if (mode == DFmode)
2254     vmode = V2DFmode;
2255   else
2256     gcc_unreachable ();
2257 
2258   temp = gen_reg_rtx (vmode);
2259   mask = ix86_build_signbit_mask (vmode, 0, 0);
2260 
2261   op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2262   x = gen_rtx_AND (vmode, op1, mask);
2263   emit_insn (gen_rtx_SET (temp, x));
2264 
2265   op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2266   x = gen_rtx_XOR (vmode, temp, op0);
2267 
2268   vdest = lowpart_subreg (vmode, dest, mode);
2269   if (vdest == NULL_RTX)
2270     vdest = gen_reg_rtx (vmode);
2271   else
2272     dest = NULL_RTX;
2273   emit_insn (gen_rtx_SET (vdest, x));
2274 
2275   if (dest)
2276     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2277 }
2278 
2279 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2280 
2281 void
ix86_expand_branch(enum rtx_code code,rtx op0,rtx op1,rtx label)2282 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2283 {
2284   machine_mode mode = GET_MODE (op0);
2285   rtx tmp;
2286 
2287   /* Handle special case - vector comparsion with boolean result, transform
2288      it using ptest instruction.  */
2289   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2290     {
2291       rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2292       machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2293 
2294       gcc_assert (code == EQ || code == NE);
2295       /* Generate XOR since we can't check that one operand is zero vector.  */
2296       tmp = gen_reg_rtx (mode);
2297       emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2298       tmp = gen_lowpart (p_mode, tmp);
2299       emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2300 			      gen_rtx_UNSPEC (CCmode,
2301 					      gen_rtvec (2, tmp, tmp),
2302 					      UNSPEC_PTEST)));
2303       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2304       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2305 				  gen_rtx_LABEL_REF (VOIDmode, label),
2306 				  pc_rtx);
2307       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2308       return;
2309     }
2310 
2311   switch (mode)
2312     {
2313     case E_HFmode:
2314     case E_SFmode:
2315     case E_DFmode:
2316     case E_XFmode:
2317     case E_QImode:
2318     case E_HImode:
2319     case E_SImode:
2320       simple:
2321       tmp = ix86_expand_compare (code, op0, op1);
2322       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2323 				  gen_rtx_LABEL_REF (VOIDmode, label),
2324 				  pc_rtx);
2325       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2326       return;
2327 
2328     case E_DImode:
2329       if (TARGET_64BIT)
2330 	goto simple;
2331       /* For 32-bit target DI comparison may be performed on
2332 	 SSE registers.  To allow this we should avoid split
2333 	 to SI mode which is achieved by doing xor in DI mode
2334 	 and then comparing with zero (which is recognized by
2335 	 STV pass).  We don't compare using xor when optimizing
2336 	 for size.  */
2337       if (!optimize_insn_for_size_p ()
2338 	  && TARGET_STV
2339 	  && (code == EQ || code == NE))
2340 	{
2341 	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2342 	  op1 = const0_rtx;
2343 	}
2344       /* FALLTHRU */
2345     case E_TImode:
2346       /* Expand DImode branch into multiple compare+branch.  */
2347       {
2348 	rtx lo[2], hi[2];
2349 	rtx_code_label *label2;
2350 	enum rtx_code code1, code2, code3;
2351 	machine_mode submode;
2352 
2353 	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2354 	  {
2355 	    std::swap (op0, op1);
2356 	    code = swap_condition (code);
2357 	  }
2358 
2359 	split_double_mode (mode, &op0, 1, lo+0, hi+0);
2360 	split_double_mode (mode, &op1, 1, lo+1, hi+1);
2361 
2362 	submode = mode == DImode ? SImode : DImode;
2363 
2364 	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2365 	   avoid two branches.  This costs one extra insn, so disable when
2366 	   optimizing for size.  */
2367 
2368 	if ((code == EQ || code == NE)
2369 	    && (!optimize_insn_for_size_p ()
2370 	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
2371 	  {
2372 	    rtx xor0, xor1;
2373 
2374 	    xor1 = hi[0];
2375 	    if (hi[1] != const0_rtx)
2376 	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2377 				   NULL_RTX, 0, OPTAB_WIDEN);
2378 
2379 	    xor0 = lo[0];
2380 	    if (lo[1] != const0_rtx)
2381 	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2382 				   NULL_RTX, 0, OPTAB_WIDEN);
2383 
2384 	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
2385 				NULL_RTX, 0, OPTAB_WIDEN);
2386 
2387 	    ix86_expand_branch (code, tmp, const0_rtx, label);
2388 	    return;
2389 	  }
2390 
2391 	/* Otherwise, if we are doing less-than or greater-or-equal-than,
2392 	   op1 is a constant and the low word is zero, then we can just
2393 	   examine the high word.  Similarly for low word -1 and
2394 	   less-or-equal-than or greater-than.  */
2395 
2396 	if (CONST_INT_P (hi[1]))
2397 	  switch (code)
2398 	    {
2399 	    case LT: case LTU: case GE: case GEU:
2400 	      if (lo[1] == const0_rtx)
2401 		{
2402 		  ix86_expand_branch (code, hi[0], hi[1], label);
2403 		  return;
2404 		}
2405 	      break;
2406 	    case LE: case LEU: case GT: case GTU:
2407 	      if (lo[1] == constm1_rtx)
2408 		{
2409 		  ix86_expand_branch (code, hi[0], hi[1], label);
2410 		  return;
2411 		}
2412 	      break;
2413 	    default:
2414 	      break;
2415 	    }
2416 
2417 	/* Emulate comparisons that do not depend on Zero flag with
2418 	   double-word subtraction.  Note that only Overflow, Sign
2419 	   and Carry flags are valid, so swap arguments and condition
2420 	   of comparisons that would otherwise test Zero flag.  */
2421 
2422 	switch (code)
2423 	  {
2424 	  case LE: case LEU: case GT: case GTU:
2425 	    std::swap (lo[0], lo[1]);
2426 	    std::swap (hi[0], hi[1]);
2427 	    code = swap_condition (code);
2428 	    /* FALLTHRU */
2429 
2430 	  case LT: case LTU: case GE: case GEU:
2431 	    {
2432 	      bool uns = (code == LTU || code == GEU);
2433 	      rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2434 		= uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2435 
2436 	      if (!nonimmediate_operand (lo[0], submode))
2437 		lo[0] = force_reg (submode, lo[0]);
2438 	      if (!x86_64_general_operand (lo[1], submode))
2439 		lo[1] = force_reg (submode, lo[1]);
2440 
2441 	      if (!register_operand (hi[0], submode))
2442 		hi[0] = force_reg (submode, hi[0]);
2443 	      if ((uns && !nonimmediate_operand (hi[1], submode))
2444 		  || (!uns && !x86_64_general_operand (hi[1], submode)))
2445 		hi[1] = force_reg (submode, hi[1]);
2446 
2447 	      emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2448 
2449 	      tmp = gen_rtx_SCRATCH (submode);
2450 	      emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2451 
2452 	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2453 	      ix86_expand_branch (code, tmp, const0_rtx, label);
2454 	      return;
2455 	    }
2456 
2457 	  default:
2458 	    break;
2459 	  }
2460 
2461 	/* Otherwise, we need two or three jumps.  */
2462 
2463 	label2 = gen_label_rtx ();
2464 
2465 	code1 = code;
2466 	code2 = swap_condition (code);
2467 	code3 = unsigned_condition (code);
2468 
2469 	switch (code)
2470 	  {
2471 	  case LT: case GT: case LTU: case GTU:
2472 	    break;
2473 
2474 	  case LE:   code1 = LT;  code2 = GT;  break;
2475 	  case GE:   code1 = GT;  code2 = LT;  break;
2476 	  case LEU:  code1 = LTU; code2 = GTU; break;
2477 	  case GEU:  code1 = GTU; code2 = LTU; break;
2478 
2479 	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
2480 	  case NE:   code2 = UNKNOWN; break;
2481 
2482 	  default:
2483 	    gcc_unreachable ();
2484 	  }
2485 
2486 	/*
2487 	 * a < b =>
2488 	 *    if (hi(a) < hi(b)) goto true;
2489 	 *    if (hi(a) > hi(b)) goto false;
2490 	 *    if (lo(a) < lo(b)) goto true;
2491 	 *  false:
2492 	 */
2493 
2494 	if (code1 != UNKNOWN)
2495 	  ix86_expand_branch (code1, hi[0], hi[1], label);
2496 	if (code2 != UNKNOWN)
2497 	  ix86_expand_branch (code2, hi[0], hi[1], label2);
2498 
2499 	ix86_expand_branch (code3, lo[0], lo[1], label);
2500 
2501 	if (code2 != UNKNOWN)
2502 	  emit_label (label2);
2503 	return;
2504       }
2505 
2506     default:
2507       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2508       goto simple;
2509     }
2510 }
2511 
2512 /* Figure out whether to use unordered fp comparisons.  */
2513 
2514 static bool
ix86_unordered_fp_compare(enum rtx_code code)2515 ix86_unordered_fp_compare (enum rtx_code code)
2516 {
2517   if (!TARGET_IEEE_FP)
2518     return false;
2519 
2520   switch (code)
2521     {
2522     case LT:
2523     case LE:
2524     case GT:
2525     case GE:
2526     case LTGT:
2527       return false;
2528 
2529     case EQ:
2530     case NE:
2531 
2532     case UNORDERED:
2533     case ORDERED:
2534     case UNLT:
2535     case UNLE:
2536     case UNGT:
2537     case UNGE:
2538     case UNEQ:
2539       return true;
2540 
2541     default:
2542       gcc_unreachable ();
2543     }
2544 }
2545 
2546 /* Return a comparison we can do and that it is equivalent to
2547    swap_condition (code) apart possibly from orderedness.
2548    But, never change orderedness if TARGET_IEEE_FP, returning
2549    UNKNOWN in that case if necessary.  */
2550 
2551 static enum rtx_code
ix86_fp_swap_condition(enum rtx_code code)2552 ix86_fp_swap_condition (enum rtx_code code)
2553 {
2554   switch (code)
2555     {
2556     case GT:                   /* GTU - CF=0 & ZF=0 */
2557       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2558     case GE:                   /* GEU - CF=0 */
2559       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2560     case UNLT:                 /* LTU - CF=1 */
2561       return TARGET_IEEE_FP ? UNKNOWN : GT;
2562     case UNLE:                 /* LEU - CF=1 | ZF=1 */
2563       return TARGET_IEEE_FP ? UNKNOWN : GE;
2564     default:
2565       return swap_condition (code);
2566     }
2567 }
2568 
2569 /* Return cost of comparison CODE using the best strategy for performance.
2570    All following functions do use number of instructions as a cost metrics.
2571    In future this should be tweaked to compute bytes for optimize_size and
2572    take into account performance of various instructions on various CPUs.  */
2573 
2574 static int
ix86_fp_comparison_cost(enum rtx_code code)2575 ix86_fp_comparison_cost (enum rtx_code code)
2576 {
2577   int arith_cost;
2578 
2579   /* The cost of code using bit-twiddling on %ah.  */
2580   switch (code)
2581     {
2582     case UNLE:
2583     case UNLT:
2584     case LTGT:
2585     case GT:
2586     case GE:
2587     case UNORDERED:
2588     case ORDERED:
2589     case UNEQ:
2590       arith_cost = 4;
2591       break;
2592     case LT:
2593     case NE:
2594     case EQ:
2595     case UNGE:
2596       arith_cost = TARGET_IEEE_FP ? 5 : 4;
2597       break;
2598     case LE:
2599     case UNGT:
2600       arith_cost = TARGET_IEEE_FP ? 6 : 4;
2601       break;
2602     default:
2603       gcc_unreachable ();
2604     }
2605 
2606   switch (ix86_fp_comparison_strategy (code))
2607     {
2608     case IX86_FPCMP_COMI:
2609       return arith_cost > 4 ? 3 : 2;
2610     case IX86_FPCMP_SAHF:
2611       return arith_cost > 4 ? 4 : 3;
2612     default:
2613       return arith_cost;
2614     }
2615 }
2616 
2617 /* Swap, force into registers, or otherwise massage the two operands
2618    to a fp comparison.  The operands are updated in place; the new
2619    comparison code is returned.  */
2620 
2621 static enum rtx_code
ix86_prepare_fp_compare_args(enum rtx_code code,rtx * pop0,rtx * pop1)2622 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2623 {
2624   bool unordered_compare = ix86_unordered_fp_compare (code);
2625   rtx op0 = *pop0, op1 = *pop1;
2626   machine_mode op_mode = GET_MODE (op0);
2627   bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2628 
2629   /* All of the unordered compare instructions only work on registers.
2630      The same is true of the fcomi compare instructions.  The XFmode
2631      compare instructions require registers except when comparing
2632      against zero or when converting operand 1 from fixed point to
2633      floating point.  */
2634 
2635   if (!is_sse
2636       && (unordered_compare
2637 	  || (op_mode == XFmode
2638 	      && ! (standard_80387_constant_p (op0) == 1
2639 		    || standard_80387_constant_p (op1) == 1)
2640 	      && GET_CODE (op1) != FLOAT)
2641 	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2642     {
2643       op0 = force_reg (op_mode, op0);
2644       op1 = force_reg (op_mode, op1);
2645     }
2646   else
2647     {
2648       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
2649 	 things around if they appear profitable, otherwise force op0
2650 	 into a register.  */
2651 
2652       if (standard_80387_constant_p (op0) == 0
2653 	  || (MEM_P (op0)
2654 	      && ! (standard_80387_constant_p (op1) == 0
2655 		    || MEM_P (op1))))
2656 	{
2657 	  enum rtx_code new_code = ix86_fp_swap_condition (code);
2658 	  if (new_code != UNKNOWN)
2659 	    {
2660 	      std::swap (op0, op1);
2661 	      code = new_code;
2662 	    }
2663 	}
2664 
2665       if (!REG_P (op0))
2666 	op0 = force_reg (op_mode, op0);
2667 
2668       if (CONSTANT_P (op1))
2669 	{
2670 	  int tmp = standard_80387_constant_p (op1);
2671 	  if (tmp == 0)
2672 	    op1 = validize_mem (force_const_mem (op_mode, op1));
2673 	  else if (tmp == 1)
2674 	    {
2675 	      if (TARGET_CMOVE)
2676 		op1 = force_reg (op_mode, op1);
2677 	    }
2678 	  else
2679 	    op1 = force_reg (op_mode, op1);
2680 	}
2681     }
2682 
2683   /* Try to rearrange the comparison to make it cheaper.  */
2684   if (ix86_fp_comparison_cost (code)
2685       > ix86_fp_comparison_cost (swap_condition (code))
2686       && (REG_P (op1) || can_create_pseudo_p ()))
2687     {
2688       std::swap (op0, op1);
2689       code = swap_condition (code);
2690       if (!REG_P (op0))
2691 	op0 = force_reg (op_mode, op0);
2692     }
2693 
2694   *pop0 = op0;
2695   *pop1 = op1;
2696   return code;
2697 }
2698 
2699 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
2700 
2701 static rtx
ix86_expand_fp_compare(enum rtx_code code,rtx op0,rtx op1)2702 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2703 {
2704   bool unordered_compare = ix86_unordered_fp_compare (code);
2705   machine_mode cmp_mode;
2706   rtx tmp, scratch;
2707 
2708   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2709 
2710   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2711   if (unordered_compare)
2712     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2713 
2714   /* Do fcomi/sahf based test when profitable.  */
2715   switch (ix86_fp_comparison_strategy (code))
2716     {
2717     case IX86_FPCMP_COMI:
2718       cmp_mode = CCFPmode;
2719       emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2720       break;
2721 
2722     case IX86_FPCMP_SAHF:
2723       cmp_mode = CCFPmode;
2724       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2725       scratch = gen_reg_rtx (HImode);
2726       emit_insn (gen_rtx_SET (scratch, tmp));
2727       emit_insn (gen_x86_sahf_1 (scratch));
2728       break;
2729 
2730     case IX86_FPCMP_ARITH:
2731       cmp_mode = CCNOmode;
2732       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2733       scratch = gen_reg_rtx (HImode);
2734       emit_insn (gen_rtx_SET (scratch, tmp));
2735 
2736       /* In the unordered case, we have to check C2 for NaN's, which
2737 	 doesn't happen to work out to anything nice combination-wise.
2738 	 So do some bit twiddling on the value we've got in AH to come
2739 	 up with an appropriate set of condition codes.  */
2740 
2741       switch (code)
2742 	{
2743 	case GT:
2744 	case UNGT:
2745 	  if (code == GT || !TARGET_IEEE_FP)
2746 	    {
2747 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2748 	      code = EQ;
2749 	    }
2750 	  else
2751 	    {
2752 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2753 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2754 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2755 	      cmp_mode = CCmode;
2756 	      code = GEU;
2757 	    }
2758 	  break;
2759 	case LT:
2760 	case UNLT:
2761 	  if (code == LT && TARGET_IEEE_FP)
2762 	    {
2763 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2764 	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2765 	      cmp_mode = CCmode;
2766 	      code = EQ;
2767 	    }
2768 	  else
2769 	    {
2770 	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2771 	      code = NE;
2772 	    }
2773 	  break;
2774 	case GE:
2775 	case UNGE:
2776 	  if (code == GE || !TARGET_IEEE_FP)
2777 	    {
2778 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2779 	      code = EQ;
2780 	    }
2781 	  else
2782 	    {
2783 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2784 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2785 	      code = NE;
2786 	    }
2787 	  break;
2788 	case LE:
2789 	case UNLE:
2790 	  if (code == LE && TARGET_IEEE_FP)
2791 	    {
2792 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2793 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2794 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2795 	      cmp_mode = CCmode;
2796 	      code = LTU;
2797 	    }
2798 	  else
2799 	    {
2800 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2801 	      code = NE;
2802 	    }
2803 	  break;
2804 	case EQ:
2805 	case UNEQ:
2806 	  if (code == EQ && TARGET_IEEE_FP)
2807 	    {
2808 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2809 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2810 	      cmp_mode = CCmode;
2811 	      code = EQ;
2812 	    }
2813 	  else
2814 	    {
2815 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2816 	      code = NE;
2817 	    }
2818 	  break;
2819 	case NE:
2820 	case LTGT:
2821 	  if (code == NE && TARGET_IEEE_FP)
2822 	    {
2823 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2824 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2825 					     GEN_INT (0x40)));
2826 	      code = NE;
2827 	    }
2828 	  else
2829 	    {
2830 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2831 	      code = EQ;
2832 	    }
2833 	  break;
2834 
2835 	case UNORDERED:
2836 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2837 	  code = NE;
2838 	  break;
2839 	case ORDERED:
2840 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2841 	  code = EQ;
2842 	  break;
2843 
2844 	default:
2845 	  gcc_unreachable ();
2846 	}
2847 	break;
2848 
2849     default:
2850       gcc_unreachable();
2851     }
2852 
2853   /* Return the test that should be put into the flags user, i.e.
2854      the bcc, scc, or cmov instruction.  */
2855   return gen_rtx_fmt_ee (code, VOIDmode,
2856 			 gen_rtx_REG (cmp_mode, FLAGS_REG),
2857 			 const0_rtx);
2858 }
2859 
2860 /* Generate insn patterns to do an integer compare of OPERANDS.  */
2861 
2862 static rtx
ix86_expand_int_compare(enum rtx_code code,rtx op0,rtx op1)2863 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2864 {
2865   machine_mode cmpmode;
2866   rtx tmp, flags;
2867 
2868   /* Swap operands to emit carry flag comparison.  */
2869   if ((code == GTU || code == LEU)
2870       && nonimmediate_operand (op1, VOIDmode))
2871     {
2872       std::swap (op0, op1);
2873       code = swap_condition (code);
2874     }
2875 
2876   cmpmode = SELECT_CC_MODE (code, op0, op1);
2877   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2878 
2879   /* This is very simple, but making the interface the same as in the
2880      FP case makes the rest of the code easier.  */
2881   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2882   emit_insn (gen_rtx_SET (flags, tmp));
2883 
2884   /* Return the test that should be put into the flags user, i.e.
2885      the bcc, scc, or cmov instruction.  */
2886   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2887 }
2888 
2889 static rtx
ix86_expand_compare(enum rtx_code code,rtx op0,rtx op1)2890 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2891 {
2892   rtx ret;
2893 
2894   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2895     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2896 
2897   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2898     {
2899       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2900       ret = ix86_expand_fp_compare (code, op0, op1);
2901     }
2902   else
2903     ret = ix86_expand_int_compare (code, op0, op1);
2904 
2905   return ret;
2906 }
2907 
2908 void
ix86_expand_setcc(rtx dest,enum rtx_code code,rtx op0,rtx op1)2909 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2910 {
2911   rtx ret;
2912 
2913   gcc_assert (GET_MODE (dest) == QImode);
2914 
2915   ret = ix86_expand_compare (code, op0, op1);
2916   PUT_MODE (ret, QImode);
2917   emit_insn (gen_rtx_SET (dest, ret));
2918 }
2919 
2920 /* Expand floating point op0 <=> op1, i.e.
2921    dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2.  */
2922 
2923 void
ix86_expand_fp_spaceship(rtx dest,rtx op0,rtx op1)2924 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
2925 {
2926   gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
2927   rtx gt = ix86_expand_fp_compare (GT, op0, op1);
2928   rtx l0 = gen_label_rtx ();
2929   rtx l1 = gen_label_rtx ();
2930   rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
2931   rtx lend = gen_label_rtx ();
2932   rtx tmp;
2933   rtx_insn *jmp;
2934   if (l2)
2935     {
2936       rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
2937 			       gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2938       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
2939 				  gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
2940       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2941       add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
2942     }
2943   rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
2944 			   gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2945   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
2946 			      gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
2947   jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2948   add_reg_br_prob_note (jmp, profile_probability::unlikely ());
2949   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
2950 			      gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
2951   jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2952   add_reg_br_prob_note (jmp, profile_probability::even ());
2953   emit_move_insn (dest, constm1_rtx);
2954   emit_jump (lend);
2955   emit_label (l0);
2956   emit_move_insn (dest, const0_rtx);
2957   emit_jump (lend);
2958   emit_label (l1);
2959   emit_move_insn (dest, const1_rtx);
2960   emit_jump (lend);
2961   if (l2)
2962     {
2963       emit_label (l2);
2964       emit_move_insn (dest, const2_rtx);
2965     }
2966   emit_label (lend);
2967 }
2968 
2969 /* Expand comparison setting or clearing carry flag.  Return true when
2970    successful and set pop for the operation.  */
2971 static bool
ix86_expand_carry_flag_compare(enum rtx_code code,rtx op0,rtx op1,rtx * pop)2972 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2973 {
2974   machine_mode mode
2975     = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2976 
2977   /* Do not handle double-mode compares that go through special path.  */
2978   if (mode == (TARGET_64BIT ? TImode : DImode))
2979     return false;
2980 
2981   if (SCALAR_FLOAT_MODE_P (mode))
2982     {
2983       rtx compare_op;
2984       rtx_insn *compare_seq;
2985 
2986       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2987 
2988       /* Shortcut:  following common codes never translate
2989 	 into carry flag compares.  */
2990       if (code == EQ || code == NE || code == UNEQ || code == LTGT
2991 	  || code == ORDERED || code == UNORDERED)
2992 	return false;
2993 
2994       /* These comparisons require zero flag; swap operands so they won't.  */
2995       if ((code == GT || code == UNLE || code == LE || code == UNGT)
2996 	  && !TARGET_IEEE_FP)
2997 	{
2998 	  std::swap (op0, op1);
2999 	  code = swap_condition (code);
3000 	}
3001 
3002       /* Try to expand the comparison and verify that we end up with
3003 	 carry flag based comparison.  This fails to be true only when
3004 	 we decide to expand comparison using arithmetic that is not
3005 	 too common scenario.  */
3006       start_sequence ();
3007       compare_op = ix86_expand_fp_compare (code, op0, op1);
3008       compare_seq = get_insns ();
3009       end_sequence ();
3010 
3011       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3012         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3013       else
3014 	code = GET_CODE (compare_op);
3015 
3016       if (code != LTU && code != GEU)
3017 	return false;
3018 
3019       emit_insn (compare_seq);
3020       *pop = compare_op;
3021       return true;
3022     }
3023 
3024   if (!INTEGRAL_MODE_P (mode))
3025     return false;
3026 
3027   switch (code)
3028     {
3029     case LTU:
3030     case GEU:
3031       break;
3032 
3033     /* Convert a==0 into (unsigned)a<1.  */
3034     case EQ:
3035     case NE:
3036       if (op1 != const0_rtx)
3037 	return false;
3038       op1 = const1_rtx;
3039       code = (code == EQ ? LTU : GEU);
3040       break;
3041 
3042     /* Convert a>b into b<a or a>=b-1.  */
3043     case GTU:
3044     case LEU:
3045       if (CONST_INT_P (op1))
3046 	{
3047 	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3048 	  /* Bail out on overflow.  We still can swap operands but that
3049 	     would force loading of the constant into register.  */
3050 	  if (op1 == const0_rtx
3051 	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3052 	    return false;
3053 	  code = (code == GTU ? GEU : LTU);
3054 	}
3055       else
3056 	{
3057 	  std::swap (op0, op1);
3058 	  code = (code == GTU ? LTU : GEU);
3059 	}
3060       break;
3061 
3062     /* Convert a>=0 into (unsigned)a<0x80000000.  */
3063     case LT:
3064     case GE:
3065       if (mode == DImode || op1 != const0_rtx)
3066 	return false;
3067       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3068       code = (code == LT ? GEU : LTU);
3069       break;
3070     case LE:
3071     case GT:
3072       if (mode == DImode || op1 != constm1_rtx)
3073 	return false;
3074       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3075       code = (code == LE ? GEU : LTU);
3076       break;
3077 
3078     default:
3079       return false;
3080     }
3081   /* Swapping operands may cause constant to appear as first operand.  */
3082   if (!nonimmediate_operand (op0, VOIDmode))
3083     {
3084       if (!can_create_pseudo_p ())
3085 	return false;
3086       op0 = force_reg (mode, op0);
3087     }
3088   *pop = ix86_expand_compare (code, op0, op1);
3089   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3090   return true;
3091 }
3092 
3093 /* Expand conditional increment or decrement using adb/sbb instructions.
3094    The default case using setcc followed by the conditional move can be
3095    done by generic code.  */
3096 bool
ix86_expand_int_addcc(rtx operands[])3097 ix86_expand_int_addcc (rtx operands[])
3098 {
3099   enum rtx_code code = GET_CODE (operands[1]);
3100   rtx flags;
3101   rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3102   rtx compare_op;
3103   rtx val = const0_rtx;
3104   bool fpcmp = false;
3105   machine_mode mode;
3106   rtx op0 = XEXP (operands[1], 0);
3107   rtx op1 = XEXP (operands[1], 1);
3108 
3109   if (operands[3] != const1_rtx
3110       && operands[3] != constm1_rtx)
3111     return false;
3112   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3113      return false;
3114   code = GET_CODE (compare_op);
3115 
3116   flags = XEXP (compare_op, 0);
3117 
3118   if (GET_MODE (flags) == CCFPmode)
3119     {
3120       fpcmp = true;
3121       code = ix86_fp_compare_code_to_integer (code);
3122     }
3123 
3124   if (code != LTU)
3125     {
3126       val = constm1_rtx;
3127       if (fpcmp)
3128 	PUT_CODE (compare_op,
3129 		  reverse_condition_maybe_unordered
3130 		    (GET_CODE (compare_op)));
3131       else
3132 	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3133     }
3134 
3135   mode = GET_MODE (operands[0]);
3136 
3137   /* Construct either adc or sbb insn.  */
3138   if ((code == LTU) == (operands[3] == constm1_rtx))
3139     insn = gen_sub3_carry;
3140   else
3141     insn = gen_add3_carry;
3142 
3143   emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3144 
3145   return true;
3146 }
3147 
3148 bool
ix86_expand_int_movcc(rtx operands[])3149 ix86_expand_int_movcc (rtx operands[])
3150 {
3151   enum rtx_code code = GET_CODE (operands[1]), compare_code;
3152   rtx_insn *compare_seq;
3153   rtx compare_op;
3154   machine_mode mode = GET_MODE (operands[0]);
3155   bool sign_bit_compare_p = false;
3156   rtx op0 = XEXP (operands[1], 0);
3157   rtx op1 = XEXP (operands[1], 1);
3158   rtx op2 = operands[2];
3159   rtx op3 = operands[3];
3160 
3161   if (GET_MODE (op0) == TImode
3162       || (GET_MODE (op0) == DImode
3163 	  && !TARGET_64BIT))
3164     return false;
3165 
3166   start_sequence ();
3167   compare_op = ix86_expand_compare (code, op0, op1);
3168   compare_seq = get_insns ();
3169   end_sequence ();
3170 
3171   compare_code = GET_CODE (compare_op);
3172 
3173   if ((op1 == const0_rtx && (code == GE || code == LT))
3174       || (op1 == constm1_rtx && (code == GT || code == LE)))
3175     sign_bit_compare_p = true;
3176 
3177   /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3178      but if op1 is a constant, the latter form allows more optimizations,
3179      either through the last 2 ops being constant handling, or the one
3180      constant and one variable cases.  On the other side, for cmov the
3181      former might be better as we don't need to load the constant into
3182      another register.  */
3183   if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3184     op2 = op1;
3185   /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1.  */
3186   else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3187     op3 = op1;
3188 
3189   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3190      HImode insns, we'd be swallowed in word prefix ops.  */
3191 
3192   if ((mode != HImode || TARGET_FAST_PREFIX)
3193       && (mode != (TARGET_64BIT ? TImode : DImode))
3194       && CONST_INT_P (op2)
3195       && CONST_INT_P (op3))
3196     {
3197       rtx out = operands[0];
3198       HOST_WIDE_INT ct = INTVAL (op2);
3199       HOST_WIDE_INT cf = INTVAL (op3);
3200       HOST_WIDE_INT diff;
3201 
3202       diff = ct - cf;
3203       /*  Sign bit compares are better done using shifts than we do by using
3204 	  sbb.  */
3205       if (sign_bit_compare_p
3206 	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3207 	{
3208 	  /* Detect overlap between destination and compare sources.  */
3209 	  rtx tmp = out;
3210 
3211           if (!sign_bit_compare_p)
3212 	    {
3213 	      rtx flags;
3214 	      bool fpcmp = false;
3215 
3216 	      compare_code = GET_CODE (compare_op);
3217 
3218 	      flags = XEXP (compare_op, 0);
3219 
3220 	      if (GET_MODE (flags) == CCFPmode)
3221 		{
3222 		  fpcmp = true;
3223 		  compare_code
3224 		    = ix86_fp_compare_code_to_integer (compare_code);
3225 		}
3226 
3227 	      /* To simplify rest of code, restrict to the GEU case.  */
3228 	      if (compare_code == LTU)
3229 		{
3230 		  std::swap (ct, cf);
3231 		  compare_code = reverse_condition (compare_code);
3232 		  code = reverse_condition (code);
3233 		}
3234 	      else
3235 		{
3236 		  if (fpcmp)
3237 		    PUT_CODE (compare_op,
3238 			      reverse_condition_maybe_unordered
3239 			        (GET_CODE (compare_op)));
3240 		  else
3241 		    PUT_CODE (compare_op,
3242 			      reverse_condition (GET_CODE (compare_op)));
3243 		}
3244 	      diff = ct - cf;
3245 
3246 	      if (reg_overlap_mentioned_p (out, op0)
3247 		  || reg_overlap_mentioned_p (out, op1))
3248 		tmp = gen_reg_rtx (mode);
3249 
3250 	      if (mode == DImode)
3251 		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3252 	      else
3253 		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
3254 						 flags, compare_op));
3255 	    }
3256 	  else
3257 	    {
3258 	      if (code == GT || code == GE)
3259 		code = reverse_condition (code);
3260 	      else
3261 		{
3262 		  std::swap (ct, cf);
3263 		  diff = ct - cf;
3264 		}
3265 	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3266 	    }
3267 
3268 	  if (diff == 1)
3269 	    {
3270 	      /*
3271 	       * cmpl op0,op1
3272 	       * sbbl dest,dest
3273 	       * [addl dest, ct]
3274 	       *
3275 	       * Size 5 - 8.
3276 	       */
3277 	      if (ct)
3278 		tmp = expand_simple_binop (mode, PLUS,
3279 					   tmp, GEN_INT (ct),
3280 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3281 	    }
3282 	  else if (cf == -1)
3283 	    {
3284 	      /*
3285 	       * cmpl op0,op1
3286 	       * sbbl dest,dest
3287 	       * orl $ct, dest
3288 	       *
3289 	       * Size 8.
3290 	       */
3291 	      tmp = expand_simple_binop (mode, IOR,
3292 					 tmp, GEN_INT (ct),
3293 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3294 	    }
3295 	  else if (diff == -1 && ct)
3296 	    {
3297 	      /*
3298 	       * cmpl op0,op1
3299 	       * sbbl dest,dest
3300 	       * notl dest
3301 	       * [addl dest, cf]
3302 	       *
3303 	       * Size 8 - 11.
3304 	       */
3305 	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3306 	      if (cf)
3307 		tmp = expand_simple_binop (mode, PLUS,
3308 					   copy_rtx (tmp), GEN_INT (cf),
3309 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3310 	    }
3311 	  else
3312 	    {
3313 	      /*
3314 	       * cmpl op0,op1
3315 	       * sbbl dest,dest
3316 	       * [notl dest]
3317 	       * andl cf - ct, dest
3318 	       * [addl dest, ct]
3319 	       *
3320 	       * Size 8 - 11.
3321 	       */
3322 
3323 	      if (cf == 0)
3324 		{
3325 		  cf = ct;
3326 		  ct = 0;
3327 		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3328 		}
3329 
3330 	      tmp = expand_simple_binop (mode, AND,
3331 					 copy_rtx (tmp),
3332 					 gen_int_mode (cf - ct, mode),
3333 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
3334 	      if (ct)
3335 		tmp = expand_simple_binop (mode, PLUS,
3336 					   copy_rtx (tmp), GEN_INT (ct),
3337 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
3338 	    }
3339 
3340 	  if (!rtx_equal_p (tmp, out))
3341 	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3342 
3343 	  return true;
3344 	}
3345 
3346       if (diff < 0)
3347 	{
3348 	  machine_mode cmp_mode = GET_MODE (op0);
3349 	  enum rtx_code new_code;
3350 
3351 	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
3352 	    {
3353 	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3354 
3355 	      /* We may be reversing a non-trapping
3356 		 comparison to a trapping comparison.  */
3357 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3358 		      && code != EQ && code != NE
3359 		      && code != ORDERED && code != UNORDERED)
3360 		    new_code = UNKNOWN;
3361 		  else
3362 		    new_code = reverse_condition_maybe_unordered (code);
3363 	    }
3364 	  else
3365 	    new_code = ix86_reverse_condition (code, cmp_mode);
3366 	  if (new_code != UNKNOWN)
3367 	    {
3368 	      std::swap (ct, cf);
3369 	      diff = -diff;
3370 	      code = new_code;
3371 	    }
3372 	}
3373 
3374       compare_code = UNKNOWN;
3375       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3376 	  && CONST_INT_P (op1))
3377 	{
3378 	  if (op1 == const0_rtx
3379 	      && (code == LT || code == GE))
3380 	    compare_code = code;
3381 	  else if (op1 == constm1_rtx)
3382 	    {
3383 	      if (code == LE)
3384 		compare_code = LT;
3385 	      else if (code == GT)
3386 		compare_code = GE;
3387 	    }
3388 	}
3389 
3390       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
3391       if (compare_code != UNKNOWN
3392 	  && GET_MODE (op0) == GET_MODE (out)
3393 	  && (cf == -1 || ct == -1))
3394 	{
3395 	  /* If lea code below could be used, only optimize
3396 	     if it results in a 2 insn sequence.  */
3397 
3398 	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3399 		 || diff == 3 || diff == 5 || diff == 9)
3400 	      || (compare_code == LT && ct == -1)
3401 	      || (compare_code == GE && cf == -1))
3402 	    {
3403 	      /*
3404 	       * notl op1	(if necessary)
3405 	       * sarl $31, op1
3406 	       * orl cf, op1
3407 	       */
3408 	      if (ct != -1)
3409 		{
3410 		  cf = ct;
3411 		  ct = -1;
3412 		  code = reverse_condition (code);
3413 		}
3414 
3415 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3416 
3417 	      out = expand_simple_binop (mode, IOR,
3418 					 out, GEN_INT (cf),
3419 					 out, 1, OPTAB_DIRECT);
3420 	      if (out != operands[0])
3421 		emit_move_insn (operands[0], out);
3422 
3423 	      return true;
3424 	    }
3425 	}
3426 
3427 
3428       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3429 	   || diff == 3 || diff == 5 || diff == 9)
3430 	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3431 	  && (mode != DImode
3432 	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3433 	{
3434 	  /*
3435 	   * xorl dest,dest
3436 	   * cmpl op1,op2
3437 	   * setcc dest
3438 	   * lea cf(dest*(ct-cf)),dest
3439 	   *
3440 	   * Size 14.
3441 	   *
3442 	   * This also catches the degenerate setcc-only case.
3443 	   */
3444 
3445 	  rtx tmp;
3446 	  int nops;
3447 
3448 	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3449 
3450 	  nops = 0;
3451 	  /* On x86_64 the lea instruction operates on Pmode, so we need
3452 	     to get arithmetics done in proper mode to match.  */
3453 	  if (diff == 1)
3454 	    tmp = copy_rtx (out);
3455 	  else
3456 	    {
3457 	      rtx out1;
3458 	      out1 = copy_rtx (out);
3459 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3460 	      nops++;
3461 	      if (diff & 1)
3462 		{
3463 		  tmp = gen_rtx_PLUS (mode, tmp, out1);
3464 		  nops++;
3465 		}
3466 	    }
3467 	  if (cf != 0)
3468 	    {
3469 	      tmp = plus_constant (mode, tmp, cf);
3470 	      nops++;
3471 	    }
3472 	  if (!rtx_equal_p (tmp, out))
3473 	    {
3474 	      if (nops == 1)
3475 		out = force_operand (tmp, copy_rtx (out));
3476 	      else
3477 		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3478 	    }
3479 	  if (!rtx_equal_p (out, operands[0]))
3480 	    emit_move_insn (operands[0], copy_rtx (out));
3481 
3482 	  return true;
3483 	}
3484 
3485       /*
3486        * General case:			Jumpful:
3487        *   xorl dest,dest		cmpl op1, op2
3488        *   cmpl op1, op2		movl ct, dest
3489        *   setcc dest			jcc 1f
3490        *   decl dest			movl cf, dest
3491        *   andl (cf-ct),dest		1:
3492        *   addl ct,dest
3493        *
3494        * Size 20.			Size 14.
3495        *
3496        * This is reasonably steep, but branch mispredict costs are
3497        * high on modern cpus, so consider failing only if optimizing
3498        * for space.
3499        */
3500 
3501       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3502 	  && BRANCH_COST (optimize_insn_for_speed_p (),
3503 		  	  false) >= 2)
3504 	{
3505 	  if (cf == 0)
3506 	    {
3507 	      machine_mode cmp_mode = GET_MODE (op0);
3508 	      enum rtx_code new_code;
3509 
3510 	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
3511 		{
3512 		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3513 
3514 		  /* We may be reversing a non-trapping
3515 		     comparison to a trapping comparison.  */
3516 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
3517 		      && code != EQ && code != NE
3518 		      && code != ORDERED && code != UNORDERED)
3519 		    new_code = UNKNOWN;
3520 		  else
3521 		    new_code = reverse_condition_maybe_unordered (code);
3522 
3523 		}
3524 	      else
3525 		{
3526 		  new_code = ix86_reverse_condition (code, cmp_mode);
3527 		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
3528 		    compare_code = reverse_condition (compare_code);
3529 		}
3530 
3531 	      if (new_code != UNKNOWN)
3532 		{
3533 		  cf = ct;
3534 		  ct = 0;
3535 		  code = new_code;
3536 		}
3537 	    }
3538 
3539 	  if (compare_code != UNKNOWN)
3540 	    {
3541 	      /* notl op1	(if needed)
3542 		 sarl $31, op1
3543 		 andl (cf-ct), op1
3544 		 addl ct, op1
3545 
3546 		 For x < 0 (resp. x <= -1) there will be no notl,
3547 		 so if possible swap the constants to get rid of the
3548 		 complement.
3549 		 True/false will be -1/0 while code below (store flag
3550 		 followed by decrement) is 0/-1, so the constants need
3551 		 to be exchanged once more.  */
3552 
3553 	      if (compare_code == GE || !cf)
3554 		{
3555 		  code = reverse_condition (code);
3556 		  compare_code = LT;
3557 		}
3558 	      else
3559 		std::swap (ct, cf);
3560 
3561 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3562 	    }
3563 	  else
3564 	    {
3565 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3566 
3567 	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3568 					 constm1_rtx,
3569 					 copy_rtx (out), 1, OPTAB_DIRECT);
3570 	    }
3571 
3572 	  out = expand_simple_binop (mode, AND, copy_rtx (out),
3573 				     gen_int_mode (cf - ct, mode),
3574 				     copy_rtx (out), 1, OPTAB_DIRECT);
3575 	  if (ct)
3576 	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3577 				       copy_rtx (out), 1, OPTAB_DIRECT);
3578 	  if (!rtx_equal_p (out, operands[0]))
3579 	    emit_move_insn (operands[0], copy_rtx (out));
3580 
3581 	  return true;
3582 	}
3583     }
3584 
3585   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3586     {
3587       /* Try a few things more with specific constants and a variable.  */
3588 
3589       optab op;
3590       rtx var, orig_out, out, tmp;
3591 
3592       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3593 	return false;
3594 
3595       operands[2] = op2;
3596       operands[3] = op3;
3597 
3598       /* If one of the two operands is an interesting constant, load a
3599 	 constant with the above and mask it in with a logical operation.  */
3600 
3601       if (CONST_INT_P (operands[2]))
3602 	{
3603 	  var = operands[3];
3604 	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3605 	    operands[3] = constm1_rtx, op = and_optab;
3606 	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3607 	    operands[3] = const0_rtx, op = ior_optab;
3608 	  else
3609 	    return false;
3610 	}
3611       else if (CONST_INT_P (operands[3]))
3612 	{
3613 	  var = operands[2];
3614 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3615 	    {
3616 	      /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3617 		 "x <= 0 ? x : 0" to enable sign_bit_compare_p.  */
3618 	      if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3619 		operands[1] = simplify_gen_relational (LT, VOIDmode,
3620 						       GET_MODE (op0),
3621 						       op0, const0_rtx);
3622 
3623 	      operands[2] = constm1_rtx;
3624 	      op = and_optab;
3625 	    }
3626 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3627 	    operands[2] = const0_rtx, op = ior_optab;
3628 	  else
3629 	    return false;
3630 	}
3631       else
3632         return false;
3633 
3634       orig_out = operands[0];
3635       tmp = gen_reg_rtx (mode);
3636       operands[0] = tmp;
3637 
3638       /* Recurse to get the constant loaded.  */
3639       if (!ix86_expand_int_movcc (operands))
3640         return false;
3641 
3642       /* Mask in the interesting variable.  */
3643       out = expand_binop (mode, op, var, tmp, orig_out, 0,
3644 			  OPTAB_WIDEN);
3645       if (!rtx_equal_p (out, orig_out))
3646 	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3647 
3648       return true;
3649     }
3650 
3651   /*
3652    * For comparison with above,
3653    *
3654    * movl cf,dest
3655    * movl ct,tmp
3656    * cmpl op1,op2
3657    * cmovcc tmp,dest
3658    *
3659    * Size 15.
3660    */
3661 
3662   if (! nonimmediate_operand (operands[2], mode))
3663     operands[2] = force_reg (mode, operands[2]);
3664   if (! nonimmediate_operand (operands[3], mode))
3665     operands[3] = force_reg (mode, operands[3]);
3666 
3667   if (! register_operand (operands[2], VOIDmode)
3668       && (mode == QImode
3669           || ! register_operand (operands[3], VOIDmode)))
3670     operands[2] = force_reg (mode, operands[2]);
3671 
3672   if (mode == QImode
3673       && ! register_operand (operands[3], VOIDmode))
3674     operands[3] = force_reg (mode, operands[3]);
3675 
3676   emit_insn (compare_seq);
3677   emit_insn (gen_rtx_SET (operands[0],
3678 			  gen_rtx_IF_THEN_ELSE (mode,
3679 						compare_op, operands[2],
3680 						operands[3])));
3681   return true;
3682 }
3683 
3684 /* Detect conditional moves that exactly match min/max operational
3685    semantics.  Note that this is IEEE safe, as long as we don't
3686    interchange the operands.
3687 
3688    Returns FALSE if this conditional move doesn't match a MIN/MAX,
3689    and TRUE if the operation is successful and instructions are emitted.  */
3690 
3691 static bool
ix86_expand_sse_fp_minmax(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx if_true,rtx if_false)3692 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3693 			   rtx cmp_op1, rtx if_true, rtx if_false)
3694 {
3695   machine_mode mode;
3696   bool is_min;
3697   rtx tmp;
3698 
3699   if (code == LT)
3700     ;
3701   else if (code == UNGE)
3702     std::swap (if_true, if_false);
3703   else
3704     return false;
3705 
3706   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3707     is_min = true;
3708   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3709     is_min = false;
3710   else
3711     return false;
3712 
3713   mode = GET_MODE (dest);
3714 
3715   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3716      but MODE may be a vector mode and thus not appropriate.  */
3717   if (!flag_finite_math_only || flag_signed_zeros)
3718     {
3719       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3720       rtvec v;
3721 
3722       if_true = force_reg (mode, if_true);
3723       v = gen_rtvec (2, if_true, if_false);
3724       tmp = gen_rtx_UNSPEC (mode, v, u);
3725     }
3726   else
3727     {
3728       code = is_min ? SMIN : SMAX;
3729       if (MEM_P (if_true) && MEM_P (if_false))
3730 	if_true = force_reg (mode, if_true);
3731       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3732     }
3733 
3734   emit_insn (gen_rtx_SET (dest, tmp));
3735   return true;
3736 }
3737 
3738 /* Return true if MODE is valid for vector compare to mask register,
3739    Same result for conditionl vector move with mask register.  */
3740 static bool
ix86_valid_mask_cmp_mode(machine_mode mode)3741 ix86_valid_mask_cmp_mode (machine_mode mode)
3742 {
3743   /* XOP has its own vector conditional movement.  */
3744   if (TARGET_XOP && !TARGET_AVX512F)
3745     return false;
3746 
3747   /* HFmode only supports vcmpsh whose dest is mask register.  */
3748   if (TARGET_AVX512FP16 && mode == HFmode)
3749     return true;
3750 
3751   /* AVX512F is needed for mask operation.  */
3752   if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3753     return false;
3754 
3755   /* AVX512BW is needed for vector QI/HImode,
3756      AVX512VL is needed for 128/256-bit vector.  */
3757   machine_mode inner_mode = GET_MODE_INNER (mode);
3758   int vector_size = GET_MODE_SIZE (mode);
3759   if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3760     return false;
3761 
3762   return vector_size == 64 || TARGET_AVX512VL;
3763 }
3764 
3765 /* Return true if integer mask comparison should be used.  */
3766 static bool
ix86_use_mask_cmp_p(machine_mode mode,machine_mode cmp_mode,rtx op_true,rtx op_false)3767 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3768 		     rtx op_true, rtx op_false)
3769 {
3770   int vector_size = GET_MODE_SIZE (mode);
3771 
3772   if (cmp_mode == HFmode)
3773     return true;
3774   else if (vector_size < 16)
3775     return false;
3776   else if (vector_size == 64)
3777     return true;
3778   else if (GET_MODE_INNER (cmp_mode) == HFmode)
3779     return true;
3780 
3781   /* When op_true is NULL, op_false must be NULL, or vice versa.  */
3782   gcc_assert (!op_true == !op_false);
3783 
3784   /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3785      vector dest is required.  */
3786   if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3787     return false;
3788 
3789   /* Exclude those that could be optimized in ix86_expand_sse_movcc.  */
3790   if (op_false == CONST0_RTX (mode)
3791       || op_true == CONST0_RTX (mode)
3792       || (INTEGRAL_MODE_P (mode)
3793 	  && (op_true == CONSTM1_RTX (mode)
3794 	      || op_false == CONSTM1_RTX (mode))))
3795     return false;
3796 
3797   return true;
3798 }
3799 
3800 /* Expand an SSE comparison.  Return the register with the result.  */
3801 
3802 static rtx
ix86_expand_sse_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx op_true,rtx op_false)3803 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3804 		     rtx op_true, rtx op_false)
3805 {
3806   machine_mode mode = GET_MODE (dest);
3807   machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3808 
3809   /* In general case result of comparison can differ from operands' type.  */
3810   machine_mode cmp_mode;
3811 
3812   /* In AVX512F the result of comparison is an integer mask.  */
3813   bool maskcmp = false;
3814   rtx x;
3815 
3816   if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3817     {
3818       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3819       maskcmp = true;
3820       cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3821     }
3822   else
3823     cmp_mode = cmp_ops_mode;
3824 
3825   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3826 
3827   bool (*op1_predicate)(rtx, machine_mode)
3828     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3829 
3830   if (!op1_predicate (cmp_op1, cmp_ops_mode))
3831     cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3832 
3833   if (optimize
3834       || (maskcmp && cmp_mode != mode)
3835       || (op_true && reg_overlap_mentioned_p (dest, op_true))
3836       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3837     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3838 
3839   if (maskcmp)
3840     {
3841       bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3842       gcc_assert (ok);
3843       return dest;
3844     }
3845 
3846   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3847 
3848   if (cmp_mode != mode)
3849     {
3850       x = force_reg (cmp_ops_mode, x);
3851       convert_move (dest, x, false);
3852     }
3853   else
3854     emit_insn (gen_rtx_SET (dest, x));
3855 
3856   return dest;
3857 }
3858 
3859 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3860    instructions that can be performed using GP registers.  */
3861 
3862 static void
ix86_emit_vec_binop(enum rtx_code code,machine_mode mode,rtx dst,rtx src1,rtx src2)3863 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
3864 		     rtx dst, rtx src1, rtx src2)
3865 {
3866   rtx tmp;
3867 
3868   tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
3869 
3870   if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
3871       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
3872     {
3873       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
3874       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
3875     }
3876 
3877   emit_insn (tmp);
3878 }
3879 
3880 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3881    operations.  This is used for both scalar and vector conditional moves.  */
3882 
3883 void
ix86_expand_sse_movcc(rtx dest,rtx cmp,rtx op_true,rtx op_false)3884 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3885 {
3886   machine_mode mode = GET_MODE (dest);
3887   machine_mode cmpmode = GET_MODE (cmp);
3888   rtx x;
3889 
3890   /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506.  */
3891   if (rtx_equal_p (op_true, op_false))
3892     {
3893       emit_move_insn (dest, op_true);
3894       return;
3895     }
3896 
3897   /* If we have an integer mask and FP value then we need
3898      to cast mask to FP mode.  */
3899   if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3900     {
3901       cmp = force_reg (cmpmode, cmp);
3902       cmp = gen_rtx_SUBREG (mode, cmp, 0);
3903     }
3904 
3905   /* In AVX512F the result of comparison is an integer mask.  */
3906   if (mode != cmpmode
3907       && GET_MODE_CLASS (cmpmode) == MODE_INT)
3908     {
3909       gcc_assert (ix86_valid_mask_cmp_mode (mode));
3910       /* Using scalar/vector move with mask register.  */
3911       cmp = force_reg (cmpmode, cmp);
3912       /* Optimize for mask zero.  */
3913       op_true = (op_true != CONST0_RTX (mode)
3914 		 ? force_reg (mode, op_true) : op_true);
3915       op_false = (op_false != CONST0_RTX (mode)
3916 		  ? force_reg (mode, op_false) : op_false);
3917       if (op_true == CONST0_RTX (mode))
3918 	{
3919 	  if (cmpmode == E_DImode && !TARGET_64BIT)
3920 	    {
3921 	      x = gen_reg_rtx (cmpmode);
3922 	      emit_insn (gen_knotdi (x, cmp));
3923 	    }
3924 	  else
3925 	    x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
3926 	  cmp = x;
3927 	  /* Reverse op_true op_false.  */
3928 	  std::swap (op_true, op_false);
3929 	}
3930 
3931       if (mode == HFmode)
3932 	emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
3933       else
3934 	emit_insn (gen_rtx_SET (dest,
3935 				gen_rtx_VEC_MERGE (mode,
3936 						   op_true, op_false, cmp)));
3937       return;
3938     }
3939 
3940   if (vector_all_ones_operand (op_true, mode)
3941       && op_false == CONST0_RTX (mode))
3942     {
3943       emit_move_insn (dest, cmp);
3944       return;
3945     }
3946   else if (op_false == CONST0_RTX (mode))
3947     {
3948       x = expand_simple_binop (mode, AND, cmp, op_true,
3949 			       dest, 1, OPTAB_DIRECT);
3950       if (x != dest)
3951 	emit_move_insn (dest, x);
3952       return;
3953     }
3954   else if (op_true == CONST0_RTX (mode))
3955     {
3956       op_false = force_reg (mode, op_false);
3957       x = gen_rtx_NOT (mode, cmp);
3958       ix86_emit_vec_binop (AND, mode, dest, x, op_false);
3959       return;
3960     }
3961   else if (vector_all_ones_operand (op_true, mode))
3962     {
3963       x = expand_simple_binop (mode, IOR, cmp, op_false,
3964 			       dest, 1, OPTAB_DIRECT);
3965       if (x != dest)
3966 	emit_move_insn (dest, x);
3967       return;
3968     }
3969 
3970   if (TARGET_XOP)
3971     {
3972       op_true = force_reg (mode, op_true);
3973 
3974       if (GET_MODE_SIZE (mode) < 16
3975 	  || !nonimmediate_operand (op_false, mode))
3976 	op_false = force_reg (mode, op_false);
3977 
3978       emit_insn (gen_rtx_SET (dest,
3979 			      gen_rtx_IF_THEN_ELSE (mode, cmp,
3980 						    op_true, op_false)));
3981       return;
3982     }
3983 
3984   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3985   machine_mode blend_mode = mode;
3986 
3987   if (GET_MODE_SIZE (mode) < 16
3988       || !vector_operand (op_true, mode))
3989     op_true = force_reg (mode, op_true);
3990 
3991   op_false = force_reg (mode, op_false);
3992 
3993   switch (mode)
3994     {
3995     case E_V2SFmode:
3996       if (TARGET_SSE4_1)
3997 	gen = gen_mmx_blendvps;
3998       break;
3999     case E_V4SFmode:
4000       if (TARGET_SSE4_1)
4001 	gen = gen_sse4_1_blendvps;
4002       break;
4003     case E_V2DFmode:
4004       if (TARGET_SSE4_1)
4005 	gen = gen_sse4_1_blendvpd;
4006       break;
4007     case E_SFmode:
4008       if (TARGET_SSE4_1)
4009 	gen = gen_sse4_1_blendvss;
4010       break;
4011     case E_DFmode:
4012       if (TARGET_SSE4_1)
4013 	gen = gen_sse4_1_blendvsd;
4014       break;
4015     case E_V8QImode:
4016     case E_V4HImode:
4017     case E_V2SImode:
4018       if (TARGET_SSE4_1)
4019 	{
4020 	  gen = gen_mmx_pblendvb_v8qi;
4021 	  blend_mode = V8QImode;
4022 	}
4023       break;
4024     case E_V4QImode:
4025     case E_V2HImode:
4026       if (TARGET_SSE4_1)
4027 	{
4028 	  gen = gen_mmx_pblendvb_v4qi;
4029 	  blend_mode = V4QImode;
4030 	}
4031       break;
4032     case E_V2QImode:
4033       if (TARGET_SSE4_1)
4034 	gen = gen_mmx_pblendvb_v2qi;
4035       break;
4036     case E_V16QImode:
4037     case E_V8HImode:
4038     case E_V8HFmode:
4039     case E_V4SImode:
4040     case E_V2DImode:
4041       if (TARGET_SSE4_1)
4042 	{
4043 	  gen = gen_sse4_1_pblendvb;
4044 	  blend_mode = V16QImode;
4045 	}
4046       break;
4047     case E_V8SFmode:
4048       if (TARGET_AVX)
4049 	gen = gen_avx_blendvps256;
4050       break;
4051     case E_V4DFmode:
4052       if (TARGET_AVX)
4053 	gen = gen_avx_blendvpd256;
4054       break;
4055     case E_V32QImode:
4056     case E_V16HImode:
4057     case E_V16HFmode:
4058     case E_V8SImode:
4059     case E_V4DImode:
4060       if (TARGET_AVX2)
4061 	{
4062 	  gen = gen_avx2_pblendvb;
4063 	  blend_mode = V32QImode;
4064 	}
4065       break;
4066 
4067     case E_V64QImode:
4068       gen = gen_avx512bw_blendmv64qi;
4069       break;
4070     case E_V32HImode:
4071       gen = gen_avx512bw_blendmv32hi;
4072       break;
4073     case E_V32HFmode:
4074       gen = gen_avx512bw_blendmv32hf;
4075       break;
4076     case E_V16SImode:
4077       gen = gen_avx512f_blendmv16si;
4078       break;
4079     case E_V8DImode:
4080       gen = gen_avx512f_blendmv8di;
4081       break;
4082     case E_V8DFmode:
4083       gen = gen_avx512f_blendmv8df;
4084       break;
4085     case E_V16SFmode:
4086       gen = gen_avx512f_blendmv16sf;
4087       break;
4088 
4089     default:
4090       break;
4091     }
4092 
4093   if (gen != NULL)
4094     {
4095       if (blend_mode == mode)
4096 	x = dest;
4097       else
4098 	{
4099 	  x = gen_reg_rtx (blend_mode);
4100 	  op_false = gen_lowpart (blend_mode, op_false);
4101 	  op_true = gen_lowpart (blend_mode, op_true);
4102 	  cmp = gen_lowpart (blend_mode, cmp);
4103 	}
4104 
4105       emit_insn (gen (x, op_false, op_true, cmp));
4106 
4107       if (x != dest)
4108 	emit_move_insn (dest, gen_lowpart (mode, x));
4109     }
4110   else
4111     {
4112       rtx t2, t3;
4113 
4114       t2 = expand_simple_binop (mode, AND, op_true, cmp,
4115 				NULL, 1, OPTAB_DIRECT);
4116 
4117       t3 = gen_reg_rtx (mode);
4118       x = gen_rtx_NOT (mode, cmp);
4119       ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4120 
4121       x = expand_simple_binop (mode, IOR, t3, t2,
4122 			       dest, 1, OPTAB_DIRECT);
4123       if (x != dest)
4124 	emit_move_insn (dest, x);
4125     }
4126 }
4127 
4128 /* Swap, force into registers, or otherwise massage the two operands
4129    to an sse comparison with a mask result.  Thus we differ a bit from
4130    ix86_prepare_fp_compare_args which expects to produce a flags result.
4131 
4132    The DEST operand exists to help determine whether to commute commutative
4133    operators.  The POP0/POP1 operands are updated in place.  The new
4134    comparison code is returned, or UNKNOWN if not implementable.  */
4135 
4136 static enum rtx_code
ix86_prepare_sse_fp_compare_args(rtx dest,enum rtx_code code,rtx * pop0,rtx * pop1)4137 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4138 				  rtx *pop0, rtx *pop1)
4139 {
4140   switch (code)
4141     {
4142     case LTGT:
4143     case UNEQ:
4144       /* AVX supports all the needed comparisons.  */
4145       if (TARGET_AVX)
4146 	break;
4147       /* We have no LTGT as an operator.  We could implement it with
4148 	 NE & ORDERED, but this requires an extra temporary.  It's
4149 	 not clear that it's worth it.  */
4150       return UNKNOWN;
4151 
4152     case LT:
4153     case LE:
4154     case UNGT:
4155     case UNGE:
4156       /* These are supported directly.  */
4157       break;
4158 
4159     case EQ:
4160     case NE:
4161     case UNORDERED:
4162     case ORDERED:
4163       /* AVX has 3 operand comparisons, no need to swap anything.  */
4164       if (TARGET_AVX)
4165 	break;
4166       /* For commutative operators, try to canonicalize the destination
4167 	 operand to be first in the comparison - this helps reload to
4168 	 avoid extra moves.  */
4169       if (!dest || !rtx_equal_p (dest, *pop1))
4170 	break;
4171       /* FALLTHRU */
4172 
4173     case GE:
4174     case GT:
4175     case UNLE:
4176     case UNLT:
4177       /* These are not supported directly before AVX, and furthermore
4178 	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
4179 	 comparison operands to transform into something that is
4180 	 supported.  */
4181       std::swap (*pop0, *pop1);
4182       code = swap_condition (code);
4183       break;
4184 
4185     default:
4186       gcc_unreachable ();
4187     }
4188 
4189   return code;
4190 }
4191 
4192 /* Expand a floating-point conditional move.  Return true if successful.  */
4193 
4194 bool
ix86_expand_fp_movcc(rtx operands[])4195 ix86_expand_fp_movcc (rtx operands[])
4196 {
4197   machine_mode mode = GET_MODE (operands[0]);
4198   enum rtx_code code = GET_CODE (operands[1]);
4199   rtx tmp, compare_op;
4200   rtx op0 = XEXP (operands[1], 0);
4201   rtx op1 = XEXP (operands[1], 1);
4202 
4203   if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4204     {
4205       machine_mode cmode;
4206 
4207       /* Since we've no cmove for sse registers, don't force bad register
4208 	 allocation just to gain access to it.  Deny movcc when the
4209 	 comparison mode doesn't match the move mode.  */
4210       cmode = GET_MODE (op0);
4211       if (cmode == VOIDmode)
4212 	cmode = GET_MODE (op1);
4213       if (cmode != mode)
4214 	return false;
4215 
4216       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4217       if (code == UNKNOWN)
4218 	return false;
4219 
4220       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4221 				     operands[2], operands[3]))
4222 	return true;
4223 
4224       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4225 				 operands[2], operands[3]);
4226       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4227       return true;
4228     }
4229 
4230   if (GET_MODE (op0) == TImode
4231       || (GET_MODE (op0) == DImode
4232 	  && !TARGET_64BIT))
4233     return false;
4234 
4235   /* The floating point conditional move instructions don't directly
4236      support conditions resulting from a signed integer comparison.  */
4237 
4238   compare_op = ix86_expand_compare (code, op0, op1);
4239   if (!fcmov_comparison_operator (compare_op, VOIDmode))
4240     {
4241       tmp = gen_reg_rtx (QImode);
4242       ix86_expand_setcc (tmp, code, op0, op1);
4243 
4244       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4245     }
4246 
4247   emit_insn (gen_rtx_SET (operands[0],
4248 			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
4249 						operands[2], operands[3])));
4250 
4251   return true;
4252 }
4253 
4254 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
4255 
4256 static int
ix86_int_cmp_code_to_pcmp_immediate(enum rtx_code code)4257 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4258 {
4259   switch (code)
4260     {
4261     case EQ:
4262       return 0;
4263     case LT:
4264     case LTU:
4265       return 1;
4266     case LE:
4267     case LEU:
4268       return 2;
4269     case NE:
4270       return 4;
4271     case GE:
4272     case GEU:
4273       return 5;
4274     case GT:
4275     case GTU:
4276       return 6;
4277     default:
4278       gcc_unreachable ();
4279     }
4280 }
4281 
4282 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
4283 
4284 static int
ix86_fp_cmp_code_to_pcmp_immediate(enum rtx_code code)4285 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4286 {
4287   switch (code)
4288     {
4289     case EQ:
4290       return 0x00;
4291     case NE:
4292       return 0x04;
4293     case GT:
4294       return 0x0e;
4295     case LE:
4296       return 0x02;
4297     case GE:
4298       return 0x0d;
4299     case LT:
4300       return 0x01;
4301     case UNLE:
4302       return 0x0a;
4303     case UNLT:
4304       return 0x09;
4305     case UNGE:
4306       return 0x05;
4307     case UNGT:
4308       return 0x06;
4309     case UNEQ:
4310       return 0x18;
4311     case LTGT:
4312       return 0x0c;
4313     case ORDERED:
4314       return 0x07;
4315     case UNORDERED:
4316       return 0x03;
4317     default:
4318       gcc_unreachable ();
4319     }
4320 }
4321 
4322 /* Return immediate value to be used in UNSPEC_PCMP
4323    for comparison CODE in MODE.  */
4324 
4325 static int
ix86_cmp_code_to_pcmp_immediate(enum rtx_code code,machine_mode mode)4326 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4327 {
4328   if (FLOAT_MODE_P (mode))
4329     return ix86_fp_cmp_code_to_pcmp_immediate (code);
4330   return ix86_int_cmp_code_to_pcmp_immediate (code);
4331 }
4332 
4333 /* Expand AVX-512 vector comparison.  */
4334 
4335 bool
ix86_expand_mask_vec_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1)4336 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4337 {
4338   machine_mode mask_mode = GET_MODE (dest);
4339   machine_mode cmp_mode = GET_MODE (cmp_op0);
4340   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4341   int unspec_code;
4342   rtx unspec;
4343 
4344   switch (code)
4345     {
4346     case LEU:
4347     case GTU:
4348     case GEU:
4349     case LTU:
4350       unspec_code = UNSPEC_UNSIGNED_PCMP;
4351       break;
4352 
4353     default:
4354       unspec_code = UNSPEC_PCMP;
4355     }
4356 
4357   unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4358 			   unspec_code);
4359   emit_insn (gen_rtx_SET (dest, unspec));
4360 
4361   return true;
4362 }
4363 
4364 /* Expand fp vector comparison.  */
4365 
4366 bool
ix86_expand_fp_vec_cmp(rtx operands[])4367 ix86_expand_fp_vec_cmp (rtx operands[])
4368 {
4369   enum rtx_code code = GET_CODE (operands[1]);
4370   rtx cmp;
4371 
4372   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4373 					   &operands[2], &operands[3]);
4374   if (code == UNKNOWN)
4375     {
4376       rtx temp;
4377       switch (GET_CODE (operands[1]))
4378 	{
4379 	case LTGT:
4380 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4381 				      operands[3], NULL, NULL);
4382 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4383 				     operands[3], NULL, NULL);
4384 	  code = AND;
4385 	  break;
4386 	case UNEQ:
4387 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4388 				      operands[3], NULL, NULL);
4389 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4390 				     operands[3], NULL, NULL);
4391 	  code = IOR;
4392 	  break;
4393 	default:
4394 	  gcc_unreachable ();
4395 	}
4396       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4397 				 OPTAB_DIRECT);
4398     }
4399   else
4400     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4401 			       NULL, NULL);
4402 
4403   if (operands[0] != cmp)
4404     emit_move_insn (operands[0], cmp);
4405 
4406   return true;
4407 }
4408 
4409 static rtx
ix86_expand_int_sse_cmp(rtx dest,enum rtx_code code,rtx cop0,rtx cop1,rtx op_true,rtx op_false,bool * negate)4410 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4411 			 rtx op_true, rtx op_false, bool *negate)
4412 {
4413   machine_mode data_mode = GET_MODE (dest);
4414   machine_mode mode = GET_MODE (cop0);
4415   rtx x;
4416 
4417   *negate = false;
4418 
4419   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
4420   if (TARGET_XOP
4421       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4422       && GET_MODE_SIZE (mode) <= 16)
4423     ;
4424   /* AVX512F supports all of the comparsions
4425      on all 128/256/512-bit vector int types.  */
4426   else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4427     ;
4428   else
4429     {
4430       /* Canonicalize the comparison to EQ, GT, GTU.  */
4431       switch (code)
4432 	{
4433 	case EQ:
4434 	case GT:
4435 	case GTU:
4436 	  break;
4437 
4438 	case NE:
4439 	case LE:
4440 	case LEU:
4441 	  code = reverse_condition (code);
4442 	  *negate = true;
4443 	  break;
4444 
4445 	case GE:
4446 	case GEU:
4447 	  code = reverse_condition (code);
4448 	  *negate = true;
4449 	  /* FALLTHRU */
4450 
4451 	case LT:
4452 	case LTU:
4453 	  std::swap (cop0, cop1);
4454 	  code = swap_condition (code);
4455 	  break;
4456 
4457 	default:
4458 	  gcc_unreachable ();
4459 	}
4460 
4461       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
4462       if (mode == V2DImode)
4463 	{
4464 	  switch (code)
4465 	    {
4466 	    case EQ:
4467 	      /* SSE4.1 supports EQ.  */
4468 	      if (!TARGET_SSE4_1)
4469 		return NULL;
4470 	      break;
4471 
4472 	    case GT:
4473 	    case GTU:
4474 	      /* SSE4.2 supports GT/GTU.  */
4475 	      if (!TARGET_SSE4_2)
4476 		return NULL;
4477 	      break;
4478 
4479 	    default:
4480 	      gcc_unreachable ();
4481 	    }
4482 	}
4483 
4484       rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4485       rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4486       if (*negate)
4487 	std::swap (optrue, opfalse);
4488 
4489       /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4490 	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4491 	 min (x, y) == x).  While we add one instruction (the minimum),
4492 	 we remove the need for two instructions in the negation, as the
4493 	 result is done this way.
4494 	 When using masks, do it for SI/DImode element types, as it is shorter
4495 	 than the two subtractions.  */
4496       if ((code != EQ
4497 	   && GET_MODE_SIZE (mode) != 64
4498 	   && vector_all_ones_operand (opfalse, data_mode)
4499 	   && optrue == CONST0_RTX (data_mode))
4500 	  || (code == GTU
4501 	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4502 	      /* Don't do it if not using integer masks and we'd end up with
4503 		 the right values in the registers though.  */
4504 	      && (GET_MODE_SIZE (mode) == 64
4505 		  || !vector_all_ones_operand (optrue, data_mode)
4506 		  || opfalse != CONST0_RTX (data_mode))))
4507 	{
4508 	  rtx (*gen) (rtx, rtx, rtx) = NULL;
4509 
4510 	  switch (mode)
4511 	    {
4512 	    case E_V16SImode:
4513 	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4514 	      break;
4515 	    case E_V8DImode:
4516 	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4517 	      cop0 = force_reg (mode, cop0);
4518 	      cop1 = force_reg (mode, cop1);
4519 	      break;
4520 	    case E_V32QImode:
4521 	      if (TARGET_AVX2)
4522 		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4523 	      break;
4524 	    case E_V16HImode:
4525 	      if (TARGET_AVX2)
4526 		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4527 	      break;
4528 	    case E_V8SImode:
4529 	      if (TARGET_AVX2)
4530 		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4531 	      break;
4532 	    case E_V4DImode:
4533 	      if (TARGET_AVX512VL)
4534 		{
4535 		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4536 		  cop0 = force_reg (mode, cop0);
4537 		  cop1 = force_reg (mode, cop1);
4538 		}
4539 	      break;
4540 	    case E_V16QImode:
4541 	      if (code == GTU && TARGET_SSE2)
4542 		gen = gen_uminv16qi3;
4543 	      else if (code == GT && TARGET_SSE4_1)
4544 		gen = gen_sminv16qi3;
4545 	      break;
4546 	    case E_V8QImode:
4547 	      if (code == GTU && TARGET_SSE2)
4548 		gen = gen_uminv8qi3;
4549 	      else if (code == GT && TARGET_SSE4_1)
4550 		gen = gen_sminv8qi3;
4551 	      break;
4552 	    case E_V4QImode:
4553 	      if (code == GTU && TARGET_SSE2)
4554 		gen = gen_uminv4qi3;
4555 	      else if (code == GT && TARGET_SSE4_1)
4556 		gen = gen_sminv4qi3;
4557 	      break;
4558 	    case E_V2QImode:
4559 	      if (code == GTU && TARGET_SSE2)
4560 		gen = gen_uminv2qi3;
4561 	      else if (code == GT && TARGET_SSE4_1)
4562 		gen = gen_sminv2qi3;
4563 	      break;
4564 	    case E_V8HImode:
4565 	      if (code == GTU && TARGET_SSE4_1)
4566 		gen = gen_uminv8hi3;
4567 	      else if (code == GT && TARGET_SSE2)
4568 		gen = gen_sminv8hi3;
4569 	      break;
4570 	    case E_V4HImode:
4571 	      if (code == GTU && TARGET_SSE4_1)
4572 		gen = gen_uminv4hi3;
4573 	      else if (code == GT && TARGET_SSE2)
4574 		gen = gen_sminv4hi3;
4575 	      break;
4576 	    case E_V2HImode:
4577 	      if (code == GTU && TARGET_SSE4_1)
4578 		gen = gen_uminv2hi3;
4579 	      else if (code == GT && TARGET_SSE2)
4580 		gen = gen_sminv2hi3;
4581 	      break;
4582 	    case E_V4SImode:
4583 	      if (TARGET_SSE4_1)
4584 		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4585 	      break;
4586 	    case E_V2SImode:
4587 	      if (TARGET_SSE4_1)
4588 		gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4589 	      break;
4590 	    case E_V2DImode:
4591 	      if (TARGET_AVX512VL)
4592 		{
4593 		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4594 		  cop0 = force_reg (mode, cop0);
4595 		  cop1 = force_reg (mode, cop1);
4596 		}
4597 	      break;
4598 	    default:
4599 	      break;
4600 	    }
4601 
4602 	  if (gen)
4603 	    {
4604 	      rtx tem = gen_reg_rtx (mode);
4605 	      if (!vector_operand (cop0, mode))
4606 		cop0 = force_reg (mode, cop0);
4607 	      if (!vector_operand (cop1, mode))
4608 		cop1 = force_reg (mode, cop1);
4609 	      *negate = !*negate;
4610 	      emit_insn (gen (tem, cop0, cop1));
4611 	      cop1 = tem;
4612 	      code = EQ;
4613 	    }
4614 	}
4615 
4616       /* Unsigned parallel compare is not supported by the hardware.
4617 	 Play some tricks to turn this into a signed comparison
4618 	 against 0.  */
4619       if (code == GTU)
4620 	{
4621 	  cop0 = force_reg (mode, cop0);
4622 
4623 	  switch (mode)
4624 	    {
4625 	    case E_V16SImode:
4626 	    case E_V8DImode:
4627 	    case E_V8SImode:
4628 	    case E_V4DImode:
4629 	    case E_V4SImode:
4630 	    case E_V2SImode:
4631 	    case E_V2DImode:
4632 		{
4633 		  rtx t1, t2, mask;
4634 
4635 		  /* Subtract (-(INT MAX) - 1) from both operands to make
4636 		     them signed.  */
4637 		  mask = ix86_build_signbit_mask (mode, true, false);
4638 		  t1 = gen_reg_rtx (mode);
4639 		  emit_insn (gen_sub3_insn (t1, cop0, mask));
4640 
4641 		  t2 = gen_reg_rtx (mode);
4642 		  emit_insn (gen_sub3_insn (t2, cop1, mask));
4643 
4644 		  cop0 = t1;
4645 		  cop1 = t2;
4646 		  code = GT;
4647 		}
4648 	      break;
4649 
4650 	    case E_V64QImode:
4651 	    case E_V32HImode:
4652 	    case E_V32QImode:
4653 	    case E_V16HImode:
4654 	    case E_V16QImode:
4655 	    case E_V8QImode:
4656 	    case E_V4QImode:
4657 	    case E_V2QImode:
4658 	    case E_V8HImode:
4659 	    case E_V4HImode:
4660 	    case E_V2HImode:
4661 	      /* Perform a parallel unsigned saturating subtraction.  */
4662 	      x = gen_reg_rtx (mode);
4663 	      emit_insn (gen_rtx_SET
4664 			 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4665 	      cop0 = x;
4666 	      cop1 = CONST0_RTX (mode);
4667 	      code = EQ;
4668 	      *negate = !*negate;
4669 	      break;
4670 
4671 	    default:
4672 	      gcc_unreachable ();
4673 	    }
4674 	}
4675     }
4676 
4677   if (*negate)
4678     std::swap (op_true, op_false);
4679 
4680   /* Allow the comparison to be done in one mode, but the movcc to
4681      happen in another mode.  */
4682   if (data_mode == mode)
4683     {
4684       x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4685 			       op_true, op_false);
4686     }
4687   else
4688     {
4689       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4690       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4691 			       op_true, op_false);
4692       if (GET_MODE (x) == mode)
4693 	x = gen_lowpart (data_mode, x);
4694     }
4695 
4696   return x;
4697 }
4698 
4699 /* Expand integer vector comparison.  */
4700 
4701 bool
ix86_expand_int_vec_cmp(rtx operands[])4702 ix86_expand_int_vec_cmp (rtx operands[])
4703 {
4704   rtx_code code = GET_CODE (operands[1]);
4705   bool negate = false;
4706   rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4707 				     operands[3], NULL, NULL, &negate);
4708 
4709   if (!cmp)
4710     return false;
4711 
4712   if (negate)
4713     cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4714 				   CONST0_RTX (GET_MODE (cmp)),
4715 				   NULL, NULL, &negate);
4716 
4717   gcc_assert (!negate);
4718 
4719   if (operands[0] != cmp)
4720     emit_move_insn (operands[0], cmp);
4721 
4722   return true;
4723 }
4724 
4725 /* Expand a floating-point vector conditional move; a vcond operation
4726    rather than a movcc operation.  */
4727 
4728 bool
ix86_expand_fp_vcond(rtx operands[])4729 ix86_expand_fp_vcond (rtx operands[])
4730 {
4731   enum rtx_code code = GET_CODE (operands[3]);
4732   rtx cmp;
4733 
4734   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4735 					   &operands[4], &operands[5]);
4736   if (code == UNKNOWN)
4737     {
4738       rtx temp;
4739       switch (GET_CODE (operands[3]))
4740 	{
4741 	case LTGT:
4742 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4743 				      operands[5], operands[0], operands[0]);
4744 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4745 				     operands[5], operands[1], operands[2]);
4746 	  code = AND;
4747 	  break;
4748 	case UNEQ:
4749 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4750 				      operands[5], operands[0], operands[0]);
4751 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4752 				     operands[5], operands[1], operands[2]);
4753 	  code = IOR;
4754 	  break;
4755 	default:
4756 	  gcc_unreachable ();
4757 	}
4758       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4759 				 OPTAB_DIRECT);
4760       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4761       return true;
4762     }
4763 
4764   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4765 				 operands[5], operands[1], operands[2]))
4766     return true;
4767 
4768   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4769 			     operands[1], operands[2]);
4770   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4771   return true;
4772 }
4773 
4774 /* Expand a signed/unsigned integral vector conditional move.  */
4775 
4776 bool
ix86_expand_int_vcond(rtx operands[])4777 ix86_expand_int_vcond (rtx operands[])
4778 {
4779   machine_mode data_mode = GET_MODE (operands[0]);
4780   machine_mode mode = GET_MODE (operands[4]);
4781   enum rtx_code code = GET_CODE (operands[3]);
4782   bool negate = false;
4783   rtx x, cop0, cop1;
4784 
4785   cop0 = operands[4];
4786   cop1 = operands[5];
4787 
4788   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4789      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
4790   if ((code == LT || code == GE)
4791       && data_mode == mode
4792       && cop1 == CONST0_RTX (mode)
4793       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4794       && GET_MODE_UNIT_SIZE (data_mode) > 1
4795       && GET_MODE_UNIT_SIZE (data_mode) <= 8
4796       && (GET_MODE_SIZE (data_mode) == 16
4797 	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4798     {
4799       rtx negop = operands[2 - (code == LT)];
4800       int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4801       if (negop == CONST1_RTX (data_mode))
4802 	{
4803 	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4804 					 operands[0], 1, OPTAB_DIRECT);
4805 	  if (res != operands[0])
4806 	    emit_move_insn (operands[0], res);
4807 	  return true;
4808 	}
4809       else if (GET_MODE_INNER (data_mode) != DImode
4810 	       && vector_all_ones_operand (negop, data_mode))
4811 	{
4812 	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4813 					 operands[0], 0, OPTAB_DIRECT);
4814 	  if (res != operands[0])
4815 	    emit_move_insn (operands[0], res);
4816 	  return true;
4817 	}
4818     }
4819 
4820   if (!nonimmediate_operand (cop1, mode))
4821     cop1 = force_reg (mode, cop1);
4822   if (!general_operand (operands[1], data_mode))
4823     operands[1] = force_reg (data_mode, operands[1]);
4824   if (!general_operand (operands[2], data_mode))
4825     operands[2] = force_reg (data_mode, operands[2]);
4826 
4827   x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4828 			       operands[1], operands[2], &negate);
4829 
4830   if (!x)
4831     return false;
4832 
4833   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4834 			 operands[2-negate]);
4835   return true;
4836 }
4837 
4838 static bool
ix86_expand_vec_perm_vpermt2(rtx target,rtx mask,rtx op0,rtx op1,struct expand_vec_perm_d * d)4839 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4840 			      struct expand_vec_perm_d *d)
4841 {
4842   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4843      expander, so args are either in d, or in op0, op1 etc.  */
4844   machine_mode mode = GET_MODE (d ? d->op0 : op0);
4845   machine_mode maskmode = mode;
4846   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4847 
4848   switch (mode)
4849     {
4850     case E_V16QImode:
4851       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4852 	gen = gen_avx512vl_vpermt2varv16qi3;
4853       break;
4854     case E_V32QImode:
4855       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4856 	gen = gen_avx512vl_vpermt2varv32qi3;
4857       break;
4858     case E_V64QImode:
4859       if (TARGET_AVX512VBMI)
4860 	gen = gen_avx512bw_vpermt2varv64qi3;
4861       break;
4862     case E_V8HImode:
4863       if (TARGET_AVX512VL && TARGET_AVX512BW)
4864 	gen = gen_avx512vl_vpermt2varv8hi3;
4865       break;
4866     case E_V16HImode:
4867       if (TARGET_AVX512VL && TARGET_AVX512BW)
4868 	gen = gen_avx512vl_vpermt2varv16hi3;
4869       break;
4870     case E_V32HImode:
4871       if (TARGET_AVX512BW)
4872 	gen = gen_avx512bw_vpermt2varv32hi3;
4873       break;
4874     case E_V4SImode:
4875       if (TARGET_AVX512VL)
4876 	gen = gen_avx512vl_vpermt2varv4si3;
4877       break;
4878     case E_V8SImode:
4879       if (TARGET_AVX512VL)
4880 	gen = gen_avx512vl_vpermt2varv8si3;
4881       break;
4882     case E_V16SImode:
4883       if (TARGET_AVX512F)
4884 	gen = gen_avx512f_vpermt2varv16si3;
4885       break;
4886     case E_V4SFmode:
4887       if (TARGET_AVX512VL)
4888 	{
4889 	  gen = gen_avx512vl_vpermt2varv4sf3;
4890 	  maskmode = V4SImode;
4891 	}
4892       break;
4893     case E_V8SFmode:
4894       if (TARGET_AVX512VL)
4895 	{
4896 	  gen = gen_avx512vl_vpermt2varv8sf3;
4897 	  maskmode = V8SImode;
4898 	}
4899       break;
4900     case E_V16SFmode:
4901       if (TARGET_AVX512F)
4902 	{
4903 	  gen = gen_avx512f_vpermt2varv16sf3;
4904 	  maskmode = V16SImode;
4905 	}
4906       break;
4907     case E_V2DImode:
4908       if (TARGET_AVX512VL)
4909 	gen = gen_avx512vl_vpermt2varv2di3;
4910       break;
4911     case E_V4DImode:
4912       if (TARGET_AVX512VL)
4913 	gen = gen_avx512vl_vpermt2varv4di3;
4914       break;
4915     case E_V8DImode:
4916       if (TARGET_AVX512F)
4917 	gen = gen_avx512f_vpermt2varv8di3;
4918       break;
4919     case E_V2DFmode:
4920       if (TARGET_AVX512VL)
4921 	{
4922 	  gen = gen_avx512vl_vpermt2varv2df3;
4923 	  maskmode = V2DImode;
4924 	}
4925       break;
4926     case E_V4DFmode:
4927       if (TARGET_AVX512VL)
4928 	{
4929 	  gen = gen_avx512vl_vpermt2varv4df3;
4930 	  maskmode = V4DImode;
4931 	}
4932       break;
4933     case E_V8DFmode:
4934       if (TARGET_AVX512F)
4935 	{
4936 	  gen = gen_avx512f_vpermt2varv8df3;
4937 	  maskmode = V8DImode;
4938 	}
4939       break;
4940     default:
4941       break;
4942     }
4943 
4944   if (gen == NULL)
4945     return false;
4946 
4947   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4948      expander, so args are either in d, or in op0, op1 etc.  */
4949   if (d)
4950     {
4951       rtx vec[64];
4952       target = d->target;
4953       op0 = d->op0;
4954       op1 = d->op1;
4955       for (int i = 0; i < d->nelt; ++i)
4956 	vec[i] = GEN_INT (d->perm[i]);
4957       mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4958     }
4959 
4960   emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4961   return true;
4962 }
4963 
4964 /* Expand a variable vector permutation.  */
4965 
4966 void
ix86_expand_vec_perm(rtx operands[])4967 ix86_expand_vec_perm (rtx operands[])
4968 {
4969   rtx target = operands[0];
4970   rtx op0 = operands[1];
4971   rtx op1 = operands[2];
4972   rtx mask = operands[3];
4973   rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4974   machine_mode mode = GET_MODE (op0);
4975   machine_mode maskmode = GET_MODE (mask);
4976   int w, e, i;
4977   bool one_operand_shuffle = rtx_equal_p (op0, op1);
4978 
4979   /* Number of elements in the vector.  */
4980   w = GET_MODE_NUNITS (mode);
4981   e = GET_MODE_UNIT_SIZE (mode);
4982   gcc_assert (w <= 64);
4983 
4984   /* For HF mode vector, convert it to HI using subreg.  */
4985   if (GET_MODE_INNER (mode) == HFmode)
4986     {
4987       machine_mode orig_mode = mode;
4988       mode = mode_for_vector (HImode, w).require ();
4989       target = lowpart_subreg (mode, target, orig_mode);
4990       op0 = lowpart_subreg (mode, op0, orig_mode);
4991       op1 = lowpart_subreg (mode, op1, orig_mode);
4992     }
4993 
4994   if (TARGET_AVX512F && one_operand_shuffle)
4995     {
4996       rtx (*gen) (rtx, rtx, rtx) = NULL;
4997       switch (mode)
4998 	{
4999 	case E_V16SImode:
5000 	  gen =gen_avx512f_permvarv16si;
5001 	  break;
5002 	case E_V16SFmode:
5003 	  gen = gen_avx512f_permvarv16sf;
5004 	  break;
5005 	case E_V8DImode:
5006 	  gen = gen_avx512f_permvarv8di;
5007 	  break;
5008 	case E_V8DFmode:
5009 	  gen = gen_avx512f_permvarv8df;
5010 	  break;
5011 	default:
5012 	  break;
5013 	}
5014       if (gen != NULL)
5015 	{
5016 	  emit_insn (gen (target, op0, mask));
5017 	  return;
5018 	}
5019     }
5020 
5021   if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5022     return;
5023 
5024   if (TARGET_AVX2)
5025     {
5026       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5027 	{
5028 	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5029 	     an constant shuffle operand.  With a tiny bit of effort we can
5030 	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
5031 	     unfortunate but there's no avoiding it.
5032 	     Similarly for V16HImode we don't have instructions for variable
5033 	     shuffling, while for V32QImode we can use after preparing suitable
5034 	     masks vpshufb; vpshufb; vpermq; vpor.  */
5035 
5036 	  if (mode == V16HImode)
5037 	    {
5038 	      maskmode = mode = V32QImode;
5039 	      w = 32;
5040 	      e = 1;
5041 	    }
5042 	  else
5043 	    {
5044 	      maskmode = mode = V8SImode;
5045 	      w = 8;
5046 	      e = 4;
5047 	    }
5048 	  t1 = gen_reg_rtx (maskmode);
5049 
5050 	  /* Replicate the low bits of the V4DImode mask into V8SImode:
5051 	       mask = { A B C D }
5052 	       t1 = { A A B B C C D D }.  */
5053 	  for (i = 0; i < w / 2; ++i)
5054 	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5055 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5056 	  vt = force_reg (maskmode, vt);
5057 	  mask = gen_lowpart (maskmode, mask);
5058 	  if (maskmode == V8SImode)
5059 	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5060 	  else
5061 	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5062 
5063 	  /* Multiply the shuffle indicies by two.  */
5064 	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5065 				    OPTAB_DIRECT);
5066 
5067 	  /* Add one to the odd shuffle indicies:
5068 		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
5069 	  for (i = 0; i < w / 2; ++i)
5070 	    {
5071 	      vec[i * 2] = const0_rtx;
5072 	      vec[i * 2 + 1] = const1_rtx;
5073 	    }
5074 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5075 	  vt = validize_mem (force_const_mem (maskmode, vt));
5076 	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5077 				    OPTAB_DIRECT);
5078 
5079 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
5080 	  operands[3] = mask = t1;
5081 	  target = gen_reg_rtx (mode);
5082 	  op0 = gen_lowpart (mode, op0);
5083 	  op1 = gen_lowpart (mode, op1);
5084 	}
5085 
5086       switch (mode)
5087 	{
5088 	case E_V8SImode:
5089 	  /* The VPERMD and VPERMPS instructions already properly ignore
5090 	     the high bits of the shuffle elements.  No need for us to
5091 	     perform an AND ourselves.  */
5092 	  if (one_operand_shuffle)
5093 	    {
5094 	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5095 	      if (target != operands[0])
5096 		emit_move_insn (operands[0],
5097 				gen_lowpart (GET_MODE (operands[0]), target));
5098 	    }
5099 	  else
5100 	    {
5101 	      t1 = gen_reg_rtx (V8SImode);
5102 	      t2 = gen_reg_rtx (V8SImode);
5103 	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5104 	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5105 	      goto merge_two;
5106 	    }
5107 	  return;
5108 
5109 	case E_V8SFmode:
5110 	  mask = gen_lowpart (V8SImode, mask);
5111 	  if (one_operand_shuffle)
5112 	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5113 	  else
5114 	    {
5115 	      t1 = gen_reg_rtx (V8SFmode);
5116 	      t2 = gen_reg_rtx (V8SFmode);
5117 	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5118 	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5119 	      goto merge_two;
5120 	    }
5121 	  return;
5122 
5123         case E_V4SImode:
5124 	  /* By combining the two 128-bit input vectors into one 256-bit
5125 	     input vector, we can use VPERMD and VPERMPS for the full
5126 	     two-operand shuffle.  */
5127 	  t1 = gen_reg_rtx (V8SImode);
5128 	  t2 = gen_reg_rtx (V8SImode);
5129 	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5130 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5131 	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5132 	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5133 	  return;
5134 
5135         case E_V4SFmode:
5136 	  t1 = gen_reg_rtx (V8SFmode);
5137 	  t2 = gen_reg_rtx (V8SImode);
5138 	  mask = gen_lowpart (V4SImode, mask);
5139 	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5140 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5141 	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5142 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5143 	  return;
5144 
5145 	case E_V32QImode:
5146 	  t1 = gen_reg_rtx (V32QImode);
5147 	  t2 = gen_reg_rtx (V32QImode);
5148 	  t3 = gen_reg_rtx (V32QImode);
5149 	  vt2 = GEN_INT (-128);
5150 	  vt = gen_const_vec_duplicate (V32QImode, vt2);
5151 	  vt = force_reg (V32QImode, vt);
5152 	  for (i = 0; i < 32; i++)
5153 	    vec[i] = i < 16 ? vt2 : const0_rtx;
5154 	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5155 	  vt2 = force_reg (V32QImode, vt2);
5156 	  /* From mask create two adjusted masks, which contain the same
5157 	     bits as mask in the low 7 bits of each vector element.
5158 	     The first mask will have the most significant bit clear
5159 	     if it requests element from the same 128-bit lane
5160 	     and MSB set if it requests element from the other 128-bit lane.
5161 	     The second mask will have the opposite values of the MSB,
5162 	     and additionally will have its 128-bit lanes swapped.
5163 	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5164 	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
5165 	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5166 	     stands for other 12 bytes.  */
5167 	  /* The bit whether element is from the same lane or the other
5168 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
5169 	  t5 = gen_reg_rtx (V4DImode);
5170 	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5171 				    GEN_INT (3)));
5172 	  /* Clear MSB bits from the mask just in case it had them set.  */
5173 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5174 	  /* After this t1 will have MSB set for elements from other lane.  */
5175 	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5176 	  /* Clear bits other than MSB.  */
5177 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
5178 	  /* Or in the lower bits from mask into t3.  */
5179 	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
5180 	  /* And invert MSB bits in t1, so MSB is set for elements from the same
5181 	     lane.  */
5182 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
5183 	  /* Swap 128-bit lanes in t3.  */
5184 	  t6 = gen_reg_rtx (V4DImode);
5185 	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5186 					  const2_rtx, GEN_INT (3),
5187 					  const0_rtx, const1_rtx));
5188 	  /* And or in the lower bits from mask into t1.  */
5189 	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
5190 	  if (one_operand_shuffle)
5191 	    {
5192 	      /* Each of these shuffles will put 0s in places where
5193 		 element from the other 128-bit lane is needed, otherwise
5194 		 will shuffle in the requested value.  */
5195 	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5196 						gen_lowpart (V32QImode, t6)));
5197 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5198 	      /* For t3 the 128-bit lanes are swapped again.  */
5199 	      t7 = gen_reg_rtx (V4DImode);
5200 	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5201 					      const2_rtx, GEN_INT (3),
5202 					      const0_rtx, const1_rtx));
5203 	      /* And oring both together leads to the result.  */
5204 	      emit_insn (gen_iorv32qi3 (target, t1,
5205 					gen_lowpart (V32QImode, t7)));
5206 	      if (target != operands[0])
5207 		emit_move_insn (operands[0],
5208 				gen_lowpart (GET_MODE (operands[0]), target));
5209 	      return;
5210 	    }
5211 
5212 	  t4 = gen_reg_rtx (V32QImode);
5213 	  /* Similarly to the above one_operand_shuffle code,
5214 	     just for repeated twice for each operand.  merge_two:
5215 	     code will merge the two results together.  */
5216 	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5217 					    gen_lowpart (V32QImode, t6)));
5218 	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5219 					    gen_lowpart (V32QImode, t6)));
5220 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5221 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5222 	  t7 = gen_reg_rtx (V4DImode);
5223 	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5224 					  const2_rtx, GEN_INT (3),
5225 					  const0_rtx, const1_rtx));
5226 	  t8 = gen_reg_rtx (V4DImode);
5227 	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5228 					  const2_rtx, GEN_INT (3),
5229 					  const0_rtx, const1_rtx));
5230 	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5231 	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5232 	  t1 = t4;
5233 	  t2 = t3;
5234 	  goto merge_two;
5235 
5236 	default:
5237 	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
5238 	  break;
5239 	}
5240     }
5241 
5242   if (TARGET_XOP)
5243     {
5244       /* The XOP VPPERM insn supports three inputs.  By ignoring the
5245 	 one_operand_shuffle special case, we avoid creating another
5246 	 set of constant vectors in memory.  */
5247       one_operand_shuffle = false;
5248 
5249       /* mask = mask & {2*w-1, ...} */
5250       vt = GEN_INT (2*w - 1);
5251     }
5252   else
5253     {
5254       /* mask = mask & {w-1, ...} */
5255       vt = GEN_INT (w - 1);
5256     }
5257 
5258   vt = gen_const_vec_duplicate (maskmode, vt);
5259   mask = expand_simple_binop (maskmode, AND, mask, vt,
5260 			      NULL_RTX, 0, OPTAB_DIRECT);
5261 
5262   /* For non-QImode operations, convert the word permutation control
5263      into a byte permutation control.  */
5264   if (mode != V16QImode)
5265     {
5266       mask = expand_simple_binop (maskmode, ASHIFT, mask,
5267 				  GEN_INT (exact_log2 (e)),
5268 				  NULL_RTX, 0, OPTAB_DIRECT);
5269 
5270       /* Convert mask to vector of chars.  */
5271       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5272 
5273       /* Replicate each of the input bytes into byte positions:
5274 	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5275 	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5276 	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
5277       for (i = 0; i < 16; ++i)
5278 	vec[i] = GEN_INT (i/e * e);
5279       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5280       vt = validize_mem (force_const_mem (V16QImode, vt));
5281       if (TARGET_XOP)
5282 	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5283       else
5284 	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5285 
5286       /* Convert it into the byte positions by doing
5287 	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
5288       for (i = 0; i < 16; ++i)
5289 	vec[i] = GEN_INT (i % e);
5290       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5291       vt = validize_mem (force_const_mem (V16QImode, vt));
5292       emit_insn (gen_addv16qi3 (mask, mask, vt));
5293     }
5294 
5295   /* The actual shuffle operations all operate on V16QImode.  */
5296   op0 = gen_lowpart (V16QImode, op0);
5297   op1 = gen_lowpart (V16QImode, op1);
5298 
5299   if (TARGET_XOP)
5300     {
5301       if (GET_MODE (target) != V16QImode)
5302 	target = gen_reg_rtx (V16QImode);
5303       emit_insn (gen_xop_pperm (target, op0, op1, mask));
5304       if (target != operands[0])
5305 	emit_move_insn (operands[0],
5306 			gen_lowpart (GET_MODE (operands[0]), target));
5307     }
5308   else if (one_operand_shuffle)
5309     {
5310       if (GET_MODE (target) != V16QImode)
5311 	target = gen_reg_rtx (V16QImode);
5312       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5313       if (target != operands[0])
5314 	emit_move_insn (operands[0],
5315 			gen_lowpart (GET_MODE (operands[0]), target));
5316     }
5317   else
5318     {
5319       rtx xops[6];
5320       bool ok;
5321 
5322       /* Shuffle the two input vectors independently.  */
5323       t1 = gen_reg_rtx (V16QImode);
5324       t2 = gen_reg_rtx (V16QImode);
5325       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5326       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5327 
5328  merge_two:
5329       /* Then merge them together.  The key is whether any given control
5330          element contained a bit set that indicates the second word.  */
5331       mask = operands[3];
5332       vt = GEN_INT (w);
5333       if (maskmode == V2DImode && !TARGET_SSE4_1)
5334 	{
5335 	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
5336 	     more shuffle to convert the V2DI input mask into a V4SI
5337 	     input mask.  At which point the masking that expand_int_vcond
5338 	     will work as desired.  */
5339 	  rtx t3 = gen_reg_rtx (V4SImode);
5340 	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5341 				        const0_rtx, const0_rtx,
5342 				        const2_rtx, const2_rtx));
5343 	  mask = t3;
5344 	  maskmode = V4SImode;
5345 	  e = w = 4;
5346 	}
5347 
5348       vt = gen_const_vec_duplicate (maskmode, vt);
5349       vt = force_reg (maskmode, vt);
5350       mask = expand_simple_binop (maskmode, AND, mask, vt,
5351 				  NULL_RTX, 0, OPTAB_DIRECT);
5352 
5353       if (GET_MODE (target) != mode)
5354 	target = gen_reg_rtx (mode);
5355       xops[0] = target;
5356       xops[1] = gen_lowpart (mode, t2);
5357       xops[2] = gen_lowpart (mode, t1);
5358       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5359       xops[4] = mask;
5360       xops[5] = vt;
5361       ok = ix86_expand_int_vcond (xops);
5362       gcc_assert (ok);
5363       if (target != operands[0])
5364 	emit_move_insn (operands[0],
5365 			gen_lowpart (GET_MODE (operands[0]), target));
5366     }
5367 }
5368 
5369 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
5370    true if we should do zero extension, else sign extension.  HIGH_P is
5371    true if we want the N/2 high elements, else the low elements.  */
5372 
5373 void
ix86_expand_sse_unpack(rtx dest,rtx src,bool unsigned_p,bool high_p)5374 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5375 {
5376   machine_mode imode = GET_MODE (src);
5377   rtx tmp;
5378 
5379   if (TARGET_SSE4_1)
5380     {
5381       rtx (*unpack)(rtx, rtx);
5382       rtx (*extract)(rtx, rtx) = NULL;
5383       machine_mode halfmode = BLKmode;
5384 
5385       switch (imode)
5386 	{
5387 	case E_V64QImode:
5388 	  if (unsigned_p)
5389 	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5390 	  else
5391 	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5392 	  halfmode = V32QImode;
5393 	  extract
5394 	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5395 	  break;
5396 	case E_V32QImode:
5397 	  if (unsigned_p)
5398 	    unpack = gen_avx2_zero_extendv16qiv16hi2;
5399 	  else
5400 	    unpack = gen_avx2_sign_extendv16qiv16hi2;
5401 	  halfmode = V16QImode;
5402 	  extract
5403 	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5404 	  break;
5405 	case E_V32HImode:
5406 	  if (unsigned_p)
5407 	    unpack = gen_avx512f_zero_extendv16hiv16si2;
5408 	  else
5409 	    unpack = gen_avx512f_sign_extendv16hiv16si2;
5410 	  halfmode = V16HImode;
5411 	  extract
5412 	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5413 	  break;
5414 	case E_V16HImode:
5415 	  if (unsigned_p)
5416 	    unpack = gen_avx2_zero_extendv8hiv8si2;
5417 	  else
5418 	    unpack = gen_avx2_sign_extendv8hiv8si2;
5419 	  halfmode = V8HImode;
5420 	  extract
5421 	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5422 	  break;
5423 	case E_V16SImode:
5424 	  if (unsigned_p)
5425 	    unpack = gen_avx512f_zero_extendv8siv8di2;
5426 	  else
5427 	    unpack = gen_avx512f_sign_extendv8siv8di2;
5428 	  halfmode = V8SImode;
5429 	  extract
5430 	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5431 	  break;
5432 	case E_V8SImode:
5433 	  if (unsigned_p)
5434 	    unpack = gen_avx2_zero_extendv4siv4di2;
5435 	  else
5436 	    unpack = gen_avx2_sign_extendv4siv4di2;
5437 	  halfmode = V4SImode;
5438 	  extract
5439 	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5440 	  break;
5441 	case E_V16QImode:
5442 	  if (unsigned_p)
5443 	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5444 	  else
5445 	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5446 	  break;
5447 	case E_V8HImode:
5448 	  if (unsigned_p)
5449 	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
5450 	  else
5451 	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
5452 	  break;
5453 	case E_V4SImode:
5454 	  if (unsigned_p)
5455 	    unpack = gen_sse4_1_zero_extendv2siv2di2;
5456 	  else
5457 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
5458 	  break;
5459 	case E_V8QImode:
5460 	  if (unsigned_p)
5461 	    unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5462 	  else
5463 	    unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5464 	  break;
5465 	case E_V4HImode:
5466 	  if (unsigned_p)
5467 	    unpack = gen_sse4_1_zero_extendv2hiv2si2;
5468 	  else
5469 	    unpack = gen_sse4_1_sign_extendv2hiv2si2;
5470 	  break;
5471 	case E_V4QImode:
5472 	  if (unsigned_p)
5473 	    unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5474 	  else
5475 	    unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5476 	  break;
5477 	default:
5478 	  gcc_unreachable ();
5479 	}
5480 
5481       if (GET_MODE_SIZE (imode) >= 32)
5482 	{
5483 	  tmp = gen_reg_rtx (halfmode);
5484 	  emit_insn (extract (tmp, src));
5485 	}
5486       else if (high_p)
5487 	{
5488 	  switch (GET_MODE_SIZE (imode))
5489 	    {
5490 	    case 16:
5491 	      /* Shift higher 8 bytes to lower 8 bytes.  */
5492 	      tmp = gen_reg_rtx (V1TImode);
5493 	      emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5494 					     GEN_INT (64)));
5495 	      break;
5496 	    case 8:
5497 	      /* Shift higher 4 bytes to lower 4 bytes.  */
5498 	      tmp = gen_reg_rtx (V1DImode);
5499 	      emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5500 					    GEN_INT (32)));
5501 	      break;
5502 	    case 4:
5503 	      /* Shift higher 2 bytes to lower 2 bytes.  */
5504 	      tmp = gen_reg_rtx (V1SImode);
5505 	      emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5506 					    GEN_INT (16)));
5507 	      break;
5508 	    default:
5509 	      gcc_unreachable ();
5510 	    }
5511 
5512 	  tmp = gen_lowpart (imode, tmp);
5513 	}
5514       else
5515 	tmp = src;
5516 
5517       emit_insn (unpack (dest, tmp));
5518     }
5519   else
5520     {
5521       rtx (*unpack)(rtx, rtx, rtx);
5522 
5523       switch (imode)
5524 	{
5525 	case E_V16QImode:
5526 	  if (high_p)
5527 	    unpack = gen_vec_interleave_highv16qi;
5528 	  else
5529 	    unpack = gen_vec_interleave_lowv16qi;
5530 	  break;
5531 	case E_V8HImode:
5532 	  if (high_p)
5533 	    unpack = gen_vec_interleave_highv8hi;
5534 	  else
5535 	    unpack = gen_vec_interleave_lowv8hi;
5536 	  break;
5537 	case E_V4SImode:
5538 	  if (high_p)
5539 	    unpack = gen_vec_interleave_highv4si;
5540 	  else
5541 	    unpack = gen_vec_interleave_lowv4si;
5542 	  break;
5543 	case E_V8QImode:
5544 	  if (high_p)
5545 	    unpack = gen_mmx_punpckhbw;
5546 	  else
5547 	    unpack = gen_mmx_punpcklbw;
5548 	  break;
5549 	case E_V4HImode:
5550 	  if (high_p)
5551 	    unpack = gen_mmx_punpckhwd;
5552 	  else
5553 	    unpack = gen_mmx_punpcklwd;
5554 	  break;
5555 	case E_V4QImode:
5556 	  if (high_p)
5557 	    unpack = gen_mmx_punpckhbw_low;
5558 	  else
5559 	    unpack = gen_mmx_punpcklbw_low;
5560 	  break;
5561 	default:
5562 	  gcc_unreachable ();
5563 	}
5564 
5565       if (unsigned_p)
5566 	tmp = force_reg (imode, CONST0_RTX (imode));
5567       else
5568 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5569 				   src, pc_rtx, pc_rtx);
5570 
5571       rtx tmp2 = gen_reg_rtx (imode);
5572       emit_insn (unpack (tmp2, src, tmp));
5573       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5574     }
5575 }
5576 
5577 /* Return true if mem is pool constant which contains a const_vector
5578    perm index, assign the index to PERM.  */
5579 bool
ix86_extract_perm_from_pool_constant(int * perm,rtx mem)5580 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5581 {
5582   machine_mode mode = GET_MODE (mem);
5583   int nelt = GET_MODE_NUNITS (mode);
5584 
5585   if (!INTEGRAL_MODE_P (mode))
5586     return false;
5587 
5588     /* Needs to be constant pool.  */
5589   if (!(MEM_P (mem))
5590       || !SYMBOL_REF_P (XEXP (mem, 0))
5591       || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5592    return false;
5593 
5594   rtx constant = get_pool_constant (XEXP (mem, 0));
5595 
5596   if (GET_CODE (constant) != CONST_VECTOR)
5597     return false;
5598 
5599   /* There could be some rtx like
5600      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5601      but with "*.LC1" refer to V2DI constant vector.  */
5602   if (GET_MODE (constant) != mode)
5603     {
5604       constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5605 
5606       if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5607 	return false;
5608     }
5609 
5610   for (int i = 0; i != nelt; i++)
5611     perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5612 
5613   return true;
5614 }
5615 
5616 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
5617    but works for floating pointer parameters and nonoffsetable memories.
5618    For pushes, it returns just stack offsets; the values will be saved
5619    in the right order.  Maximally three parts are generated.  */
5620 
5621 static int
ix86_split_to_parts(rtx operand,rtx * parts,machine_mode mode)5622 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5623 {
5624   int size;
5625 
5626   if (!TARGET_64BIT)
5627     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5628   else
5629     size = (GET_MODE_SIZE (mode) + 4) / 8;
5630 
5631   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5632   gcc_assert (size >= 2 && size <= 4);
5633 
5634   /* Optimize constant pool reference to immediates.  This is used by fp
5635      moves, that force all constants to memory to allow combining.  */
5636   if (MEM_P (operand) && MEM_READONLY_P (operand))
5637     operand = avoid_constant_pool_reference (operand);
5638 
5639   if (MEM_P (operand) && !offsettable_memref_p (operand))
5640     {
5641       /* The only non-offsetable memories we handle are pushes.  */
5642       int ok = push_operand (operand, VOIDmode);
5643 
5644       gcc_assert (ok);
5645 
5646       operand = copy_rtx (operand);
5647       PUT_MODE (operand, word_mode);
5648       parts[0] = parts[1] = parts[2] = parts[3] = operand;
5649       return size;
5650     }
5651 
5652   if (GET_CODE (operand) == CONST_VECTOR)
5653     {
5654       scalar_int_mode imode = int_mode_for_mode (mode).require ();
5655       /* Caution: if we looked through a constant pool memory above,
5656 	 the operand may actually have a different mode now.  That's
5657 	 ok, since we want to pun this all the way back to an integer.  */
5658       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5659       gcc_assert (operand != NULL);
5660       mode = imode;
5661     }
5662 
5663   if (!TARGET_64BIT)
5664     {
5665       if (mode == DImode)
5666 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5667       else
5668 	{
5669 	  int i;
5670 
5671 	  if (REG_P (operand))
5672 	    {
5673 	      gcc_assert (reload_completed);
5674 	      for (i = 0; i < size; i++)
5675 		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5676 	    }
5677 	  else if (offsettable_memref_p (operand))
5678 	    {
5679 	      operand = adjust_address (operand, SImode, 0);
5680 	      parts[0] = operand;
5681 	      for (i = 1; i < size; i++)
5682 		parts[i] = adjust_address (operand, SImode, 4 * i);
5683 	    }
5684 	  else if (CONST_DOUBLE_P (operand))
5685 	    {
5686 	      const REAL_VALUE_TYPE *r;
5687 	      long l[4];
5688 
5689 	      r = CONST_DOUBLE_REAL_VALUE (operand);
5690 	      switch (mode)
5691 		{
5692 		case E_TFmode:
5693 		  real_to_target (l, r, mode);
5694 		  parts[3] = gen_int_mode (l[3], SImode);
5695 		  parts[2] = gen_int_mode (l[2], SImode);
5696 		  break;
5697 		case E_XFmode:
5698 		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5699 		     long double may not be 80-bit.  */
5700 		  real_to_target (l, r, mode);
5701 		  parts[2] = gen_int_mode (l[2], SImode);
5702 		  break;
5703 		case E_DFmode:
5704 		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5705 		  break;
5706 		default:
5707 		  gcc_unreachable ();
5708 		}
5709 	      parts[1] = gen_int_mode (l[1], SImode);
5710 	      parts[0] = gen_int_mode (l[0], SImode);
5711 	    }
5712 	  else
5713 	    gcc_unreachable ();
5714 	}
5715     }
5716   else
5717     {
5718       if (mode == TImode)
5719 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5720       if (mode == XFmode || mode == TFmode)
5721 	{
5722 	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5723 	  if (REG_P (operand))
5724 	    {
5725 	      gcc_assert (reload_completed);
5726 	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5727 	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5728 	    }
5729 	  else if (offsettable_memref_p (operand))
5730 	    {
5731 	      operand = adjust_address (operand, DImode, 0);
5732 	      parts[0] = operand;
5733 	      parts[1] = adjust_address (operand, upper_mode, 8);
5734 	    }
5735 	  else if (CONST_DOUBLE_P (operand))
5736 	    {
5737 	      long l[4];
5738 
5739 	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5740 
5741 	      /* real_to_target puts 32-bit pieces in each long.  */
5742 	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5743 				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5744 					  << 32), DImode);
5745 
5746 	      if (upper_mode == SImode)
5747 	        parts[1] = gen_int_mode (l[2], SImode);
5748 	      else
5749 	        parts[1]
5750 		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5751 				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5752 				     << 32), DImode);
5753 	    }
5754 	  else
5755 	    gcc_unreachable ();
5756 	}
5757     }
5758 
5759   return size;
5760 }
5761 
5762 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5763    Return false when normal moves are needed; true when all required
5764    insns have been emitted.  Operands 2-4 contain the input values
5765    int the correct order; operands 5-7 contain the output values.  */
5766 
5767 void
ix86_split_long_move(rtx operands[])5768 ix86_split_long_move (rtx operands[])
5769 {
5770   rtx part[2][4];
5771   int nparts, i, j;
5772   int push = 0;
5773   int collisions = 0;
5774   machine_mode mode = GET_MODE (operands[0]);
5775   bool collisionparts[4];
5776 
5777   /* The DFmode expanders may ask us to move double.
5778      For 64bit target this is single move.  By hiding the fact
5779      here we simplify i386.md splitters.  */
5780   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5781     {
5782       /* Optimize constant pool reference to immediates.  This is used by
5783 	 fp moves, that force all constants to memory to allow combining.  */
5784 
5785       if (MEM_P (operands[1])
5786 	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5787 	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5788 	operands[1] = get_pool_constant (XEXP (operands[1], 0));
5789       if (push_operand (operands[0], VOIDmode))
5790 	{
5791 	  operands[0] = copy_rtx (operands[0]);
5792 	  PUT_MODE (operands[0], word_mode);
5793 	}
5794       else
5795         operands[0] = gen_lowpart (DImode, operands[0]);
5796       operands[1] = gen_lowpart (DImode, operands[1]);
5797       emit_move_insn (operands[0], operands[1]);
5798       return;
5799     }
5800 
5801   /* The only non-offsettable memory we handle is push.  */
5802   if (push_operand (operands[0], VOIDmode))
5803     push = 1;
5804   else
5805     gcc_assert (!MEM_P (operands[0])
5806 		|| offsettable_memref_p (operands[0]));
5807 
5808   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5809   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5810 
5811   /* When emitting push, take care for source operands on the stack.  */
5812   if (push && MEM_P (operands[1])
5813       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5814     {
5815       rtx src_base = XEXP (part[1][nparts - 1], 0);
5816 
5817       /* Compensate for the stack decrement by 4.  */
5818       if (!TARGET_64BIT && nparts == 3
5819 	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5820 	src_base = plus_constant (Pmode, src_base, 4);
5821 
5822       /* src_base refers to the stack pointer and is
5823 	 automatically decreased by emitted push.  */
5824       for (i = 0; i < nparts; i++)
5825 	part[1][i] = change_address (part[1][i],
5826 				     GET_MODE (part[1][i]), src_base);
5827     }
5828 
5829   /* We need to do copy in the right order in case an address register
5830      of the source overlaps the destination.  */
5831   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5832     {
5833       rtx tmp;
5834 
5835       for (i = 0; i < nparts; i++)
5836 	{
5837 	  collisionparts[i]
5838 	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5839 	  if (collisionparts[i])
5840 	    collisions++;
5841 	}
5842 
5843       /* Collision in the middle part can be handled by reordering.  */
5844       if (collisions == 1 && nparts == 3 && collisionparts [1])
5845 	{
5846 	  std::swap (part[0][1], part[0][2]);
5847 	  std::swap (part[1][1], part[1][2]);
5848 	}
5849       else if (collisions == 1
5850 	       && nparts == 4
5851 	       && (collisionparts [1] || collisionparts [2]))
5852 	{
5853 	  if (collisionparts [1])
5854 	    {
5855 	      std::swap (part[0][1], part[0][2]);
5856 	      std::swap (part[1][1], part[1][2]);
5857 	    }
5858 	  else
5859 	    {
5860 	      std::swap (part[0][2], part[0][3]);
5861 	      std::swap (part[1][2], part[1][3]);
5862 	    }
5863 	}
5864 
5865       /* If there are more collisions, we can't handle it by reordering.
5866 	 Do an lea to the last part and use only one colliding move.  */
5867       else if (collisions > 1)
5868 	{
5869 	  rtx base, addr;
5870 
5871 	  collisions = 1;
5872 
5873 	  base = part[0][nparts - 1];
5874 
5875 	  /* Handle the case when the last part isn't valid for lea.
5876 	     Happens in 64-bit mode storing the 12-byte XFmode.  */
5877 	  if (GET_MODE (base) != Pmode)
5878 	    base = gen_rtx_REG (Pmode, REGNO (base));
5879 
5880 	  addr = XEXP (part[1][0], 0);
5881 	  if (TARGET_TLS_DIRECT_SEG_REFS)
5882 	    {
5883 	      struct ix86_address parts;
5884 	      int ok = ix86_decompose_address (addr, &parts);
5885 	      gcc_assert (ok);
5886 	      /* It is not valid to use %gs: or %fs: in lea.  */
5887 	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5888 	    }
5889 	  emit_insn (gen_rtx_SET (base, addr));
5890 	  part[1][0] = replace_equiv_address (part[1][0], base);
5891 	  for (i = 1; i < nparts; i++)
5892 	    {
5893 	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5894 	      part[1][i] = replace_equiv_address (part[1][i], tmp);
5895 	    }
5896 	}
5897     }
5898 
5899   if (push)
5900     {
5901       if (!TARGET_64BIT)
5902 	{
5903 	  if (nparts == 3)
5904 	    {
5905 	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5906                 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5907 	      emit_move_insn (part[0][2], part[1][2]);
5908 	    }
5909 	  else if (nparts == 4)
5910 	    {
5911 	      emit_move_insn (part[0][3], part[1][3]);
5912 	      emit_move_insn (part[0][2], part[1][2]);
5913 	    }
5914 	}
5915       else
5916 	{
5917 	  /* In 64bit mode we don't have 32bit push available.  In case this is
5918 	     register, it is OK - we will just use larger counterpart.  We also
5919 	     retype memory - these comes from attempt to avoid REX prefix on
5920 	     moving of second half of TFmode value.  */
5921 	  if (GET_MODE (part[1][1]) == SImode)
5922 	    {
5923 	      switch (GET_CODE (part[1][1]))
5924 		{
5925 		case MEM:
5926 		  part[1][1] = adjust_address (part[1][1], DImode, 0);
5927 		  break;
5928 
5929 		case REG:
5930 		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5931 		  break;
5932 
5933 		default:
5934 		  gcc_unreachable ();
5935 		}
5936 
5937 	      if (GET_MODE (part[1][0]) == SImode)
5938 		part[1][0] = part[1][1];
5939 	    }
5940 	}
5941       emit_move_insn (part[0][1], part[1][1]);
5942       emit_move_insn (part[0][0], part[1][0]);
5943       return;
5944     }
5945 
5946   /* Choose correct order to not overwrite the source before it is copied.  */
5947   if ((REG_P (part[0][0])
5948        && REG_P (part[1][1])
5949        && (REGNO (part[0][0]) == REGNO (part[1][1])
5950 	   || (nparts == 3
5951 	       && REGNO (part[0][0]) == REGNO (part[1][2]))
5952 	   || (nparts == 4
5953 	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
5954       || (collisions > 0
5955 	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5956     {
5957       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5958 	{
5959 	  operands[2 + i] = part[0][j];
5960 	  operands[6 + i] = part[1][j];
5961 	}
5962     }
5963   else
5964     {
5965       for (i = 0; i < nparts; i++)
5966 	{
5967 	  operands[2 + i] = part[0][i];
5968 	  operands[6 + i] = part[1][i];
5969 	}
5970     }
5971 
5972   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
5973   if (optimize_insn_for_size_p ())
5974     {
5975       for (j = 0; j < nparts - 1; j++)
5976 	if (CONST_INT_P (operands[6 + j])
5977 	    && operands[6 + j] != const0_rtx
5978 	    && REG_P (operands[2 + j]))
5979 	  for (i = j; i < nparts - 1; i++)
5980 	    if (CONST_INT_P (operands[7 + i])
5981 		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5982 	      operands[7 + i] = operands[2 + j];
5983     }
5984 
5985   for (i = 0; i < nparts; i++)
5986     emit_move_insn (operands[2 + i], operands[6 + i]);
5987 
5988   return;
5989 }
5990 
5991 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5992    left shift by a constant, either using a single shift or
5993    a sequence of add instructions.  */
5994 
5995 static void
ix86_expand_ashl_const(rtx operand,int count,machine_mode mode)5996 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5997 {
5998   if (count == 1
5999       || (count * ix86_cost->add <= ix86_cost->shift_const
6000 	  && !optimize_insn_for_size_p ()))
6001     {
6002       while (count-- > 0)
6003 	emit_insn (gen_add2_insn (operand, operand));
6004     }
6005   else
6006     {
6007       rtx (*insn)(rtx, rtx, rtx);
6008 
6009       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6010       emit_insn (insn (operand, operand, GEN_INT (count)));
6011     }
6012 }
6013 
6014 void
ix86_split_ashl(rtx * operands,rtx scratch,machine_mode mode)6015 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6016 {
6017   rtx (*gen_ashl3)(rtx, rtx, rtx);
6018   rtx (*gen_shld)(rtx, rtx, rtx);
6019   int half_width = GET_MODE_BITSIZE (mode) >> 1;
6020   machine_mode half_mode;
6021 
6022   rtx low[2], high[2];
6023   int count;
6024 
6025   if (CONST_INT_P (operands[2]))
6026     {
6027       split_double_mode (mode, operands, 2, low, high);
6028       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6029 
6030       if (count >= half_width)
6031 	{
6032 	  emit_move_insn (high[0], low[1]);
6033 	  emit_move_insn (low[0], const0_rtx);
6034 
6035 	  if (count > half_width)
6036 	    ix86_expand_ashl_const (high[0], count - half_width, mode);
6037 	}
6038       else
6039 	{
6040 	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6041 
6042 	  if (!rtx_equal_p (operands[0], operands[1]))
6043 	    emit_move_insn (operands[0], operands[1]);
6044 
6045 	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6046 	  ix86_expand_ashl_const (low[0], count, mode);
6047 	}
6048       return;
6049     }
6050 
6051   split_double_mode (mode, operands, 1, low, high);
6052   half_mode = mode == DImode ? SImode : DImode;
6053 
6054   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6055 
6056   if (operands[1] == const1_rtx)
6057     {
6058       /* Assuming we've chosen a QImode capable registers, then 1 << N
6059 	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
6060       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6061 	{
6062 	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6063 
6064 	  ix86_expand_clear (low[0]);
6065 	  ix86_expand_clear (high[0]);
6066 	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6067 
6068 	  d = gen_lowpart (QImode, low[0]);
6069 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6070 	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
6071 	  emit_insn (gen_rtx_SET (d, s));
6072 
6073 	  d = gen_lowpart (QImode, high[0]);
6074 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6075 	  s = gen_rtx_NE (QImode, flags, const0_rtx);
6076 	  emit_insn (gen_rtx_SET (d, s));
6077 	}
6078 
6079       /* Otherwise, we can get the same results by manually performing
6080 	 a bit extract operation on bit 5/6, and then performing the two
6081 	 shifts.  The two methods of getting 0/1 into low/high are exactly
6082 	 the same size.  Avoiding the shift in the bit extract case helps
6083 	 pentium4 a bit; no one else seems to care much either way.  */
6084       else
6085 	{
6086 	  rtx (*gen_lshr3)(rtx, rtx, rtx);
6087 	  rtx (*gen_and3)(rtx, rtx, rtx);
6088 	  rtx (*gen_xor3)(rtx, rtx, rtx);
6089 	  HOST_WIDE_INT bits;
6090 	  rtx x;
6091 
6092 	  if (mode == DImode)
6093 	    {
6094 	      gen_lshr3 = gen_lshrsi3;
6095 	      gen_and3 = gen_andsi3;
6096 	      gen_xor3 = gen_xorsi3;
6097 	      bits = 5;
6098 	    }
6099 	  else
6100 	    {
6101 	      gen_lshr3 = gen_lshrdi3;
6102 	      gen_and3 = gen_anddi3;
6103 	      gen_xor3 = gen_xordi3;
6104 	      bits = 6;
6105 	    }
6106 
6107 	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6108 	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6109 	  else
6110 	    x = gen_lowpart (half_mode, operands[2]);
6111 	  emit_insn (gen_rtx_SET (high[0], x));
6112 
6113 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6114 	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6115 	  emit_move_insn (low[0], high[0]);
6116 	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6117 	}
6118 
6119       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6120       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6121       return;
6122     }
6123 
6124   if (operands[1] == constm1_rtx)
6125     {
6126       /* For -1 << N, we can avoid the shld instruction, because we
6127 	 know that we're shifting 0...31/63 ones into a -1.  */
6128       emit_move_insn (low[0], constm1_rtx);
6129       if (optimize_insn_for_size_p ())
6130 	emit_move_insn (high[0], low[0]);
6131       else
6132 	emit_move_insn (high[0], constm1_rtx);
6133     }
6134   else
6135     {
6136       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6137 
6138       if (!rtx_equal_p (operands[0], operands[1]))
6139 	emit_move_insn (operands[0], operands[1]);
6140 
6141       split_double_mode (mode, operands, 1, low, high);
6142       emit_insn (gen_shld (high[0], low[0], operands[2]));
6143     }
6144 
6145   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6146 
6147   if (TARGET_CMOVE && scratch)
6148     {
6149       ix86_expand_clear (scratch);
6150       emit_insn (gen_x86_shift_adj_1
6151 		 (half_mode, high[0], low[0], operands[2], scratch));
6152     }
6153   else
6154     emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6155 }
6156 
6157 void
ix86_split_ashr(rtx * operands,rtx scratch,machine_mode mode)6158 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6159 {
6160   rtx (*gen_ashr3)(rtx, rtx, rtx)
6161     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6162   rtx (*gen_shrd)(rtx, rtx, rtx);
6163   int half_width = GET_MODE_BITSIZE (mode) >> 1;
6164 
6165   rtx low[2], high[2];
6166   int count;
6167 
6168   if (CONST_INT_P (operands[2]))
6169     {
6170       split_double_mode (mode, operands, 2, low, high);
6171       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6172 
6173       if (count == GET_MODE_BITSIZE (mode) - 1)
6174 	{
6175 	  emit_move_insn (high[0], high[1]);
6176 	  emit_insn (gen_ashr3 (high[0], high[0],
6177 				GEN_INT (half_width - 1)));
6178 	  emit_move_insn (low[0], high[0]);
6179 
6180 	}
6181       else if (count >= half_width)
6182 	{
6183 	  emit_move_insn (low[0], high[1]);
6184 	  emit_move_insn (high[0], low[0]);
6185 	  emit_insn (gen_ashr3 (high[0], high[0],
6186 				GEN_INT (half_width - 1)));
6187 
6188 	  if (count > half_width)
6189 	    emit_insn (gen_ashr3 (low[0], low[0],
6190 				  GEN_INT (count - half_width)));
6191 	}
6192       else
6193 	{
6194 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6195 
6196 	  if (!rtx_equal_p (operands[0], operands[1]))
6197 	    emit_move_insn (operands[0], operands[1]);
6198 
6199 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6200 	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6201 	}
6202     }
6203   else
6204     {
6205       machine_mode half_mode;
6206 
6207       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6208 
6209      if (!rtx_equal_p (operands[0], operands[1]))
6210 	emit_move_insn (operands[0], operands[1]);
6211 
6212       split_double_mode (mode, operands, 1, low, high);
6213       half_mode = mode == DImode ? SImode : DImode;
6214 
6215       emit_insn (gen_shrd (low[0], high[0], operands[2]));
6216       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6217 
6218       if (TARGET_CMOVE && scratch)
6219 	{
6220 	  emit_move_insn (scratch, high[0]);
6221 	  emit_insn (gen_ashr3 (scratch, scratch,
6222 				GEN_INT (half_width - 1)));
6223 	  emit_insn (gen_x86_shift_adj_1
6224 		     (half_mode, low[0], high[0], operands[2], scratch));
6225 	}
6226       else
6227 	emit_insn (gen_x86_shift_adj_3
6228 		   (half_mode, low[0], high[0], operands[2]));
6229     }
6230 }
6231 
6232 void
ix86_split_lshr(rtx * operands,rtx scratch,machine_mode mode)6233 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6234 {
6235   rtx (*gen_lshr3)(rtx, rtx, rtx)
6236     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6237   rtx (*gen_shrd)(rtx, rtx, rtx);
6238   int half_width = GET_MODE_BITSIZE (mode) >> 1;
6239 
6240   rtx low[2], high[2];
6241   int count;
6242 
6243   if (CONST_INT_P (operands[2]))
6244     {
6245       split_double_mode (mode, operands, 2, low, high);
6246       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6247 
6248       if (count >= half_width)
6249 	{
6250 	  emit_move_insn (low[0], high[1]);
6251 	  ix86_expand_clear (high[0]);
6252 
6253 	  if (count > half_width)
6254 	    emit_insn (gen_lshr3 (low[0], low[0],
6255 				  GEN_INT (count - half_width)));
6256 	}
6257       else
6258 	{
6259 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6260 
6261 	  if (!rtx_equal_p (operands[0], operands[1]))
6262 	    emit_move_insn (operands[0], operands[1]);
6263 
6264 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6265 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6266 	}
6267     }
6268   else
6269     {
6270       machine_mode half_mode;
6271 
6272       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6273 
6274       if (!rtx_equal_p (operands[0], operands[1]))
6275 	emit_move_insn (operands[0], operands[1]);
6276 
6277       split_double_mode (mode, operands, 1, low, high);
6278       half_mode = mode == DImode ? SImode : DImode;
6279 
6280       emit_insn (gen_shrd (low[0], high[0], operands[2]));
6281       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6282 
6283       if (TARGET_CMOVE && scratch)
6284 	{
6285 	  ix86_expand_clear (scratch);
6286 	  emit_insn (gen_x86_shift_adj_1
6287 		     (half_mode, low[0], high[0], operands[2], scratch));
6288 	}
6289       else
6290 	emit_insn (gen_x86_shift_adj_2
6291 		   (half_mode, low[0], high[0], operands[2]));
6292     }
6293 }
6294 
6295 /* Expand move of V1TI mode register X to a new TI mode register.  */
6296 static rtx
ix86_expand_v1ti_to_ti(rtx x)6297 ix86_expand_v1ti_to_ti (rtx x)
6298 {
6299   rtx result = gen_reg_rtx (TImode);
6300   if (TARGET_SSE2)
6301     {
6302       rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6303       rtx lo = gen_lowpart (DImode, result);
6304       emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6305       rtx hi = gen_highpart (DImode, result);
6306       emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6307     }
6308   else
6309     emit_move_insn (result, gen_lowpart (TImode, x));
6310   return result;
6311 }
6312 
6313 /* Expand move of TI mode register X to a new V1TI mode register.  */
6314 static rtx
ix86_expand_ti_to_v1ti(rtx x)6315 ix86_expand_ti_to_v1ti (rtx x)
6316 {
6317   if (TARGET_SSE2)
6318     {
6319       rtx lo = gen_lowpart (DImode, x);
6320       rtx hi = gen_highpart (DImode, x);
6321       rtx tmp = gen_reg_rtx (V2DImode);
6322       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6323       return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6324     }
6325 
6326   return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6327 }
6328 
6329 /* Expand V1TI mode shift (of rtx_code CODE) by constant.  */
6330 void
ix86_expand_v1ti_shift(enum rtx_code code,rtx operands[])6331 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6332 {
6333   rtx op1 = force_reg (V1TImode, operands[1]);
6334 
6335   if (!CONST_INT_P (operands[2]))
6336     {
6337       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6338       rtx tmp2 = gen_reg_rtx (TImode);
6339       rtx (*shift) (rtx, rtx, rtx)
6340 	    = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6341       emit_insn (shift (tmp2, tmp1, operands[2]));
6342       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6343       emit_move_insn (operands[0], tmp3);
6344       return;
6345     }
6346 
6347   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6348 
6349   if (bits == 0)
6350     {
6351       emit_move_insn (operands[0], op1);
6352       return;
6353     }
6354 
6355   if ((bits & 7) == 0)
6356     {
6357       rtx tmp = gen_reg_rtx (V1TImode);
6358       if (code == ASHIFT)
6359 	emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6360       else
6361 	emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6362       emit_move_insn (operands[0], tmp);
6363       return;
6364     }
6365 
6366   rtx tmp1 = gen_reg_rtx (V1TImode);
6367   if (code == ASHIFT)
6368     emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6369   else
6370     emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6371 
6372   /* tmp2 is operands[1] shifted by 64, in V2DImode.  */
6373   rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6374 
6375   /* tmp3 will be the V2DImode result.  */
6376   rtx tmp3 = gen_reg_rtx (V2DImode);
6377 
6378   if (bits > 64)
6379     {
6380       if (code == ASHIFT)
6381 	emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6382       else
6383 	emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6384     }
6385   else
6386     {
6387       /* tmp4 is operands[1], in V2DImode.  */
6388       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6389 
6390       rtx tmp5 = gen_reg_rtx (V2DImode);
6391       if (code == ASHIFT)
6392 	emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6393       else
6394 	emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6395 
6396       rtx tmp6 = gen_reg_rtx (V2DImode);
6397       if (code == ASHIFT)
6398 	emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6399       else
6400 	emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6401 
6402       emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6403     }
6404 
6405   /* Convert the result back to V1TImode and store in operands[0].  */
6406   rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6407   emit_move_insn (operands[0], tmp7);
6408 }
6409 
6410 /* Expand V1TI mode rotate (of rtx_code CODE) by constant.  */
6411 void
ix86_expand_v1ti_rotate(enum rtx_code code,rtx operands[])6412 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6413 {
6414   rtx op1 = force_reg (V1TImode, operands[1]);
6415 
6416   if (!CONST_INT_P (operands[2]))
6417     {
6418       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6419       rtx tmp2 = gen_reg_rtx (TImode);
6420       rtx (*rotate) (rtx, rtx, rtx)
6421 	    = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6422       emit_insn (rotate (tmp2, tmp1, operands[2]));
6423       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6424       emit_move_insn (operands[0], tmp3);
6425       return;
6426     }
6427 
6428   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6429 
6430   if (bits == 0)
6431     {
6432       emit_move_insn (operands[0], op1);
6433       return;
6434     }
6435 
6436   if (code == ROTATERT)
6437     bits = 128 - bits;
6438 
6439   if ((bits & 31) == 0)
6440     {
6441       rtx tmp2 = gen_reg_rtx (V4SImode);
6442       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6443       if (bits == 32)
6444 	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6445       else if (bits == 64)
6446 	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6447       else
6448 	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6449       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6450       return;
6451     }
6452 
6453   if ((bits & 7) == 0)
6454     {
6455       rtx tmp1 = gen_reg_rtx (V1TImode);
6456       rtx tmp2 = gen_reg_rtx (V1TImode);
6457       rtx tmp3 = gen_reg_rtx (V1TImode);
6458 
6459       emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6460       emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6461       emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6462       emit_move_insn (operands[0], tmp3);
6463       return;
6464     }
6465 
6466   rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6467 
6468   rtx lobits;
6469   rtx hibits;
6470 
6471   switch (bits >> 5)
6472     {
6473     case 0:
6474       lobits = op1_v4si;
6475       hibits = gen_reg_rtx (V4SImode);
6476       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6477       break;
6478 
6479     case 1:
6480       lobits = gen_reg_rtx (V4SImode);
6481       hibits = gen_reg_rtx (V4SImode);
6482       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6483       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6484       break;
6485 
6486     case 2:
6487       lobits = gen_reg_rtx (V4SImode);
6488       hibits = gen_reg_rtx (V4SImode);
6489       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6490       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6491       break;
6492 
6493     default:
6494       lobits = gen_reg_rtx (V4SImode);
6495       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6496       hibits = op1_v4si;
6497       break;
6498     }
6499 
6500   rtx tmp1 = gen_reg_rtx (V4SImode);
6501   rtx tmp2 = gen_reg_rtx (V4SImode);
6502   rtx tmp3 = gen_reg_rtx (V4SImode);
6503 
6504   emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6505   emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6506   emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6507 
6508   emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6509 }
6510 
6511 /* Expand V1TI mode ashiftrt by constant.  */
6512 void
ix86_expand_v1ti_ashiftrt(rtx operands[])6513 ix86_expand_v1ti_ashiftrt (rtx operands[])
6514 {
6515   rtx op1 = force_reg (V1TImode, operands[1]);
6516 
6517   if (!CONST_INT_P (operands[2]))
6518     {
6519       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6520       rtx tmp2 = gen_reg_rtx (TImode);
6521       emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6522       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6523       emit_move_insn (operands[0], tmp3);
6524       return;
6525     }
6526 
6527   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6528 
6529   if (bits == 0)
6530     {
6531       emit_move_insn (operands[0], op1);
6532       return;
6533     }
6534 
6535   if (bits == 127)
6536     {
6537       /* Two operations.  */
6538       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6539       rtx tmp2 = gen_reg_rtx (V4SImode);
6540       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6541 
6542       rtx tmp3 = gen_reg_rtx (V4SImode);
6543       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6544 
6545       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6546       return;
6547     }
6548 
6549   if (bits == 64)
6550     {
6551       /* Three operations.  */
6552       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6553       rtx tmp2 = gen_reg_rtx (V4SImode);
6554       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6555 
6556       rtx tmp3 = gen_reg_rtx (V4SImode);
6557       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6558 
6559       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6560       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6561       rtx tmp6 = gen_reg_rtx (V2DImode);
6562       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6563 
6564       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6565       return;
6566     }
6567 
6568   if (bits == 96)
6569     {
6570       /* Three operations.  */
6571       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6572       rtx tmp2 = gen_reg_rtx (V4SImode);
6573       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6574 
6575       rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6576       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6577       rtx tmp5 = gen_reg_rtx (V2DImode);
6578       emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6579 
6580       rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
6581       rtx tmp7 = gen_reg_rtx (V4SImode);
6582       emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6583 
6584       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6585       return;
6586     }
6587 
6588   if (bits >= 111)
6589     {
6590       /* Three operations.  */
6591       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6592       rtx tmp2 = gen_reg_rtx (V4SImode);
6593       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6594 
6595       rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6596       rtx tmp4 = gen_reg_rtx (V8HImode);
6597       emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6598 
6599       rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6600       rtx tmp6 = gen_reg_rtx (V4SImode);
6601       emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6602 
6603       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6604       return;
6605     }
6606 
6607   if (TARGET_AVX2 || TARGET_SSE4_1)
6608     {
6609       /* Three operations.  */
6610       if (bits == 32)
6611 	{
6612 	  rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6613 	  rtx tmp2 = gen_reg_rtx (V4SImode);
6614 	  emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6615 
6616 	  rtx tmp3 = gen_reg_rtx (V1TImode);
6617 	  emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6618 
6619 	  if (TARGET_AVX2)
6620 	    {
6621 	      rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6622 	      rtx tmp5 = gen_reg_rtx (V4SImode);
6623 	      emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6624 					       GEN_INT (7)));
6625 
6626 	      emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6627 	    }
6628 	  else
6629 	    {
6630 	      rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6631 	      rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6632 	      rtx tmp6 = gen_reg_rtx (V8HImode);
6633 	      emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6634 					     GEN_INT (0x3f)));
6635 
6636 	      emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6637 	    }
6638 	  return;
6639 	}
6640 
6641       /* Three operations.  */
6642       if (bits == 8 || bits == 16 || bits == 24)
6643 	{
6644 	  rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6645 	  rtx tmp2 = gen_reg_rtx (V4SImode);
6646 	  emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6647 
6648 	  rtx tmp3 = gen_reg_rtx (V1TImode);
6649 	  emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6650 
6651 	  if (TARGET_AVX2)
6652 	    {
6653 	      rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6654 	      rtx tmp5 = gen_reg_rtx (V4SImode);
6655 	      emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6656 					       GEN_INT (7)));
6657 
6658 	      emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6659 	    }
6660 	  else
6661 	    {
6662 	      rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6663 	      rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6664 	      rtx tmp6 = gen_reg_rtx (V8HImode);
6665 	      emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6666 					     GEN_INT (0x3f)));
6667 
6668 	      emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6669 	    }
6670 	  return;
6671 	}
6672     }
6673 
6674   if (bits > 96)
6675     {
6676       /* Four operations.  */
6677       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6678       rtx tmp2 = gen_reg_rtx (V4SImode);
6679       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6680 
6681       rtx tmp3 = gen_reg_rtx (V4SImode);
6682       emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6683 
6684       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6685       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6686       rtx tmp6 = gen_reg_rtx (V2DImode);
6687       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6688 
6689       rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
6690       rtx tmp8 = gen_reg_rtx (V4SImode);
6691       emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6692 
6693       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
6694       return;
6695     }
6696 
6697   if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6698     {
6699       /* Four operations.  */
6700       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6701       rtx tmp2 = gen_reg_rtx (V4SImode);
6702       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6703 
6704       rtx tmp3 = gen_reg_rtx (V4SImode);
6705       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6706 
6707       rtx tmp4 = gen_reg_rtx (V1TImode);
6708       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6709 
6710       rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6711       rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
6712       rtx tmp7 = gen_reg_rtx (V8HImode);
6713       emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6714 				     GEN_INT (bits == 48 ? 0x1f : 0x07)));
6715 
6716       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6717       return;
6718     }
6719 
6720   if ((bits & 7) == 0)
6721     {
6722       /* Five operations.  */
6723       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6724       rtx tmp2 = gen_reg_rtx (V4SImode);
6725       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6726 
6727       rtx tmp3 = gen_reg_rtx (V4SImode);
6728       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6729 
6730       rtx tmp4 = gen_reg_rtx (V1TImode);
6731       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6732 
6733       rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6734       rtx tmp6 = gen_reg_rtx (V1TImode);
6735       emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6736 
6737       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6738       rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
6739       rtx tmp9 = gen_reg_rtx (V2DImode);
6740       emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6741 
6742       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
6743       return;
6744     }
6745 
6746   if (TARGET_AVX2 && bits < 32)
6747     {
6748       /* Six operations.  */
6749       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6750       rtx tmp2 = gen_reg_rtx (V4SImode);
6751       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6752 
6753       rtx tmp3 = gen_reg_rtx (V1TImode);
6754       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6755 
6756       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6757       rtx tmp5 = gen_reg_rtx (V2DImode);
6758       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6759 
6760       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6761       rtx tmp7 = gen_reg_rtx (V2DImode);
6762       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6763 
6764       rtx tmp8 = gen_reg_rtx (V2DImode);
6765       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6766 
6767       rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
6768       rtx tmp10 = gen_reg_rtx (V4SImode);
6769       emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6770 
6771       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
6772       return;
6773     }
6774 
6775   if (TARGET_SSE4_1 && bits < 15)
6776     {
6777       /* Six operations.  */
6778       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6779       rtx tmp2 = gen_reg_rtx (V4SImode);
6780       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6781 
6782       rtx tmp3 = gen_reg_rtx (V1TImode);
6783       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6784 
6785       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6786       rtx tmp5 = gen_reg_rtx (V2DImode);
6787       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6788 
6789       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6790       rtx tmp7 = gen_reg_rtx (V2DImode);
6791       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6792 
6793       rtx tmp8 = gen_reg_rtx (V2DImode);
6794       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6795 
6796       rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6797       rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
6798       rtx tmp11 = gen_reg_rtx (V8HImode);
6799       emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
6800 
6801       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
6802       return;
6803     }
6804 
6805   if (bits == 1)
6806     {
6807       /* Eight operations.  */
6808       rtx tmp1 = gen_reg_rtx (V1TImode);
6809       emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6810 
6811       rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6812       rtx tmp3 = gen_reg_rtx (V2DImode);
6813       emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
6814 
6815       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6816       rtx tmp5 = gen_reg_rtx (V2DImode);
6817       emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
6818 
6819       rtx tmp6 = gen_reg_rtx (V2DImode);
6820       emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
6821 
6822       rtx tmp7 = gen_reg_rtx (V2DImode);
6823       emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
6824 
6825       rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
6826       rtx tmp9 = gen_reg_rtx (V4SImode);
6827       emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
6828 
6829       rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
6830       rtx tmp11 = gen_reg_rtx (V2DImode);
6831       emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
6832 
6833       rtx tmp12 = gen_reg_rtx (V2DImode);
6834       emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
6835 
6836       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
6837       return;
6838     }
6839 
6840   if (bits > 64)
6841     {
6842       /* Eight operations.  */
6843       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6844       rtx tmp2 = gen_reg_rtx (V4SImode);
6845       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6846 
6847       rtx tmp3 = gen_reg_rtx (V4SImode);
6848       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6849 
6850       rtx tmp4 = gen_reg_rtx (V1TImode);
6851       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6852 
6853       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6854       rtx tmp6 = gen_reg_rtx (V2DImode);
6855       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
6856 
6857       rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6858       rtx tmp8 = gen_reg_rtx (V1TImode);
6859       emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
6860 
6861       rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6862       rtx tmp10 = gen_reg_rtx (V2DImode);
6863       emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
6864 
6865       rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
6866       rtx tmp12 = gen_reg_rtx (V2DImode);
6867       emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
6868 
6869       rtx tmp13 = gen_reg_rtx (V2DImode);
6870       emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
6871 
6872       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
6873     }
6874   else
6875     {
6876       /* Nine operations.  */
6877       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6878       rtx tmp2 = gen_reg_rtx (V4SImode);
6879       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6880 
6881       rtx tmp3 = gen_reg_rtx (V4SImode);
6882       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6883 
6884       rtx tmp4 = gen_reg_rtx (V1TImode);
6885       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6886 
6887       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6888       rtx tmp6 = gen_reg_rtx (V2DImode);
6889       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
6890 
6891       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6892       rtx tmp8 = gen_reg_rtx (V2DImode);
6893       emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
6894 
6895       rtx tmp9 = gen_reg_rtx (V2DImode);
6896       emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
6897 
6898       rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6899       rtx tmp11 = gen_reg_rtx (V1TImode);
6900       emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
6901 
6902       rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
6903       rtx tmp13 = gen_reg_rtx (V2DImode);
6904       emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
6905 
6906       rtx tmp14 = gen_reg_rtx (V2DImode);
6907       emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
6908 
6909       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
6910     }
6911 }
6912 
6913 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
6914    DImode for constant loop counts.  */
6915 
6916 static machine_mode
counter_mode(rtx count_exp)6917 counter_mode (rtx count_exp)
6918 {
6919   if (GET_MODE (count_exp) != VOIDmode)
6920     return GET_MODE (count_exp);
6921   if (!CONST_INT_P (count_exp))
6922     return Pmode;
6923   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
6924     return DImode;
6925   return SImode;
6926 }
6927 
6928 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
6929    to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
6930    specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
6931    memory by VALUE (supposed to be in MODE).
6932 
6933    The size is rounded down to whole number of chunk size moved at once.
6934    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
6935 
6936 
6937 static void
expand_set_or_cpymem_via_loop(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx count,machine_mode mode,int unroll,int expected_size,bool issetmem)6938 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
6939 			       rtx destptr, rtx srcptr, rtx value,
6940 			       rtx count, machine_mode mode, int unroll,
6941 			       int expected_size, bool issetmem)
6942 {
6943   rtx_code_label *out_label, *top_label;
6944   rtx iter, tmp;
6945   machine_mode iter_mode = counter_mode (count);
6946   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
6947   rtx piece_size = GEN_INT (piece_size_n);
6948   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
6949   rtx size;
6950   int i;
6951 
6952   top_label = gen_label_rtx ();
6953   out_label = gen_label_rtx ();
6954   iter = gen_reg_rtx (iter_mode);
6955 
6956   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
6957 			      NULL, 1, OPTAB_DIRECT);
6958   /* Those two should combine.  */
6959   if (piece_size == const1_rtx)
6960     {
6961       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
6962 			       true, out_label);
6963       predict_jump (REG_BR_PROB_BASE * 10 / 100);
6964     }
6965   emit_move_insn (iter, const0_rtx);
6966 
6967   emit_label (top_label);
6968 
6969   tmp = convert_modes (Pmode, iter_mode, iter, true);
6970 
6971   /* This assert could be relaxed - in this case we'll need to compute
6972      smallest power of two, containing in PIECE_SIZE_N and pass it to
6973      offset_address.  */
6974   gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
6975   destmem = offset_address (destmem, tmp, piece_size_n);
6976   destmem = adjust_address (destmem, mode, 0);
6977 
6978   if (!issetmem)
6979     {
6980       srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
6981       srcmem = adjust_address (srcmem, mode, 0);
6982 
6983       /* When unrolling for chips that reorder memory reads and writes,
6984 	 we can save registers by using single temporary.
6985 	 Also using 4 temporaries is overkill in 32bit mode.  */
6986       if (!TARGET_64BIT && 0)
6987 	{
6988 	  for (i = 0; i < unroll; i++)
6989 	    {
6990 	      if (i)
6991 		{
6992 		  destmem = adjust_address (copy_rtx (destmem), mode,
6993 					    GET_MODE_SIZE (mode));
6994 		  srcmem = adjust_address (copy_rtx (srcmem), mode,
6995 					   GET_MODE_SIZE (mode));
6996 		}
6997 	      emit_move_insn (destmem, srcmem);
6998 	    }
6999 	}
7000       else
7001 	{
7002 	  rtx tmpreg[4];
7003 	  gcc_assert (unroll <= 4);
7004 	  for (i = 0; i < unroll; i++)
7005 	    {
7006 	      tmpreg[i] = gen_reg_rtx (mode);
7007 	      if (i)
7008 		srcmem = adjust_address (copy_rtx (srcmem), mode,
7009 					 GET_MODE_SIZE (mode));
7010 	      emit_move_insn (tmpreg[i], srcmem);
7011 	    }
7012 	  for (i = 0; i < unroll; i++)
7013 	    {
7014 	      if (i)
7015 		destmem = adjust_address (copy_rtx (destmem), mode,
7016 					  GET_MODE_SIZE (mode));
7017 	      emit_move_insn (destmem, tmpreg[i]);
7018 	    }
7019 	}
7020     }
7021   else
7022     for (i = 0; i < unroll; i++)
7023       {
7024 	if (i)
7025 	  destmem = adjust_address (copy_rtx (destmem), mode,
7026 				    GET_MODE_SIZE (mode));
7027 	emit_move_insn (destmem, value);
7028       }
7029 
7030   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7031 			     true, OPTAB_LIB_WIDEN);
7032   if (tmp != iter)
7033     emit_move_insn (iter, tmp);
7034 
7035   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7036 			   true, top_label);
7037   if (expected_size != -1)
7038     {
7039       expected_size /= GET_MODE_SIZE (mode) * unroll;
7040       if (expected_size == 0)
7041 	predict_jump (0);
7042       else if (expected_size > REG_BR_PROB_BASE)
7043 	predict_jump (REG_BR_PROB_BASE - 1);
7044       else
7045         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7046 		      / expected_size);
7047     }
7048   else
7049     predict_jump (REG_BR_PROB_BASE * 80 / 100);
7050   iter = ix86_zero_extend_to_Pmode (iter);
7051   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7052 			     true, OPTAB_LIB_WIDEN);
7053   if (tmp != destptr)
7054     emit_move_insn (destptr, tmp);
7055   if (!issetmem)
7056     {
7057       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7058 				 true, OPTAB_LIB_WIDEN);
7059       if (tmp != srcptr)
7060 	emit_move_insn (srcptr, tmp);
7061     }
7062   emit_label (out_label);
7063 }
7064 
7065 /* Divide COUNTREG by SCALE.  */
7066 static rtx
scale_counter(rtx countreg,int scale)7067 scale_counter (rtx countreg, int scale)
7068 {
7069   rtx sc;
7070 
7071   if (scale == 1)
7072     return countreg;
7073   if (CONST_INT_P (countreg))
7074     return GEN_INT (INTVAL (countreg) / scale);
7075   gcc_assert (REG_P (countreg));
7076 
7077   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7078 			    GEN_INT (exact_log2 (scale)),
7079 			    NULL, 1, OPTAB_DIRECT);
7080   return sc;
7081 }
7082 
7083 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7084    When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7085    When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7086    For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7087    ORIG_VALUE is the original value passed to memset to fill the memory with.
7088    Other arguments have same meaning as for previous function.  */
7089 
7090 static void
expand_set_or_cpymem_via_rep(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx orig_value,rtx count,machine_mode mode,bool issetmem)7091 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7092 			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7093 			   rtx count,
7094 			   machine_mode mode, bool issetmem)
7095 {
7096   rtx destexp;
7097   rtx srcexp;
7098   rtx countreg;
7099   HOST_WIDE_INT rounded_count;
7100 
7101   /* If possible, it is shorter to use rep movs.
7102      TODO: Maybe it is better to move this logic to decide_alg.  */
7103   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7104       && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7105       && (!issetmem || orig_value == const0_rtx))
7106     mode = SImode;
7107 
7108   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7109     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7110 
7111   countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7112 						       GET_MODE_SIZE (mode)));
7113   if (mode != QImode)
7114     {
7115       destexp = gen_rtx_ASHIFT (Pmode, countreg,
7116 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7117       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7118     }
7119   else
7120     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7121   if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7122     {
7123       rounded_count
7124 	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7125       destmem = shallow_copy_rtx (destmem);
7126       set_mem_size (destmem, rounded_count);
7127     }
7128   else if (MEM_SIZE_KNOWN_P (destmem))
7129     clear_mem_size (destmem);
7130 
7131   if (issetmem)
7132     {
7133       value = force_reg (mode, gen_lowpart (mode, value));
7134       emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7135     }
7136   else
7137     {
7138       if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7139 	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7140       if (mode != QImode)
7141 	{
7142 	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7143 				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7144 	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7145 	}
7146       else
7147 	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7148       if (CONST_INT_P (count))
7149 	{
7150 	  rounded_count
7151 	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7152 	  srcmem = shallow_copy_rtx (srcmem);
7153 	  set_mem_size (srcmem, rounded_count);
7154 	}
7155       else
7156 	{
7157 	  if (MEM_SIZE_KNOWN_P (srcmem))
7158 	    clear_mem_size (srcmem);
7159 	}
7160       emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7161 			      destexp, srcexp));
7162     }
7163 }
7164 
7165 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7166    DESTMEM.
7167    SRC is passed by pointer to be updated on return.
7168    Return value is updated DST.  */
7169 static rtx
emit_memmov(rtx destmem,rtx * srcmem,rtx destptr,rtx srcptr,HOST_WIDE_INT size_to_move)7170 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7171 	     HOST_WIDE_INT size_to_move)
7172 {
7173   rtx dst = destmem, src = *srcmem, tempreg;
7174   enum insn_code code;
7175   machine_mode move_mode;
7176   int piece_size, i;
7177 
7178   /* Find the widest mode in which we could perform moves.
7179      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7180      it until move of such size is supported.  */
7181   piece_size = 1 << floor_log2 (size_to_move);
7182   while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7183 	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7184     {
7185       gcc_assert (piece_size > 1);
7186       piece_size >>= 1;
7187     }
7188 
7189   /* Find the corresponding vector mode with the same size as MOVE_MODE.
7190      MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
7191   if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7192     {
7193       int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7194       if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7195 	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7196 	{
7197 	  move_mode = word_mode;
7198 	  piece_size = GET_MODE_SIZE (move_mode);
7199 	  code = optab_handler (mov_optab, move_mode);
7200 	}
7201     }
7202   gcc_assert (code != CODE_FOR_nothing);
7203 
7204   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7205   src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7206 
7207   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
7208   gcc_assert (size_to_move % piece_size == 0);
7209 
7210   for (i = 0; i < size_to_move; i += piece_size)
7211     {
7212       /* We move from memory to memory, so we'll need to do it via
7213 	 a temporary register.  */
7214       tempreg = gen_reg_rtx (move_mode);
7215       emit_insn (GEN_FCN (code) (tempreg, src));
7216       emit_insn (GEN_FCN (code) (dst, tempreg));
7217 
7218       emit_move_insn (destptr,
7219 		      plus_constant (Pmode, copy_rtx (destptr), piece_size));
7220       emit_move_insn (srcptr,
7221 		      plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7222 
7223       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7224 					  piece_size);
7225       src = adjust_automodify_address_nv (src, move_mode, srcptr,
7226 					  piece_size);
7227     }
7228 
7229   /* Update DST and SRC rtx.  */
7230   *srcmem = src;
7231   return dst;
7232 }
7233 
7234 /* Helper function for the string operations below.  Dest VARIABLE whether
7235    it is aligned to VALUE bytes.  If true, jump to the label.  */
7236 
7237 static rtx_code_label *
ix86_expand_aligntest(rtx variable,int value,bool epilogue)7238 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7239 {
7240   rtx_code_label *label = gen_label_rtx ();
7241   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7242   if (GET_MODE (variable) == DImode)
7243     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7244   else
7245     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7246   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7247 			   1, label);
7248   if (epilogue)
7249     predict_jump (REG_BR_PROB_BASE * 50 / 100);
7250   else
7251     predict_jump (REG_BR_PROB_BASE * 90 / 100);
7252   return label;
7253 }
7254 
7255 
7256 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
7257 
7258 static void
expand_cpymem_epilogue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx count,int max_size)7259 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7260 			rtx destptr, rtx srcptr, rtx count, int max_size)
7261 {
7262   rtx src, dest;
7263   if (CONST_INT_P (count))
7264     {
7265       HOST_WIDE_INT countval = INTVAL (count);
7266       HOST_WIDE_INT epilogue_size = countval % max_size;
7267       int i;
7268 
7269       /* For now MAX_SIZE should be a power of 2.  This assert could be
7270 	 relaxed, but it'll require a bit more complicated epilogue
7271 	 expanding.  */
7272       gcc_assert ((max_size & (max_size - 1)) == 0);
7273       for (i = max_size; i >= 1; i >>= 1)
7274 	{
7275 	  if (epilogue_size & i)
7276 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7277 	}
7278       return;
7279     }
7280   if (max_size > 8)
7281     {
7282       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7283 				    count, 1, OPTAB_DIRECT);
7284       expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7285 				     count, QImode, 1, 4, false);
7286       return;
7287     }
7288 
7289   /* When there are stringops, we can cheaply increase dest and src pointers.
7290      Otherwise we save code size by maintaining offset (zero is readily
7291      available from preceding rep operation) and using x86 addressing modes.
7292    */
7293   if (TARGET_SINGLE_STRINGOP)
7294     {
7295       if (max_size > 4)
7296 	{
7297 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7298 	  src = change_address (srcmem, SImode, srcptr);
7299 	  dest = change_address (destmem, SImode, destptr);
7300 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
7301 	  emit_label (label);
7302 	  LABEL_NUSES (label) = 1;
7303 	}
7304       if (max_size > 2)
7305 	{
7306 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7307 	  src = change_address (srcmem, HImode, srcptr);
7308 	  dest = change_address (destmem, HImode, destptr);
7309 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
7310 	  emit_label (label);
7311 	  LABEL_NUSES (label) = 1;
7312 	}
7313       if (max_size > 1)
7314 	{
7315 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7316 	  src = change_address (srcmem, QImode, srcptr);
7317 	  dest = change_address (destmem, QImode, destptr);
7318 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
7319 	  emit_label (label);
7320 	  LABEL_NUSES (label) = 1;
7321 	}
7322     }
7323   else
7324     {
7325       rtx offset = force_reg (Pmode, const0_rtx);
7326       rtx tmp;
7327 
7328       if (max_size > 4)
7329 	{
7330 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7331 	  src = change_address (srcmem, SImode, srcptr);
7332 	  dest = change_address (destmem, SImode, destptr);
7333 	  emit_move_insn (dest, src);
7334 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7335 				     true, OPTAB_LIB_WIDEN);
7336 	  if (tmp != offset)
7337 	    emit_move_insn (offset, tmp);
7338 	  emit_label (label);
7339 	  LABEL_NUSES (label) = 1;
7340 	}
7341       if (max_size > 2)
7342 	{
7343 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7344 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7345 	  src = change_address (srcmem, HImode, tmp);
7346 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7347 	  dest = change_address (destmem, HImode, tmp);
7348 	  emit_move_insn (dest, src);
7349 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7350 				     true, OPTAB_LIB_WIDEN);
7351 	  if (tmp != offset)
7352 	    emit_move_insn (offset, tmp);
7353 	  emit_label (label);
7354 	  LABEL_NUSES (label) = 1;
7355 	}
7356       if (max_size > 1)
7357 	{
7358 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7359 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7360 	  src = change_address (srcmem, QImode, tmp);
7361 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7362 	  dest = change_address (destmem, QImode, tmp);
7363 	  emit_move_insn (dest, src);
7364 	  emit_label (label);
7365 	  LABEL_NUSES (label) = 1;
7366 	}
7367     }
7368 }
7369 
7370 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7371    with value PROMOTED_VAL.
7372    SRC is passed by pointer to be updated on return.
7373    Return value is updated DST.  */
7374 static rtx
emit_memset(rtx destmem,rtx destptr,rtx promoted_val,HOST_WIDE_INT size_to_move)7375 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7376 	     HOST_WIDE_INT size_to_move)
7377 {
7378   rtx dst = destmem;
7379   enum insn_code code;
7380   machine_mode move_mode;
7381   int piece_size, i;
7382 
7383   /* Find the widest mode in which we could perform moves.
7384      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7385      it until move of such size is supported.  */
7386   move_mode = GET_MODE (promoted_val);
7387   if (move_mode == VOIDmode)
7388     move_mode = QImode;
7389   if (size_to_move < GET_MODE_SIZE (move_mode))
7390     {
7391       unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7392       move_mode = int_mode_for_size (move_bits, 0).require ();
7393       promoted_val = gen_lowpart (move_mode, promoted_val);
7394     }
7395   piece_size = GET_MODE_SIZE (move_mode);
7396   code = optab_handler (mov_optab, move_mode);
7397   gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7398 
7399   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7400 
7401   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
7402   gcc_assert (size_to_move % piece_size == 0);
7403 
7404   for (i = 0; i < size_to_move; i += piece_size)
7405     {
7406       if (piece_size <= GET_MODE_SIZE (word_mode))
7407 	{
7408 	  emit_insn (gen_strset (destptr, dst, promoted_val));
7409 	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7410 					      piece_size);
7411 	  continue;
7412 	}
7413 
7414       emit_insn (GEN_FCN (code) (dst, promoted_val));
7415 
7416       emit_move_insn (destptr,
7417 		      plus_constant (Pmode, copy_rtx (destptr), piece_size));
7418 
7419       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7420 					  piece_size);
7421     }
7422 
7423   /* Update DST rtx.  */
7424   return dst;
7425 }
7426 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
7427 static void
expand_setmem_epilogue_via_loop(rtx destmem,rtx destptr,rtx value,rtx count,int max_size)7428 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7429 				 rtx count, int max_size)
7430 {
7431   count = expand_simple_binop (counter_mode (count), AND, count,
7432 			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7433   expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7434 				 gen_lowpart (QImode, value), count, QImode,
7435 				 1, max_size / 2, true);
7436 }
7437 
7438 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
7439 static void
expand_setmem_epilogue(rtx destmem,rtx destptr,rtx value,rtx vec_value,rtx count,int max_size)7440 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7441 			rtx count, int max_size)
7442 {
7443   rtx dest;
7444 
7445   if (CONST_INT_P (count))
7446     {
7447       HOST_WIDE_INT countval = INTVAL (count);
7448       HOST_WIDE_INT epilogue_size = countval % max_size;
7449       int i;
7450 
7451       /* For now MAX_SIZE should be a power of 2.  This assert could be
7452 	 relaxed, but it'll require a bit more complicated epilogue
7453 	 expanding.  */
7454       gcc_assert ((max_size & (max_size - 1)) == 0);
7455       for (i = max_size; i >= 1; i >>= 1)
7456 	{
7457 	  if (epilogue_size & i)
7458 	    {
7459 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7460 		destmem = emit_memset (destmem, destptr, vec_value, i);
7461 	      else
7462 		destmem = emit_memset (destmem, destptr, value, i);
7463 	    }
7464 	}
7465       return;
7466     }
7467   if (max_size > 32)
7468     {
7469       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7470       return;
7471     }
7472   if (max_size > 16)
7473     {
7474       rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7475       if (TARGET_64BIT)
7476 	{
7477 	  dest = change_address (destmem, DImode, destptr);
7478 	  emit_insn (gen_strset (destptr, dest, value));
7479 	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7480 	  emit_insn (gen_strset (destptr, dest, value));
7481 	}
7482       else
7483 	{
7484 	  dest = change_address (destmem, SImode, destptr);
7485 	  emit_insn (gen_strset (destptr, dest, value));
7486 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7487 	  emit_insn (gen_strset (destptr, dest, value));
7488 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7489 	  emit_insn (gen_strset (destptr, dest, value));
7490 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7491 	  emit_insn (gen_strset (destptr, dest, value));
7492 	}
7493       emit_label (label);
7494       LABEL_NUSES (label) = 1;
7495     }
7496   if (max_size > 8)
7497     {
7498       rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7499       if (TARGET_64BIT)
7500 	{
7501 	  dest = change_address (destmem, DImode, destptr);
7502 	  emit_insn (gen_strset (destptr, dest, value));
7503 	}
7504       else
7505 	{
7506 	  dest = change_address (destmem, SImode, destptr);
7507 	  emit_insn (gen_strset (destptr, dest, value));
7508 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7509 	  emit_insn (gen_strset (destptr, dest, value));
7510 	}
7511       emit_label (label);
7512       LABEL_NUSES (label) = 1;
7513     }
7514   if (max_size > 4)
7515     {
7516       rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7517       dest = change_address (destmem, SImode, destptr);
7518       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7519       emit_label (label);
7520       LABEL_NUSES (label) = 1;
7521     }
7522   if (max_size > 2)
7523     {
7524       rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7525       dest = change_address (destmem, HImode, destptr);
7526       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7527       emit_label (label);
7528       LABEL_NUSES (label) = 1;
7529     }
7530   if (max_size > 1)
7531     {
7532       rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7533       dest = change_address (destmem, QImode, destptr);
7534       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7535       emit_label (label);
7536       LABEL_NUSES (label) = 1;
7537     }
7538 }
7539 
7540 /* Adjust COUNTER by the VALUE.  */
7541 static void
ix86_adjust_counter(rtx countreg,HOST_WIDE_INT value)7542 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7543 {
7544   emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7545 }
7546 
7547 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7548    DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
7549    Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7550    ignored.
7551    Return value is updated DESTMEM.  */
7552 
7553 static rtx
expand_set_or_cpymem_prologue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int align,int desired_alignment,bool issetmem)7554 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7555 				  rtx destptr, rtx srcptr, rtx value,
7556 				  rtx vec_value, rtx count, int align,
7557 				  int desired_alignment, bool issetmem)
7558 {
7559   int i;
7560   for (i = 1; i < desired_alignment; i <<= 1)
7561     {
7562       if (align <= i)
7563 	{
7564 	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7565 	  if (issetmem)
7566 	    {
7567 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7568 		destmem = emit_memset (destmem, destptr, vec_value, i);
7569 	      else
7570 		destmem = emit_memset (destmem, destptr, value, i);
7571 	    }
7572 	  else
7573 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7574 	  ix86_adjust_counter (count, i);
7575 	  emit_label (label);
7576 	  LABEL_NUSES (label) = 1;
7577 	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7578 	}
7579     }
7580   return destmem;
7581 }
7582 
7583 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7584    or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7585    and jump to DONE_LABEL.  */
7586 static void
expand_small_cpymem_or_setmem(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int size,rtx done_label,bool issetmem)7587 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
7588 			       rtx destptr, rtx srcptr,
7589 			       rtx value, rtx vec_value,
7590 			       rtx count, int size,
7591 			       rtx done_label, bool issetmem)
7592 {
7593   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7594   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7595   rtx modesize;
7596   int n;
7597 
7598   /* If we do not have vector value to copy, we must reduce size.  */
7599   if (issetmem)
7600     {
7601       if (!vec_value)
7602 	{
7603 	  if (GET_MODE (value) == VOIDmode && size > 8)
7604 	    mode = Pmode;
7605 	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7606 	    mode = GET_MODE (value);
7607 	}
7608       else
7609 	mode = GET_MODE (vec_value), value = vec_value;
7610     }
7611   else
7612     {
7613       /* Choose appropriate vector mode.  */
7614       if (size >= 32)
7615 	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7616       else if (size >= 16)
7617 	mode = TARGET_SSE ? V16QImode : DImode;
7618       srcmem = change_address (srcmem, mode, srcptr);
7619     }
7620   destmem = change_address (destmem, mode, destptr);
7621   modesize = GEN_INT (GET_MODE_SIZE (mode));
7622   gcc_assert (GET_MODE_SIZE (mode) <= size);
7623   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7624     {
7625       if (issetmem)
7626 	emit_move_insn (destmem, gen_lowpart (mode, value));
7627       else
7628 	{
7629           emit_move_insn (destmem, srcmem);
7630           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7631 	}
7632       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7633     }
7634 
7635   destmem = offset_address (destmem, count, 1);
7636   destmem = offset_address (destmem, GEN_INT (-2 * size),
7637 			    GET_MODE_SIZE (mode));
7638   if (!issetmem)
7639     {
7640       srcmem = offset_address (srcmem, count, 1);
7641       srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7642 			       GET_MODE_SIZE (mode));
7643     }
7644   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7645     {
7646       if (issetmem)
7647 	emit_move_insn (destmem, gen_lowpart (mode, value));
7648       else
7649 	{
7650 	  emit_move_insn (destmem, srcmem);
7651 	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7652 	}
7653       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7654     }
7655   emit_jump_insn (gen_jump (done_label));
7656   emit_barrier ();
7657 
7658   emit_label (label);
7659   LABEL_NUSES (label) = 1;
7660 }
7661 
7662 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7663    and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7664    bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7665    proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7666    DONE_LABEL is a label after the whole copying sequence. The label is created
7667    on demand if *DONE_LABEL is NULL.
7668    MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
7669    bounds after the initial copies.
7670 
7671    DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7672    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7673    we will dispatch to a library call for large blocks.
7674 
7675    In pseudocode we do:
7676 
7677    if (COUNT < SIZE)
7678      {
7679        Assume that SIZE is 4. Bigger sizes are handled analogously
7680        if (COUNT & 4)
7681 	 {
7682 	    copy 4 bytes from SRCPTR to DESTPTR
7683 	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7684 	    goto done_label
7685 	 }
7686        if (!COUNT)
7687 	 goto done_label;
7688        copy 1 byte from SRCPTR to DESTPTR
7689        if (COUNT & 2)
7690 	 {
7691 	    copy 2 bytes from SRCPTR to DESTPTR
7692 	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7693 	 }
7694      }
7695    else
7696      {
7697        copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7698        copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7699 
7700        OLD_DESPTR = DESTPTR;
7701        Align DESTPTR up to DESIRED_ALIGN
7702        SRCPTR += DESTPTR - OLD_DESTPTR
7703        COUNT -= DEST_PTR - OLD_DESTPTR
7704        if (DYNAMIC_CHECK)
7705 	 Round COUNT down to multiple of SIZE
7706        << optional caller supplied zero size guard is here >>
7707        << optional caller supplied dynamic check is here >>
7708        << caller supplied main copy loop is here >>
7709      }
7710    done_label:
7711   */
7712 static void
expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves(rtx destmem,rtx srcmem,rtx * destptr,rtx * srcptr,machine_mode mode,rtx value,rtx vec_value,rtx * count,rtx_code_label ** done_label,int size,int desired_align,int align,unsigned HOST_WIDE_INT * min_size,bool dynamic_check,bool issetmem)7713 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
7714 							    rtx *destptr, rtx *srcptr,
7715 							    machine_mode mode,
7716 							    rtx value, rtx vec_value,
7717 							    rtx *count,
7718 							    rtx_code_label **done_label,
7719 							    int size,
7720 							    int desired_align,
7721 							    int align,
7722 							    unsigned HOST_WIDE_INT *min_size,
7723 							    bool dynamic_check,
7724 							    bool issetmem)
7725 {
7726   rtx_code_label *loop_label = NULL, *label;
7727   int n;
7728   rtx modesize;
7729   int prolog_size = 0;
7730   rtx mode_value;
7731 
7732   /* Chose proper value to copy.  */
7733   if (issetmem && VECTOR_MODE_P (mode))
7734     mode_value = vec_value;
7735   else
7736     mode_value = value;
7737   gcc_assert (GET_MODE_SIZE (mode) <= size);
7738 
7739   /* See if block is big or small, handle small blocks.  */
7740   if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7741     {
7742       int size2 = size;
7743       loop_label = gen_label_rtx ();
7744 
7745       if (!*done_label)
7746 	*done_label = gen_label_rtx ();
7747 
7748       emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7749 			       1, loop_label);
7750       size2 >>= 1;
7751 
7752       /* Handle sizes > 3.  */
7753       for (;size2 > 2; size2 >>= 1)
7754 	expand_small_cpymem_or_setmem (destmem, srcmem,
7755 				       *destptr, *srcptr,
7756 				       value, vec_value,
7757 				       *count,
7758 				       size2, *done_label, issetmem);
7759       /* Nothing to copy?  Jump to DONE_LABEL if so */
7760       emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7761 			       1, *done_label);
7762 
7763       /* Do a byte copy.  */
7764       destmem = change_address (destmem, QImode, *destptr);
7765       if (issetmem)
7766 	emit_move_insn (destmem, gen_lowpart (QImode, value));
7767       else
7768 	{
7769           srcmem = change_address (srcmem, QImode, *srcptr);
7770           emit_move_insn (destmem, srcmem);
7771 	}
7772 
7773       /* Handle sizes 2 and 3.  */
7774       label = ix86_expand_aligntest (*count, 2, false);
7775       destmem = change_address (destmem, HImode, *destptr);
7776       destmem = offset_address (destmem, *count, 1);
7777       destmem = offset_address (destmem, GEN_INT (-2), 2);
7778       if (issetmem)
7779         emit_move_insn (destmem, gen_lowpart (HImode, value));
7780       else
7781 	{
7782 	  srcmem = change_address (srcmem, HImode, *srcptr);
7783 	  srcmem = offset_address (srcmem, *count, 1);
7784 	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
7785 	  emit_move_insn (destmem, srcmem);
7786 	}
7787 
7788       emit_label (label);
7789       LABEL_NUSES (label) = 1;
7790       emit_jump_insn (gen_jump (*done_label));
7791       emit_barrier ();
7792     }
7793   else
7794     gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
7795 		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
7796 
7797   /* Start memcpy for COUNT >= SIZE.  */
7798   if (loop_label)
7799     {
7800        emit_label (loop_label);
7801        LABEL_NUSES (loop_label) = 1;
7802     }
7803 
7804   /* Copy first desired_align bytes.  */
7805   if (!issetmem)
7806     srcmem = change_address (srcmem, mode, *srcptr);
7807   destmem = change_address (destmem, mode, *destptr);
7808   modesize = GEN_INT (GET_MODE_SIZE (mode));
7809   for (n = 0; prolog_size < desired_align - align; n++)
7810     {
7811       if (issetmem)
7812         emit_move_insn (destmem, mode_value);
7813       else
7814 	{
7815           emit_move_insn (destmem, srcmem);
7816           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7817 	}
7818       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7819       prolog_size += GET_MODE_SIZE (mode);
7820     }
7821 
7822 
7823   /* Copy last SIZE bytes.  */
7824   destmem = offset_address (destmem, *count, 1);
7825   destmem = offset_address (destmem,
7826 			    GEN_INT (-size - prolog_size),
7827 			    1);
7828   if (issetmem)
7829     emit_move_insn (destmem, mode_value);
7830   else
7831     {
7832       srcmem = offset_address (srcmem, *count, 1);
7833       srcmem = offset_address (srcmem,
7834 			       GEN_INT (-size - prolog_size),
7835 			       1);
7836       emit_move_insn (destmem, srcmem);
7837     }
7838   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
7839     {
7840       destmem = offset_address (destmem, modesize, 1);
7841       if (issetmem)
7842 	emit_move_insn (destmem, mode_value);
7843       else
7844 	{
7845           srcmem = offset_address (srcmem, modesize, 1);
7846           emit_move_insn (destmem, srcmem);
7847 	}
7848     }
7849 
7850   /* Align destination.  */
7851   if (desired_align > 1 && desired_align > align)
7852     {
7853       rtx saveddest = *destptr;
7854 
7855       gcc_assert (desired_align <= size);
7856       /* Align destptr up, place it to new register.  */
7857       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
7858 				      GEN_INT (prolog_size),
7859 				      NULL_RTX, 1, OPTAB_DIRECT);
7860       if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
7861 	REG_POINTER (*destptr) = 1;
7862       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
7863 				      GEN_INT (-desired_align),
7864 				      *destptr, 1, OPTAB_DIRECT);
7865       /* See how many bytes we skipped.  */
7866       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
7867 				       *destptr,
7868 				       saveddest, 1, OPTAB_DIRECT);
7869       /* Adjust srcptr and count.  */
7870       if (!issetmem)
7871 	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
7872 				       saveddest, *srcptr, 1, OPTAB_DIRECT);
7873       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7874 				    saveddest, *count, 1, OPTAB_DIRECT);
7875       /* We copied at most size + prolog_size.  */
7876       if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
7877 	*min_size
7878 	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
7879       else
7880 	*min_size = 0;
7881 
7882       /* Our loops always round down the block size, but for dispatch to
7883          library we need precise value.  */
7884       if (dynamic_check)
7885 	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
7886 				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
7887     }
7888   else
7889     {
7890       gcc_assert (prolog_size == 0);
7891       /* Decrease count, so we won't end up copying last word twice.  */
7892       if (!CONST_INT_P (*count))
7893 	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7894 				      constm1_rtx, *count, 1, OPTAB_DIRECT);
7895       else
7896 	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
7897 				      (unsigned HOST_WIDE_INT)size));
7898       if (*min_size)
7899 	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
7900     }
7901 }
7902 
7903 
7904 /* This function is like the previous one, except here we know how many bytes
7905    need to be copied.  That allows us to update alignment not only of DST, which
7906    is returned, but also of SRC, which is passed as a pointer for that
7907    reason.  */
7908 static rtx
expand_set_or_cpymem_constant_prologue(rtx dst,rtx * srcp,rtx destreg,rtx srcreg,rtx value,rtx vec_value,int desired_align,int align_bytes,bool issetmem)7909 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
7910 					   rtx srcreg, rtx value, rtx vec_value,
7911 					   int desired_align, int align_bytes,
7912 					   bool issetmem)
7913 {
7914   rtx src = NULL;
7915   rtx orig_dst = dst;
7916   rtx orig_src = NULL;
7917   int piece_size = 1;
7918   int copied_bytes = 0;
7919 
7920   if (!issetmem)
7921     {
7922       gcc_assert (srcp != NULL);
7923       src = *srcp;
7924       orig_src = src;
7925     }
7926 
7927   for (piece_size = 1;
7928        piece_size <= desired_align && copied_bytes < align_bytes;
7929        piece_size <<= 1)
7930     {
7931       if (align_bytes & piece_size)
7932 	{
7933 	  if (issetmem)
7934 	    {
7935 	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
7936 		dst = emit_memset (dst, destreg, vec_value, piece_size);
7937 	      else
7938 		dst = emit_memset (dst, destreg, value, piece_size);
7939 	    }
7940 	  else
7941 	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
7942 	  copied_bytes += piece_size;
7943 	}
7944     }
7945   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
7946     set_mem_align (dst, desired_align * BITS_PER_UNIT);
7947   if (MEM_SIZE_KNOWN_P (orig_dst))
7948     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
7949 
7950   if (!issetmem)
7951     {
7952       int src_align_bytes = get_mem_align_offset (src, desired_align
7953 						       * BITS_PER_UNIT);
7954       if (src_align_bytes >= 0)
7955 	src_align_bytes = desired_align - src_align_bytes;
7956       if (src_align_bytes >= 0)
7957 	{
7958 	  unsigned int src_align;
7959 	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
7960 	    {
7961 	      if ((src_align_bytes & (src_align - 1))
7962 		   == (align_bytes & (src_align - 1)))
7963 		break;
7964 	    }
7965 	  if (src_align > (unsigned int) desired_align)
7966 	    src_align = desired_align;
7967 	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
7968 	    set_mem_align (src, src_align * BITS_PER_UNIT);
7969 	}
7970       if (MEM_SIZE_KNOWN_P (orig_src))
7971 	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
7972       *srcp = src;
7973     }
7974 
7975   return dst;
7976 }
7977 
7978 /* Return true if ALG can be used in current context.
7979    Assume we expand memset if MEMSET is true.  */
7980 static bool
alg_usable_p(enum stringop_alg alg,bool memset,bool have_as)7981 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
7982 {
7983   if (alg == no_stringop)
7984     return false;
7985   if (alg == vector_loop)
7986     return TARGET_SSE || TARGET_AVX;
7987   /* Algorithms using the rep prefix want at least edi and ecx;
7988      additionally, memset wants eax and memcpy wants esi.  Don't
7989      consider such algorithms if the user has appropriated those
7990      registers for their own purposes, or if we have a non-default
7991      address space, since some string insns cannot override the segment.  */
7992   if (alg == rep_prefix_1_byte
7993       || alg == rep_prefix_4_byte
7994       || alg == rep_prefix_8_byte)
7995     {
7996       if (have_as)
7997 	return false;
7998       if (fixed_regs[CX_REG]
7999 	  || fixed_regs[DI_REG]
8000 	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8001 	return false;
8002     }
8003   return true;
8004 }
8005 
8006 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
8007 static enum stringop_alg
decide_alg(HOST_WIDE_INT count,HOST_WIDE_INT expected_size,unsigned HOST_WIDE_INT min_size,unsigned HOST_WIDE_INT max_size,bool memset,bool zero_memset,bool have_as,int * dynamic_check,bool * noalign,bool recur)8008 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8009 	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8010 	    bool memset, bool zero_memset, bool have_as,
8011 	    int *dynamic_check, bool *noalign, bool recur)
8012 {
8013   const struct stringop_algs *algs;
8014   bool optimize_for_speed;
8015   int max = 0;
8016   const struct processor_costs *cost;
8017   int i;
8018   bool any_alg_usable_p = false;
8019 
8020   *noalign = false;
8021   *dynamic_check = -1;
8022 
8023   /* Even if the string operation call is cold, we still might spend a lot
8024      of time processing large blocks.  */
8025   if (optimize_function_for_size_p (cfun)
8026       || (optimize_insn_for_size_p ()
8027  	  && (max_size < 256
8028               || (expected_size != -1 && expected_size < 256))))
8029     optimize_for_speed = false;
8030   else
8031     optimize_for_speed = true;
8032 
8033   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8034   if (memset)
8035     algs = &cost->memset[TARGET_64BIT != 0];
8036   else
8037     algs = &cost->memcpy[TARGET_64BIT != 0];
8038 
8039   /* See maximal size for user defined algorithm.  */
8040   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8041     {
8042       enum stringop_alg candidate = algs->size[i].alg;
8043       bool usable = alg_usable_p (candidate, memset, have_as);
8044       any_alg_usable_p |= usable;
8045 
8046       if (candidate != libcall && candidate && usable)
8047 	max = algs->size[i].max;
8048     }
8049 
8050   /* If expected size is not known but max size is small enough
8051      so inline version is a win, set expected size into
8052      the range.  */
8053   if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8054       && expected_size == -1)
8055     expected_size = min_size / 2 + max_size / 2;
8056 
8057   /* If user specified the algorithm, honor it if possible.  */
8058   if (ix86_stringop_alg != no_stringop
8059       && alg_usable_p (ix86_stringop_alg, memset, have_as))
8060     return ix86_stringop_alg;
8061   /* rep; movq or rep; movl is the smallest variant.  */
8062   else if (!optimize_for_speed)
8063     {
8064       *noalign = true;
8065       if (!count || (count & 3) || (memset && !zero_memset))
8066 	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8067 	       ? rep_prefix_1_byte : loop_1_byte;
8068       else
8069 	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8070 	       ? rep_prefix_4_byte : loop;
8071     }
8072   /* Very tiny blocks are best handled via the loop, REP is expensive to
8073      setup.  */
8074   else if (expected_size != -1 && expected_size < 4)
8075     return loop_1_byte;
8076   else if (expected_size != -1)
8077     {
8078       enum stringop_alg alg = libcall;
8079       bool alg_noalign = false;
8080       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8081 	{
8082 	  /* We get here if the algorithms that were not libcall-based
8083 	     were rep-prefix based and we are unable to use rep prefixes
8084 	     based on global register usage.  Break out of the loop and
8085 	     use the heuristic below.  */
8086 	  if (algs->size[i].max == 0)
8087 	    break;
8088 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8089 	    {
8090 	      enum stringop_alg candidate = algs->size[i].alg;
8091 
8092 	      if (candidate != libcall
8093 		  && alg_usable_p (candidate, memset, have_as))
8094 		{
8095 		  alg = candidate;
8096 		  alg_noalign = algs->size[i].noalign;
8097 		}
8098 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8099 		 last non-libcall inline algorithm.  */
8100 	      if (TARGET_INLINE_ALL_STRINGOPS)
8101 		{
8102 		  /* When the current size is best to be copied by a libcall,
8103 		     but we are still forced to inline, run the heuristic below
8104 		     that will pick code for medium sized blocks.  */
8105 		  if (alg != libcall)
8106 		    {
8107 		      *noalign = alg_noalign;
8108 		      return alg;
8109 		    }
8110 		  else if (!any_alg_usable_p)
8111 		    break;
8112 		}
8113 	      else if (alg_usable_p (candidate, memset, have_as)
8114 		       && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8115 			    && candidate == rep_prefix_1_byte
8116 			    /* NB: If min_size != max_size, size is
8117 			       unknown.  */
8118 			    && min_size != max_size))
8119 		{
8120 		  *noalign = algs->size[i].noalign;
8121 		  return candidate;
8122 		}
8123 	    }
8124 	}
8125     }
8126   /* When asked to inline the call anyway, try to pick meaningful choice.
8127      We look for maximal size of block that is faster to copy by hand and
8128      take blocks of at most of that size guessing that average size will
8129      be roughly half of the block.
8130 
8131      If this turns out to be bad, we might simply specify the preferred
8132      choice in ix86_costs.  */
8133   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8134       && (algs->unknown_size == libcall
8135 	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
8136     {
8137       enum stringop_alg alg;
8138       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8139 
8140       /* If there aren't any usable algorithms or if recursing already,
8141 	 then recursing on smaller sizes or same size isn't going to
8142 	 find anything.  Just return the simple byte-at-a-time copy loop.  */
8143       if (!any_alg_usable_p || recur)
8144 	{
8145 	  /* Pick something reasonable.  */
8146 	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8147 	    *dynamic_check = 128;
8148 	  return loop_1_byte;
8149 	}
8150       alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8151 			zero_memset, have_as, dynamic_check, noalign, true);
8152       gcc_assert (*dynamic_check == -1);
8153       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8154 	*dynamic_check = max;
8155       else
8156 	gcc_assert (alg != libcall);
8157       return alg;
8158     }
8159   return (alg_usable_p (algs->unknown_size, memset, have_as)
8160 	  ? algs->unknown_size : libcall);
8161 }
8162 
8163 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
8164    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
8165 static int
decide_alignment(int align,enum stringop_alg alg,int expected_size,machine_mode move_mode)8166 decide_alignment (int align,
8167 		  enum stringop_alg alg,
8168 		  int expected_size,
8169 		  machine_mode move_mode)
8170 {
8171   int desired_align = 0;
8172 
8173   gcc_assert (alg != no_stringop);
8174 
8175   if (alg == libcall)
8176     return 0;
8177   if (move_mode == VOIDmode)
8178     return 0;
8179 
8180   desired_align = GET_MODE_SIZE (move_mode);
8181   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8182      copying whole cacheline at once.  */
8183   if (TARGET_CPU_P (PENTIUMPRO)
8184       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8185     desired_align = 8;
8186 
8187   if (optimize_size)
8188     desired_align = 1;
8189   if (desired_align < align)
8190     desired_align = align;
8191   if (expected_size != -1 && expected_size < 4)
8192     desired_align = align;
8193 
8194   return desired_align;
8195 }
8196 
8197 
8198 /* Helper function for memcpy.  For QImode value 0xXY produce
8199    0xXYXYXYXY of wide specified by MODE.  This is essentially
8200    a * 0x10101010, but we can do slightly better than
8201    synth_mult by unwinding the sequence by hand on CPUs with
8202    slow multiply.  */
8203 static rtx
promote_duplicated_reg(machine_mode mode,rtx val)8204 promote_duplicated_reg (machine_mode mode, rtx val)
8205 {
8206   machine_mode valmode = GET_MODE (val);
8207   rtx tmp;
8208   int nops = mode == DImode ? 3 : 2;
8209 
8210   gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8211   if (val == const0_rtx)
8212     return copy_to_mode_reg (mode, CONST0_RTX (mode));
8213   if (CONST_INT_P (val))
8214     {
8215       HOST_WIDE_INT v = INTVAL (val) & 255;
8216 
8217       v |= v << 8;
8218       v |= v << 16;
8219       if (mode == DImode)
8220         v |= (v << 16) << 16;
8221       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8222     }
8223 
8224   if (valmode == VOIDmode)
8225     valmode = QImode;
8226   if (valmode != QImode)
8227     val = gen_lowpart (QImode, val);
8228   if (mode == QImode)
8229     return val;
8230   if (!TARGET_PARTIAL_REG_STALL)
8231     nops--;
8232   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8233       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8234       <= (ix86_cost->shift_const + ix86_cost->add) * nops
8235           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8236     {
8237       rtx reg = convert_modes (mode, QImode, val, true);
8238       tmp = promote_duplicated_reg (mode, const1_rtx);
8239       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8240 				  OPTAB_DIRECT);
8241     }
8242   else
8243     {
8244       rtx reg = convert_modes (mode, QImode, val, true);
8245 
8246       if (!TARGET_PARTIAL_REG_STALL)
8247 	emit_insn (gen_insv_1 (mode, reg, reg));
8248       else
8249 	{
8250 	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8251 				     NULL, 1, OPTAB_DIRECT);
8252 	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8253 				     OPTAB_DIRECT);
8254 	}
8255       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8256 			         NULL, 1, OPTAB_DIRECT);
8257       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8258       if (mode == SImode)
8259 	return reg;
8260       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8261 				 NULL, 1, OPTAB_DIRECT);
8262       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8263       return reg;
8264     }
8265 }
8266 
8267 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8268    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8269    alignment from ALIGN to DESIRED_ALIGN.  */
8270 static rtx
promote_duplicated_reg_to_size(rtx val,int size_needed,int desired_align,int align)8271 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8272 				int align)
8273 {
8274   rtx promoted_val;
8275 
8276   if (TARGET_64BIT
8277       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8278     promoted_val = promote_duplicated_reg (DImode, val);
8279   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8280     promoted_val = promote_duplicated_reg (SImode, val);
8281   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8282     promoted_val = promote_duplicated_reg (HImode, val);
8283   else
8284     promoted_val = val;
8285 
8286   return promoted_val;
8287 }
8288 
8289 /* Copy the address to a Pmode register.  This is used for x32 to
8290    truncate DImode TLS address to a SImode register. */
8291 
8292 static rtx
ix86_copy_addr_to_reg(rtx addr)8293 ix86_copy_addr_to_reg (rtx addr)
8294 {
8295   rtx reg;
8296   if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8297     {
8298       reg = copy_addr_to_reg (addr);
8299       REG_POINTER (reg) = 1;
8300       return reg;
8301     }
8302   else
8303     {
8304       gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8305       reg = copy_to_mode_reg (DImode, addr);
8306       REG_POINTER (reg) = 1;
8307       return gen_rtx_SUBREG (SImode, reg, 0);
8308     }
8309 }
8310 
8311 /* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
8312    operations when profitable.  The code depends upon architecture, block size
8313    and alignment, but always has one of the following overall structures:
8314 
8315    Aligned move sequence:
8316 
8317      1) Prologue guard: Conditional that jumps up to epilogues for small
8318 	blocks that can be handled by epilogue alone.  This is faster
8319 	but also needed for correctness, since prologue assume the block
8320 	is larger than the desired alignment.
8321 
8322 	Optional dynamic check for size and libcall for large
8323 	blocks is emitted here too, with -minline-stringops-dynamically.
8324 
8325      2) Prologue: copy first few bytes in order to get destination
8326 	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
8327 	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8328 	copied.  We emit either a jump tree on power of two sized
8329 	blocks, or a byte loop.
8330 
8331      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8332 	with specified algorithm.
8333 
8334      4) Epilogue: code copying tail of the block that is too small to be
8335 	handled by main body (or up to size guarded by prologue guard).
8336 
8337   Misaligned move sequence
8338 
8339      1) missaligned move prologue/epilogue containing:
8340         a) Prologue handling small memory blocks and jumping to done_label
8341 	   (skipped if blocks are known to be large enough)
8342 	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8343            needed by single possibly misaligned move
8344 	   (skipped if alignment is not needed)
8345         c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8346 
8347      2) Zero size guard dispatching to done_label, if needed
8348 
8349      3) dispatch to library call, if needed,
8350 
8351      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8352 	with specified algorithm.  */
8353 bool
ix86_expand_set_or_cpymem(rtx dst,rtx src,rtx count_exp,rtx val_exp,rtx align_exp,rtx expected_align_exp,rtx expected_size_exp,rtx min_size_exp,rtx max_size_exp,rtx probable_max_size_exp,bool issetmem)8354 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8355 			   rtx align_exp, rtx expected_align_exp,
8356 			   rtx expected_size_exp, rtx min_size_exp,
8357 			   rtx max_size_exp, rtx probable_max_size_exp,
8358 			   bool issetmem)
8359 {
8360   rtx destreg;
8361   rtx srcreg = NULL;
8362   rtx_code_label *label = NULL;
8363   rtx tmp;
8364   rtx_code_label *jump_around_label = NULL;
8365   HOST_WIDE_INT align = 1;
8366   unsigned HOST_WIDE_INT count = 0;
8367   HOST_WIDE_INT expected_size = -1;
8368   int size_needed = 0, epilogue_size_needed;
8369   int desired_align = 0, align_bytes = 0;
8370   enum stringop_alg alg;
8371   rtx promoted_val = NULL;
8372   rtx vec_promoted_val = NULL;
8373   bool force_loopy_epilogue = false;
8374   int dynamic_check;
8375   bool need_zero_guard = false;
8376   bool noalign;
8377   machine_mode move_mode = VOIDmode;
8378   machine_mode wider_mode;
8379   int unroll_factor = 1;
8380   /* TODO: Once value ranges are available, fill in proper data.  */
8381   unsigned HOST_WIDE_INT min_size = 0;
8382   unsigned HOST_WIDE_INT max_size = -1;
8383   unsigned HOST_WIDE_INT probable_max_size = -1;
8384   bool misaligned_prologue_used = false;
8385   bool have_as;
8386 
8387   if (CONST_INT_P (align_exp))
8388     align = INTVAL (align_exp);
8389   /* i386 can do misaligned access on reasonably increased cost.  */
8390   if (CONST_INT_P (expected_align_exp)
8391       && INTVAL (expected_align_exp) > align)
8392     align = INTVAL (expected_align_exp);
8393   /* ALIGN is the minimum of destination and source alignment, but we care here
8394      just about destination alignment.  */
8395   else if (!issetmem
8396 	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8397     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8398 
8399   if (CONST_INT_P (count_exp))
8400     {
8401       min_size = max_size = probable_max_size = count = expected_size
8402 	= INTVAL (count_exp);
8403       /* When COUNT is 0, there is nothing to do.  */
8404       if (!count)
8405 	return true;
8406     }
8407   else
8408     {
8409       if (min_size_exp)
8410 	min_size = INTVAL (min_size_exp);
8411       if (max_size_exp)
8412 	max_size = INTVAL (max_size_exp);
8413       if (probable_max_size_exp)
8414 	probable_max_size = INTVAL (probable_max_size_exp);
8415       if (CONST_INT_P (expected_size_exp))
8416 	expected_size = INTVAL (expected_size_exp);
8417      }
8418 
8419   /* Make sure we don't need to care about overflow later on.  */
8420   if (count > (HOST_WIDE_INT_1U << 30))
8421     return false;
8422 
8423   have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8424   if (!issetmem)
8425     have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8426 
8427   /* Step 0: Decide on preferred algorithm, desired alignment and
8428      size of chunks to be copied by main loop.  */
8429   alg = decide_alg (count, expected_size, min_size, probable_max_size,
8430 		    issetmem,
8431 		    issetmem && val_exp == const0_rtx, have_as,
8432 		    &dynamic_check, &noalign, false);
8433 
8434   if (dump_file)
8435     fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8436 	     stringop_alg_names[alg]);
8437 
8438   if (alg == libcall)
8439     return false;
8440   gcc_assert (alg != no_stringop);
8441 
8442   /* For now vector-version of memset is generated only for memory zeroing, as
8443      creating of promoted vector value is very cheap in this case.  */
8444   if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8445     alg = unrolled_loop;
8446 
8447   if (!count)
8448     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8449   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8450   if (!issetmem)
8451     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8452 
8453   unroll_factor = 1;
8454   move_mode = word_mode;
8455   switch (alg)
8456     {
8457     case libcall:
8458     case no_stringop:
8459     case last_alg:
8460       gcc_unreachable ();
8461     case loop_1_byte:
8462       need_zero_guard = true;
8463       move_mode = QImode;
8464       break;
8465     case loop:
8466       need_zero_guard = true;
8467       break;
8468     case unrolled_loop:
8469       need_zero_guard = true;
8470       unroll_factor = (TARGET_64BIT ? 4 : 2);
8471       break;
8472     case vector_loop:
8473       need_zero_guard = true;
8474       unroll_factor = 4;
8475       /* Find the widest supported mode.  */
8476       move_mode = word_mode;
8477       while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8478 	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8479 	move_mode = wider_mode;
8480 
8481       if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8482 	move_mode = TImode;
8483       if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
8484 	move_mode = OImode;
8485 
8486       /* Find the corresponding vector mode with the same size as MOVE_MODE.
8487 	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
8488       if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8489 	{
8490 	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8491 	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8492 	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8493 	    move_mode = word_mode;
8494 	}
8495       gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8496       break;
8497     case rep_prefix_8_byte:
8498       move_mode = DImode;
8499       break;
8500     case rep_prefix_4_byte:
8501       move_mode = SImode;
8502       break;
8503     case rep_prefix_1_byte:
8504       move_mode = QImode;
8505       break;
8506     }
8507   size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8508   epilogue_size_needed = size_needed;
8509 
8510   /* If we are going to call any library calls conditionally, make sure any
8511      pending stack adjustment happen before the first conditional branch,
8512      otherwise they will be emitted before the library call only and won't
8513      happen from the other branches.  */
8514   if (dynamic_check != -1)
8515     do_pending_stack_adjust ();
8516 
8517   desired_align = decide_alignment (align, alg, expected_size, move_mode);
8518   if (!TARGET_ALIGN_STRINGOPS || noalign)
8519     align = desired_align;
8520 
8521   /* Step 1: Prologue guard.  */
8522 
8523   /* Alignment code needs count to be in register.  */
8524   if (CONST_INT_P (count_exp) && desired_align > align)
8525     {
8526       if (INTVAL (count_exp) > desired_align
8527 	  && INTVAL (count_exp) > size_needed)
8528 	{
8529 	  align_bytes
8530 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8531 	  if (align_bytes <= 0)
8532 	    align_bytes = 0;
8533 	  else
8534 	    align_bytes = desired_align - align_bytes;
8535 	}
8536       if (align_bytes == 0)
8537 	count_exp = force_reg (counter_mode (count_exp), count_exp);
8538     }
8539   gcc_assert (desired_align >= 1 && align >= 1);
8540 
8541   /* Misaligned move sequences handle both prologue and epilogue at once.
8542      Default code generation results in a smaller code for large alignments
8543      and also avoids redundant job when sizes are known precisely.  */
8544   misaligned_prologue_used
8545     = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8546        && MAX (desired_align, epilogue_size_needed) <= 32
8547        && desired_align <= epilogue_size_needed
8548        && ((desired_align > align && !align_bytes)
8549 	   || (!count && epilogue_size_needed > 1)));
8550 
8551   /* Do the cheap promotion to allow better CSE across the
8552      main loop and epilogue (ie one load of the big constant in the
8553      front of all code.
8554      For now the misaligned move sequences do not have fast path
8555      without broadcasting.  */
8556   if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8557     {
8558       if (alg == vector_loop)
8559 	{
8560 	  gcc_assert (val_exp == const0_rtx);
8561 	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8562 	  promoted_val = promote_duplicated_reg_to_size (val_exp,
8563 							 GET_MODE_SIZE (word_mode),
8564 							 desired_align, align);
8565 	}
8566       else
8567 	{
8568 	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8569 							 desired_align, align);
8570 	}
8571     }
8572   /* Misaligned move sequences handles both prologues and epilogues at once.
8573      Default code generation results in smaller code for large alignments and
8574      also avoids redundant job when sizes are known precisely.  */
8575   if (misaligned_prologue_used)
8576     {
8577       /* Misaligned move prologue handled small blocks by itself.  */
8578       expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8579 	   (dst, src, &destreg, &srcreg,
8580 	    move_mode, promoted_val, vec_promoted_val,
8581 	    &count_exp,
8582 	    &jump_around_label,
8583             desired_align < align
8584 	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8585 	    desired_align, align, &min_size, dynamic_check, issetmem);
8586       if (!issetmem)
8587         src = change_address (src, BLKmode, srcreg);
8588       dst = change_address (dst, BLKmode, destreg);
8589       set_mem_align (dst, desired_align * BITS_PER_UNIT);
8590       epilogue_size_needed = 0;
8591       if (need_zero_guard
8592 	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
8593 	{
8594 	  /* It is possible that we copied enough so the main loop will not
8595 	     execute.  */
8596 	  gcc_assert (size_needed > 1);
8597 	  if (jump_around_label == NULL_RTX)
8598 	    jump_around_label = gen_label_rtx ();
8599 	  emit_cmp_and_jump_insns (count_exp,
8600 				   GEN_INT (size_needed),
8601 				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8602 	  if (expected_size == -1
8603 	      || expected_size < (desired_align - align) / 2 + size_needed)
8604 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
8605 	  else
8606 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
8607 	}
8608     }
8609   /* Ensure that alignment prologue won't copy past end of block.  */
8610   else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8611     {
8612       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8613       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8614 	 Make sure it is power of 2.  */
8615       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8616 
8617       /* To improve performance of small blocks, we jump around the VAL
8618 	 promoting mode.  This mean that if the promoted VAL is not constant,
8619 	 we might not use it in the epilogue and have to use byte
8620 	 loop variant.  */
8621       if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8622 	force_loopy_epilogue = true;
8623       if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8624 	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8625 	{
8626 	  /* If main algorithm works on QImode, no epilogue is needed.
8627 	     For small sizes just don't align anything.  */
8628 	  if (size_needed == 1)
8629 	    desired_align = align;
8630 	  else
8631 	    goto epilogue;
8632 	}
8633       else if (!count
8634 	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8635 	{
8636 	  label = gen_label_rtx ();
8637 	  emit_cmp_and_jump_insns (count_exp,
8638 				   GEN_INT (epilogue_size_needed),
8639 				   LTU, 0, counter_mode (count_exp), 1, label);
8640 	  if (expected_size == -1 || expected_size < epilogue_size_needed)
8641 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
8642 	  else
8643 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
8644 	}
8645     }
8646 
8647   /* Emit code to decide on runtime whether library call or inline should be
8648      used.  */
8649   if (dynamic_check != -1)
8650     {
8651       if (!issetmem && CONST_INT_P (count_exp))
8652 	{
8653 	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8654 	    {
8655 	      emit_block_copy_via_libcall (dst, src, count_exp);
8656 	      count_exp = const0_rtx;
8657 	      goto epilogue;
8658 	    }
8659 	}
8660       else
8661 	{
8662 	  rtx_code_label *hot_label = gen_label_rtx ();
8663 	  if (jump_around_label == NULL_RTX)
8664 	    jump_around_label = gen_label_rtx ();
8665 	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8666 				   LEU, 0, counter_mode (count_exp),
8667 				   1, hot_label);
8668 	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
8669 	  if (issetmem)
8670 	    set_storage_via_libcall (dst, count_exp, val_exp);
8671 	  else
8672 	    emit_block_copy_via_libcall (dst, src, count_exp);
8673 	  emit_jump (jump_around_label);
8674 	  emit_label (hot_label);
8675 	}
8676     }
8677 
8678   /* Step 2: Alignment prologue.  */
8679   /* Do the expensive promotion once we branched off the small blocks.  */
8680   if (issetmem && !promoted_val)
8681     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8682 						   desired_align, align);
8683 
8684   if (desired_align > align && !misaligned_prologue_used)
8685     {
8686       if (align_bytes == 0)
8687 	{
8688 	  /* Except for the first move in prologue, we no longer know
8689 	     constant offset in aliasing info.  It don't seems to worth
8690 	     the pain to maintain it for the first move, so throw away
8691 	     the info early.  */
8692 	  dst = change_address (dst, BLKmode, destreg);
8693 	  if (!issetmem)
8694 	    src = change_address (src, BLKmode, srcreg);
8695 	  dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
8696 					    promoted_val, vec_promoted_val,
8697 					    count_exp, align, desired_align,
8698 					    issetmem);
8699 	  /* At most desired_align - align bytes are copied.  */
8700 	  if (min_size < (unsigned)(desired_align - align))
8701 	    min_size = 0;
8702 	  else
8703 	    min_size -= desired_align - align;
8704 	}
8705       else
8706 	{
8707 	  /* If we know how many bytes need to be stored before dst is
8708 	     sufficiently aligned, maintain aliasing info accurately.  */
8709 	  dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
8710 							   srcreg,
8711 							   promoted_val,
8712 							   vec_promoted_val,
8713 							   desired_align,
8714 							   align_bytes,
8715 							   issetmem);
8716 
8717 	  count_exp = plus_constant (counter_mode (count_exp),
8718 				     count_exp, -align_bytes);
8719 	  count -= align_bytes;
8720 	  min_size -= align_bytes;
8721 	  max_size -= align_bytes;
8722 	}
8723       if (need_zero_guard
8724 	  && min_size < (unsigned HOST_WIDE_INT) size_needed
8725 	  && (count < (unsigned HOST_WIDE_INT) size_needed
8726 	      || (align_bytes == 0
8727 		  && count < ((unsigned HOST_WIDE_INT) size_needed
8728 			      + desired_align - align))))
8729 	{
8730 	  /* It is possible that we copied enough so the main loop will not
8731 	     execute.  */
8732 	  gcc_assert (size_needed > 1);
8733 	  if (label == NULL_RTX)
8734 	    label = gen_label_rtx ();
8735 	  emit_cmp_and_jump_insns (count_exp,
8736 				   GEN_INT (size_needed),
8737 				   LTU, 0, counter_mode (count_exp), 1, label);
8738 	  if (expected_size == -1
8739 	      || expected_size < (desired_align - align) / 2 + size_needed)
8740 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
8741 	  else
8742 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
8743 	}
8744     }
8745   if (label && size_needed == 1)
8746     {
8747       emit_label (label);
8748       LABEL_NUSES (label) = 1;
8749       label = NULL;
8750       epilogue_size_needed = 1;
8751       if (issetmem)
8752 	promoted_val = val_exp;
8753     }
8754   else if (label == NULL_RTX && !misaligned_prologue_used)
8755     epilogue_size_needed = size_needed;
8756 
8757   /* Step 3: Main loop.  */
8758 
8759   switch (alg)
8760     {
8761     case libcall:
8762     case no_stringop:
8763     case last_alg:
8764       gcc_unreachable ();
8765     case loop_1_byte:
8766     case loop:
8767     case unrolled_loop:
8768       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
8769 				     count_exp, move_mode, unroll_factor,
8770 				     expected_size, issetmem);
8771       break;
8772     case vector_loop:
8773       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
8774 				     vec_promoted_val, count_exp, move_mode,
8775 				     unroll_factor, expected_size, issetmem);
8776       break;
8777     case rep_prefix_8_byte:
8778     case rep_prefix_4_byte:
8779     case rep_prefix_1_byte:
8780       expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
8781 				       val_exp, count_exp, move_mode, issetmem);
8782       break;
8783     }
8784   /* Adjust properly the offset of src and dest memory for aliasing.  */
8785   if (CONST_INT_P (count_exp))
8786     {
8787       if (!issetmem)
8788 	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
8789 					    (count / size_needed) * size_needed);
8790       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
8791 					  (count / size_needed) * size_needed);
8792     }
8793   else
8794     {
8795       if (!issetmem)
8796 	src = change_address (src, BLKmode, srcreg);
8797       dst = change_address (dst, BLKmode, destreg);
8798     }
8799 
8800   /* Step 4: Epilogue to copy the remaining bytes.  */
8801  epilogue:
8802   if (label)
8803     {
8804       /* When the main loop is done, COUNT_EXP might hold original count,
8805 	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8806 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8807 	 bytes. Compensate if needed.  */
8808 
8809       if (size_needed < epilogue_size_needed)
8810 	{
8811 	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
8812 				     GEN_INT (size_needed - 1), count_exp, 1,
8813 				     OPTAB_DIRECT);
8814 	  if (tmp != count_exp)
8815 	    emit_move_insn (count_exp, tmp);
8816 	}
8817       emit_label (label);
8818       LABEL_NUSES (label) = 1;
8819     }
8820 
8821   if (count_exp != const0_rtx && epilogue_size_needed > 1)
8822     {
8823       if (force_loopy_epilogue)
8824 	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
8825 					 epilogue_size_needed);
8826       else
8827 	{
8828 	  if (issetmem)
8829 	    expand_setmem_epilogue (dst, destreg, promoted_val,
8830 				    vec_promoted_val, count_exp,
8831 				    epilogue_size_needed);
8832 	  else
8833 	    expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
8834 				    epilogue_size_needed);
8835 	}
8836     }
8837   if (jump_around_label)
8838     emit_label (jump_around_label);
8839   return true;
8840 }
8841 
8842 /* Expand cmpstrn or memcmp.  */
8843 
8844 bool
ix86_expand_cmpstrn_or_cmpmem(rtx result,rtx src1,rtx src2,rtx length,rtx align,bool is_cmpstrn)8845 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
8846 			       rtx length, rtx align, bool is_cmpstrn)
8847 {
8848   /* Expand strncmp and memcmp only with -minline-all-stringops since
8849      "repz cmpsb" can be much slower than strncmp and memcmp functions
8850      implemented with vector instructions, see
8851 
8852      https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
8853    */
8854   if (!TARGET_INLINE_ALL_STRINGOPS)
8855     return false;
8856 
8857   /* Can't use this if the user has appropriated ecx, esi or edi.  */
8858   if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
8859     return false;
8860 
8861   if (is_cmpstrn)
8862     {
8863       /* For strncmp, length is the maximum length, which can be larger
8864 	 than actual string lengths.  We can expand the cmpstrn pattern
8865 	 to "repz cmpsb" only if one of the strings is a constant so
8866 	 that expand_builtin_strncmp() can write the length argument to
8867 	 be the minimum of the const string length and the actual length
8868 	 argument.  Otherwise, "repz cmpsb" may pass the 0 byte.  */
8869       tree t1 = MEM_EXPR (src1);
8870       tree t2 = MEM_EXPR (src2);
8871       if (!((t1 && TREE_CODE (t1) == MEM_REF
8872 	     && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
8873 	     && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
8874 		 == STRING_CST))
8875 	    || (t2 && TREE_CODE (t2) == MEM_REF
8876 		&& TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
8877 		&& (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
8878 		    == STRING_CST))))
8879 	return false;
8880     }
8881 
8882   rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
8883   rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
8884   if (addr1 != XEXP (src1, 0))
8885     src1 = replace_equiv_address_nv (src1, addr1);
8886   if (addr2 != XEXP (src2, 0))
8887     src2 = replace_equiv_address_nv (src2, addr2);
8888 
8889   /* NB: Make a copy of the data length to avoid changing the original
8890      data length by cmpstrnqi patterns.  */
8891   length = ix86_zero_extend_to_Pmode (length);
8892   rtx lengthreg = gen_reg_rtx (Pmode);
8893   emit_move_insn (lengthreg, length);
8894 
8895   /* If we are testing strict equality, we can use known alignment to
8896      good advantage.  This may be possible with combine, particularly
8897      once cc0 is dead.  */
8898   if (CONST_INT_P (length))
8899     {
8900       if (length == const0_rtx)
8901 	{
8902 	  emit_move_insn (result, const0_rtx);
8903 	  return true;
8904 	}
8905       emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
8906 				     src1, src2));
8907     }
8908   else
8909     {
8910       emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
8911       emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
8912 				  src1, src2));
8913     }
8914 
8915   rtx out = gen_lowpart (QImode, result);
8916   emit_insn (gen_cmpintqi (out));
8917   emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
8918 
8919   return true;
8920 }
8921 
8922 /* Expand the appropriate insns for doing strlen if not just doing
8923    repnz; scasb
8924 
8925    out = result, initialized with the start address
8926    align_rtx = alignment of the address.
8927    scratch = scratch register, initialized with the startaddress when
8928 	not aligned, otherwise undefined
8929 
8930    This is just the body. It needs the initializations mentioned above and
8931    some address computing at the end.  These things are done in i386.md.  */
8932 
8933 static void
ix86_expand_strlensi_unroll_1(rtx out,rtx src,rtx align_rtx)8934 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
8935 {
8936   int align;
8937   rtx tmp;
8938   rtx_code_label *align_2_label = NULL;
8939   rtx_code_label *align_3_label = NULL;
8940   rtx_code_label *align_4_label = gen_label_rtx ();
8941   rtx_code_label *end_0_label = gen_label_rtx ();
8942   rtx mem;
8943   rtx tmpreg = gen_reg_rtx (SImode);
8944   rtx scratch = gen_reg_rtx (SImode);
8945   rtx cmp;
8946 
8947   align = 0;
8948   if (CONST_INT_P (align_rtx))
8949     align = INTVAL (align_rtx);
8950 
8951   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
8952 
8953   /* Is there a known alignment and is it less than 4?  */
8954   if (align < 4)
8955     {
8956       rtx scratch1 = gen_reg_rtx (Pmode);
8957       emit_move_insn (scratch1, out);
8958       /* Is there a known alignment and is it not 2? */
8959       if (align != 2)
8960 	{
8961 	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
8962 	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
8963 
8964 	  /* Leave just the 3 lower bits.  */
8965 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
8966 				    NULL_RTX, 0, OPTAB_WIDEN);
8967 
8968 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8969 				   Pmode, 1, align_4_label);
8970 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
8971 				   Pmode, 1, align_2_label);
8972 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
8973 				   Pmode, 1, align_3_label);
8974 	}
8975       else
8976         {
8977 	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
8978 	     check if is aligned to 4 - byte.  */
8979 
8980 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
8981 				    NULL_RTX, 0, OPTAB_WIDEN);
8982 
8983 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8984 				   Pmode, 1, align_4_label);
8985         }
8986 
8987       mem = change_address (src, QImode, out);
8988 
8989       /* Now compare the bytes.  */
8990 
8991       /* Compare the first n unaligned byte on a byte per byte basis.  */
8992       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
8993 			       QImode, 1, end_0_label);
8994 
8995       /* Increment the address.  */
8996       emit_insn (gen_add2_insn (out, const1_rtx));
8997 
8998       /* Not needed with an alignment of 2 */
8999       if (align != 2)
9000 	{
9001 	  emit_label (align_2_label);
9002 
9003 	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9004 				   end_0_label);
9005 
9006 	  emit_insn (gen_add2_insn (out, const1_rtx));
9007 
9008 	  emit_label (align_3_label);
9009 	}
9010 
9011       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9012 			       end_0_label);
9013 
9014       emit_insn (gen_add2_insn (out, const1_rtx));
9015     }
9016 
9017   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
9018      align this loop.  It gives only huge programs, but does not help to
9019      speed up.  */
9020   emit_label (align_4_label);
9021 
9022   mem = change_address (src, SImode, out);
9023   emit_move_insn (scratch, mem);
9024   emit_insn (gen_add2_insn (out, GEN_INT (4)));
9025 
9026   /* This formula yields a nonzero result iff one of the bytes is zero.
9027      This saves three branches inside loop and many cycles.  */
9028 
9029   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9030   emit_insn (gen_one_cmplsi2 (scratch, scratch));
9031   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9032   emit_insn (gen_andsi3 (tmpreg, tmpreg,
9033 			 gen_int_mode (0x80808080, SImode)));
9034   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9035 			   align_4_label);
9036 
9037   if (TARGET_CMOVE)
9038     {
9039        rtx reg = gen_reg_rtx (SImode);
9040        rtx reg2 = gen_reg_rtx (Pmode);
9041        emit_move_insn (reg, tmpreg);
9042        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9043 
9044        /* If zero is not in the first two bytes, move two bytes forward.  */
9045        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9046        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9047        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9048        emit_insn (gen_rtx_SET (tmpreg,
9049 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
9050 						     reg,
9051 						     tmpreg)));
9052        /* Emit lea manually to avoid clobbering of flags.  */
9053        emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9054 
9055        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9056        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9057        emit_insn (gen_rtx_SET (out,
9058 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9059 						     reg2,
9060 						     out)));
9061     }
9062   else
9063     {
9064        rtx_code_label *end_2_label = gen_label_rtx ();
9065        /* Is zero in the first two bytes? */
9066 
9067        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9068        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9069        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9070        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9071                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9072                             pc_rtx);
9073        tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9074        JUMP_LABEL (tmp) = end_2_label;
9075 
9076        /* Not in the first two.  Move two bytes forward.  */
9077        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9078        emit_insn (gen_add2_insn (out, const2_rtx));
9079 
9080        emit_label (end_2_label);
9081 
9082     }
9083 
9084   /* Avoid branch in fixing the byte.  */
9085   tmpreg = gen_lowpart (QImode, tmpreg);
9086   emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9087   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9088   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9089   emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9090 
9091   emit_label (end_0_label);
9092 }
9093 
9094 /* Expand strlen.  */
9095 
9096 bool
ix86_expand_strlen(rtx out,rtx src,rtx eoschar,rtx align)9097 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9098 {
9099 if (TARGET_UNROLL_STRLEN
9100 	   && TARGET_INLINE_ALL_STRINGOPS
9101 	   && eoschar == const0_rtx
9102 	   && optimize > 1)
9103     {
9104       /* The generic case of strlen expander is long.  Avoid it's
9105 	 expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
9106       rtx addr = force_reg (Pmode, XEXP (src, 0));
9107       /* Well it seems that some optimizer does not combine a call like
9108 	 foo(strlen(bar), strlen(bar));
9109 	 when the move and the subtraction is done here.  It does calculate
9110 	 the length just once when these instructions are done inside of
9111 	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
9112 	 often used and I use one fewer register for the lifetime of
9113 	 output_strlen_unroll() this is better.  */
9114 
9115       emit_move_insn (out, addr);
9116 
9117       ix86_expand_strlensi_unroll_1 (out, src, align);
9118 
9119       /* strlensi_unroll_1 returns the address of the zero at the end of
9120 	 the string, like memchr(), so compute the length by subtracting
9121 	 the start address.  */
9122       emit_insn (gen_sub2_insn (out, addr));
9123       return true;
9124     }
9125   else
9126     return false;
9127 }
9128 
9129 /* For given symbol (function) construct code to compute address of it's PLT
9130    entry in large x86-64 PIC model.  */
9131 
9132 static rtx
construct_plt_address(rtx symbol)9133 construct_plt_address (rtx symbol)
9134 {
9135   rtx tmp, unspec;
9136 
9137   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9138   gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9139   gcc_assert (Pmode == DImode);
9140 
9141   tmp = gen_reg_rtx (Pmode);
9142   unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9143 
9144   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9145   emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9146   return tmp;
9147 }
9148 
9149 /* Additional registers that are clobbered by SYSV calls.  */
9150 
9151 static int const x86_64_ms_sysv_extra_clobbered_registers
9152 		 [NUM_X86_64_MS_CLOBBERED_REGS] =
9153 {
9154   SI_REG, DI_REG,
9155   XMM6_REG, XMM7_REG,
9156   XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9157   XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9158 };
9159 
9160 rtx_insn *
ix86_expand_call(rtx retval,rtx fnaddr,rtx callarg1,rtx callarg2,rtx pop,bool sibcall)9161 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9162 		  rtx callarg2,
9163 		  rtx pop, bool sibcall)
9164 {
9165   rtx vec[3];
9166   rtx use = NULL, call;
9167   unsigned int vec_len = 0;
9168   tree fndecl;
9169 
9170   if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9171     {
9172       fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9173       if (fndecl
9174 	  && (lookup_attribute ("interrupt",
9175 				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9176 	error ("interrupt service routine cannot be called directly");
9177     }
9178   else
9179     fndecl = NULL_TREE;
9180 
9181   if (pop == const0_rtx)
9182     pop = NULL;
9183   gcc_assert (!TARGET_64BIT || !pop);
9184 
9185   rtx addr = XEXP (fnaddr, 0);
9186   if (TARGET_MACHO && !TARGET_64BIT)
9187     {
9188 #if TARGET_MACHO
9189       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9190 	fnaddr = machopic_indirect_call_target (fnaddr);
9191 #endif
9192     }
9193   else
9194     {
9195       /* Static functions and indirect calls don't need the pic register.  Also,
9196 	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9197 	 it an indirect call.  */
9198       if (flag_pic
9199 	  && GET_CODE (addr) == SYMBOL_REF
9200 	  && ix86_call_use_plt_p (addr))
9201 	{
9202 	  if (flag_plt
9203 	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
9204 		  || !lookup_attribute ("noplt",
9205 					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9206 	    {
9207 	      if (!TARGET_64BIT
9208 		  || (ix86_cmodel == CM_LARGE_PIC
9209 		      && DEFAULT_ABI != MS_ABI))
9210 		{
9211 		  use_reg (&use, gen_rtx_REG (Pmode,
9212 					      REAL_PIC_OFFSET_TABLE_REGNUM));
9213 		  if (ix86_use_pseudo_pic_reg ())
9214 		    emit_move_insn (gen_rtx_REG (Pmode,
9215 						 REAL_PIC_OFFSET_TABLE_REGNUM),
9216 				    pic_offset_table_rtx);
9217 		}
9218 	    }
9219 	  else if (!TARGET_PECOFF && !TARGET_MACHO)
9220 	    {
9221 	      if (TARGET_64BIT
9222 		  && ix86_cmodel == CM_LARGE_PIC
9223 		  && DEFAULT_ABI != MS_ABI)
9224 		{
9225 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9226 					   UNSPEC_GOT);
9227 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9228 		  fnaddr = force_reg (Pmode, fnaddr);
9229 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9230 		}
9231 	      else if (TARGET_64BIT)
9232 		{
9233 		  fnaddr = gen_rtx_UNSPEC (Pmode,
9234 					   gen_rtvec (1, addr),
9235 					   UNSPEC_GOTPCREL);
9236 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9237 		}
9238 	      else
9239 		{
9240 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9241 					   UNSPEC_GOT);
9242 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9243 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9244 					 fnaddr);
9245 		}
9246 	      fnaddr = gen_const_mem (Pmode, fnaddr);
9247 	      /* Pmode may not be the same as word_mode for x32, which
9248 		 doesn't support indirect branch via 32-bit memory slot.
9249 		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9250 		 indirect branch via x32 GOT slot is OK.  */
9251 	      if (GET_MODE (fnaddr) != word_mode)
9252 		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9253 	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
9254 	    }
9255 	}
9256     }
9257 
9258   /* Skip setting up RAX register for -mskip-rax-setup when there are no
9259      parameters passed in vector registers.  */
9260   if (TARGET_64BIT
9261       && (INTVAL (callarg2) > 0
9262 	  || (INTVAL (callarg2) == 0
9263 	      && (TARGET_SSE || !flag_skip_rax_setup))))
9264     {
9265       rtx al = gen_rtx_REG (QImode, AX_REG);
9266       emit_move_insn (al, callarg2);
9267       use_reg (&use, al);
9268     }
9269 
9270   if (ix86_cmodel == CM_LARGE_PIC
9271       && !TARGET_PECOFF
9272       && MEM_P (fnaddr)
9273       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9274       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9275     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9276   /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9277      branch via x32 GOT slot is OK.  */
9278   else if (!(TARGET_X32
9279 	     && MEM_P (fnaddr)
9280 	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9281 	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9282 	   && (sibcall
9283 	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9284 	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9285     {
9286       fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9287       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9288     }
9289 
9290   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9291 
9292   if (retval)
9293     call = gen_rtx_SET (retval, call);
9294   vec[vec_len++] = call;
9295 
9296   if (pop)
9297     {
9298       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9299       pop = gen_rtx_SET (stack_pointer_rtx, pop);
9300       vec[vec_len++] = pop;
9301     }
9302 
9303   if (cfun->machine->no_caller_saved_registers
9304       && (!fndecl
9305 	  || (!TREE_THIS_VOLATILE (fndecl)
9306 	      && !lookup_attribute ("no_caller_saved_registers",
9307 				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9308     {
9309       static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9310       bool is_64bit_ms_abi = (TARGET_64BIT
9311 			      && ix86_function_abi (fndecl) == MS_ABI);
9312       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9313 
9314       /* If there are no caller-saved registers, add all registers
9315 	 that are clobbered by the call which returns.  */
9316       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9317 	if (!fixed_regs[i]
9318 	    && (ix86_call_used_regs[i] == 1
9319 		|| (ix86_call_used_regs[i] & c_mask))
9320 	    && !STACK_REGNO_P (i)
9321 	    && !MMX_REGNO_P (i))
9322 	  clobber_reg (&use,
9323 		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9324     }
9325   else if (TARGET_64BIT_MS_ABI
9326 	   && (!callarg2 || INTVAL (callarg2) != -2))
9327     {
9328       unsigned i;
9329 
9330       for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9331 	{
9332 	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9333 	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9334 
9335 	  clobber_reg (&use, gen_rtx_REG (mode, regno));
9336 	}
9337 
9338       /* Set here, but it may get cleared later.  */
9339       if (TARGET_CALL_MS2SYSV_XLOGUES)
9340 	{
9341 	  if (!TARGET_SSE)
9342 	    ;
9343 
9344 	  /* Don't break hot-patched functions.  */
9345 	  else if (ix86_function_ms_hook_prologue (current_function_decl))
9346 	    ;
9347 
9348 	  /* TODO: Cases not yet examined.  */
9349 	  else if (flag_split_stack)
9350 	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9351 
9352 	  else
9353 	    {
9354 	      gcc_assert (!reload_completed);
9355 	      cfun->machine->call_ms2sysv = true;
9356 	    }
9357 	}
9358     }
9359 
9360   if (TARGET_MACHO && TARGET_64BIT && !sibcall
9361       && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9362 	  || !fndecl || TREE_PUBLIC (fndecl)))
9363     {
9364       /* We allow public functions defined in a TU to bind locally for PIC
9365 	 code (the default) on 64bit Mach-O.
9366 	 If such functions are not inlined, we cannot tell at compile-time if
9367 	 they will be called via the lazy symbol resolver (this can depend on
9368 	 options given at link-time).  Therefore, we must assume that the lazy
9369 	 resolver could be used which clobbers R11 and R10.  */
9370       clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9371       clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9372     }
9373 
9374   if (vec_len > 1)
9375     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9376   rtx_insn *call_insn = emit_call_insn (call);
9377   if (use)
9378     CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9379 
9380   return call_insn;
9381 }
9382 
9383 /* Split simple return with popping POPC bytes from stack to indirect
9384    branch with stack adjustment .  */
9385 
9386 void
ix86_split_simple_return_pop_internal(rtx popc)9387 ix86_split_simple_return_pop_internal (rtx popc)
9388 {
9389   struct machine_function *m = cfun->machine;
9390   rtx ecx = gen_rtx_REG (SImode, CX_REG);
9391   rtx_insn *insn;
9392 
9393   /* There is no "pascal" calling convention in any 64bit ABI.  */
9394   gcc_assert (!TARGET_64BIT);
9395 
9396   insn = emit_insn (gen_pop (ecx));
9397   m->fs.cfa_offset -= UNITS_PER_WORD;
9398   m->fs.sp_offset -= UNITS_PER_WORD;
9399 
9400   rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9401   x = gen_rtx_SET (stack_pointer_rtx, x);
9402   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9403   add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9404   RTX_FRAME_RELATED_P (insn) = 1;
9405 
9406   x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9407   x = gen_rtx_SET (stack_pointer_rtx, x);
9408   insn = emit_insn (x);
9409   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9410   RTX_FRAME_RELATED_P (insn) = 1;
9411 
9412   /* Now return address is in ECX.  */
9413   emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9414 }
9415 
9416 /* Errors in the source file can cause expand_expr to return const0_rtx
9417    where we expect a vector.  To avoid crashing, use one of the vector
9418    clear instructions.  */
9419 
9420 static rtx
safe_vector_operand(rtx x,machine_mode mode)9421 safe_vector_operand (rtx x, machine_mode mode)
9422 {
9423   if (x == const0_rtx)
9424     x = CONST0_RTX (mode);
9425   return x;
9426 }
9427 
9428 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
9429 
9430 static rtx
ix86_expand_binop_builtin(enum insn_code icode,tree exp,rtx target)9431 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9432 {
9433   rtx pat;
9434   tree arg0 = CALL_EXPR_ARG (exp, 0);
9435   tree arg1 = CALL_EXPR_ARG (exp, 1);
9436   rtx op0 = expand_normal (arg0);
9437   rtx op1 = expand_normal (arg1);
9438   machine_mode tmode = insn_data[icode].operand[0].mode;
9439   machine_mode mode0 = insn_data[icode].operand[1].mode;
9440   machine_mode mode1 = insn_data[icode].operand[2].mode;
9441 
9442   if (VECTOR_MODE_P (mode0))
9443     op0 = safe_vector_operand (op0, mode0);
9444   if (VECTOR_MODE_P (mode1))
9445     op1 = safe_vector_operand (op1, mode1);
9446 
9447   if (optimize || !target
9448       || GET_MODE (target) != tmode
9449       || !insn_data[icode].operand[0].predicate (target, tmode))
9450     target = gen_reg_rtx (tmode);
9451 
9452   if (GET_MODE (op1) == SImode && mode1 == TImode)
9453     {
9454       rtx x = gen_reg_rtx (V4SImode);
9455       emit_insn (gen_sse2_loadd (x, op1));
9456       op1 = gen_lowpart (TImode, x);
9457     }
9458 
9459   if (!insn_data[icode].operand[1].predicate (op0, mode0))
9460     op0 = copy_to_mode_reg (mode0, op0);
9461   if (!insn_data[icode].operand[2].predicate (op1, mode1))
9462     op1 = copy_to_mode_reg (mode1, op1);
9463 
9464   pat = GEN_FCN (icode) (target, op0, op1);
9465   if (! pat)
9466     return 0;
9467 
9468   emit_insn (pat);
9469 
9470   return target;
9471 }
9472 
9473 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
9474 
9475 static rtx
ix86_expand_multi_arg_builtin(enum insn_code icode,tree exp,rtx target,enum ix86_builtin_func_type m_type,enum rtx_code sub_code)9476 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9477 			       enum ix86_builtin_func_type m_type,
9478 			       enum rtx_code sub_code)
9479 {
9480   rtx pat;
9481   unsigned int i, nargs;
9482   bool comparison_p = false;
9483   bool tf_p = false;
9484   bool last_arg_constant = false;
9485   int num_memory = 0;
9486   rtx xops[4];
9487 
9488   machine_mode tmode = insn_data[icode].operand[0].mode;
9489 
9490   switch (m_type)
9491     {
9492     case MULTI_ARG_4_DF2_DI_I:
9493     case MULTI_ARG_4_DF2_DI_I1:
9494     case MULTI_ARG_4_SF2_SI_I:
9495     case MULTI_ARG_4_SF2_SI_I1:
9496       nargs = 4;
9497       last_arg_constant = true;
9498       break;
9499 
9500     case MULTI_ARG_3_SF:
9501     case MULTI_ARG_3_DF:
9502     case MULTI_ARG_3_SF2:
9503     case MULTI_ARG_3_DF2:
9504     case MULTI_ARG_3_DI:
9505     case MULTI_ARG_3_SI:
9506     case MULTI_ARG_3_SI_DI:
9507     case MULTI_ARG_3_HI:
9508     case MULTI_ARG_3_HI_SI:
9509     case MULTI_ARG_3_QI:
9510     case MULTI_ARG_3_DI2:
9511     case MULTI_ARG_3_SI2:
9512     case MULTI_ARG_3_HI2:
9513     case MULTI_ARG_3_QI2:
9514       nargs = 3;
9515       break;
9516 
9517     case MULTI_ARG_2_SF:
9518     case MULTI_ARG_2_DF:
9519     case MULTI_ARG_2_DI:
9520     case MULTI_ARG_2_SI:
9521     case MULTI_ARG_2_HI:
9522     case MULTI_ARG_2_QI:
9523       nargs = 2;
9524       break;
9525 
9526     case MULTI_ARG_2_DI_IMM:
9527     case MULTI_ARG_2_SI_IMM:
9528     case MULTI_ARG_2_HI_IMM:
9529     case MULTI_ARG_2_QI_IMM:
9530       nargs = 2;
9531       last_arg_constant = true;
9532       break;
9533 
9534     case MULTI_ARG_1_SF:
9535     case MULTI_ARG_1_DF:
9536     case MULTI_ARG_1_SF2:
9537     case MULTI_ARG_1_DF2:
9538     case MULTI_ARG_1_DI:
9539     case MULTI_ARG_1_SI:
9540     case MULTI_ARG_1_HI:
9541     case MULTI_ARG_1_QI:
9542     case MULTI_ARG_1_SI_DI:
9543     case MULTI_ARG_1_HI_DI:
9544     case MULTI_ARG_1_HI_SI:
9545     case MULTI_ARG_1_QI_DI:
9546     case MULTI_ARG_1_QI_SI:
9547     case MULTI_ARG_1_QI_HI:
9548       nargs = 1;
9549       break;
9550 
9551     case MULTI_ARG_2_DI_CMP:
9552     case MULTI_ARG_2_SI_CMP:
9553     case MULTI_ARG_2_HI_CMP:
9554     case MULTI_ARG_2_QI_CMP:
9555       nargs = 2;
9556       comparison_p = true;
9557       break;
9558 
9559     case MULTI_ARG_2_SF_TF:
9560     case MULTI_ARG_2_DF_TF:
9561     case MULTI_ARG_2_DI_TF:
9562     case MULTI_ARG_2_SI_TF:
9563     case MULTI_ARG_2_HI_TF:
9564     case MULTI_ARG_2_QI_TF:
9565       nargs = 2;
9566       tf_p = true;
9567       break;
9568 
9569     default:
9570       gcc_unreachable ();
9571     }
9572 
9573   if (optimize || !target
9574       || GET_MODE (target) != tmode
9575       || !insn_data[icode].operand[0].predicate (target, tmode))
9576     target = gen_reg_rtx (tmode);
9577   else if (memory_operand (target, tmode))
9578     num_memory++;
9579 
9580   gcc_assert (nargs <= ARRAY_SIZE (xops));
9581 
9582   for (i = 0; i < nargs; i++)
9583     {
9584       tree arg = CALL_EXPR_ARG (exp, i);
9585       rtx op = expand_normal (arg);
9586       int adjust = (comparison_p) ? 1 : 0;
9587       machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9588 
9589       if (last_arg_constant && i == nargs - 1)
9590 	{
9591 	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9592 	    {
9593 	      enum insn_code new_icode = icode;
9594 	      switch (icode)
9595 		{
9596 		case CODE_FOR_xop_vpermil2v2df3:
9597 		case CODE_FOR_xop_vpermil2v4sf3:
9598 		case CODE_FOR_xop_vpermil2v4df3:
9599 		case CODE_FOR_xop_vpermil2v8sf3:
9600 		  error ("the last argument must be a 2-bit immediate");
9601 		  return gen_reg_rtx (tmode);
9602 		case CODE_FOR_xop_rotlv2di3:
9603 		  new_icode = CODE_FOR_rotlv2di3;
9604 		  goto xop_rotl;
9605 		case CODE_FOR_xop_rotlv4si3:
9606 		  new_icode = CODE_FOR_rotlv4si3;
9607 		  goto xop_rotl;
9608 		case CODE_FOR_xop_rotlv8hi3:
9609 		  new_icode = CODE_FOR_rotlv8hi3;
9610 		  goto xop_rotl;
9611 		case CODE_FOR_xop_rotlv16qi3:
9612 		  new_icode = CODE_FOR_rotlv16qi3;
9613 		xop_rotl:
9614 		  if (CONST_INT_P (op))
9615 		    {
9616 		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9617 		      op = GEN_INT (INTVAL (op) & mask);
9618 		      gcc_checking_assert
9619 			(insn_data[icode].operand[i + 1].predicate (op, mode));
9620 		    }
9621 		  else
9622 		    {
9623 		      gcc_checking_assert
9624 			(nargs == 2
9625 			 && insn_data[new_icode].operand[0].mode == tmode
9626 			 && insn_data[new_icode].operand[1].mode == tmode
9627 			 && insn_data[new_icode].operand[2].mode == mode
9628 			 && insn_data[new_icode].operand[0].predicate
9629 			    == insn_data[icode].operand[0].predicate
9630 			 && insn_data[new_icode].operand[1].predicate
9631 			    == insn_data[icode].operand[1].predicate);
9632 		      icode = new_icode;
9633 		      goto non_constant;
9634 		    }
9635 		  break;
9636 		default:
9637 		  gcc_unreachable ();
9638 		}
9639 	    }
9640 	}
9641       else
9642 	{
9643 	non_constant:
9644 	  if (VECTOR_MODE_P (mode))
9645 	    op = safe_vector_operand (op, mode);
9646 
9647 	  /* If we aren't optimizing, only allow one memory operand to be
9648 	     generated.  */
9649 	  if (memory_operand (op, mode))
9650 	    num_memory++;
9651 
9652 	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9653 
9654 	  if (optimize
9655 	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9656 	      || num_memory > 1)
9657 	    op = force_reg (mode, op);
9658 	}
9659 
9660       xops[i] = op;
9661     }
9662 
9663   switch (nargs)
9664     {
9665     case 1:
9666       pat = GEN_FCN (icode) (target, xops[0]);
9667       break;
9668 
9669     case 2:
9670       if (tf_p)
9671 	pat = GEN_FCN (icode) (target, xops[0], xops[1],
9672 			       GEN_INT ((int)sub_code));
9673       else if (! comparison_p)
9674 	pat = GEN_FCN (icode) (target, xops[0], xops[1]);
9675       else
9676 	{
9677 	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
9678 				       xops[0], xops[1]);
9679 
9680 	  pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
9681 	}
9682       break;
9683 
9684     case 3:
9685       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
9686       break;
9687 
9688     case 4:
9689       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
9690       break;
9691 
9692     default:
9693       gcc_unreachable ();
9694     }
9695 
9696   if (! pat)
9697     return 0;
9698 
9699   emit_insn (pat);
9700   return target;
9701 }
9702 
9703 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9704    insns with vec_merge.  */
9705 
9706 static rtx
ix86_expand_unop_vec_merge_builtin(enum insn_code icode,tree exp,rtx target)9707 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9708 				    rtx target)
9709 {
9710   rtx pat;
9711   tree arg0 = CALL_EXPR_ARG (exp, 0);
9712   rtx op1, op0 = expand_normal (arg0);
9713   machine_mode tmode = insn_data[icode].operand[0].mode;
9714   machine_mode mode0 = insn_data[icode].operand[1].mode;
9715 
9716   if (optimize || !target
9717       || GET_MODE (target) != tmode
9718       || !insn_data[icode].operand[0].predicate (target, tmode))
9719     target = gen_reg_rtx (tmode);
9720 
9721   if (VECTOR_MODE_P (mode0))
9722     op0 = safe_vector_operand (op0, mode0);
9723 
9724   if ((optimize && !register_operand (op0, mode0))
9725       || !insn_data[icode].operand[1].predicate (op0, mode0))
9726     op0 = copy_to_mode_reg (mode0, op0);
9727 
9728   op1 = op0;
9729   if (!insn_data[icode].operand[2].predicate (op1, mode0))
9730     op1 = copy_to_mode_reg (mode0, op1);
9731 
9732   pat = GEN_FCN (icode) (target, op0, op1);
9733   if (! pat)
9734     return 0;
9735   emit_insn (pat);
9736   return target;
9737 }
9738 
9739 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
9740 
9741 static rtx
ix86_expand_sse_compare(const struct builtin_description * d,tree exp,rtx target,bool swap)9742 ix86_expand_sse_compare (const struct builtin_description *d,
9743 			 tree exp, rtx target, bool swap)
9744 {
9745   rtx pat;
9746   tree arg0 = CALL_EXPR_ARG (exp, 0);
9747   tree arg1 = CALL_EXPR_ARG (exp, 1);
9748   rtx op0 = expand_normal (arg0);
9749   rtx op1 = expand_normal (arg1);
9750   rtx op2;
9751   machine_mode tmode = insn_data[d->icode].operand[0].mode;
9752   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9753   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9754   enum rtx_code comparison = d->comparison;
9755 
9756   if (VECTOR_MODE_P (mode0))
9757     op0 = safe_vector_operand (op0, mode0);
9758   if (VECTOR_MODE_P (mode1))
9759     op1 = safe_vector_operand (op1, mode1);
9760 
9761   /* Swap operands if we have a comparison that isn't available in
9762      hardware.  */
9763   if (swap)
9764     std::swap (op0, op1);
9765 
9766   if (optimize || !target
9767       || GET_MODE (target) != tmode
9768       || !insn_data[d->icode].operand[0].predicate (target, tmode))
9769     target = gen_reg_rtx (tmode);
9770 
9771   if ((optimize && !register_operand (op0, mode0))
9772       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
9773     op0 = copy_to_mode_reg (mode0, op0);
9774   if ((optimize && !register_operand (op1, mode1))
9775       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
9776     op1 = copy_to_mode_reg (mode1, op1);
9777 
9778   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
9779   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9780   if (! pat)
9781     return 0;
9782   emit_insn (pat);
9783   return target;
9784 }
9785 
9786 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
9787 
9788 static rtx
ix86_expand_sse_comi(const struct builtin_description * d,tree exp,rtx target)9789 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
9790 		      rtx target)
9791 {
9792   rtx pat;
9793   tree arg0 = CALL_EXPR_ARG (exp, 0);
9794   tree arg1 = CALL_EXPR_ARG (exp, 1);
9795   rtx op0 = expand_normal (arg0);
9796   rtx op1 = expand_normal (arg1);
9797   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9798   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9799   enum rtx_code comparison = d->comparison;
9800 
9801   if (VECTOR_MODE_P (mode0))
9802     op0 = safe_vector_operand (op0, mode0);
9803   if (VECTOR_MODE_P (mode1))
9804     op1 = safe_vector_operand (op1, mode1);
9805 
9806   target = gen_reg_rtx (SImode);
9807   emit_move_insn (target, const0_rtx);
9808   target = gen_rtx_SUBREG (QImode, target, 0);
9809 
9810   if ((optimize && !register_operand (op0, mode0))
9811       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9812     op0 = copy_to_mode_reg (mode0, op0);
9813   if ((optimize && !register_operand (op1, mode1))
9814       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9815     op1 = copy_to_mode_reg (mode1, op1);
9816 
9817   pat = GEN_FCN (d->icode) (op0, op1);
9818   if (! pat)
9819     return 0;
9820   emit_insn (pat);
9821   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9822 			  gen_rtx_fmt_ee (comparison, QImode,
9823 					  SET_DEST (pat),
9824 					  const0_rtx)));
9825 
9826   return SUBREG_REG (target);
9827 }
9828 
9829 /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
9830 
9831 static rtx
ix86_expand_sse_round(const struct builtin_description * d,tree exp,rtx target)9832 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
9833 		       rtx target)
9834 {
9835   rtx pat;
9836   tree arg0 = CALL_EXPR_ARG (exp, 0);
9837   rtx op1, op0 = expand_normal (arg0);
9838   machine_mode tmode = insn_data[d->icode].operand[0].mode;
9839   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9840 
9841   if (optimize || target == 0
9842       || GET_MODE (target) != tmode
9843       || !insn_data[d->icode].operand[0].predicate (target, tmode))
9844     target = gen_reg_rtx (tmode);
9845 
9846   if (VECTOR_MODE_P (mode0))
9847     op0 = safe_vector_operand (op0, mode0);
9848 
9849   if ((optimize && !register_operand (op0, mode0))
9850       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9851     op0 = copy_to_mode_reg (mode0, op0);
9852 
9853   op1 = GEN_INT (d->comparison);
9854 
9855   pat = GEN_FCN (d->icode) (target, op0, op1);
9856   if (! pat)
9857     return 0;
9858   emit_insn (pat);
9859   return target;
9860 }
9861 
9862 static rtx
ix86_expand_sse_round_vec_pack_sfix(const struct builtin_description * d,tree exp,rtx target)9863 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
9864 				     tree exp, rtx target)
9865 {
9866   rtx pat;
9867   tree arg0 = CALL_EXPR_ARG (exp, 0);
9868   tree arg1 = CALL_EXPR_ARG (exp, 1);
9869   rtx op0 = expand_normal (arg0);
9870   rtx op1 = expand_normal (arg1);
9871   rtx op2;
9872   machine_mode tmode = insn_data[d->icode].operand[0].mode;
9873   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9874   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9875 
9876   if (optimize || target == 0
9877       || GET_MODE (target) != tmode
9878       || !insn_data[d->icode].operand[0].predicate (target, tmode))
9879     target = gen_reg_rtx (tmode);
9880 
9881   op0 = safe_vector_operand (op0, mode0);
9882   op1 = safe_vector_operand (op1, mode1);
9883 
9884   if ((optimize && !register_operand (op0, mode0))
9885       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9886     op0 = copy_to_mode_reg (mode0, op0);
9887   if ((optimize && !register_operand (op1, mode1))
9888       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9889     op1 = copy_to_mode_reg (mode1, op1);
9890 
9891   op2 = GEN_INT (d->comparison);
9892 
9893   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9894   if (! pat)
9895     return 0;
9896   emit_insn (pat);
9897   return target;
9898 }
9899 
9900 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
9901 
9902 static rtx
ix86_expand_sse_ptest(const struct builtin_description * d,tree exp,rtx target)9903 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
9904 		       rtx target)
9905 {
9906   rtx pat;
9907   tree arg0 = CALL_EXPR_ARG (exp, 0);
9908   tree arg1 = CALL_EXPR_ARG (exp, 1);
9909   rtx op0 = expand_normal (arg0);
9910   rtx op1 = expand_normal (arg1);
9911   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9912   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9913   enum rtx_code comparison = d->comparison;
9914 
9915   if (VECTOR_MODE_P (mode0))
9916     op0 = safe_vector_operand (op0, mode0);
9917   if (VECTOR_MODE_P (mode1))
9918     op1 = safe_vector_operand (op1, mode1);
9919 
9920   target = gen_reg_rtx (SImode);
9921   emit_move_insn (target, const0_rtx);
9922   target = gen_rtx_SUBREG (QImode, target, 0);
9923 
9924   if ((optimize && !register_operand (op0, mode0))
9925       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9926     op0 = copy_to_mode_reg (mode0, op0);
9927   if ((optimize && !register_operand (op1, mode1))
9928       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9929     op1 = copy_to_mode_reg (mode1, op1);
9930 
9931   pat = GEN_FCN (d->icode) (op0, op1);
9932   if (! pat)
9933     return 0;
9934   emit_insn (pat);
9935   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9936 			  gen_rtx_fmt_ee (comparison, QImode,
9937 					  SET_DEST (pat),
9938 					  const0_rtx)));
9939 
9940   return SUBREG_REG (target);
9941 }
9942 
9943 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
9944 
9945 static rtx
ix86_expand_sse_pcmpestr(const struct builtin_description * d,tree exp,rtx target)9946 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
9947 			  tree exp, rtx target)
9948 {
9949   rtx pat;
9950   tree arg0 = CALL_EXPR_ARG (exp, 0);
9951   tree arg1 = CALL_EXPR_ARG (exp, 1);
9952   tree arg2 = CALL_EXPR_ARG (exp, 2);
9953   tree arg3 = CALL_EXPR_ARG (exp, 3);
9954   tree arg4 = CALL_EXPR_ARG (exp, 4);
9955   rtx scratch0, scratch1;
9956   rtx op0 = expand_normal (arg0);
9957   rtx op1 = expand_normal (arg1);
9958   rtx op2 = expand_normal (arg2);
9959   rtx op3 = expand_normal (arg3);
9960   rtx op4 = expand_normal (arg4);
9961   machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
9962 
9963   tmode0 = insn_data[d->icode].operand[0].mode;
9964   tmode1 = insn_data[d->icode].operand[1].mode;
9965   modev2 = insn_data[d->icode].operand[2].mode;
9966   modei3 = insn_data[d->icode].operand[3].mode;
9967   modev4 = insn_data[d->icode].operand[4].mode;
9968   modei5 = insn_data[d->icode].operand[5].mode;
9969   modeimm = insn_data[d->icode].operand[6].mode;
9970 
9971   if (VECTOR_MODE_P (modev2))
9972     op0 = safe_vector_operand (op0, modev2);
9973   if (VECTOR_MODE_P (modev4))
9974     op2 = safe_vector_operand (op2, modev4);
9975 
9976   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
9977     op0 = copy_to_mode_reg (modev2, op0);
9978   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
9979     op1 = copy_to_mode_reg (modei3, op1);
9980   if ((optimize && !register_operand (op2, modev4))
9981       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
9982     op2 = copy_to_mode_reg (modev4, op2);
9983   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
9984     op3 = copy_to_mode_reg (modei5, op3);
9985 
9986   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
9987     {
9988       error ("the fifth argument must be an 8-bit immediate");
9989       return const0_rtx;
9990     }
9991 
9992   if (d->code == IX86_BUILTIN_PCMPESTRI128)
9993     {
9994       if (optimize || !target
9995 	  || GET_MODE (target) != tmode0
9996 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
9997 	target = gen_reg_rtx (tmode0);
9998 
9999       scratch1 = gen_reg_rtx (tmode1);
10000 
10001       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10002     }
10003   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10004     {
10005       if (optimize || !target
10006 	  || GET_MODE (target) != tmode1
10007 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10008 	target = gen_reg_rtx (tmode1);
10009 
10010       scratch0 = gen_reg_rtx (tmode0);
10011 
10012       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10013     }
10014   else
10015     {
10016       gcc_assert (d->flag);
10017 
10018       scratch0 = gen_reg_rtx (tmode0);
10019       scratch1 = gen_reg_rtx (tmode1);
10020 
10021       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10022     }
10023 
10024   if (! pat)
10025     return 0;
10026 
10027   emit_insn (pat);
10028 
10029   if (d->flag)
10030     {
10031       target = gen_reg_rtx (SImode);
10032       emit_move_insn (target, const0_rtx);
10033       target = gen_rtx_SUBREG (QImode, target, 0);
10034 
10035       emit_insn
10036 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10037 		      gen_rtx_fmt_ee (EQ, QImode,
10038 				      gen_rtx_REG ((machine_mode) d->flag,
10039 						   FLAGS_REG),
10040 				      const0_rtx)));
10041       return SUBREG_REG (target);
10042     }
10043   else
10044     return target;
10045 }
10046 
10047 
10048 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
10049 
10050 static rtx
ix86_expand_sse_pcmpistr(const struct builtin_description * d,tree exp,rtx target)10051 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10052 			  tree exp, rtx target)
10053 {
10054   rtx pat;
10055   tree arg0 = CALL_EXPR_ARG (exp, 0);
10056   tree arg1 = CALL_EXPR_ARG (exp, 1);
10057   tree arg2 = CALL_EXPR_ARG (exp, 2);
10058   rtx scratch0, scratch1;
10059   rtx op0 = expand_normal (arg0);
10060   rtx op1 = expand_normal (arg1);
10061   rtx op2 = expand_normal (arg2);
10062   machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10063 
10064   tmode0 = insn_data[d->icode].operand[0].mode;
10065   tmode1 = insn_data[d->icode].operand[1].mode;
10066   modev2 = insn_data[d->icode].operand[2].mode;
10067   modev3 = insn_data[d->icode].operand[3].mode;
10068   modeimm = insn_data[d->icode].operand[4].mode;
10069 
10070   if (VECTOR_MODE_P (modev2))
10071     op0 = safe_vector_operand (op0, modev2);
10072   if (VECTOR_MODE_P (modev3))
10073     op1 = safe_vector_operand (op1, modev3);
10074 
10075   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10076     op0 = copy_to_mode_reg (modev2, op0);
10077   if ((optimize && !register_operand (op1, modev3))
10078       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10079     op1 = copy_to_mode_reg (modev3, op1);
10080 
10081   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10082     {
10083       error ("the third argument must be an 8-bit immediate");
10084       return const0_rtx;
10085     }
10086 
10087   if (d->code == IX86_BUILTIN_PCMPISTRI128)
10088     {
10089       if (optimize || !target
10090 	  || GET_MODE (target) != tmode0
10091 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10092 	target = gen_reg_rtx (tmode0);
10093 
10094       scratch1 = gen_reg_rtx (tmode1);
10095 
10096       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10097     }
10098   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10099     {
10100       if (optimize || !target
10101 	  || GET_MODE (target) != tmode1
10102 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10103 	target = gen_reg_rtx (tmode1);
10104 
10105       scratch0 = gen_reg_rtx (tmode0);
10106 
10107       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10108     }
10109   else
10110     {
10111       gcc_assert (d->flag);
10112 
10113       scratch0 = gen_reg_rtx (tmode0);
10114       scratch1 = gen_reg_rtx (tmode1);
10115 
10116       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10117     }
10118 
10119   if (! pat)
10120     return 0;
10121 
10122   emit_insn (pat);
10123 
10124   if (d->flag)
10125     {
10126       target = gen_reg_rtx (SImode);
10127       emit_move_insn (target, const0_rtx);
10128       target = gen_rtx_SUBREG (QImode, target, 0);
10129 
10130       emit_insn
10131 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10132 		      gen_rtx_fmt_ee (EQ, QImode,
10133 				      gen_rtx_REG ((machine_mode) d->flag,
10134 						   FLAGS_REG),
10135 				      const0_rtx)));
10136       return SUBREG_REG (target);
10137     }
10138   else
10139     return target;
10140 }
10141 
10142 /* Fixup modeless constants to fit required mode.  */
10143 
10144 static rtx
fixup_modeless_constant(rtx x,machine_mode mode)10145 fixup_modeless_constant (rtx x, machine_mode mode)
10146 {
10147   if (GET_MODE (x) == VOIDmode)
10148     x = convert_to_mode (mode, x, 1);
10149   return x;
10150 }
10151 
10152 /* Subroutine of ix86_expand_builtin to take care of insns with
10153    variable number of operands.  */
10154 
10155 static rtx
ix86_expand_args_builtin(const struct builtin_description * d,tree exp,rtx target)10156 ix86_expand_args_builtin (const struct builtin_description *d,
10157 			  tree exp, rtx target)
10158 {
10159   rtx pat, real_target;
10160   unsigned int i, nargs;
10161   unsigned int nargs_constant = 0;
10162   unsigned int mask_pos = 0;
10163   int num_memory = 0;
10164   rtx xops[6];
10165   bool second_arg_count = false;
10166   enum insn_code icode = d->icode;
10167   const struct insn_data_d *insn_p = &insn_data[icode];
10168   machine_mode tmode = insn_p->operand[0].mode;
10169   machine_mode rmode = VOIDmode;
10170   bool swap = false;
10171   enum rtx_code comparison = d->comparison;
10172 
10173   switch ((enum ix86_builtin_func_type) d->flag)
10174     {
10175     case V2DF_FTYPE_V2DF_ROUND:
10176     case V4DF_FTYPE_V4DF_ROUND:
10177     case V8DF_FTYPE_V8DF_ROUND:
10178     case V4SF_FTYPE_V4SF_ROUND:
10179     case V8SF_FTYPE_V8SF_ROUND:
10180     case V16SF_FTYPE_V16SF_ROUND:
10181     case V8HF_FTYPE_V8HF_ROUND:
10182     case V16HF_FTYPE_V16HF_ROUND:
10183     case V32HF_FTYPE_V32HF_ROUND:
10184     case V4SI_FTYPE_V4SF_ROUND:
10185     case V8SI_FTYPE_V8SF_ROUND:
10186     case V16SI_FTYPE_V16SF_ROUND:
10187       return ix86_expand_sse_round (d, exp, target);
10188     case V4SI_FTYPE_V2DF_V2DF_ROUND:
10189     case V8SI_FTYPE_V4DF_V4DF_ROUND:
10190     case V16SI_FTYPE_V8DF_V8DF_ROUND:
10191       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10192     case INT_FTYPE_V8SF_V8SF_PTEST:
10193     case INT_FTYPE_V4DI_V4DI_PTEST:
10194     case INT_FTYPE_V4DF_V4DF_PTEST:
10195     case INT_FTYPE_V4SF_V4SF_PTEST:
10196     case INT_FTYPE_V2DI_V2DI_PTEST:
10197     case INT_FTYPE_V2DF_V2DF_PTEST:
10198       return ix86_expand_sse_ptest (d, exp, target);
10199     case FLOAT128_FTYPE_FLOAT128:
10200     case FLOAT_FTYPE_FLOAT:
10201     case INT_FTYPE_INT:
10202     case UINT_FTYPE_UINT:
10203     case UINT16_FTYPE_UINT16:
10204     case UINT64_FTYPE_INT:
10205     case UINT64_FTYPE_UINT64:
10206     case INT64_FTYPE_INT64:
10207     case INT64_FTYPE_V4SF:
10208     case INT64_FTYPE_V2DF:
10209     case INT_FTYPE_V16QI:
10210     case INT_FTYPE_V8QI:
10211     case INT_FTYPE_V8SF:
10212     case INT_FTYPE_V4DF:
10213     case INT_FTYPE_V4SF:
10214     case INT_FTYPE_V2DF:
10215     case INT_FTYPE_V32QI:
10216     case V16QI_FTYPE_V16QI:
10217     case V8SI_FTYPE_V8SF:
10218     case V8SI_FTYPE_V4SI:
10219     case V8HI_FTYPE_V8HI:
10220     case V8HI_FTYPE_V16QI:
10221     case V8QI_FTYPE_V8QI:
10222     case V8SF_FTYPE_V8SF:
10223     case V8SF_FTYPE_V8SI:
10224     case V8SF_FTYPE_V4SF:
10225     case V8SF_FTYPE_V8HI:
10226     case V4SI_FTYPE_V4SI:
10227     case V4SI_FTYPE_V16QI:
10228     case V4SI_FTYPE_V4SF:
10229     case V4SI_FTYPE_V8SI:
10230     case V4SI_FTYPE_V8HI:
10231     case V4SI_FTYPE_V4DF:
10232     case V4SI_FTYPE_V2DF:
10233     case V4HI_FTYPE_V4HI:
10234     case V4DF_FTYPE_V4DF:
10235     case V4DF_FTYPE_V4SI:
10236     case V4DF_FTYPE_V4SF:
10237     case V4DF_FTYPE_V2DF:
10238     case V4SF_FTYPE_V4SF:
10239     case V4SF_FTYPE_V4SI:
10240     case V4SF_FTYPE_V8SF:
10241     case V4SF_FTYPE_V4DF:
10242     case V4SF_FTYPE_V8HI:
10243     case V4SF_FTYPE_V2DF:
10244     case V2DI_FTYPE_V2DI:
10245     case V2DI_FTYPE_V16QI:
10246     case V2DI_FTYPE_V8HI:
10247     case V2DI_FTYPE_V4SI:
10248     case V2DF_FTYPE_V2DF:
10249     case V2DF_FTYPE_V4SI:
10250     case V2DF_FTYPE_V4DF:
10251     case V2DF_FTYPE_V4SF:
10252     case V2DF_FTYPE_V2SI:
10253     case V2SI_FTYPE_V2SI:
10254     case V2SI_FTYPE_V4SF:
10255     case V2SI_FTYPE_V2SF:
10256     case V2SI_FTYPE_V2DF:
10257     case V2SF_FTYPE_V2SF:
10258     case V2SF_FTYPE_V2SI:
10259     case V32QI_FTYPE_V32QI:
10260     case V32QI_FTYPE_V16QI:
10261     case V16HI_FTYPE_V16HI:
10262     case V16HI_FTYPE_V8HI:
10263     case V8SI_FTYPE_V8SI:
10264     case V16HI_FTYPE_V16QI:
10265     case V8SI_FTYPE_V16QI:
10266     case V4DI_FTYPE_V16QI:
10267     case V8SI_FTYPE_V8HI:
10268     case V4DI_FTYPE_V8HI:
10269     case V4DI_FTYPE_V4SI:
10270     case V4DI_FTYPE_V2DI:
10271     case UQI_FTYPE_UQI:
10272     case UHI_FTYPE_UHI:
10273     case USI_FTYPE_USI:
10274     case USI_FTYPE_UQI:
10275     case USI_FTYPE_UHI:
10276     case UDI_FTYPE_UDI:
10277     case UHI_FTYPE_V16QI:
10278     case USI_FTYPE_V32QI:
10279     case UDI_FTYPE_V64QI:
10280     case V16QI_FTYPE_UHI:
10281     case V32QI_FTYPE_USI:
10282     case V64QI_FTYPE_UDI:
10283     case V8HI_FTYPE_UQI:
10284     case V16HI_FTYPE_UHI:
10285     case V32HI_FTYPE_USI:
10286     case V4SI_FTYPE_UQI:
10287     case V8SI_FTYPE_UQI:
10288     case V4SI_FTYPE_UHI:
10289     case V8SI_FTYPE_UHI:
10290     case UQI_FTYPE_V8HI:
10291     case UHI_FTYPE_V16HI:
10292     case USI_FTYPE_V32HI:
10293     case UQI_FTYPE_V4SI:
10294     case UQI_FTYPE_V8SI:
10295     case UHI_FTYPE_V16SI:
10296     case UQI_FTYPE_V2DI:
10297     case UQI_FTYPE_V4DI:
10298     case UQI_FTYPE_V8DI:
10299     case V16SI_FTYPE_UHI:
10300     case V2DI_FTYPE_UQI:
10301     case V4DI_FTYPE_UQI:
10302     case V16SI_FTYPE_INT:
10303     case V16SF_FTYPE_V8SF:
10304     case V16SI_FTYPE_V8SI:
10305     case V16SF_FTYPE_V4SF:
10306     case V16SI_FTYPE_V4SI:
10307     case V16SI_FTYPE_V16SF:
10308     case V16SI_FTYPE_V16SI:
10309     case V64QI_FTYPE_V64QI:
10310     case V32HI_FTYPE_V32HI:
10311     case V16SF_FTYPE_V16SF:
10312     case V8DI_FTYPE_UQI:
10313     case V8DI_FTYPE_V8DI:
10314     case V8DF_FTYPE_V4DF:
10315     case V8DF_FTYPE_V2DF:
10316     case V8DF_FTYPE_V8DF:
10317     case V4DI_FTYPE_V4DI:
10318     case V16HI_FTYPE_V16SF:
10319     case V8HI_FTYPE_V8SF:
10320     case V8HI_FTYPE_V4SF:
10321       nargs = 1;
10322       break;
10323     case V4SF_FTYPE_V4SF_VEC_MERGE:
10324     case V2DF_FTYPE_V2DF_VEC_MERGE:
10325       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10326     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10327     case V16QI_FTYPE_V16QI_V16QI:
10328     case V16QI_FTYPE_V8HI_V8HI:
10329     case V16HF_FTYPE_V16HF_V16HF:
10330     case V16SF_FTYPE_V16SF_V16SF:
10331     case V8QI_FTYPE_V8QI_V8QI:
10332     case V8QI_FTYPE_V4HI_V4HI:
10333     case V8HI_FTYPE_V8HI_V8HI:
10334     case V8HI_FTYPE_V16QI_V16QI:
10335     case V8HI_FTYPE_V4SI_V4SI:
10336     case V8HF_FTYPE_V8HF_V8HF:
10337     case V8SF_FTYPE_V8SF_V8SF:
10338     case V8SF_FTYPE_V8SF_V8SI:
10339     case V8DF_FTYPE_V8DF_V8DF:
10340     case V4SI_FTYPE_V4SI_V4SI:
10341     case V4SI_FTYPE_V8HI_V8HI:
10342     case V4SI_FTYPE_V2DF_V2DF:
10343     case V4HI_FTYPE_V4HI_V4HI:
10344     case V4HI_FTYPE_V8QI_V8QI:
10345     case V4HI_FTYPE_V2SI_V2SI:
10346     case V4DF_FTYPE_V4DF_V4DF:
10347     case V4DF_FTYPE_V4DF_V4DI:
10348     case V4SF_FTYPE_V4SF_V4SF:
10349     case V4SF_FTYPE_V4SF_V4SI:
10350     case V4SF_FTYPE_V4SF_V2SI:
10351     case V4SF_FTYPE_V4SF_V2DF:
10352     case V4SF_FTYPE_V4SF_UINT:
10353     case V4SF_FTYPE_V4SF_DI:
10354     case V4SF_FTYPE_V4SF_SI:
10355     case V2DI_FTYPE_V2DI_V2DI:
10356     case V2DI_FTYPE_V16QI_V16QI:
10357     case V2DI_FTYPE_V4SI_V4SI:
10358     case V2DI_FTYPE_V2DI_V16QI:
10359     case V2SI_FTYPE_V2SI_V2SI:
10360     case V2SI_FTYPE_V4HI_V4HI:
10361     case V2SI_FTYPE_V2SF_V2SF:
10362     case V2DF_FTYPE_V2DF_V2DF:
10363     case V2DF_FTYPE_V2DF_V4SF:
10364     case V2DF_FTYPE_V2DF_V2DI:
10365     case V2DF_FTYPE_V2DF_DI:
10366     case V2DF_FTYPE_V2DF_SI:
10367     case V2DF_FTYPE_V2DF_UINT:
10368     case V2SF_FTYPE_V2SF_V2SF:
10369     case V1DI_FTYPE_V1DI_V1DI:
10370     case V1DI_FTYPE_V8QI_V8QI:
10371     case V1DI_FTYPE_V2SI_V2SI:
10372     case V32QI_FTYPE_V16HI_V16HI:
10373     case V16HI_FTYPE_V8SI_V8SI:
10374     case V64QI_FTYPE_V64QI_V64QI:
10375     case V32QI_FTYPE_V32QI_V32QI:
10376     case V16HI_FTYPE_V32QI_V32QI:
10377     case V16HI_FTYPE_V16HI_V16HI:
10378     case V8SI_FTYPE_V4DF_V4DF:
10379     case V8SI_FTYPE_V8SI_V8SI:
10380     case V8SI_FTYPE_V16HI_V16HI:
10381     case V4DI_FTYPE_V4DI_V4DI:
10382     case V4DI_FTYPE_V8SI_V8SI:
10383     case V8DI_FTYPE_V64QI_V64QI:
10384       if (comparison == UNKNOWN)
10385 	return ix86_expand_binop_builtin (icode, exp, target);
10386       nargs = 2;
10387       break;
10388     case V4SF_FTYPE_V4SF_V4SF_SWAP:
10389     case V2DF_FTYPE_V2DF_V2DF_SWAP:
10390       gcc_assert (comparison != UNKNOWN);
10391       nargs = 2;
10392       swap = true;
10393       break;
10394     case V16HI_FTYPE_V16HI_V8HI_COUNT:
10395     case V16HI_FTYPE_V16HI_SI_COUNT:
10396     case V8SI_FTYPE_V8SI_V4SI_COUNT:
10397     case V8SI_FTYPE_V8SI_SI_COUNT:
10398     case V4DI_FTYPE_V4DI_V2DI_COUNT:
10399     case V4DI_FTYPE_V4DI_INT_COUNT:
10400     case V8HI_FTYPE_V8HI_V8HI_COUNT:
10401     case V8HI_FTYPE_V8HI_SI_COUNT:
10402     case V4SI_FTYPE_V4SI_V4SI_COUNT:
10403     case V4SI_FTYPE_V4SI_SI_COUNT:
10404     case V4HI_FTYPE_V4HI_V4HI_COUNT:
10405     case V4HI_FTYPE_V4HI_SI_COUNT:
10406     case V2DI_FTYPE_V2DI_V2DI_COUNT:
10407     case V2DI_FTYPE_V2DI_SI_COUNT:
10408     case V2SI_FTYPE_V2SI_V2SI_COUNT:
10409     case V2SI_FTYPE_V2SI_SI_COUNT:
10410     case V1DI_FTYPE_V1DI_V1DI_COUNT:
10411     case V1DI_FTYPE_V1DI_SI_COUNT:
10412       nargs = 2;
10413       second_arg_count = true;
10414       break;
10415     case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10416     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10417     case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10418     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10419     case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10420     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10421     case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10422     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10423     case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10424     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10425     case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10426     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10427     case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10428     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10429     case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10430     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10431     case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10432     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10433       nargs = 4;
10434       second_arg_count = true;
10435       break;
10436     case UINT64_FTYPE_UINT64_UINT64:
10437     case UINT_FTYPE_UINT_UINT:
10438     case UINT_FTYPE_UINT_USHORT:
10439     case UINT_FTYPE_UINT_UCHAR:
10440     case UINT16_FTYPE_UINT16_INT:
10441     case UINT8_FTYPE_UINT8_INT:
10442     case UQI_FTYPE_UQI_UQI:
10443     case UHI_FTYPE_UHI_UHI:
10444     case USI_FTYPE_USI_USI:
10445     case UDI_FTYPE_UDI_UDI:
10446     case V16SI_FTYPE_V8DF_V8DF:
10447     case V32HI_FTYPE_V16SF_V16SF:
10448     case V16HI_FTYPE_V8SF_V8SF:
10449     case V8HI_FTYPE_V4SF_V4SF:
10450     case V16HI_FTYPE_V16SF_UHI:
10451     case V8HI_FTYPE_V8SF_UQI:
10452     case V8HI_FTYPE_V4SF_UQI:
10453       nargs = 2;
10454       break;
10455     case V2DI_FTYPE_V2DI_INT_CONVERT:
10456       nargs = 2;
10457       rmode = V1TImode;
10458       nargs_constant = 1;
10459       break;
10460     case V4DI_FTYPE_V4DI_INT_CONVERT:
10461       nargs = 2;
10462       rmode = V2TImode;
10463       nargs_constant = 1;
10464       break;
10465     case V8DI_FTYPE_V8DI_INT_CONVERT:
10466       nargs = 2;
10467       rmode = V4TImode;
10468       nargs_constant = 1;
10469       break;
10470     case V8HI_FTYPE_V8HI_INT:
10471     case V8HI_FTYPE_V8SF_INT:
10472     case V16HI_FTYPE_V16SF_INT:
10473     case V8HI_FTYPE_V4SF_INT:
10474     case V8SF_FTYPE_V8SF_INT:
10475     case V4SF_FTYPE_V16SF_INT:
10476     case V16SF_FTYPE_V16SF_INT:
10477     case V4SI_FTYPE_V4SI_INT:
10478     case V4SI_FTYPE_V8SI_INT:
10479     case V4HI_FTYPE_V4HI_INT:
10480     case V4DF_FTYPE_V4DF_INT:
10481     case V4DF_FTYPE_V8DF_INT:
10482     case V4SF_FTYPE_V4SF_INT:
10483     case V4SF_FTYPE_V8SF_INT:
10484     case V2DI_FTYPE_V2DI_INT:
10485     case V2DF_FTYPE_V2DF_INT:
10486     case V2DF_FTYPE_V4DF_INT:
10487     case V16HI_FTYPE_V16HI_INT:
10488     case V8SI_FTYPE_V8SI_INT:
10489     case V16SI_FTYPE_V16SI_INT:
10490     case V4SI_FTYPE_V16SI_INT:
10491     case V4DI_FTYPE_V4DI_INT:
10492     case V2DI_FTYPE_V4DI_INT:
10493     case V4DI_FTYPE_V8DI_INT:
10494     case UQI_FTYPE_UQI_UQI_CONST:
10495     case UHI_FTYPE_UHI_UQI:
10496     case USI_FTYPE_USI_UQI:
10497     case UDI_FTYPE_UDI_UQI:
10498       nargs = 2;
10499       nargs_constant = 1;
10500       break;
10501     case V16QI_FTYPE_V16QI_V16QI_V16QI:
10502     case V8SF_FTYPE_V8SF_V8SF_V8SF:
10503     case V4DF_FTYPE_V4DF_V4DF_V4DF:
10504     case V4SF_FTYPE_V4SF_V4SF_V4SF:
10505     case V2DF_FTYPE_V2DF_V2DF_V2DF:
10506     case V32QI_FTYPE_V32QI_V32QI_V32QI:
10507     case UHI_FTYPE_V16SI_V16SI_UHI:
10508     case UQI_FTYPE_V8DI_V8DI_UQI:
10509     case V16HI_FTYPE_V16SI_V16HI_UHI:
10510     case V16QI_FTYPE_V16SI_V16QI_UHI:
10511     case V16QI_FTYPE_V8DI_V16QI_UQI:
10512     case V32HF_FTYPE_V32HF_V32HF_USI:
10513     case V16SF_FTYPE_V16SF_V16SF_UHI:
10514     case V16SF_FTYPE_V4SF_V16SF_UHI:
10515     case V16SI_FTYPE_SI_V16SI_UHI:
10516     case V16SI_FTYPE_V16HI_V16SI_UHI:
10517     case V16SI_FTYPE_V16QI_V16SI_UHI:
10518     case V8SF_FTYPE_V4SF_V8SF_UQI:
10519     case V4DF_FTYPE_V2DF_V4DF_UQI:
10520     case V8SI_FTYPE_V4SI_V8SI_UQI:
10521     case V8SI_FTYPE_SI_V8SI_UQI:
10522     case V4SI_FTYPE_V4SI_V4SI_UQI:
10523     case V4SI_FTYPE_SI_V4SI_UQI:
10524     case V4DI_FTYPE_V2DI_V4DI_UQI:
10525     case V4DI_FTYPE_DI_V4DI_UQI:
10526     case V2DI_FTYPE_V2DI_V2DI_UQI:
10527     case V2DI_FTYPE_DI_V2DI_UQI:
10528     case V64QI_FTYPE_V64QI_V64QI_UDI:
10529     case V64QI_FTYPE_V16QI_V64QI_UDI:
10530     case V64QI_FTYPE_QI_V64QI_UDI:
10531     case V32QI_FTYPE_V32QI_V32QI_USI:
10532     case V32QI_FTYPE_V16QI_V32QI_USI:
10533     case V32QI_FTYPE_QI_V32QI_USI:
10534     case V16QI_FTYPE_V16QI_V16QI_UHI:
10535     case V16QI_FTYPE_QI_V16QI_UHI:
10536     case V32HI_FTYPE_V8HI_V32HI_USI:
10537     case V32HI_FTYPE_HI_V32HI_USI:
10538     case V16HI_FTYPE_V8HI_V16HI_UHI:
10539     case V16HI_FTYPE_HI_V16HI_UHI:
10540     case V8HI_FTYPE_V8HI_V8HI_UQI:
10541     case V8HI_FTYPE_HI_V8HI_UQI:
10542     case V16HF_FTYPE_V16HF_V16HF_UHI:
10543     case V8SF_FTYPE_V8HI_V8SF_UQI:
10544     case V4SF_FTYPE_V8HI_V4SF_UQI:
10545     case V8SI_FTYPE_V8HF_V8SI_UQI:
10546     case V8SF_FTYPE_V8HF_V8SF_UQI:
10547     case V8SI_FTYPE_V8SF_V8SI_UQI:
10548     case V4SI_FTYPE_V4SF_V4SI_UQI:
10549     case V4SI_FTYPE_V8HF_V4SI_UQI:
10550     case V4SF_FTYPE_V8HF_V4SF_UQI:
10551     case V4DI_FTYPE_V8HF_V4DI_UQI:
10552     case V4DI_FTYPE_V4SF_V4DI_UQI:
10553     case V2DI_FTYPE_V8HF_V2DI_UQI:
10554     case V2DI_FTYPE_V4SF_V2DI_UQI:
10555     case V8HF_FTYPE_V8HF_V8HF_UQI:
10556     case V8HF_FTYPE_V8HF_V8HF_V8HF:
10557     case V8HF_FTYPE_V8HI_V8HF_UQI:
10558     case V8HF_FTYPE_V8SI_V8HF_UQI:
10559     case V8HF_FTYPE_V8SF_V8HF_UQI:
10560     case V8HF_FTYPE_V4SI_V8HF_UQI:
10561     case V8HF_FTYPE_V4SF_V8HF_UQI:
10562     case V8HF_FTYPE_V4DI_V8HF_UQI:
10563     case V8HF_FTYPE_V4DF_V8HF_UQI:
10564     case V8HF_FTYPE_V2DI_V8HF_UQI:
10565     case V8HF_FTYPE_V2DF_V8HF_UQI:
10566     case V4SF_FTYPE_V4DI_V4SF_UQI:
10567     case V4SF_FTYPE_V2DI_V4SF_UQI:
10568     case V4DF_FTYPE_V4DI_V4DF_UQI:
10569     case V4DF_FTYPE_V8HF_V4DF_UQI:
10570     case V2DF_FTYPE_V8HF_V2DF_UQI:
10571     case V2DF_FTYPE_V2DI_V2DF_UQI:
10572     case V16QI_FTYPE_V8HI_V16QI_UQI:
10573     case V16QI_FTYPE_V16HI_V16QI_UHI:
10574     case V16QI_FTYPE_V4SI_V16QI_UQI:
10575     case V16QI_FTYPE_V8SI_V16QI_UQI:
10576     case V8HI_FTYPE_V8HF_V8HI_UQI:
10577     case V8HI_FTYPE_V4SI_V8HI_UQI:
10578     case V8HI_FTYPE_V8SI_V8HI_UQI:
10579     case V16QI_FTYPE_V2DI_V16QI_UQI:
10580     case V16QI_FTYPE_V4DI_V16QI_UQI:
10581     case V8HI_FTYPE_V2DI_V8HI_UQI:
10582     case V8HI_FTYPE_V4DI_V8HI_UQI:
10583     case V4SI_FTYPE_V2DI_V4SI_UQI:
10584     case V4SI_FTYPE_V4DI_V4SI_UQI:
10585     case V32QI_FTYPE_V32HI_V32QI_USI:
10586     case UHI_FTYPE_V16QI_V16QI_UHI:
10587     case USI_FTYPE_V32QI_V32QI_USI:
10588     case UDI_FTYPE_V64QI_V64QI_UDI:
10589     case UQI_FTYPE_V8HI_V8HI_UQI:
10590     case UHI_FTYPE_V16HI_V16HI_UHI:
10591     case USI_FTYPE_V32HI_V32HI_USI:
10592     case UQI_FTYPE_V4SI_V4SI_UQI:
10593     case UQI_FTYPE_V8SI_V8SI_UQI:
10594     case UQI_FTYPE_V2DI_V2DI_UQI:
10595     case UQI_FTYPE_V4DI_V4DI_UQI:
10596     case V4SF_FTYPE_V2DF_V4SF_UQI:
10597     case V4SF_FTYPE_V4DF_V4SF_UQI:
10598     case V16SI_FTYPE_V16SI_V16SI_UHI:
10599     case V16SI_FTYPE_V4SI_V16SI_UHI:
10600     case V2DI_FTYPE_V4SI_V2DI_UQI:
10601     case V2DI_FTYPE_V8HI_V2DI_UQI:
10602     case V2DI_FTYPE_V16QI_V2DI_UQI:
10603     case V4DI_FTYPE_V4DI_V4DI_UQI:
10604     case V4DI_FTYPE_V4SI_V4DI_UQI:
10605     case V4DI_FTYPE_V8HI_V4DI_UQI:
10606     case V4DI_FTYPE_V16QI_V4DI_UQI:
10607     case V4DI_FTYPE_V4DF_V4DI_UQI:
10608     case V2DI_FTYPE_V2DF_V2DI_UQI:
10609     case V4SI_FTYPE_V4DF_V4SI_UQI:
10610     case V4SI_FTYPE_V2DF_V4SI_UQI:
10611     case V4SI_FTYPE_V8HI_V4SI_UQI:
10612     case V4SI_FTYPE_V16QI_V4SI_UQI:
10613     case V4DI_FTYPE_V4DI_V4DI_V4DI:
10614     case V8DF_FTYPE_V2DF_V8DF_UQI:
10615     case V8DF_FTYPE_V4DF_V8DF_UQI:
10616     case V8DF_FTYPE_V8DF_V8DF_UQI:
10617     case V8SF_FTYPE_V8SF_V8SF_UQI:
10618     case V8SF_FTYPE_V8SI_V8SF_UQI:
10619     case V4DF_FTYPE_V4DF_V4DF_UQI:
10620     case V4SF_FTYPE_V4SF_V4SF_UQI:
10621     case V2DF_FTYPE_V2DF_V2DF_UQI:
10622     case V2DF_FTYPE_V4SF_V2DF_UQI:
10623     case V2DF_FTYPE_V4SI_V2DF_UQI:
10624     case V4SF_FTYPE_V4SI_V4SF_UQI:
10625     case V4DF_FTYPE_V4SF_V4DF_UQI:
10626     case V4DF_FTYPE_V4SI_V4DF_UQI:
10627     case V8SI_FTYPE_V8SI_V8SI_UQI:
10628     case V8SI_FTYPE_V8HI_V8SI_UQI:
10629     case V8SI_FTYPE_V16QI_V8SI_UQI:
10630     case V8DF_FTYPE_V8SI_V8DF_UQI:
10631     case V8DI_FTYPE_DI_V8DI_UQI:
10632     case V16SF_FTYPE_V8SF_V16SF_UHI:
10633     case V16SI_FTYPE_V8SI_V16SI_UHI:
10634     case V16HF_FTYPE_V16HI_V16HF_UHI:
10635     case V16HF_FTYPE_V16HF_V16HF_V16HF:
10636     case V16HI_FTYPE_V16HF_V16HI_UHI:
10637     case V16HI_FTYPE_V16HI_V16HI_UHI:
10638     case V8HI_FTYPE_V16QI_V8HI_UQI:
10639     case V16HI_FTYPE_V16QI_V16HI_UHI:
10640     case V32HI_FTYPE_V32HI_V32HI_USI:
10641     case V32HI_FTYPE_V32QI_V32HI_USI:
10642     case V8DI_FTYPE_V16QI_V8DI_UQI:
10643     case V8DI_FTYPE_V2DI_V8DI_UQI:
10644     case V8DI_FTYPE_V4DI_V8DI_UQI:
10645     case V8DI_FTYPE_V8DI_V8DI_UQI:
10646     case V8DI_FTYPE_V8HI_V8DI_UQI:
10647     case V8DI_FTYPE_V8SI_V8DI_UQI:
10648     case V8HI_FTYPE_V8DI_V8HI_UQI:
10649     case V8SI_FTYPE_V8DI_V8SI_UQI:
10650     case V4SI_FTYPE_V4SI_V4SI_V4SI:
10651     case V16SI_FTYPE_V16SI_V16SI_V16SI:
10652     case V8DI_FTYPE_V8DI_V8DI_V8DI:
10653     case V32HI_FTYPE_V32HI_V32HI_V32HI:
10654     case V2DI_FTYPE_V2DI_V2DI_V2DI:
10655     case V16HI_FTYPE_V16HI_V16HI_V16HI:
10656     case V8SI_FTYPE_V8SI_V8SI_V8SI:
10657     case V8HI_FTYPE_V8HI_V8HI_V8HI:
10658     case V32HI_FTYPE_V16SF_V16SF_USI:
10659     case V16HI_FTYPE_V8SF_V8SF_UHI:
10660     case V8HI_FTYPE_V4SF_V4SF_UQI:
10661     case V16HI_FTYPE_V16SF_V16HI_UHI:
10662     case V8HI_FTYPE_V8SF_V8HI_UQI:
10663     case V8HI_FTYPE_V4SF_V8HI_UQI:
10664     case V16SF_FTYPE_V16SF_V32HI_V32HI:
10665     case V8SF_FTYPE_V8SF_V16HI_V16HI:
10666     case V4SF_FTYPE_V4SF_V8HI_V8HI:
10667       nargs = 3;
10668       break;
10669     case V32QI_FTYPE_V32QI_V32QI_INT:
10670     case V16HI_FTYPE_V16HI_V16HI_INT:
10671     case V16QI_FTYPE_V16QI_V16QI_INT:
10672     case V4DI_FTYPE_V4DI_V4DI_INT:
10673     case V8HI_FTYPE_V8HI_V8HI_INT:
10674     case V8SI_FTYPE_V8SI_V8SI_INT:
10675     case V8SI_FTYPE_V8SI_V4SI_INT:
10676     case V8SF_FTYPE_V8SF_V8SF_INT:
10677     case V8SF_FTYPE_V8SF_V4SF_INT:
10678     case V4SI_FTYPE_V4SI_V4SI_INT:
10679     case V4DF_FTYPE_V4DF_V4DF_INT:
10680     case V16SF_FTYPE_V16SF_V16SF_INT:
10681     case V16SF_FTYPE_V16SF_V4SF_INT:
10682     case V16SI_FTYPE_V16SI_V4SI_INT:
10683     case V4DF_FTYPE_V4DF_V2DF_INT:
10684     case V4SF_FTYPE_V4SF_V4SF_INT:
10685     case V2DI_FTYPE_V2DI_V2DI_INT:
10686     case V4DI_FTYPE_V4DI_V2DI_INT:
10687     case V2DF_FTYPE_V2DF_V2DF_INT:
10688     case UQI_FTYPE_V8DI_V8UDI_INT:
10689     case UQI_FTYPE_V8DF_V8DF_INT:
10690     case UQI_FTYPE_V2DF_V2DF_INT:
10691     case UQI_FTYPE_V4SF_V4SF_INT:
10692     case UHI_FTYPE_V16SI_V16SI_INT:
10693     case UHI_FTYPE_V16SF_V16SF_INT:
10694     case V64QI_FTYPE_V64QI_V64QI_INT:
10695     case V32HI_FTYPE_V32HI_V32HI_INT:
10696     case V16SI_FTYPE_V16SI_V16SI_INT:
10697     case V8DI_FTYPE_V8DI_V8DI_INT:
10698       nargs = 3;
10699       nargs_constant = 1;
10700       break;
10701     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
10702       nargs = 3;
10703       rmode = V4DImode;
10704       nargs_constant = 1;
10705       break;
10706     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
10707       nargs = 3;
10708       rmode = V2DImode;
10709       nargs_constant = 1;
10710       break;
10711     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
10712       nargs = 3;
10713       rmode = DImode;
10714       nargs_constant = 1;
10715       break;
10716     case V2DI_FTYPE_V2DI_UINT_UINT:
10717       nargs = 3;
10718       nargs_constant = 2;
10719       break;
10720     case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
10721       nargs = 3;
10722       rmode = V8DImode;
10723       nargs_constant = 1;
10724       break;
10725     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
10726       nargs = 5;
10727       rmode = V8DImode;
10728       mask_pos = 2;
10729       nargs_constant = 1;
10730       break;
10731     case QI_FTYPE_V8DF_INT_UQI:
10732     case QI_FTYPE_V4DF_INT_UQI:
10733     case QI_FTYPE_V2DF_INT_UQI:
10734     case HI_FTYPE_V16SF_INT_UHI:
10735     case QI_FTYPE_V8SF_INT_UQI:
10736     case QI_FTYPE_V4SF_INT_UQI:
10737     case QI_FTYPE_V8HF_INT_UQI:
10738     case HI_FTYPE_V16HF_INT_UHI:
10739     case SI_FTYPE_V32HF_INT_USI:
10740     case V4SI_FTYPE_V4SI_V4SI_UHI:
10741     case V8SI_FTYPE_V8SI_V8SI_UHI:
10742       nargs = 3;
10743       mask_pos = 1;
10744       nargs_constant = 1;
10745       break;
10746     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
10747       nargs = 5;
10748       rmode = V4DImode;
10749       mask_pos = 2;
10750       nargs_constant = 1;
10751       break;
10752     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
10753       nargs = 5;
10754       rmode = V2DImode;
10755       mask_pos = 2;
10756       nargs_constant = 1;
10757       break;
10758     case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
10759     case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
10760     case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
10761     case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
10762     case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
10763     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
10764     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
10765     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
10766     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
10767     case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
10768     case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
10769     case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
10770     case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
10771     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
10772     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
10773     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
10774     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
10775     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
10776     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
10777     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
10778     case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
10779     case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
10780     case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
10781     case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
10782     case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
10783     case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
10784     case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
10785     case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
10786     case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
10787     case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
10788     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
10789     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
10790     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
10791     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
10792     case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
10793     case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
10794     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
10795     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
10796     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
10797     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
10798     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
10799     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
10800     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
10801     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
10802     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
10803     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
10804     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
10805     case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
10806     case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
10807     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
10808     case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
10809     case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
10810     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
10811     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
10812     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
10813     case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
10814     case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
10815     case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
10816       nargs = 4;
10817       break;
10818     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
10819     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
10820     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
10821     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
10822     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
10823       nargs = 4;
10824       nargs_constant = 1;
10825       break;
10826     case UQI_FTYPE_V4DI_V4DI_INT_UQI:
10827     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
10828     case QI_FTYPE_V4DF_V4DF_INT_UQI:
10829     case QI_FTYPE_V8SF_V8SF_INT_UQI:
10830     case UHI_FTYPE_V16HF_V16HF_INT_UHI:
10831     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
10832     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
10833     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
10834     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
10835     case UQI_FTYPE_V8HF_V8HF_INT_UQI:
10836     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
10837     case USI_FTYPE_V32QI_V32QI_INT_USI:
10838     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
10839     case USI_FTYPE_V32HI_V32HI_INT_USI:
10840     case USI_FTYPE_V32HF_V32HF_INT_USI:
10841     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
10842     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
10843       nargs = 4;
10844       mask_pos = 1;
10845       nargs_constant = 1;
10846       break;
10847     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
10848       nargs = 4;
10849       nargs_constant = 2;
10850       break;
10851     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
10852     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
10853     case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
10854     case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
10855     case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
10856       nargs = 4;
10857       break;
10858     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
10859     case UHI_FTYPE_V16SI_V16SI_INT_UHI:
10860       mask_pos = 1;
10861       nargs = 4;
10862       nargs_constant = 1;
10863       break;
10864     case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
10865     case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
10866     case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
10867     case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
10868     case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
10869     case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
10870     case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
10871     case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
10872     case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
10873     case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
10874     case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
10875     case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
10876     case V32HI_FTYPE_V32HI_INT_V32HI_USI:
10877     case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
10878     case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
10879     case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
10880     case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
10881     case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
10882     case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
10883     case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
10884     case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
10885     case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
10886     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
10887     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
10888     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
10889     case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
10890     case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
10891     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
10892     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
10893     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
10894     case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
10895     case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
10896       nargs = 4;
10897       mask_pos = 2;
10898       nargs_constant = 1;
10899       break;
10900     case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
10901     case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
10902     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
10903     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
10904     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
10905     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
10906     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
10907     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
10908     case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
10909     case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
10910     case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
10911     case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
10912     case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
10913     case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
10914     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
10915     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
10916     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
10917     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
10918     case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
10919     case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
10920     case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
10921     case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
10922     case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
10923     case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
10924     case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
10925     case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
10926     case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
10927       nargs = 5;
10928       mask_pos = 2;
10929       nargs_constant = 1;
10930       break;
10931     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
10932     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
10933     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
10934     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
10935     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
10936     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
10937     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
10938     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
10939     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
10940     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
10941       nargs = 5;
10942       mask_pos = 1;
10943       nargs_constant = 1;
10944       break;
10945     case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
10946     case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
10947     case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
10948     case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
10949     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
10950     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
10951     case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
10952     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
10953     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
10954     case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
10955     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
10956     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
10957       nargs = 5;
10958       mask_pos = 1;
10959       nargs_constant = 2;
10960       break;
10961 
10962     default:
10963       gcc_unreachable ();
10964     }
10965 
10966   gcc_assert (nargs <= ARRAY_SIZE (xops));
10967 
10968   if (comparison != UNKNOWN)
10969     {
10970       gcc_assert (nargs == 2);
10971       return ix86_expand_sse_compare (d, exp, target, swap);
10972     }
10973 
10974   if (rmode == VOIDmode || rmode == tmode)
10975     {
10976       if (optimize
10977 	  || target == 0
10978 	  || GET_MODE (target) != tmode
10979 	  || !insn_p->operand[0].predicate (target, tmode))
10980 	target = gen_reg_rtx (tmode);
10981       else if (memory_operand (target, tmode))
10982 	num_memory++;
10983       real_target = target;
10984     }
10985   else
10986     {
10987       real_target = gen_reg_rtx (tmode);
10988       target = lowpart_subreg (rmode, real_target, tmode);
10989     }
10990 
10991   for (i = 0; i < nargs; i++)
10992     {
10993       tree arg = CALL_EXPR_ARG (exp, i);
10994       rtx op = expand_normal (arg);
10995       machine_mode mode = insn_p->operand[i + 1].mode;
10996       bool match = insn_p->operand[i + 1].predicate (op, mode);
10997 
10998       if (second_arg_count && i == 1)
10999 	{
11000 	  /* SIMD shift insns take either an 8-bit immediate or
11001 	     register as count.  But builtin functions take int as
11002 	     count.  If count doesn't match, we put it in register.
11003 	     The instructions are using 64-bit count, if op is just
11004 	     32-bit, zero-extend it, as negative shift counts
11005 	     are undefined behavior and zero-extension is more
11006 	     efficient.  */
11007 	  if (!match)
11008 	    {
11009 	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
11010 		op = convert_modes (mode, GET_MODE (op), op, 1);
11011 	      else
11012 		op = lowpart_subreg (mode, op, GET_MODE (op));
11013 	      if (!insn_p->operand[i + 1].predicate (op, mode))
11014 		op = copy_to_reg (op);
11015 	    }
11016 	}
11017       else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11018 	       (!mask_pos && (nargs - i) <= nargs_constant))
11019 	{
11020 	  if (!match)
11021 	    switch (icode)
11022 	      {
11023 	      case CODE_FOR_avx_vinsertf128v4di:
11024 	      case CODE_FOR_avx_vextractf128v4di:
11025 		error ("the last argument must be an 1-bit immediate");
11026 		return const0_rtx;
11027 
11028 	      case CODE_FOR_avx512f_cmpv8di3_mask:
11029 	      case CODE_FOR_avx512f_cmpv16si3_mask:
11030 	      case CODE_FOR_avx512f_ucmpv8di3_mask:
11031 	      case CODE_FOR_avx512f_ucmpv16si3_mask:
11032 	      case CODE_FOR_avx512vl_cmpv4di3_mask:
11033 	      case CODE_FOR_avx512vl_cmpv8si3_mask:
11034 	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
11035 	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
11036 	      case CODE_FOR_avx512vl_cmpv2di3_mask:
11037 	      case CODE_FOR_avx512vl_cmpv4si3_mask:
11038 	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
11039 	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
11040 		error ("the last argument must be a 3-bit immediate");
11041 		return const0_rtx;
11042 
11043 	      case CODE_FOR_sse4_1_roundsd:
11044 	      case CODE_FOR_sse4_1_roundss:
11045 
11046 	      case CODE_FOR_sse4_1_roundpd:
11047 	      case CODE_FOR_sse4_1_roundps:
11048 	      case CODE_FOR_avx_roundpd256:
11049 	      case CODE_FOR_avx_roundps256:
11050 
11051 	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11052 	      case CODE_FOR_sse4_1_roundps_sfix:
11053 	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11054 	      case CODE_FOR_avx_roundps_sfix256:
11055 
11056 	      case CODE_FOR_sse4_1_blendps:
11057 	      case CODE_FOR_avx_blendpd256:
11058 	      case CODE_FOR_avx_vpermilv4df:
11059 	      case CODE_FOR_avx_vpermilv4df_mask:
11060 	      case CODE_FOR_avx512f_getmantv8df_mask:
11061 	      case CODE_FOR_avx512f_getmantv16sf_mask:
11062 	      case CODE_FOR_avx512vl_getmantv16hf_mask:
11063 	      case CODE_FOR_avx512vl_getmantv8sf_mask:
11064 	      case CODE_FOR_avx512vl_getmantv4df_mask:
11065 	      case CODE_FOR_avx512fp16_getmantv8hf_mask:
11066 	      case CODE_FOR_avx512vl_getmantv4sf_mask:
11067 	      case CODE_FOR_avx512vl_getmantv2df_mask:
11068 	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
11069 	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11070 	      case CODE_FOR_avx512dq_rangepv4df_mask:
11071 	      case CODE_FOR_avx512dq_rangepv8sf_mask:
11072 	      case CODE_FOR_avx512dq_rangepv2df_mask:
11073 	      case CODE_FOR_avx512dq_rangepv4sf_mask:
11074 	      case CODE_FOR_avx_shufpd256_mask:
11075 		error ("the last argument must be a 4-bit immediate");
11076 		return const0_rtx;
11077 
11078 	      case CODE_FOR_sha1rnds4:
11079 	      case CODE_FOR_sse4_1_blendpd:
11080 	      case CODE_FOR_avx_vpermilv2df:
11081 	      case CODE_FOR_avx_vpermilv2df_mask:
11082 	      case CODE_FOR_xop_vpermil2v2df3:
11083 	      case CODE_FOR_xop_vpermil2v4sf3:
11084 	      case CODE_FOR_xop_vpermil2v4df3:
11085 	      case CODE_FOR_xop_vpermil2v8sf3:
11086 	      case CODE_FOR_avx512f_vinsertf32x4_mask:
11087 	      case CODE_FOR_avx512f_vinserti32x4_mask:
11088 	      case CODE_FOR_avx512f_vextractf32x4_mask:
11089 	      case CODE_FOR_avx512f_vextracti32x4_mask:
11090 	      case CODE_FOR_sse2_shufpd:
11091 	      case CODE_FOR_sse2_shufpd_mask:
11092 	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
11093 	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
11094 	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
11095 	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
11096 		error ("the last argument must be a 2-bit immediate");
11097 		return const0_rtx;
11098 
11099 	      case CODE_FOR_avx_vextractf128v4df:
11100 	      case CODE_FOR_avx_vextractf128v8sf:
11101 	      case CODE_FOR_avx_vextractf128v8si:
11102 	      case CODE_FOR_avx_vinsertf128v4df:
11103 	      case CODE_FOR_avx_vinsertf128v8sf:
11104 	      case CODE_FOR_avx_vinsertf128v8si:
11105 	      case CODE_FOR_avx512f_vinsertf64x4_mask:
11106 	      case CODE_FOR_avx512f_vinserti64x4_mask:
11107 	      case CODE_FOR_avx512f_vextractf64x4_mask:
11108 	      case CODE_FOR_avx512f_vextracti64x4_mask:
11109 	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
11110 	      case CODE_FOR_avx512dq_vinserti32x8_mask:
11111 	      case CODE_FOR_avx512vl_vinsertv4df:
11112 	      case CODE_FOR_avx512vl_vinsertv4di:
11113 	      case CODE_FOR_avx512vl_vinsertv8sf:
11114 	      case CODE_FOR_avx512vl_vinsertv8si:
11115 		error ("the last argument must be a 1-bit immediate");
11116 		return const0_rtx;
11117 
11118 	      case CODE_FOR_avx_vmcmpv2df3:
11119 	      case CODE_FOR_avx_vmcmpv4sf3:
11120 	      case CODE_FOR_avx_cmpv2df3:
11121 	      case CODE_FOR_avx_cmpv4sf3:
11122 	      case CODE_FOR_avx_cmpv4df3:
11123 	      case CODE_FOR_avx_cmpv8sf3:
11124 	      case CODE_FOR_avx512f_cmpv8df3_mask:
11125 	      case CODE_FOR_avx512f_cmpv16sf3_mask:
11126 	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
11127 	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11128 	      case CODE_FOR_avx512bw_cmpv32hf3_mask:
11129 	      case CODE_FOR_avx512vl_cmpv16hf3_mask:
11130 	      case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11131 		error ("the last argument must be a 5-bit immediate");
11132 		return const0_rtx;
11133 
11134 	      default:
11135 		switch (nargs_constant)
11136 		  {
11137 		  case 2:
11138 		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11139 			(!mask_pos && (nargs - i) == nargs_constant))
11140 		      {
11141 			error ("the next to last argument must be an 8-bit immediate");
11142 			break;
11143 		      }
11144 		    /* FALLTHRU */
11145 		  case 1:
11146 		    error ("the last argument must be an 8-bit immediate");
11147 		    break;
11148 		  default:
11149 		    gcc_unreachable ();
11150 		  }
11151 		return const0_rtx;
11152 	      }
11153 	}
11154       else
11155 	{
11156 	  if (VECTOR_MODE_P (mode))
11157 	    op = safe_vector_operand (op, mode);
11158 
11159 	  /* If we aren't optimizing, only allow one memory operand to
11160 	     be generated.  */
11161 	  if (memory_operand (op, mode))
11162 	    num_memory++;
11163 
11164 	  op = fixup_modeless_constant (op, mode);
11165 
11166 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11167 	    {
11168 	      if (optimize || !match || num_memory > 1)
11169 		op = copy_to_mode_reg (mode, op);
11170 	    }
11171 	  else
11172 	    {
11173 	      op = copy_to_reg (op);
11174 	      op = lowpart_subreg (mode, op, GET_MODE (op));
11175 	    }
11176 	}
11177 
11178       xops[i] = op;
11179     }
11180 
11181   switch (nargs)
11182     {
11183     case 1:
11184       pat = GEN_FCN (icode) (real_target, xops[0]);
11185       break;
11186     case 2:
11187       pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11188       break;
11189     case 3:
11190       pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11191       break;
11192     case 4:
11193       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11194 			     xops[2], xops[3]);
11195       break;
11196     case 5:
11197       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11198 			     xops[2], xops[3], xops[4]);
11199       break;
11200     case 6:
11201       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11202 			     xops[2], xops[3], xops[4], xops[5]);
11203       break;
11204     default:
11205       gcc_unreachable ();
11206     }
11207 
11208   if (! pat)
11209     return 0;
11210 
11211   emit_insn (pat);
11212   return target;
11213 }
11214 
11215 /* Transform pattern of following layout:
11216      (set A
11217        (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11218      )
11219    into:
11220      (set (A B)) */
11221 
11222 static rtx
ix86_erase_embedded_rounding(rtx pat)11223 ix86_erase_embedded_rounding (rtx pat)
11224 {
11225   if (GET_CODE (pat) == INSN)
11226     pat = PATTERN (pat);
11227 
11228   gcc_assert (GET_CODE (pat) == SET);
11229   rtx src = SET_SRC (pat);
11230   gcc_assert (XVECLEN (src, 0) == 2);
11231   rtx p0 = XVECEXP (src, 0, 0);
11232   gcc_assert (GET_CODE (src) == UNSPEC
11233 	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11234   rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11235   return res;
11236 }
11237 
11238 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11239    with rounding.  */
11240 static rtx
ix86_expand_sse_comi_round(const struct builtin_description * d,tree exp,rtx target)11241 ix86_expand_sse_comi_round (const struct builtin_description *d,
11242 			    tree exp, rtx target)
11243 {
11244   rtx pat, set_dst;
11245   tree arg0 = CALL_EXPR_ARG (exp, 0);
11246   tree arg1 = CALL_EXPR_ARG (exp, 1);
11247   tree arg2 = CALL_EXPR_ARG (exp, 2);
11248   tree arg3 = CALL_EXPR_ARG (exp, 3);
11249   rtx op0 = expand_normal (arg0);
11250   rtx op1 = expand_normal (arg1);
11251   rtx op2 = expand_normal (arg2);
11252   rtx op3 = expand_normal (arg3);
11253   enum insn_code icode = d->icode;
11254   const struct insn_data_d *insn_p = &insn_data[icode];
11255   machine_mode mode0 = insn_p->operand[0].mode;
11256   machine_mode mode1 = insn_p->operand[1].mode;
11257 
11258   /* See avxintrin.h for values.  */
11259   static const enum rtx_code comparisons[32] =
11260     {
11261       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11262       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11263       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11264       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11265     };
11266   static const bool ordereds[32] =
11267     {
11268       true,  true,  true,  false, false, false, false, true,
11269       false, false, false, true,  true,  true,  true,  false,
11270       true,  true,  true,  false, false, false, false, true,
11271       false, false, false, true,  true,  true,  true,  false
11272     };
11273   static const bool non_signalings[32] =
11274     {
11275       true,  false, false, true,  true,  false, false, true,
11276       true,  false, false, true,  true,  false, false, true,
11277       false, true,  true,  false, false, true,  true,  false,
11278       false, true,  true,  false, false, true,  true,  false
11279     };
11280 
11281   if (!CONST_INT_P (op2))
11282     {
11283       error ("the third argument must be comparison constant");
11284       return const0_rtx;
11285     }
11286   if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11287     {
11288       error ("incorrect comparison mode");
11289       return const0_rtx;
11290     }
11291 
11292   if (!insn_p->operand[2].predicate (op3, SImode))
11293     {
11294       error ("incorrect rounding operand");
11295       return const0_rtx;
11296     }
11297 
11298   if (VECTOR_MODE_P (mode0))
11299     op0 = safe_vector_operand (op0, mode0);
11300   if (VECTOR_MODE_P (mode1))
11301     op1 = safe_vector_operand (op1, mode1);
11302 
11303   enum rtx_code comparison = comparisons[INTVAL (op2)];
11304   bool ordered = ordereds[INTVAL (op2)];
11305   bool non_signaling = non_signalings[INTVAL (op2)];
11306   rtx const_val = const0_rtx;
11307 
11308   bool check_unordered = false;
11309   machine_mode mode = CCFPmode;
11310   switch (comparison)
11311     {
11312     case ORDERED:
11313       if (!ordered)
11314 	{
11315 	  /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
11316 	  if (!non_signaling)
11317 	    ordered = true;
11318 	  mode = CCSmode;
11319 	}
11320       else
11321 	{
11322 	  /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S.  */
11323 	  if (non_signaling)
11324 	    ordered = false;
11325 	  mode = CCPmode;
11326 	}
11327       comparison = NE;
11328       break;
11329     case UNORDERED:
11330       if (ordered)
11331 	{
11332 	  /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS.  */
11333 	  if (non_signaling)
11334 	    ordered = false;
11335 	  mode = CCSmode;
11336 	}
11337       else
11338 	{
11339 	  /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S.  */
11340 	  if (!non_signaling)
11341 	    ordered = true;
11342 	  mode = CCPmode;
11343 	}
11344       comparison = EQ;
11345       break;
11346 
11347     case LE:	/* -> GE  */
11348     case LT:	/* -> GT  */
11349     case UNGE:	/* -> UNLE  */
11350     case UNGT:	/* -> UNLT  */
11351       std::swap (op0, op1);
11352       comparison = swap_condition (comparison);
11353       /* FALLTHRU */
11354     case GT:
11355     case GE:
11356     case UNEQ:
11357     case UNLT:
11358     case UNLE:
11359     case LTGT:
11360       /* These are supported by CCFPmode.  NB: Use ordered/signaling
11361 	 COMI or unordered/non-signaling UCOMI.  Both set ZF, PF, CF
11362 	 with NAN operands.  */
11363       if (ordered == non_signaling)
11364 	ordered = !ordered;
11365       break;
11366     case EQ:
11367       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
11368 	 _CMP_EQ_OQ/_CMP_EQ_OS.  */
11369       check_unordered = true;
11370       mode = CCZmode;
11371       break;
11372     case NE:
11373       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
11374 	 _CMP_NEQ_UQ/_CMP_NEQ_US.  */
11375       gcc_assert (!ordered);
11376       check_unordered = true;
11377       mode = CCZmode;
11378       const_val = const1_rtx;
11379       break;
11380     default:
11381       gcc_unreachable ();
11382     }
11383 
11384   target = gen_reg_rtx (SImode);
11385   emit_move_insn (target, const_val);
11386   target = gen_rtx_SUBREG (QImode, target, 0);
11387 
11388   if ((optimize && !register_operand (op0, mode0))
11389       || !insn_p->operand[0].predicate (op0, mode0))
11390     op0 = copy_to_mode_reg (mode0, op0);
11391   if ((optimize && !register_operand (op1, mode1))
11392       || !insn_p->operand[1].predicate (op1, mode1))
11393     op1 = copy_to_mode_reg (mode1, op1);
11394 
11395   /*
11396      1. COMI: ordered and signaling.
11397      2. UCOMI: unordered and non-signaling.
11398    */
11399   if (non_signaling)
11400     icode = (icode == CODE_FOR_sse_comi_round
11401 	     ? CODE_FOR_sse_ucomi_round
11402 	     : CODE_FOR_sse2_ucomi_round);
11403 
11404   pat = GEN_FCN (icode) (op0, op1, op3);
11405   if (! pat)
11406     return 0;
11407 
11408   /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
11409   if (INTVAL (op3) == NO_ROUND)
11410     {
11411       pat = ix86_erase_embedded_rounding (pat);
11412       if (! pat)
11413 	return 0;
11414 
11415       set_dst = SET_DEST (pat);
11416     }
11417   else
11418     {
11419       gcc_assert (GET_CODE (pat) == SET);
11420       set_dst = SET_DEST (pat);
11421     }
11422 
11423   emit_insn (pat);
11424 
11425   rtx_code_label *label = NULL;
11426 
11427   /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
11428      with NAN operands.  */
11429   if (check_unordered)
11430     {
11431       gcc_assert (comparison == EQ || comparison == NE);
11432 
11433       rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
11434       label = gen_label_rtx ();
11435       rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
11436       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11437 				  gen_rtx_LABEL_REF (VOIDmode, label),
11438 				  pc_rtx);
11439       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
11440     }
11441 
11442   /* NB: Set CCFPmode and check a different CCmode which is in subset
11443      of CCFPmode.  */
11444   if (GET_MODE (set_dst) != mode)
11445     {
11446       gcc_assert (mode == CCAmode || mode == CCCmode
11447 		  || mode == CCOmode || mode == CCPmode
11448 		  || mode == CCSmode || mode == CCZmode);
11449       set_dst = gen_rtx_REG (mode, FLAGS_REG);
11450     }
11451 
11452   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11453 			  gen_rtx_fmt_ee (comparison, QImode,
11454 					  set_dst,
11455 					  const0_rtx)));
11456 
11457   if (label)
11458     emit_label (label);
11459 
11460   return SUBREG_REG (target);
11461 }
11462 
11463 static rtx
ix86_expand_round_builtin(const struct builtin_description * d,tree exp,rtx target)11464 ix86_expand_round_builtin (const struct builtin_description *d,
11465 			   tree exp, rtx target)
11466 {
11467   rtx pat;
11468   unsigned int i, nargs;
11469   rtx xops[6];
11470   enum insn_code icode = d->icode;
11471   const struct insn_data_d *insn_p = &insn_data[icode];
11472   machine_mode tmode = insn_p->operand[0].mode;
11473   unsigned int nargs_constant = 0;
11474   unsigned int redundant_embed_rnd = 0;
11475 
11476   switch ((enum ix86_builtin_func_type) d->flag)
11477     {
11478     case UINT64_FTYPE_V2DF_INT:
11479     case UINT64_FTYPE_V4SF_INT:
11480     case UINT64_FTYPE_V8HF_INT:
11481     case UINT_FTYPE_V2DF_INT:
11482     case UINT_FTYPE_V4SF_INT:
11483     case UINT_FTYPE_V8HF_INT:
11484     case INT64_FTYPE_V2DF_INT:
11485     case INT64_FTYPE_V4SF_INT:
11486     case INT64_FTYPE_V8HF_INT:
11487     case INT_FTYPE_V2DF_INT:
11488     case INT_FTYPE_V4SF_INT:
11489     case INT_FTYPE_V8HF_INT:
11490       nargs = 2;
11491       break;
11492     case V32HF_FTYPE_V32HF_V32HF_INT:
11493     case V8HF_FTYPE_V8HF_V8HF_INT:
11494     case V8HF_FTYPE_V8HF_INT_INT:
11495     case V8HF_FTYPE_V8HF_UINT_INT:
11496     case V8HF_FTYPE_V8HF_INT64_INT:
11497     case V8HF_FTYPE_V8HF_UINT64_INT:
11498     case V4SF_FTYPE_V4SF_UINT_INT:
11499     case V4SF_FTYPE_V4SF_UINT64_INT:
11500     case V2DF_FTYPE_V2DF_UINT64_INT:
11501     case V4SF_FTYPE_V4SF_INT_INT:
11502     case V4SF_FTYPE_V4SF_INT64_INT:
11503     case V2DF_FTYPE_V2DF_INT64_INT:
11504     case V4SF_FTYPE_V4SF_V4SF_INT:
11505     case V2DF_FTYPE_V2DF_V2DF_INT:
11506     case V4SF_FTYPE_V4SF_V2DF_INT:
11507     case V2DF_FTYPE_V2DF_V4SF_INT:
11508       nargs = 3;
11509       break;
11510     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11511     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
11512     case V32HI_FTYPE_V32HF_V32HI_USI_INT:
11513     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
11514     case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
11515     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11516     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11517     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
11518     case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11519     case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
11520     case V32HF_FTYPE_V32HI_V32HF_USI_INT:
11521     case V32HF_FTYPE_V32HF_V32HF_USI_INT:
11522     case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
11523     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11524     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11525     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11526     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
11527     case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
11528     case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
11529     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11530     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11531     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11532     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
11533     case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
11534     case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11535     case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
11536     case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
11537       nargs = 4;
11538       break;
11539     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11540     case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11541       nargs_constant = 2;
11542       nargs = 4;
11543       break;
11544     case INT_FTYPE_V4SF_V4SF_INT_INT:
11545     case INT_FTYPE_V2DF_V2DF_INT_INT:
11546       return ix86_expand_sse_comi_round (d, exp, target);
11547     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11548     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11549     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
11550     case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
11551     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
11552     case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
11553     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
11554     case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
11555     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11556     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
11557     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
11558     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11559     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
11560     case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
11561     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
11562     case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11563     case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
11564       nargs = 5;
11565       break;
11566     case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
11567     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11568     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
11569     case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11570     case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
11571       nargs_constant = 4;
11572       nargs = 5;
11573       break;
11574     case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11575     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11576     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11577     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
11578     case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11579     case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
11580       nargs_constant = 3;
11581       nargs = 5;
11582       break;
11583     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11584     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11585     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11586     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11587     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11588     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
11589     case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
11590       nargs = 6;
11591       nargs_constant = 4;
11592       break;
11593     case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11594     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11595     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11596     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11597       nargs = 6;
11598       nargs_constant = 3;
11599       break;
11600     default:
11601       gcc_unreachable ();
11602     }
11603   gcc_assert (nargs <= ARRAY_SIZE (xops));
11604 
11605   if (optimize
11606       || target == 0
11607       || GET_MODE (target) != tmode
11608       || !insn_p->operand[0].predicate (target, tmode))
11609     target = gen_reg_rtx (tmode);
11610 
11611   for (i = 0; i < nargs; i++)
11612     {
11613       tree arg = CALL_EXPR_ARG (exp, i);
11614       rtx op = expand_normal (arg);
11615       machine_mode mode = insn_p->operand[i + 1].mode;
11616       bool match = insn_p->operand[i + 1].predicate (op, mode);
11617 
11618       if (i == nargs - nargs_constant)
11619 	{
11620 	  if (!match)
11621 	    {
11622 	      switch (icode)
11623 		{
11624 		case CODE_FOR_avx512f_getmantv8df_mask_round:
11625 		case CODE_FOR_avx512f_getmantv16sf_mask_round:
11626 		case CODE_FOR_avx512bw_getmantv32hf_mask_round:
11627 		case CODE_FOR_avx512f_vgetmantv2df_round:
11628 		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11629 		case CODE_FOR_avx512f_vgetmantv4sf_round:
11630 		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
11631 		case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
11632 		  error ("the immediate argument must be a 4-bit immediate");
11633 		  return const0_rtx;
11634 		case CODE_FOR_avx512f_cmpv8df3_mask_round:
11635 		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11636 		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11637 		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
11638 		case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11639 		case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
11640 		  error ("the immediate argument must be a 5-bit immediate");
11641 		  return const0_rtx;
11642 		default:
11643 		  error ("the immediate argument must be an 8-bit immediate");
11644 		  return const0_rtx;
11645 		}
11646 	    }
11647 	}
11648       else if (i == nargs-1)
11649 	{
11650 	  if (!insn_p->operand[nargs].predicate (op, SImode))
11651 	    {
11652 	      error ("incorrect rounding operand");
11653 	      return const0_rtx;
11654 	    }
11655 
11656 	  /* If there is no rounding use normal version of the pattern.  */
11657 	  if (INTVAL (op) == NO_ROUND)
11658 	    {
11659 	      /* Skip erasing embedded rounding for below expanders who
11660 		 generates multiple insns.  In ix86_erase_embedded_rounding
11661 		 the pattern will be transformed to a single set, and emit_insn
11662 		 appends the set insead of insert it to chain.  So the insns
11663 		 emitted inside define_expander would be ignored.  */
11664 	      switch (icode)
11665 		{
11666 		case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11667 		case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11668 		case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11669 		case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11670 		case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11671 		case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11672 		  redundant_embed_rnd = 0;
11673 		  break;
11674 		default:
11675 		  redundant_embed_rnd = 1;
11676 		  break;
11677 		}
11678 	    }
11679 	}
11680       else
11681 	{
11682 	  if (VECTOR_MODE_P (mode))
11683 	    op = safe_vector_operand (op, mode);
11684 
11685 	  op = fixup_modeless_constant (op, mode);
11686 
11687 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11688 	    {
11689 	      if (optimize || !match)
11690 		op = copy_to_mode_reg (mode, op);
11691 	    }
11692 	  else
11693 	    {
11694 	      op = copy_to_reg (op);
11695 	      op = lowpart_subreg (mode, op, GET_MODE (op));
11696 	    }
11697 	}
11698 
11699       xops[i] = op;
11700     }
11701 
11702   switch (nargs)
11703     {
11704     case 1:
11705       pat = GEN_FCN (icode) (target, xops[0]);
11706       break;
11707     case 2:
11708       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11709       break;
11710     case 3:
11711       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11712       break;
11713     case 4:
11714       pat = GEN_FCN (icode) (target, xops[0], xops[1],
11715 			     xops[2], xops[3]);
11716       break;
11717     case 5:
11718       pat = GEN_FCN (icode) (target, xops[0], xops[1],
11719 			     xops[2], xops[3], xops[4]);
11720       break;
11721     case 6:
11722       pat = GEN_FCN (icode) (target, xops[0], xops[1],
11723 			     xops[2], xops[3], xops[4], xops[5]);
11724       break;
11725     default:
11726       gcc_unreachable ();
11727     }
11728 
11729   if (!pat)
11730     return 0;
11731 
11732   if (redundant_embed_rnd)
11733     pat = ix86_erase_embedded_rounding (pat);
11734 
11735   emit_insn (pat);
11736   return target;
11737 }
11738 
11739 /* Subroutine of ix86_expand_builtin to take care of special insns
11740    with variable number of operands.  */
11741 
11742 static rtx
ix86_expand_special_args_builtin(const struct builtin_description * d,tree exp,rtx target)11743 ix86_expand_special_args_builtin (const struct builtin_description *d,
11744 				  tree exp, rtx target)
11745 {
11746   tree arg;
11747   rtx pat, op;
11748   unsigned int i, nargs, arg_adjust, memory;
11749   bool aligned_mem = false;
11750   rtx xops[3];
11751   enum insn_code icode = d->icode;
11752   const struct insn_data_d *insn_p = &insn_data[icode];
11753   machine_mode tmode = insn_p->operand[0].mode;
11754   enum { load, store } klass;
11755 
11756   switch ((enum ix86_builtin_func_type) d->flag)
11757     {
11758     case VOID_FTYPE_VOID:
11759       emit_insn (GEN_FCN (icode) (target));
11760       return 0;
11761     case VOID_FTYPE_UINT64:
11762     case VOID_FTYPE_UNSIGNED:
11763       nargs = 0;
11764       klass = store;
11765       memory = 0;
11766       break;
11767 
11768     case INT_FTYPE_VOID:
11769     case USHORT_FTYPE_VOID:
11770     case UINT64_FTYPE_VOID:
11771     case UINT_FTYPE_VOID:
11772     case UINT8_FTYPE_VOID:
11773     case UNSIGNED_FTYPE_VOID:
11774       nargs = 0;
11775       klass = load;
11776       memory = 0;
11777       break;
11778     case UINT64_FTYPE_PUNSIGNED:
11779     case V2DI_FTYPE_PV2DI:
11780     case V4DI_FTYPE_PV4DI:
11781     case V32QI_FTYPE_PCCHAR:
11782     case V16QI_FTYPE_PCCHAR:
11783     case V8SF_FTYPE_PCV4SF:
11784     case V8SF_FTYPE_PCFLOAT:
11785     case V4SF_FTYPE_PCFLOAT:
11786     case V4DF_FTYPE_PCV2DF:
11787     case V4DF_FTYPE_PCDOUBLE:
11788     case V2DF_FTYPE_PCDOUBLE:
11789     case VOID_FTYPE_PVOID:
11790     case V8DI_FTYPE_PV8DI:
11791       nargs = 1;
11792       klass = load;
11793       memory = 0;
11794       switch (icode)
11795 	{
11796 	case CODE_FOR_sse4_1_movntdqa:
11797 	case CODE_FOR_avx2_movntdqa:
11798 	case CODE_FOR_avx512f_movntdqa:
11799 	  aligned_mem = true;
11800 	  break;
11801 	default:
11802 	  break;
11803 	}
11804       break;
11805     case VOID_FTYPE_PV2SF_V4SF:
11806     case VOID_FTYPE_PV8DI_V8DI:
11807     case VOID_FTYPE_PV4DI_V4DI:
11808     case VOID_FTYPE_PV2DI_V2DI:
11809     case VOID_FTYPE_PCHAR_V32QI:
11810     case VOID_FTYPE_PCHAR_V16QI:
11811     case VOID_FTYPE_PFLOAT_V16SF:
11812     case VOID_FTYPE_PFLOAT_V8SF:
11813     case VOID_FTYPE_PFLOAT_V4SF:
11814     case VOID_FTYPE_PDOUBLE_V8DF:
11815     case VOID_FTYPE_PDOUBLE_V4DF:
11816     case VOID_FTYPE_PDOUBLE_V2DF:
11817     case VOID_FTYPE_PLONGLONG_LONGLONG:
11818     case VOID_FTYPE_PULONGLONG_ULONGLONG:
11819     case VOID_FTYPE_PUNSIGNED_UNSIGNED:
11820     case VOID_FTYPE_PINT_INT:
11821       nargs = 1;
11822       klass = store;
11823       /* Reserve memory operand for target.  */
11824       memory = ARRAY_SIZE (xops);
11825       switch (icode)
11826 	{
11827 	/* These builtins and instructions require the memory
11828 	   to be properly aligned.  */
11829 	case CODE_FOR_avx_movntv4di:
11830 	case CODE_FOR_sse2_movntv2di:
11831 	case CODE_FOR_avx_movntv8sf:
11832 	case CODE_FOR_sse_movntv4sf:
11833 	case CODE_FOR_sse4a_vmmovntv4sf:
11834 	case CODE_FOR_avx_movntv4df:
11835 	case CODE_FOR_sse2_movntv2df:
11836 	case CODE_FOR_sse4a_vmmovntv2df:
11837 	case CODE_FOR_sse2_movntidi:
11838 	case CODE_FOR_sse_movntq:
11839 	case CODE_FOR_sse2_movntisi:
11840 	case CODE_FOR_avx512f_movntv16sf:
11841 	case CODE_FOR_avx512f_movntv8df:
11842 	case CODE_FOR_avx512f_movntv8di:
11843 	  aligned_mem = true;
11844 	  break;
11845 	default:
11846 	  break;
11847 	}
11848       break;
11849     case VOID_FTYPE_PVOID_PCVOID:
11850 	nargs = 1;
11851 	klass = store;
11852 	memory = 0;
11853 
11854 	break;
11855     case V4SF_FTYPE_V4SF_PCV2SF:
11856     case V2DF_FTYPE_V2DF_PCDOUBLE:
11857       nargs = 2;
11858       klass = load;
11859       memory = 1;
11860       break;
11861     case V8SF_FTYPE_PCV8SF_V8SI:
11862     case V4DF_FTYPE_PCV4DF_V4DI:
11863     case V4SF_FTYPE_PCV4SF_V4SI:
11864     case V2DF_FTYPE_PCV2DF_V2DI:
11865     case V8SI_FTYPE_PCV8SI_V8SI:
11866     case V4DI_FTYPE_PCV4DI_V4DI:
11867     case V4SI_FTYPE_PCV4SI_V4SI:
11868     case V2DI_FTYPE_PCV2DI_V2DI:
11869     case VOID_FTYPE_INT_INT64:
11870       nargs = 2;
11871       klass = load;
11872       memory = 0;
11873       break;
11874     case VOID_FTYPE_PV8DF_V8DF_UQI:
11875     case VOID_FTYPE_PV4DF_V4DF_UQI:
11876     case VOID_FTYPE_PV2DF_V2DF_UQI:
11877     case VOID_FTYPE_PV16SF_V16SF_UHI:
11878     case VOID_FTYPE_PV8SF_V8SF_UQI:
11879     case VOID_FTYPE_PV4SF_V4SF_UQI:
11880     case VOID_FTYPE_PV8DI_V8DI_UQI:
11881     case VOID_FTYPE_PV4DI_V4DI_UQI:
11882     case VOID_FTYPE_PV2DI_V2DI_UQI:
11883     case VOID_FTYPE_PV16SI_V16SI_UHI:
11884     case VOID_FTYPE_PV8SI_V8SI_UQI:
11885     case VOID_FTYPE_PV4SI_V4SI_UQI:
11886     case VOID_FTYPE_PV64QI_V64QI_UDI:
11887     case VOID_FTYPE_PV32HI_V32HI_USI:
11888     case VOID_FTYPE_PV32QI_V32QI_USI:
11889     case VOID_FTYPE_PV16QI_V16QI_UHI:
11890     case VOID_FTYPE_PV16HI_V16HI_UHI:
11891     case VOID_FTYPE_PV8HI_V8HI_UQI:
11892       switch (icode)
11893 	{
11894 	/* These builtins and instructions require the memory
11895 	   to be properly aligned.  */
11896 	case CODE_FOR_avx512f_storev16sf_mask:
11897 	case CODE_FOR_avx512f_storev16si_mask:
11898 	case CODE_FOR_avx512f_storev8df_mask:
11899 	case CODE_FOR_avx512f_storev8di_mask:
11900 	case CODE_FOR_avx512vl_storev8sf_mask:
11901 	case CODE_FOR_avx512vl_storev8si_mask:
11902 	case CODE_FOR_avx512vl_storev4df_mask:
11903 	case CODE_FOR_avx512vl_storev4di_mask:
11904 	case CODE_FOR_avx512vl_storev4sf_mask:
11905 	case CODE_FOR_avx512vl_storev4si_mask:
11906 	case CODE_FOR_avx512vl_storev2df_mask:
11907 	case CODE_FOR_avx512vl_storev2di_mask:
11908 	  aligned_mem = true;
11909 	  break;
11910 	default:
11911 	  break;
11912 	}
11913       /* FALLTHRU */
11914     case VOID_FTYPE_PV8SF_V8SI_V8SF:
11915     case VOID_FTYPE_PV4DF_V4DI_V4DF:
11916     case VOID_FTYPE_PV4SF_V4SI_V4SF:
11917     case VOID_FTYPE_PV2DF_V2DI_V2DF:
11918     case VOID_FTYPE_PV8SI_V8SI_V8SI:
11919     case VOID_FTYPE_PV4DI_V4DI_V4DI:
11920     case VOID_FTYPE_PV4SI_V4SI_V4SI:
11921     case VOID_FTYPE_PV2DI_V2DI_V2DI:
11922     case VOID_FTYPE_PV8SI_V8DI_UQI:
11923     case VOID_FTYPE_PV8HI_V8DI_UQI:
11924     case VOID_FTYPE_PV16HI_V16SI_UHI:
11925     case VOID_FTYPE_PUDI_V8DI_UQI:
11926     case VOID_FTYPE_PV16QI_V16SI_UHI:
11927     case VOID_FTYPE_PV4SI_V4DI_UQI:
11928     case VOID_FTYPE_PUDI_V2DI_UQI:
11929     case VOID_FTYPE_PUDI_V4DI_UQI:
11930     case VOID_FTYPE_PUSI_V2DI_UQI:
11931     case VOID_FTYPE_PV8HI_V8SI_UQI:
11932     case VOID_FTYPE_PUDI_V4SI_UQI:
11933     case VOID_FTYPE_PUSI_V4DI_UQI:
11934     case VOID_FTYPE_PUHI_V2DI_UQI:
11935     case VOID_FTYPE_PUDI_V8SI_UQI:
11936     case VOID_FTYPE_PUSI_V4SI_UQI:
11937     case VOID_FTYPE_PCHAR_V64QI_UDI:
11938     case VOID_FTYPE_PCHAR_V32QI_USI:
11939     case VOID_FTYPE_PCHAR_V16QI_UHI:
11940     case VOID_FTYPE_PSHORT_V32HI_USI:
11941     case VOID_FTYPE_PSHORT_V16HI_UHI:
11942     case VOID_FTYPE_PSHORT_V8HI_UQI:
11943     case VOID_FTYPE_PINT_V16SI_UHI:
11944     case VOID_FTYPE_PINT_V8SI_UQI:
11945     case VOID_FTYPE_PINT_V4SI_UQI:
11946     case VOID_FTYPE_PINT64_V8DI_UQI:
11947     case VOID_FTYPE_PINT64_V4DI_UQI:
11948     case VOID_FTYPE_PINT64_V2DI_UQI:
11949     case VOID_FTYPE_PDOUBLE_V8DF_UQI:
11950     case VOID_FTYPE_PDOUBLE_V4DF_UQI:
11951     case VOID_FTYPE_PDOUBLE_V2DF_UQI:
11952     case VOID_FTYPE_PFLOAT_V16SF_UHI:
11953     case VOID_FTYPE_PFLOAT_V8SF_UQI:
11954     case VOID_FTYPE_PFLOAT_V4SF_UQI:
11955     case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
11956     case VOID_FTYPE_PV32QI_V32HI_USI:
11957     case VOID_FTYPE_PV16QI_V16HI_UHI:
11958     case VOID_FTYPE_PUDI_V8HI_UQI:
11959       nargs = 2;
11960       klass = store;
11961       /* Reserve memory operand for target.  */
11962       memory = ARRAY_SIZE (xops);
11963       break;
11964     case V4SF_FTYPE_PCV4SF_V4SF_UQI:
11965     case V8SF_FTYPE_PCV8SF_V8SF_UQI:
11966     case V16SF_FTYPE_PCV16SF_V16SF_UHI:
11967     case V4SI_FTYPE_PCV4SI_V4SI_UQI:
11968     case V8SI_FTYPE_PCV8SI_V8SI_UQI:
11969     case V16SI_FTYPE_PCV16SI_V16SI_UHI:
11970     case V2DF_FTYPE_PCV2DF_V2DF_UQI:
11971     case V4DF_FTYPE_PCV4DF_V4DF_UQI:
11972     case V8DF_FTYPE_PCV8DF_V8DF_UQI:
11973     case V2DI_FTYPE_PCV2DI_V2DI_UQI:
11974     case V4DI_FTYPE_PCV4DI_V4DI_UQI:
11975     case V8DI_FTYPE_PCV8DI_V8DI_UQI:
11976     case V64QI_FTYPE_PCV64QI_V64QI_UDI:
11977     case V32HI_FTYPE_PCV32HI_V32HI_USI:
11978     case V32QI_FTYPE_PCV32QI_V32QI_USI:
11979     case V16QI_FTYPE_PCV16QI_V16QI_UHI:
11980     case V16HI_FTYPE_PCV16HI_V16HI_UHI:
11981     case V8HI_FTYPE_PCV8HI_V8HI_UQI:
11982       switch (icode)
11983 	{
11984 	/* These builtins and instructions require the memory
11985 	   to be properly aligned.  */
11986 	case CODE_FOR_avx512f_loadv16sf_mask:
11987 	case CODE_FOR_avx512f_loadv16si_mask:
11988 	case CODE_FOR_avx512f_loadv8df_mask:
11989 	case CODE_FOR_avx512f_loadv8di_mask:
11990 	case CODE_FOR_avx512vl_loadv8sf_mask:
11991 	case CODE_FOR_avx512vl_loadv8si_mask:
11992 	case CODE_FOR_avx512vl_loadv4df_mask:
11993 	case CODE_FOR_avx512vl_loadv4di_mask:
11994 	case CODE_FOR_avx512vl_loadv4sf_mask:
11995 	case CODE_FOR_avx512vl_loadv4si_mask:
11996 	case CODE_FOR_avx512vl_loadv2df_mask:
11997 	case CODE_FOR_avx512vl_loadv2di_mask:
11998 	case CODE_FOR_avx512bw_loadv64qi_mask:
11999 	case CODE_FOR_avx512vl_loadv32qi_mask:
12000 	case CODE_FOR_avx512vl_loadv16qi_mask:
12001 	case CODE_FOR_avx512bw_loadv32hi_mask:
12002 	case CODE_FOR_avx512vl_loadv16hi_mask:
12003 	case CODE_FOR_avx512vl_loadv8hi_mask:
12004 	  aligned_mem = true;
12005 	  break;
12006 	default:
12007 	  break;
12008 	}
12009       /* FALLTHRU */
12010     case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12011     case V32QI_FTYPE_PCCHAR_V32QI_USI:
12012     case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12013     case V32HI_FTYPE_PCSHORT_V32HI_USI:
12014     case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12015     case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12016     case V16SI_FTYPE_PCINT_V16SI_UHI:
12017     case V8SI_FTYPE_PCINT_V8SI_UQI:
12018     case V4SI_FTYPE_PCINT_V4SI_UQI:
12019     case V8DI_FTYPE_PCINT64_V8DI_UQI:
12020     case V4DI_FTYPE_PCINT64_V4DI_UQI:
12021     case V2DI_FTYPE_PCINT64_V2DI_UQI:
12022     case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12023     case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12024     case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12025     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12026     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12027     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12028     case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12029       nargs = 3;
12030       klass = load;
12031       memory = 0;
12032       break;
12033     default:
12034       gcc_unreachable ();
12035     }
12036 
12037   gcc_assert (nargs <= ARRAY_SIZE (xops));
12038 
12039   if (klass == store)
12040     {
12041       arg = CALL_EXPR_ARG (exp, 0);
12042       op = expand_normal (arg);
12043       gcc_assert (target == 0);
12044       if (memory)
12045 	{
12046 	  op = ix86_zero_extend_to_Pmode (op);
12047 	  target = gen_rtx_MEM (tmode, op);
12048 	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12049 	     on it.  Try to improve it using get_pointer_alignment,
12050 	     and if the special builtin is one that requires strict
12051 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
12052 	     Failure to do so could lead to ix86_legitimate_combined_insn
12053 	     rejecting all changes to such insns.  */
12054 	  unsigned int align = get_pointer_alignment (arg);
12055 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12056 	    align = GET_MODE_ALIGNMENT (tmode);
12057 	  if (MEM_ALIGN (target) < align)
12058 	    set_mem_align (target, align);
12059 	}
12060       else
12061 	target = force_reg (tmode, op);
12062       arg_adjust = 1;
12063     }
12064   else
12065     {
12066       arg_adjust = 0;
12067       if (optimize
12068 	  || target == 0
12069 	  || !register_operand (target, tmode)
12070 	  || GET_MODE (target) != tmode)
12071 	target = gen_reg_rtx (tmode);
12072     }
12073 
12074   for (i = 0; i < nargs; i++)
12075     {
12076       machine_mode mode = insn_p->operand[i + 1].mode;
12077 
12078       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12079       op = expand_normal (arg);
12080 
12081       if (i == memory)
12082 	{
12083 	  /* This must be the memory operand.  */
12084 	  op = ix86_zero_extend_to_Pmode (op);
12085 	  op = gen_rtx_MEM (mode, op);
12086 	  /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12087 	     on it.  Try to improve it using get_pointer_alignment,
12088 	     and if the special builtin is one that requires strict
12089 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
12090 	     Failure to do so could lead to ix86_legitimate_combined_insn
12091 	     rejecting all changes to such insns.  */
12092 	  unsigned int align = get_pointer_alignment (arg);
12093 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12094 	    align = GET_MODE_ALIGNMENT (mode);
12095 	  if (MEM_ALIGN (op) < align)
12096 	    set_mem_align (op, align);
12097 	}
12098       else
12099 	{
12100 	  /* This must be register.  */
12101 	  if (VECTOR_MODE_P (mode))
12102 	    op = safe_vector_operand (op, mode);
12103 
12104 	  op = fixup_modeless_constant (op, mode);
12105 
12106 	  /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12107 	     and that mask operand shoud be at the end.
12108 	     Keep all-ones mask which would be simplified by the expander.  */
12109 	  if (nargs == 3 && i == 2 && klass == load
12110 	      && constm1_operand (op, mode)
12111 	      && insn_p->operand[i].predicate (op, mode))
12112 	    ;
12113 	  else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12114 	    op = copy_to_mode_reg (mode, op);
12115 	  else
12116 	    {
12117 	      op = copy_to_reg (op);
12118 	      op = lowpart_subreg (mode, op, GET_MODE (op));
12119 	    }
12120 	}
12121 
12122       xops[i]= op;
12123     }
12124 
12125   switch (nargs)
12126     {
12127     case 0:
12128       pat = GEN_FCN (icode) (target);
12129       break;
12130     case 1:
12131       pat = GEN_FCN (icode) (target, xops[0]);
12132       break;
12133     case 2:
12134       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12135       break;
12136     case 3:
12137       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12138       break;
12139     default:
12140       gcc_unreachable ();
12141     }
12142 
12143   if (! pat)
12144     return 0;
12145 
12146   emit_insn (pat);
12147   return klass == store ? 0 : target;
12148 }
12149 
12150 /* Return the integer constant in ARG.  Constrain it to be in the range
12151    of the subparts of VEC_TYPE; issue an error if not.  */
12152 
12153 static int
get_element_number(tree vec_type,tree arg)12154 get_element_number (tree vec_type, tree arg)
12155 {
12156   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12157 
12158   if (!tree_fits_uhwi_p (arg)
12159       || (elt = tree_to_uhwi (arg), elt > max))
12160     {
12161       error ("selector must be an integer constant in the range "
12162 	     "[0, %wi]", max);
12163       return 0;
12164     }
12165 
12166   return elt;
12167 }
12168 
12169 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
12170    ix86_expand_vector_init.  We DO have language-level syntax for this, in
12171    the form of  (type){ init-list }.  Except that since we can't place emms
12172    instructions from inside the compiler, we can't allow the use of MMX
12173    registers unless the user explicitly asks for it.  So we do *not* define
12174    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
12175    we have builtins invoked by mmintrin.h that gives us license to emit
12176    these sorts of instructions.  */
12177 
12178 static rtx
ix86_expand_vec_init_builtin(tree type,tree exp,rtx target)12179 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12180 {
12181   machine_mode tmode = TYPE_MODE (type);
12182   machine_mode inner_mode = GET_MODE_INNER (tmode);
12183   int i, n_elt = GET_MODE_NUNITS (tmode);
12184   rtvec v = rtvec_alloc (n_elt);
12185 
12186   gcc_assert (VECTOR_MODE_P (tmode));
12187   gcc_assert (call_expr_nargs (exp) == n_elt);
12188 
12189   for (i = 0; i < n_elt; ++i)
12190     {
12191       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12192       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12193     }
12194 
12195   if (!target || !register_operand (target, tmode))
12196     target = gen_reg_rtx (tmode);
12197 
12198   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12199   return target;
12200 }
12201 
12202 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
12203    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
12204    had a language-level syntax for referencing vector elements.  */
12205 
12206 static rtx
ix86_expand_vec_ext_builtin(tree exp,rtx target)12207 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12208 {
12209   machine_mode tmode, mode0;
12210   tree arg0, arg1;
12211   int elt;
12212   rtx op0;
12213 
12214   arg0 = CALL_EXPR_ARG (exp, 0);
12215   arg1 = CALL_EXPR_ARG (exp, 1);
12216 
12217   op0 = expand_normal (arg0);
12218   elt = get_element_number (TREE_TYPE (arg0), arg1);
12219 
12220   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12221   mode0 = TYPE_MODE (TREE_TYPE (arg0));
12222   gcc_assert (VECTOR_MODE_P (mode0));
12223 
12224   op0 = force_reg (mode0, op0);
12225 
12226   if (optimize || !target || !register_operand (target, tmode))
12227     target = gen_reg_rtx (tmode);
12228 
12229   ix86_expand_vector_extract (true, target, op0, elt);
12230 
12231   return target;
12232 }
12233 
12234 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
12235    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
12236    a language-level syntax for referencing vector elements.  */
12237 
12238 static rtx
ix86_expand_vec_set_builtin(tree exp)12239 ix86_expand_vec_set_builtin (tree exp)
12240 {
12241   machine_mode tmode, mode1;
12242   tree arg0, arg1, arg2;
12243   int elt;
12244   rtx op0, op1, target;
12245 
12246   arg0 = CALL_EXPR_ARG (exp, 0);
12247   arg1 = CALL_EXPR_ARG (exp, 1);
12248   arg2 = CALL_EXPR_ARG (exp, 2);
12249 
12250   tmode = TYPE_MODE (TREE_TYPE (arg0));
12251   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12252   gcc_assert (VECTOR_MODE_P (tmode));
12253 
12254   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12255   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12256   elt = get_element_number (TREE_TYPE (arg0), arg2);
12257 
12258   if (GET_MODE (op1) != mode1)
12259     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12260 
12261   op0 = force_reg (tmode, op0);
12262   op1 = force_reg (mode1, op1);
12263 
12264   /* OP0 is the source of these builtin functions and shouldn't be
12265      modified.  Create a copy, use it and return it as target.  */
12266   target = gen_reg_rtx (tmode);
12267   emit_move_insn (target, op0);
12268   ix86_expand_vector_set (true, target, op1, elt);
12269 
12270   return target;
12271 }
12272 
12273 /* Return true if the necessary isa options for this builtin exist,
12274    else false.
12275    fcode = DECL_MD_FUNCTION_CODE (fndecl);  */
12276 bool
ix86_check_builtin_isa_match(unsigned int fcode,HOST_WIDE_INT * pbisa,HOST_WIDE_INT * pbisa2)12277 ix86_check_builtin_isa_match (unsigned int fcode,
12278 			      HOST_WIDE_INT* pbisa,
12279 			      HOST_WIDE_INT* pbisa2)
12280 {
12281   HOST_WIDE_INT isa = ix86_isa_flags;
12282   HOST_WIDE_INT isa2 = ix86_isa_flags2;
12283   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12284   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12285   /* The general case is we require all the ISAs specified in bisa{,2}
12286      to be enabled.
12287      The exceptions are:
12288      OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12289      OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12290      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12291      (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12292        OPTION_MASK_ISA2_AVXVNNI
12293      where for each such pair it is sufficient if either of the ISAs is
12294      enabled, plus if it is ored with other options also those others.
12295      OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
12296   if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12297        == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12298       && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
12299     isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
12300 
12301   if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12302        == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12303       && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
12304     isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
12305 
12306   if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12307        == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12308       && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
12309     isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
12310 
12311   if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12312 	== (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12313        || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
12314       && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12315 	   == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12316 	  || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
12317     {
12318       isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
12319       isa2 |= OPTION_MASK_ISA2_AVXVNNI;
12320     }
12321 
12322   if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12323       /* __builtin_ia32_maskmovq requires MMX registers.  */
12324       && fcode != IX86_BUILTIN_MASKMOVQ)
12325     {
12326       bisa &= ~OPTION_MASK_ISA_MMX;
12327       bisa |= OPTION_MASK_ISA_SSE2;
12328     }
12329 
12330   if (pbisa)
12331     *pbisa = bisa;
12332   if (pbisa2)
12333     *pbisa2 = bisa2;
12334 
12335   return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12336 }
12337 
12338 /* Expand an expression EXP that calls a built-in function,
12339    with result going to TARGET if that's convenient
12340    (and in mode MODE if that's convenient).
12341    SUBTARGET may be used as the target for computing one of EXP's operands.
12342    IGNORE is nonzero if the value is to be ignored.  */
12343 
12344 rtx
ix86_expand_builtin(tree exp,rtx target,rtx subtarget,machine_mode mode,int ignore)12345 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12346 		     machine_mode mode, int ignore)
12347 {
12348   size_t i;
12349   enum insn_code icode, icode2;
12350   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12351   tree arg0, arg1, arg2, arg3, arg4;
12352   rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12353   machine_mode mode0, mode1, mode2, mode3, mode4;
12354   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12355   HOST_WIDE_INT bisa, bisa2;
12356 
12357   /* For CPU builtins that can be folded, fold first and expand the fold.  */
12358   switch (fcode)
12359     {
12360     case IX86_BUILTIN_CPU_INIT:
12361       {
12362 	/* Make it call __cpu_indicator_init in libgcc.  */
12363 	tree call_expr, fndecl, type;
12364 	type = build_function_type_list (integer_type_node, NULL_TREE);
12365 	fndecl = build_fn_decl ("__cpu_indicator_init", type);
12366 	call_expr = build_call_expr (fndecl, 0);
12367 	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12368       }
12369     case IX86_BUILTIN_CPU_IS:
12370     case IX86_BUILTIN_CPU_SUPPORTS:
12371       {
12372 	tree arg0 = CALL_EXPR_ARG (exp, 0);
12373 	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12374 	gcc_assert (fold_expr != NULL_TREE);
12375 	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12376       }
12377     }
12378 
12379   if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
12380     {
12381       bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12382       if (TARGET_ABI_X32)
12383 	bisa |= OPTION_MASK_ABI_X32;
12384       else
12385 	bisa |= OPTION_MASK_ABI_64;
12386       char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
12387 				       (enum fpmath_unit) 0,
12388 				       (enum prefer_vector_width) 0,
12389 				       PVW_NONE, PVW_NONE,
12390 				       false, add_abi_p);
12391       if (!opts)
12392 	error ("%qE needs unknown isa option", fndecl);
12393       else
12394 	{
12395 	  gcc_assert (opts != NULL);
12396 	  error ("%qE needs isa option %s", fndecl, opts);
12397 	  free (opts);
12398 	}
12399       return expand_call (exp, target, ignore);
12400     }
12401 
12402   switch (fcode)
12403     {
12404     case IX86_BUILTIN_MASKMOVQ:
12405     case IX86_BUILTIN_MASKMOVDQU:
12406       icode = (fcode == IX86_BUILTIN_MASKMOVQ
12407 	       ? CODE_FOR_mmx_maskmovq
12408 	       : CODE_FOR_sse2_maskmovdqu);
12409       /* Note the arg order is different from the operand order.  */
12410       arg1 = CALL_EXPR_ARG (exp, 0);
12411       arg2 = CALL_EXPR_ARG (exp, 1);
12412       arg0 = CALL_EXPR_ARG (exp, 2);
12413       op0 = expand_normal (arg0);
12414       op1 = expand_normal (arg1);
12415       op2 = expand_normal (arg2);
12416       mode0 = insn_data[icode].operand[0].mode;
12417       mode1 = insn_data[icode].operand[1].mode;
12418       mode2 = insn_data[icode].operand[2].mode;
12419 
12420       op0 = ix86_zero_extend_to_Pmode (op0);
12421       op0 = gen_rtx_MEM (mode1, op0);
12422 
12423       if (!insn_data[icode].operand[0].predicate (op0, mode0))
12424 	op0 = copy_to_mode_reg (mode0, op0);
12425       if (!insn_data[icode].operand[1].predicate (op1, mode1))
12426 	op1 = copy_to_mode_reg (mode1, op1);
12427       if (!insn_data[icode].operand[2].predicate (op2, mode2))
12428 	op2 = copy_to_mode_reg (mode2, op2);
12429       pat = GEN_FCN (icode) (op0, op1, op2);
12430       if (! pat)
12431 	return 0;
12432       emit_insn (pat);
12433       return 0;
12434 
12435     case IX86_BUILTIN_LDMXCSR:
12436       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12437       target = assign_386_stack_local (SImode, SLOT_TEMP);
12438       emit_move_insn (target, op0);
12439       emit_insn (gen_sse_ldmxcsr (target));
12440       return 0;
12441 
12442     case IX86_BUILTIN_STMXCSR:
12443       target = assign_386_stack_local (SImode, SLOT_TEMP);
12444       emit_insn (gen_sse_stmxcsr (target));
12445       return copy_to_mode_reg (SImode, target);
12446 
12447     case IX86_BUILTIN_CLFLUSH:
12448 	arg0 = CALL_EXPR_ARG (exp, 0);
12449 	op0 = expand_normal (arg0);
12450 	icode = CODE_FOR_sse2_clflush;
12451 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12452 	  op0 = ix86_zero_extend_to_Pmode (op0);
12453 
12454 	emit_insn (gen_sse2_clflush (op0));
12455 	return 0;
12456 
12457     case IX86_BUILTIN_CLWB:
12458 	arg0 = CALL_EXPR_ARG (exp, 0);
12459 	op0 = expand_normal (arg0);
12460 	icode = CODE_FOR_clwb;
12461 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12462 	  op0 = ix86_zero_extend_to_Pmode (op0);
12463 
12464 	emit_insn (gen_clwb (op0));
12465 	return 0;
12466 
12467     case IX86_BUILTIN_CLFLUSHOPT:
12468 	arg0 = CALL_EXPR_ARG (exp, 0);
12469 	op0 = expand_normal (arg0);
12470 	icode = CODE_FOR_clflushopt;
12471 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12472 	  op0 = ix86_zero_extend_to_Pmode (op0);
12473 
12474 	emit_insn (gen_clflushopt (op0));
12475 	return 0;
12476 
12477     case IX86_BUILTIN_MONITOR:
12478     case IX86_BUILTIN_MONITORX:
12479       arg0 = CALL_EXPR_ARG (exp, 0);
12480       arg1 = CALL_EXPR_ARG (exp, 1);
12481       arg2 = CALL_EXPR_ARG (exp, 2);
12482       op0 = expand_normal (arg0);
12483       op1 = expand_normal (arg1);
12484       op2 = expand_normal (arg2);
12485       if (!REG_P (op0))
12486 	op0 = ix86_zero_extend_to_Pmode (op0);
12487       if (!REG_P (op1))
12488 	op1 = copy_to_mode_reg (SImode, op1);
12489       if (!REG_P (op2))
12490 	op2 = copy_to_mode_reg (SImode, op2);
12491 
12492       emit_insn (fcode == IX86_BUILTIN_MONITOR
12493 		 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12494 		 : gen_monitorx (Pmode, op0, op1, op2));
12495       return 0;
12496 
12497     case IX86_BUILTIN_MWAIT:
12498       arg0 = CALL_EXPR_ARG (exp, 0);
12499       arg1 = CALL_EXPR_ARG (exp, 1);
12500       op0 = expand_normal (arg0);
12501       op1 = expand_normal (arg1);
12502       if (!REG_P (op0))
12503 	op0 = copy_to_mode_reg (SImode, op0);
12504       if (!REG_P (op1))
12505 	op1 = copy_to_mode_reg (SImode, op1);
12506       emit_insn (gen_sse3_mwait (op0, op1));
12507       return 0;
12508 
12509     case IX86_BUILTIN_MWAITX:
12510       arg0 = CALL_EXPR_ARG (exp, 0);
12511       arg1 = CALL_EXPR_ARG (exp, 1);
12512       arg2 = CALL_EXPR_ARG (exp, 2);
12513       op0 = expand_normal (arg0);
12514       op1 = expand_normal (arg1);
12515       op2 = expand_normal (arg2);
12516       if (!REG_P (op0))
12517 	op0 = copy_to_mode_reg (SImode, op0);
12518       if (!REG_P (op1))
12519 	op1 = copy_to_mode_reg (SImode, op1);
12520       if (!REG_P (op2))
12521 	op2 = copy_to_mode_reg (SImode, op2);
12522       emit_insn (gen_mwaitx (op0, op1, op2));
12523       return 0;
12524 
12525     case IX86_BUILTIN_UMONITOR:
12526       arg0 = CALL_EXPR_ARG (exp, 0);
12527       op0 = expand_normal (arg0);
12528 
12529       op0 = ix86_zero_extend_to_Pmode (op0);
12530       emit_insn (gen_umonitor (Pmode, op0));
12531       return 0;
12532 
12533     case IX86_BUILTIN_UMWAIT:
12534     case IX86_BUILTIN_TPAUSE:
12535       arg0 = CALL_EXPR_ARG (exp, 0);
12536       arg1 = CALL_EXPR_ARG (exp, 1);
12537       op0 = expand_normal (arg0);
12538       op1 = expand_normal (arg1);
12539 
12540       if (!REG_P (op0))
12541 	op0 = copy_to_mode_reg (SImode, op0);
12542 
12543       op1 = force_reg (DImode, op1);
12544 
12545       if (TARGET_64BIT)
12546 	{
12547 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12548 				     NULL, 1, OPTAB_DIRECT);
12549 	  switch (fcode)
12550 	    {
12551 	    case IX86_BUILTIN_UMWAIT:
12552 	      icode = CODE_FOR_umwait_rex64;
12553 	      break;
12554 	    case IX86_BUILTIN_TPAUSE:
12555 	      icode = CODE_FOR_tpause_rex64;
12556 	      break;
12557 	    default:
12558 	      gcc_unreachable ();
12559 	    }
12560 
12561 	  op2 = gen_lowpart (SImode, op2);
12562 	  op1 = gen_lowpart (SImode, op1);
12563 	  pat = GEN_FCN (icode) (op0, op1, op2);
12564 	}
12565       else
12566 	{
12567 	  switch (fcode)
12568 	    {
12569 	    case IX86_BUILTIN_UMWAIT:
12570 	      icode = CODE_FOR_umwait;
12571 	      break;
12572 	    case IX86_BUILTIN_TPAUSE:
12573 	      icode = CODE_FOR_tpause;
12574 	      break;
12575 	    default:
12576 	      gcc_unreachable ();
12577 	    }
12578 	  pat = GEN_FCN (icode) (op0, op1);
12579 	}
12580 
12581       if (!pat)
12582 	return 0;
12583 
12584       emit_insn (pat);
12585 
12586       if (target == 0
12587 	  || !register_operand (target, QImode))
12588 	target = gen_reg_rtx (QImode);
12589 
12590       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12591 			const0_rtx);
12592       emit_insn (gen_rtx_SET (target, pat));
12593 
12594       return target;
12595 
12596     case IX86_BUILTIN_TESTUI:
12597       emit_insn (gen_testui ());
12598 
12599       if (target == 0
12600 	  || !register_operand (target, QImode))
12601 	target = gen_reg_rtx (QImode);
12602 
12603       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12604 			 const0_rtx);
12605       emit_insn (gen_rtx_SET (target, pat));
12606 
12607       return target;
12608 
12609     case IX86_BUILTIN_CLZERO:
12610       arg0 = CALL_EXPR_ARG (exp, 0);
12611       op0 = expand_normal (arg0);
12612       if (!REG_P (op0))
12613 	op0 = ix86_zero_extend_to_Pmode (op0);
12614       emit_insn (gen_clzero (Pmode, op0));
12615       return 0;
12616 
12617     case IX86_BUILTIN_CLDEMOTE:
12618       arg0 = CALL_EXPR_ARG (exp, 0);
12619       op0 = expand_normal (arg0);
12620       icode = CODE_FOR_cldemote;
12621       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12622 	op0 = ix86_zero_extend_to_Pmode (op0);
12623 
12624       emit_insn (gen_cldemote (op0));
12625       return 0;
12626 
12627     case IX86_BUILTIN_LOADIWKEY:
12628       {
12629 	arg0 = CALL_EXPR_ARG (exp, 0);
12630 	arg1 = CALL_EXPR_ARG (exp, 1);
12631 	arg2 = CALL_EXPR_ARG (exp, 2);
12632 	arg3 = CALL_EXPR_ARG (exp, 3);
12633 
12634 	op0 = expand_normal (arg0);
12635 	op1 = expand_normal (arg1);
12636 	op2 = expand_normal (arg2);
12637 	op3 = expand_normal (arg3);
12638 
12639 	if (!REG_P (op0))
12640 	  op0 = copy_to_mode_reg (V2DImode, op0);
12641 	if (!REG_P (op1))
12642 	  op1 = copy_to_mode_reg (V2DImode, op1);
12643 	if (!REG_P (op2))
12644 	  op2 = copy_to_mode_reg (V2DImode, op2);
12645 	if (!REG_P (op3))
12646 	  op3 = copy_to_mode_reg (SImode, op3);
12647 
12648 	emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12649 
12650 	return 0;
12651       }
12652 
12653     case IX86_BUILTIN_AESDEC128KLU8:
12654       icode = CODE_FOR_aesdec128klu8;
12655       goto aesdecenc_expand;
12656 
12657     case IX86_BUILTIN_AESDEC256KLU8:
12658       icode = CODE_FOR_aesdec256klu8;
12659       goto aesdecenc_expand;
12660 
12661     case IX86_BUILTIN_AESENC128KLU8:
12662       icode = CODE_FOR_aesenc128klu8;
12663       goto aesdecenc_expand;
12664 
12665     case IX86_BUILTIN_AESENC256KLU8:
12666       icode = CODE_FOR_aesenc256klu8;
12667 
12668     aesdecenc_expand:
12669 
12670       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
12671       arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
12672       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12673 
12674       op0 = expand_normal (arg0);
12675       op1 = expand_normal (arg1);
12676       op2 = expand_normal (arg2);
12677 
12678       if (!address_operand (op0, V2DImode))
12679 	{
12680 	  op0 = convert_memory_address (Pmode, op0);
12681 	  op0 = copy_addr_to_reg (op0);
12682 	}
12683       op0 = gen_rtx_MEM (V2DImode, op0);
12684 
12685       if (!REG_P (op1))
12686 	op1 = copy_to_mode_reg (V2DImode, op1);
12687 
12688       if (!address_operand (op2, VOIDmode))
12689 	{
12690 	  op2 = convert_memory_address (Pmode, op2);
12691 	  op2 = copy_addr_to_reg (op2);
12692 	}
12693       op2 = gen_rtx_MEM (BLKmode, op2);
12694 
12695       emit_insn (GEN_FCN (icode) (op1, op1, op2));
12696 
12697       if (target == 0)
12698 	target = gen_reg_rtx (QImode);
12699 
12700       /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12701 	 error occurs. Then the output should be cleared for safety. */
12702       rtx_code_label *ok_label;
12703       rtx tmp;
12704 
12705       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12706       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12707       ok_label = gen_label_rtx ();
12708       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12709 			       true, ok_label);
12710       /* Usually the runtime error seldom occur, so predict OK path as
12711 	 hotspot to optimize it as fallthrough block. */
12712       predict_jump (REG_BR_PROB_BASE * 90 / 100);
12713 
12714       emit_insn (gen_rtx_SET (op1, const0_rtx));
12715 
12716       emit_label (ok_label);
12717       emit_insn (gen_rtx_SET (target, pat));
12718       emit_insn (gen_rtx_SET (op0, op1));
12719 
12720       return target;
12721 
12722     case IX86_BUILTIN_AESDECWIDE128KLU8:
12723       icode = CODE_FOR_aesdecwide128klu8;
12724       goto wideaesdecenc_expand;
12725 
12726     case IX86_BUILTIN_AESDECWIDE256KLU8:
12727       icode = CODE_FOR_aesdecwide256klu8;
12728       goto wideaesdecenc_expand;
12729 
12730     case IX86_BUILTIN_AESENCWIDE128KLU8:
12731       icode = CODE_FOR_aesencwide128klu8;
12732       goto wideaesdecenc_expand;
12733 
12734     case IX86_BUILTIN_AESENCWIDE256KLU8:
12735       icode = CODE_FOR_aesencwide256klu8;
12736 
12737     wideaesdecenc_expand:
12738 
12739       rtx xmm_regs[8];
12740       rtx op;
12741 
12742       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
12743       arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
12744       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12745 
12746       op0 = expand_normal (arg0);
12747       op1 = expand_normal (arg1);
12748       op2 = expand_normal (arg2);
12749 
12750       if (!address_operand (op2, VOIDmode))
12751 	{
12752 	  op2 = convert_memory_address (Pmode, op2);
12753 	  op2 = copy_addr_to_reg (op2);
12754 	}
12755       op2 = gen_rtx_MEM (BLKmode, op2);
12756 
12757       for (i = 0; i < 8; i++)
12758 	{
12759 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12760 
12761 	  op = gen_rtx_MEM (V2DImode,
12762 			    plus_constant (Pmode, op1, (i * 16)));
12763 
12764 	  emit_move_insn (xmm_regs[i], op);
12765 	}
12766 
12767       emit_insn (GEN_FCN (icode) (op2));
12768 
12769       if (target == 0)
12770 	target = gen_reg_rtx (QImode);
12771 
12772       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12773       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12774       ok_label = gen_label_rtx ();
12775       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12776 			       true, ok_label);
12777       predict_jump (REG_BR_PROB_BASE * 90 / 100);
12778 
12779       for (i = 0; i < 8; i++)
12780 	emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
12781 
12782       emit_label (ok_label);
12783       emit_insn (gen_rtx_SET (target, pat));
12784 
12785       for (i = 0; i < 8; i++)
12786 	{
12787 	  op = gen_rtx_MEM (V2DImode,
12788 			    plus_constant (Pmode, op0, (i * 16)));
12789 	  emit_move_insn (op, xmm_regs[i]);
12790 	}
12791 
12792       return target;
12793 
12794     case IX86_BUILTIN_ENCODEKEY128U32:
12795       {
12796 	rtx op, xmm_regs[7];
12797 
12798 	arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12799 	arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
12800 	arg2 = CALL_EXPR_ARG (exp, 2); // void *h
12801 
12802 	op0 = expand_normal (arg0);
12803 	op1 = expand_normal (arg1);
12804 	op2 = expand_normal (arg2);
12805 
12806 	if (!REG_P (op0))
12807 	  op0 = copy_to_mode_reg (SImode, op0);
12808 
12809 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12810 	emit_move_insn (op, op1);
12811 
12812 	for (i = 0; i < 3; i++)
12813 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12814 
12815 	if (target == 0)
12816 	  target = gen_reg_rtx (SImode);
12817 
12818 	emit_insn (gen_encodekey128u32 (target, op0));
12819 
12820 	for (i = 0; i < 3; i++)
12821 	  {
12822 	    op = gen_rtx_MEM (V2DImode,
12823 			      plus_constant (Pmode, op2, (i * 16)));
12824 	    emit_move_insn (op, xmm_regs[i]);
12825 	  }
12826 
12827 	return target;
12828       }
12829     case IX86_BUILTIN_ENCODEKEY256U32:
12830       {
12831 	rtx op, xmm_regs[7];
12832 
12833 	arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12834 	arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
12835 	arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
12836 	arg3 = CALL_EXPR_ARG (exp, 3); // void *h
12837 
12838 	op0 = expand_normal (arg0);
12839 	op1 = expand_normal (arg1);
12840 	op2 = expand_normal (arg2);
12841 	op3 = expand_normal (arg3);
12842 
12843 	if (!REG_P (op0))
12844 	  op0 = copy_to_mode_reg (SImode, op0);
12845 
12846 	/* Force to use xmm0, xmm1 for keylow, keyhi*/
12847 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12848 	emit_move_insn (op, op1);
12849 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
12850 	emit_move_insn (op, op2);
12851 
12852 	for (i = 0; i < 4; i++)
12853 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12854 
12855 	if (target == 0)
12856 	  target = gen_reg_rtx (SImode);
12857 
12858 	emit_insn (gen_encodekey256u32 (target, op0));
12859 
12860 	for (i = 0; i < 4; i++)
12861 	  {
12862 	    op = gen_rtx_MEM (V2DImode,
12863 			      plus_constant (Pmode, op3, (i * 16)));
12864 	    emit_move_insn (op, xmm_regs[i]);
12865 	  }
12866 
12867 	return target;
12868       }
12869 
12870     case IX86_BUILTIN_VEC_INIT_V2SI:
12871     case IX86_BUILTIN_VEC_INIT_V4HI:
12872     case IX86_BUILTIN_VEC_INIT_V8QI:
12873       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
12874 
12875     case IX86_BUILTIN_VEC_EXT_V2DF:
12876     case IX86_BUILTIN_VEC_EXT_V2DI:
12877     case IX86_BUILTIN_VEC_EXT_V4SF:
12878     case IX86_BUILTIN_VEC_EXT_V4SI:
12879     case IX86_BUILTIN_VEC_EXT_V8HI:
12880     case IX86_BUILTIN_VEC_EXT_V2SI:
12881     case IX86_BUILTIN_VEC_EXT_V4HI:
12882     case IX86_BUILTIN_VEC_EXT_V16QI:
12883       return ix86_expand_vec_ext_builtin (exp, target);
12884 
12885     case IX86_BUILTIN_VEC_SET_V2DI:
12886     case IX86_BUILTIN_VEC_SET_V4SF:
12887     case IX86_BUILTIN_VEC_SET_V4SI:
12888     case IX86_BUILTIN_VEC_SET_V8HI:
12889     case IX86_BUILTIN_VEC_SET_V4HI:
12890     case IX86_BUILTIN_VEC_SET_V16QI:
12891       return ix86_expand_vec_set_builtin (exp);
12892 
12893     case IX86_BUILTIN_NANQ:
12894     case IX86_BUILTIN_NANSQ:
12895       return expand_call (exp, target, ignore);
12896 
12897     case IX86_BUILTIN_RDPID:
12898 
12899       op0 = gen_reg_rtx (word_mode);
12900 
12901       if (TARGET_64BIT)
12902 	{
12903 	  insn = gen_rdpid_rex64 (op0);
12904 	  op0 = convert_to_mode (SImode, op0, 1);
12905 	}
12906       else
12907 	insn = gen_rdpid (op0);
12908 
12909       emit_insn (insn);
12910 
12911       if (target == 0
12912 	  || !register_operand (target, SImode))
12913 	target = gen_reg_rtx (SImode);
12914 
12915       emit_move_insn (target, op0);
12916       return target;
12917 
12918     case IX86_BUILTIN_2INTERSECTD512:
12919     case IX86_BUILTIN_2INTERSECTQ512:
12920     case IX86_BUILTIN_2INTERSECTD256:
12921     case IX86_BUILTIN_2INTERSECTQ256:
12922     case IX86_BUILTIN_2INTERSECTD128:
12923     case IX86_BUILTIN_2INTERSECTQ128:
12924       arg0 = CALL_EXPR_ARG (exp, 0);
12925       arg1 = CALL_EXPR_ARG (exp, 1);
12926       arg2 = CALL_EXPR_ARG (exp, 2);
12927       arg3 = CALL_EXPR_ARG (exp, 3);
12928       op0 = expand_normal (arg0);
12929       op1 = expand_normal (arg1);
12930       op2 = expand_normal (arg2);
12931       op3 = expand_normal (arg3);
12932 
12933       if (!address_operand (op0, VOIDmode))
12934 	{
12935 	  op0 = convert_memory_address (Pmode, op0);
12936 	  op0 = copy_addr_to_reg (op0);
12937 	}
12938       if (!address_operand (op1, VOIDmode))
12939 	{
12940 	  op1 = convert_memory_address (Pmode, op1);
12941 	  op1 = copy_addr_to_reg (op1);
12942 	}
12943 
12944       switch (fcode)
12945 	{
12946 	case IX86_BUILTIN_2INTERSECTD512:
12947 	  mode4 = P2HImode;
12948 	  icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
12949 	  break;
12950 	case IX86_BUILTIN_2INTERSECTQ512:
12951 	  mode4 = P2QImode;
12952 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
12953 	  break;
12954 	case IX86_BUILTIN_2INTERSECTD256:
12955 	  mode4 = P2QImode;
12956 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
12957 	  break;
12958 	case IX86_BUILTIN_2INTERSECTQ256:
12959 	  mode4 = P2QImode;
12960 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
12961 	  break;
12962 	case IX86_BUILTIN_2INTERSECTD128:
12963 	  mode4 = P2QImode;
12964 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
12965 	  break;
12966 	case IX86_BUILTIN_2INTERSECTQ128:
12967 	  mode4 = P2QImode;
12968 	  icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
12969 	  break;
12970 	default:
12971 	  gcc_unreachable ();
12972 	}
12973 
12974       mode2 = insn_data[icode].operand[1].mode;
12975       mode3 = insn_data[icode].operand[2].mode;
12976       if (!insn_data[icode].operand[1].predicate (op2, mode2))
12977 	op2 = copy_to_mode_reg (mode2, op2);
12978       if (!insn_data[icode].operand[2].predicate (op3, mode3))
12979 	op3 = copy_to_mode_reg (mode3, op3);
12980 
12981       op4 = gen_reg_rtx (mode4);
12982       emit_insn (GEN_FCN (icode) (op4, op2, op3));
12983       mode0 = mode4 == P2HImode ? HImode : QImode;
12984       emit_move_insn (gen_rtx_MEM (mode0, op0),
12985 		      gen_lowpart (mode0, op4));
12986       emit_move_insn (gen_rtx_MEM (mode0, op1),
12987 		      gen_highpart (mode0, op4));
12988 
12989       return 0;
12990 
12991     case IX86_BUILTIN_RDPMC:
12992     case IX86_BUILTIN_RDTSC:
12993     case IX86_BUILTIN_RDTSCP:
12994     case IX86_BUILTIN_XGETBV:
12995 
12996       op0 = gen_reg_rtx (DImode);
12997       op1 = gen_reg_rtx (DImode);
12998 
12999       if (fcode == IX86_BUILTIN_RDPMC)
13000 	{
13001 	  arg0 = CALL_EXPR_ARG (exp, 0);
13002 	  op2 = expand_normal (arg0);
13003 	  if (!register_operand (op2, SImode))
13004 	    op2 = copy_to_mode_reg (SImode, op2);
13005 
13006 	  insn = (TARGET_64BIT
13007 		  ? gen_rdpmc_rex64 (op0, op1, op2)
13008 		  : gen_rdpmc (op0, op2));
13009 	  emit_insn (insn);
13010 	}
13011       else if (fcode == IX86_BUILTIN_XGETBV)
13012 	{
13013 	  arg0 = CALL_EXPR_ARG (exp, 0);
13014 	  op2 = expand_normal (arg0);
13015 	  if (!register_operand (op2, SImode))
13016 	    op2 = copy_to_mode_reg (SImode, op2);
13017 
13018 	  insn = (TARGET_64BIT
13019 		  ? gen_xgetbv_rex64 (op0, op1, op2)
13020 		  : gen_xgetbv (op0, op2));
13021 	  emit_insn (insn);
13022 	}
13023       else if (fcode == IX86_BUILTIN_RDTSC)
13024 	{
13025 	  insn = (TARGET_64BIT
13026 		  ? gen_rdtsc_rex64 (op0, op1)
13027 		  : gen_rdtsc (op0));
13028 	  emit_insn (insn);
13029 	}
13030       else
13031 	{
13032 	  op2 = gen_reg_rtx (SImode);
13033 
13034 	  insn = (TARGET_64BIT
13035 		  ? gen_rdtscp_rex64 (op0, op1, op2)
13036 		  : gen_rdtscp (op0, op2));
13037 	  emit_insn (insn);
13038 
13039 	  arg0 = CALL_EXPR_ARG (exp, 0);
13040 	  op4 = expand_normal (arg0);
13041 	  if (!address_operand (op4, VOIDmode))
13042 	    {
13043 	      op4 = convert_memory_address (Pmode, op4);
13044 	      op4 = copy_addr_to_reg (op4);
13045 	    }
13046 	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13047 	}
13048 
13049       if (target == 0
13050 	  || !register_operand (target, DImode))
13051         target = gen_reg_rtx (DImode);
13052 
13053       if (TARGET_64BIT)
13054 	{
13055 	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13056 				     op1, 1, OPTAB_DIRECT);
13057 	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
13058 				     op0, 1, OPTAB_DIRECT);
13059 	}
13060 
13061       emit_move_insn (target, op0);
13062       return target;
13063 
13064     case IX86_BUILTIN_ENQCMD:
13065     case IX86_BUILTIN_ENQCMDS:
13066     case IX86_BUILTIN_MOVDIR64B:
13067 
13068       arg0 = CALL_EXPR_ARG (exp, 0);
13069       arg1 = CALL_EXPR_ARG (exp, 1);
13070       op0 = expand_normal (arg0);
13071       op1 = expand_normal (arg1);
13072 
13073       op0 = ix86_zero_extend_to_Pmode (op0);
13074       if (!address_operand (op1, VOIDmode))
13075       {
13076 	op1 = convert_memory_address (Pmode, op1);
13077 	op1 = copy_addr_to_reg (op1);
13078       }
13079       op1 = gen_rtx_MEM (XImode, op1);
13080 
13081       if (fcode == IX86_BUILTIN_MOVDIR64B)
13082 	{
13083 	  emit_insn (gen_movdir64b (Pmode, op0, op1));
13084 	  return 0;
13085 	}
13086       else
13087 	{
13088 	  if (target == 0
13089 	      || !register_operand (target, SImode))
13090 	    target = gen_reg_rtx (SImode);
13091 
13092 	  emit_move_insn (target, const0_rtx);
13093 	  target = gen_rtx_SUBREG (QImode, target, 0);
13094 
13095 	  int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13096 			 ? UNSPECV_ENQCMD
13097 			 : UNSPECV_ENQCMDS);
13098 	  icode = code_for_enqcmd (unspecv, Pmode);
13099 	  emit_insn (GEN_FCN (icode) (op0, op1));
13100 
13101 	  emit_insn
13102 	    (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13103 			  gen_rtx_fmt_ee (EQ, QImode,
13104 					  gen_rtx_REG (CCZmode, FLAGS_REG),
13105 					  const0_rtx)));
13106 	  return SUBREG_REG (target);
13107 	}
13108 
13109     case IX86_BUILTIN_FXSAVE:
13110     case IX86_BUILTIN_FXRSTOR:
13111     case IX86_BUILTIN_FXSAVE64:
13112     case IX86_BUILTIN_FXRSTOR64:
13113     case IX86_BUILTIN_FNSTENV:
13114     case IX86_BUILTIN_FLDENV:
13115       mode0 = BLKmode;
13116       switch (fcode)
13117 	{
13118 	case IX86_BUILTIN_FXSAVE:
13119 	  icode = CODE_FOR_fxsave;
13120 	  break;
13121 	case IX86_BUILTIN_FXRSTOR:
13122 	  icode = CODE_FOR_fxrstor;
13123 	  break;
13124 	case IX86_BUILTIN_FXSAVE64:
13125 	  icode = CODE_FOR_fxsave64;
13126 	  break;
13127 	case IX86_BUILTIN_FXRSTOR64:
13128 	  icode = CODE_FOR_fxrstor64;
13129 	  break;
13130 	case IX86_BUILTIN_FNSTENV:
13131 	  icode = CODE_FOR_fnstenv;
13132 	  break;
13133 	case IX86_BUILTIN_FLDENV:
13134 	  icode = CODE_FOR_fldenv;
13135 	  break;
13136 	default:
13137 	  gcc_unreachable ();
13138 	}
13139 
13140       arg0 = CALL_EXPR_ARG (exp, 0);
13141       op0 = expand_normal (arg0);
13142 
13143       if (!address_operand (op0, VOIDmode))
13144 	{
13145 	  op0 = convert_memory_address (Pmode, op0);
13146 	  op0 = copy_addr_to_reg (op0);
13147 	}
13148       op0 = gen_rtx_MEM (mode0, op0);
13149 
13150       pat = GEN_FCN (icode) (op0);
13151       if (pat)
13152 	emit_insn (pat);
13153       return 0;
13154 
13155     case IX86_BUILTIN_XSETBV:
13156       arg0 = CALL_EXPR_ARG (exp, 0);
13157       arg1 = CALL_EXPR_ARG (exp, 1);
13158       op0 = expand_normal (arg0);
13159       op1 = expand_normal (arg1);
13160 
13161       if (!REG_P (op0))
13162 	op0 = copy_to_mode_reg (SImode, op0);
13163 
13164       op1 = force_reg (DImode, op1);
13165 
13166       if (TARGET_64BIT)
13167 	{
13168 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13169 				     NULL, 1, OPTAB_DIRECT);
13170 
13171 	  icode = CODE_FOR_xsetbv_rex64;
13172 
13173 	  op2 = gen_lowpart (SImode, op2);
13174 	  op1 = gen_lowpart (SImode, op1);
13175 	  pat = GEN_FCN (icode) (op0, op1, op2);
13176 	}
13177       else
13178 	{
13179 	  icode = CODE_FOR_xsetbv;
13180 
13181 	  pat = GEN_FCN (icode) (op0, op1);
13182 	}
13183       if (pat)
13184 	emit_insn (pat);
13185       return 0;
13186 
13187     case IX86_BUILTIN_XSAVE:
13188     case IX86_BUILTIN_XRSTOR:
13189     case IX86_BUILTIN_XSAVE64:
13190     case IX86_BUILTIN_XRSTOR64:
13191     case IX86_BUILTIN_XSAVEOPT:
13192     case IX86_BUILTIN_XSAVEOPT64:
13193     case IX86_BUILTIN_XSAVES:
13194     case IX86_BUILTIN_XRSTORS:
13195     case IX86_BUILTIN_XSAVES64:
13196     case IX86_BUILTIN_XRSTORS64:
13197     case IX86_BUILTIN_XSAVEC:
13198     case IX86_BUILTIN_XSAVEC64:
13199       arg0 = CALL_EXPR_ARG (exp, 0);
13200       arg1 = CALL_EXPR_ARG (exp, 1);
13201       op0 = expand_normal (arg0);
13202       op1 = expand_normal (arg1);
13203 
13204       if (!address_operand (op0, VOIDmode))
13205 	{
13206 	  op0 = convert_memory_address (Pmode, op0);
13207 	  op0 = copy_addr_to_reg (op0);
13208 	}
13209       op0 = gen_rtx_MEM (BLKmode, op0);
13210 
13211       op1 = force_reg (DImode, op1);
13212 
13213       if (TARGET_64BIT)
13214 	{
13215 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13216 				     NULL, 1, OPTAB_DIRECT);
13217 	  switch (fcode)
13218 	    {
13219 	    case IX86_BUILTIN_XSAVE:
13220 	      icode = CODE_FOR_xsave_rex64;
13221 	      break;
13222 	    case IX86_BUILTIN_XRSTOR:
13223 	      icode = CODE_FOR_xrstor_rex64;
13224 	      break;
13225 	    case IX86_BUILTIN_XSAVE64:
13226 	      icode = CODE_FOR_xsave64;
13227 	      break;
13228 	    case IX86_BUILTIN_XRSTOR64:
13229 	      icode = CODE_FOR_xrstor64;
13230 	      break;
13231 	    case IX86_BUILTIN_XSAVEOPT:
13232 	      icode = CODE_FOR_xsaveopt_rex64;
13233 	      break;
13234 	    case IX86_BUILTIN_XSAVEOPT64:
13235 	      icode = CODE_FOR_xsaveopt64;
13236 	      break;
13237 	    case IX86_BUILTIN_XSAVES:
13238 	      icode = CODE_FOR_xsaves_rex64;
13239 	      break;
13240 	    case IX86_BUILTIN_XRSTORS:
13241 	      icode = CODE_FOR_xrstors_rex64;
13242 	      break;
13243 	    case IX86_BUILTIN_XSAVES64:
13244 	      icode = CODE_FOR_xsaves64;
13245 	      break;
13246 	    case IX86_BUILTIN_XRSTORS64:
13247 	      icode = CODE_FOR_xrstors64;
13248 	      break;
13249 	    case IX86_BUILTIN_XSAVEC:
13250 	      icode = CODE_FOR_xsavec_rex64;
13251 	      break;
13252 	    case IX86_BUILTIN_XSAVEC64:
13253 	      icode = CODE_FOR_xsavec64;
13254 	      break;
13255 	    default:
13256 	      gcc_unreachable ();
13257 	    }
13258 
13259 	  op2 = gen_lowpart (SImode, op2);
13260 	  op1 = gen_lowpart (SImode, op1);
13261 	  pat = GEN_FCN (icode) (op0, op1, op2);
13262 	}
13263       else
13264 	{
13265 	  switch (fcode)
13266 	    {
13267 	    case IX86_BUILTIN_XSAVE:
13268 	      icode = CODE_FOR_xsave;
13269 	      break;
13270 	    case IX86_BUILTIN_XRSTOR:
13271 	      icode = CODE_FOR_xrstor;
13272 	      break;
13273 	    case IX86_BUILTIN_XSAVEOPT:
13274 	      icode = CODE_FOR_xsaveopt;
13275 	      break;
13276 	    case IX86_BUILTIN_XSAVES:
13277 	      icode = CODE_FOR_xsaves;
13278 	      break;
13279 	    case IX86_BUILTIN_XRSTORS:
13280 	      icode = CODE_FOR_xrstors;
13281 	      break;
13282 	    case IX86_BUILTIN_XSAVEC:
13283 	      icode = CODE_FOR_xsavec;
13284 	      break;
13285 	    default:
13286 	      gcc_unreachable ();
13287 	    }
13288 	  pat = GEN_FCN (icode) (op0, op1);
13289 	}
13290 
13291       if (pat)
13292 	emit_insn (pat);
13293       return 0;
13294 
13295     case IX86_BUILTIN_LDTILECFG:
13296     case IX86_BUILTIN_STTILECFG:
13297       arg0 = CALL_EXPR_ARG (exp, 0);
13298       op0 = expand_normal (arg0);
13299 
13300       if (!address_operand (op0, VOIDmode))
13301 	{
13302 	  op0 = convert_memory_address (Pmode, op0);
13303 	  op0 = copy_addr_to_reg (op0);
13304 	}
13305       op0 = gen_rtx_MEM (XImode, op0);
13306       if (fcode == IX86_BUILTIN_LDTILECFG)
13307 	icode = CODE_FOR_ldtilecfg;
13308       else
13309 	icode = CODE_FOR_sttilecfg;
13310       pat = GEN_FCN (icode) (op0);
13311       emit_insn (pat);
13312       return 0;
13313 
13314     case IX86_BUILTIN_LLWPCB:
13315       arg0 = CALL_EXPR_ARG (exp, 0);
13316       op0 = expand_normal (arg0);
13317 
13318       if (!register_operand (op0, Pmode))
13319 	op0 = ix86_zero_extend_to_Pmode (op0);
13320       emit_insn (gen_lwp_llwpcb (Pmode, op0));
13321       return 0;
13322 
13323     case IX86_BUILTIN_SLWPCB:
13324       if (!target
13325 	  || !register_operand (target, Pmode))
13326 	target = gen_reg_rtx (Pmode);
13327       emit_insn (gen_lwp_slwpcb (Pmode, target));
13328       return target;
13329 
13330     case IX86_BUILTIN_LWPVAL32:
13331     case IX86_BUILTIN_LWPVAL64:
13332     case IX86_BUILTIN_LWPINS32:
13333     case IX86_BUILTIN_LWPINS64:
13334       mode = ((fcode == IX86_BUILTIN_LWPVAL32
13335 	       || fcode == IX86_BUILTIN_LWPINS32)
13336 	      ? SImode : DImode);
13337 
13338       if (fcode == IX86_BUILTIN_LWPVAL32
13339 	  || fcode == IX86_BUILTIN_LWPVAL64)
13340 	icode = code_for_lwp_lwpval (mode);
13341       else
13342 	icode = code_for_lwp_lwpins (mode);
13343 
13344       arg0 = CALL_EXPR_ARG (exp, 0);
13345       arg1 = CALL_EXPR_ARG (exp, 1);
13346       arg2 = CALL_EXPR_ARG (exp, 2);
13347       op0 = expand_normal (arg0);
13348       op1 = expand_normal (arg1);
13349       op2 = expand_normal (arg2);
13350       mode0 = insn_data[icode].operand[0].mode;
13351 
13352       if (!insn_data[icode].operand[0].predicate (op0, mode0))
13353 	op0 = copy_to_mode_reg (mode0, op0);
13354       if (!insn_data[icode].operand[1].predicate (op1, SImode))
13355 	op1 = copy_to_mode_reg (SImode, op1);
13356 
13357       if (!CONST_INT_P (op2))
13358 	{
13359 	  error ("the last argument must be a 32-bit immediate");
13360 	  return const0_rtx;
13361 	}
13362 
13363       emit_insn (GEN_FCN (icode) (op0, op1, op2));
13364 
13365       if (fcode == IX86_BUILTIN_LWPINS32
13366 	  || fcode == IX86_BUILTIN_LWPINS64)
13367 	{
13368 	  if (target == 0
13369 	      || !nonimmediate_operand (target, QImode))
13370 	    target = gen_reg_rtx (QImode);
13371 
13372 	  pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13373 			    const0_rtx);
13374 	  emit_insn (gen_rtx_SET (target, pat));
13375 
13376 	  return target;
13377 	}
13378       else
13379 	return 0;
13380 
13381     case IX86_BUILTIN_BEXTRI32:
13382     case IX86_BUILTIN_BEXTRI64:
13383       mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13384 
13385       arg0 = CALL_EXPR_ARG (exp, 0);
13386       arg1 = CALL_EXPR_ARG (exp, 1);
13387       op0 = expand_normal (arg0);
13388       op1 = expand_normal (arg1);
13389 
13390       if (!CONST_INT_P (op1))
13391 	{
13392 	  error ("last argument must be an immediate");
13393 	  return const0_rtx;
13394 	}
13395       else
13396 	{
13397 	  unsigned char lsb_index = UINTVAL (op1);
13398 	  unsigned char length = UINTVAL (op1) >> 8;
13399 
13400 	  unsigned char bitsize = GET_MODE_BITSIZE (mode);
13401 
13402 	  icode = code_for_tbm_bextri (mode);
13403 
13404 	  mode1 = insn_data[icode].operand[1].mode;
13405 	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
13406 	    op0 = copy_to_mode_reg (mode1, op0);
13407 
13408 	  mode0 = insn_data[icode].operand[0].mode;
13409 	  if (target == 0
13410 	      || !register_operand (target, mode0))
13411 	    target = gen_reg_rtx (mode0);
13412 
13413 	  if (length == 0 || lsb_index >= bitsize)
13414 	    {
13415 	      emit_move_insn (target, const0_rtx);
13416 	      return target;
13417 	    }
13418 
13419 	  if (length + lsb_index > bitsize)
13420 	    length = bitsize - lsb_index;
13421 
13422 	  op1 = GEN_INT (length);
13423 	  op2 = GEN_INT (lsb_index);
13424 
13425 	  emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13426 	  return target;
13427 	}
13428 
13429     case IX86_BUILTIN_RDRAND16_STEP:
13430       mode = HImode;
13431       goto rdrand_step;
13432 
13433     case IX86_BUILTIN_RDRAND32_STEP:
13434       mode = SImode;
13435       goto rdrand_step;
13436 
13437     case IX86_BUILTIN_RDRAND64_STEP:
13438       mode = DImode;
13439 
13440 rdrand_step:
13441       arg0 = CALL_EXPR_ARG (exp, 0);
13442       op1 = expand_normal (arg0);
13443       if (!address_operand (op1, VOIDmode))
13444 	{
13445 	  op1 = convert_memory_address (Pmode, op1);
13446 	  op1 = copy_addr_to_reg (op1);
13447 	}
13448 
13449       op0 = gen_reg_rtx (mode);
13450       emit_insn (gen_rdrand (mode, op0));
13451 
13452       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13453 
13454       op1 = force_reg (SImode, const1_rtx);
13455 
13456       /* Emit SImode conditional move.  */
13457       if (mode == HImode)
13458 	{
13459 	  if (TARGET_ZERO_EXTEND_WITH_AND
13460 	      && optimize_function_for_speed_p (cfun))
13461 	    {
13462 	      op2 = force_reg (SImode, const0_rtx);
13463 
13464 	      emit_insn (gen_movstricthi
13465 			 (gen_lowpart (HImode, op2), op0));
13466 	    }
13467 	  else
13468 	    {
13469 	      op2 = gen_reg_rtx (SImode);
13470 
13471 	      emit_insn (gen_zero_extendhisi2 (op2, op0));
13472 	    }
13473 	}
13474       else if (mode == SImode)
13475 	op2 = op0;
13476       else
13477 	op2 = gen_rtx_SUBREG (SImode, op0, 0);
13478 
13479       if (target == 0
13480 	  || !register_operand (target, SImode))
13481 	target = gen_reg_rtx (SImode);
13482 
13483       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13484 			 const0_rtx);
13485       emit_insn (gen_rtx_SET (target,
13486 			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13487       return target;
13488 
13489     case IX86_BUILTIN_RDSEED16_STEP:
13490       mode = HImode;
13491       goto rdseed_step;
13492 
13493     case IX86_BUILTIN_RDSEED32_STEP:
13494       mode = SImode;
13495       goto rdseed_step;
13496 
13497     case IX86_BUILTIN_RDSEED64_STEP:
13498       mode = DImode;
13499 
13500 rdseed_step:
13501       arg0 = CALL_EXPR_ARG (exp, 0);
13502       op1 = expand_normal (arg0);
13503       if (!address_operand (op1, VOIDmode))
13504 	{
13505 	  op1 = convert_memory_address (Pmode, op1);
13506 	  op1 = copy_addr_to_reg (op1);
13507 	}
13508 
13509       op0 = gen_reg_rtx (mode);
13510       emit_insn (gen_rdseed (mode, op0));
13511 
13512       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13513 
13514       op2 = gen_reg_rtx (QImode);
13515 
13516       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13517                          const0_rtx);
13518       emit_insn (gen_rtx_SET (op2, pat));
13519 
13520       if (target == 0
13521 	  || !register_operand (target, SImode))
13522         target = gen_reg_rtx (SImode);
13523 
13524       emit_insn (gen_zero_extendqisi2 (target, op2));
13525       return target;
13526 
13527     case IX86_BUILTIN_SBB32:
13528       icode = CODE_FOR_subborrowsi;
13529       icode2 = CODE_FOR_subborrowsi_0;
13530       mode0 = SImode;
13531       mode1 = DImode;
13532       mode2 = CCmode;
13533       goto handlecarry;
13534 
13535     case IX86_BUILTIN_SBB64:
13536       icode = CODE_FOR_subborrowdi;
13537       icode2 = CODE_FOR_subborrowdi_0;
13538       mode0 = DImode;
13539       mode1 = TImode;
13540       mode2 = CCmode;
13541       goto handlecarry;
13542 
13543     case IX86_BUILTIN_ADDCARRYX32:
13544       icode = CODE_FOR_addcarrysi;
13545       icode2 = CODE_FOR_addcarrysi_0;
13546       mode0 = SImode;
13547       mode1 = DImode;
13548       mode2 = CCCmode;
13549       goto handlecarry;
13550 
13551     case IX86_BUILTIN_ADDCARRYX64:
13552       icode = CODE_FOR_addcarrydi;
13553       icode2 = CODE_FOR_addcarrydi_0;
13554       mode0 = DImode;
13555       mode1 = TImode;
13556       mode2 = CCCmode;
13557 
13558     handlecarry:
13559       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
13560       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
13561       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
13562       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
13563 
13564       op1 = expand_normal (arg0);
13565       if (!integer_zerop (arg0))
13566 	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
13567 
13568       op2 = expand_normal (arg1);
13569       if (!register_operand (op2, mode0))
13570 	op2 = copy_to_mode_reg (mode0, op2);
13571 
13572       op3 = expand_normal (arg2);
13573       if (!register_operand (op3, mode0))
13574 	op3 = copy_to_mode_reg (mode0, op3);
13575 
13576       op4 = expand_normal (arg3);
13577       if (!address_operand (op4, VOIDmode))
13578 	{
13579 	  op4 = convert_memory_address (Pmode, op4);
13580 	  op4 = copy_addr_to_reg (op4);
13581 	}
13582 
13583       op0 = gen_reg_rtx (mode0);
13584       if (integer_zerop (arg0))
13585 	{
13586 	  /* If arg0 is 0, optimize right away into add or sub
13587 	     instruction that sets CCCmode flags.  */
13588 	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
13589 	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
13590 	}
13591       else
13592 	{
13593 	  /* Generate CF from input operand.  */
13594 	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
13595 
13596 	  /* Generate instruction that consumes CF.  */
13597 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
13598 	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
13599 	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
13600 	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
13601 	}
13602 
13603       /* Return current CF value.  */
13604       if (target == 0)
13605         target = gen_reg_rtx (QImode);
13606 
13607       pat = gen_rtx_LTU (QImode, op1, const0_rtx);
13608       emit_insn (gen_rtx_SET (target, pat));
13609 
13610       /* Store the result.  */
13611       emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
13612 
13613       return target;
13614 
13615     case IX86_BUILTIN_READ_FLAGS:
13616       if (ignore)
13617 	return const0_rtx;
13618 
13619       emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
13620 
13621       if (optimize
13622 	  || target == NULL_RTX
13623 	  || !nonimmediate_operand (target, word_mode)
13624 	  || GET_MODE (target) != word_mode)
13625 	target = gen_reg_rtx (word_mode);
13626 
13627       emit_insn (gen_pop (target));
13628       return target;
13629 
13630     case IX86_BUILTIN_WRITE_FLAGS:
13631 
13632       arg0 = CALL_EXPR_ARG (exp, 0);
13633       op0 = expand_normal (arg0);
13634       if (!general_no_elim_operand (op0, word_mode))
13635 	op0 = copy_to_mode_reg (word_mode, op0);
13636 
13637       emit_insn (gen_push (op0));
13638       emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
13639       return 0;
13640 
13641     case IX86_BUILTIN_KTESTC8:
13642       icode = CODE_FOR_ktestqi;
13643       mode3 = CCCmode;
13644       goto kortest;
13645 
13646     case IX86_BUILTIN_KTESTZ8:
13647       icode = CODE_FOR_ktestqi;
13648       mode3 = CCZmode;
13649       goto kortest;
13650 
13651     case IX86_BUILTIN_KTESTC16:
13652       icode = CODE_FOR_ktesthi;
13653       mode3 = CCCmode;
13654       goto kortest;
13655 
13656     case IX86_BUILTIN_KTESTZ16:
13657       icode = CODE_FOR_ktesthi;
13658       mode3 = CCZmode;
13659       goto kortest;
13660 
13661     case IX86_BUILTIN_KTESTC32:
13662       icode = CODE_FOR_ktestsi;
13663       mode3 = CCCmode;
13664       goto kortest;
13665 
13666     case IX86_BUILTIN_KTESTZ32:
13667       icode = CODE_FOR_ktestsi;
13668       mode3 = CCZmode;
13669       goto kortest;
13670 
13671     case IX86_BUILTIN_KTESTC64:
13672       icode = CODE_FOR_ktestdi;
13673       mode3 = CCCmode;
13674       goto kortest;
13675 
13676     case IX86_BUILTIN_KTESTZ64:
13677       icode = CODE_FOR_ktestdi;
13678       mode3 = CCZmode;
13679       goto kortest;
13680 
13681     case IX86_BUILTIN_KORTESTC8:
13682       icode = CODE_FOR_kortestqi;
13683       mode3 = CCCmode;
13684       goto kortest;
13685 
13686     case IX86_BUILTIN_KORTESTZ8:
13687       icode = CODE_FOR_kortestqi;
13688       mode3 = CCZmode;
13689       goto kortest;
13690 
13691     case IX86_BUILTIN_KORTESTC16:
13692       icode = CODE_FOR_kortesthi;
13693       mode3 = CCCmode;
13694       goto kortest;
13695 
13696     case IX86_BUILTIN_KORTESTZ16:
13697       icode = CODE_FOR_kortesthi;
13698       mode3 = CCZmode;
13699       goto kortest;
13700 
13701     case IX86_BUILTIN_KORTESTC32:
13702       icode = CODE_FOR_kortestsi;
13703       mode3 = CCCmode;
13704       goto kortest;
13705 
13706     case IX86_BUILTIN_KORTESTZ32:
13707       icode = CODE_FOR_kortestsi;
13708       mode3 = CCZmode;
13709       goto kortest;
13710 
13711     case IX86_BUILTIN_KORTESTC64:
13712       icode = CODE_FOR_kortestdi;
13713       mode3 = CCCmode;
13714       goto kortest;
13715 
13716     case IX86_BUILTIN_KORTESTZ64:
13717       icode = CODE_FOR_kortestdi;
13718       mode3 = CCZmode;
13719 
13720     kortest:
13721       arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
13722       arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
13723       op0 = expand_normal (arg0);
13724       op1 = expand_normal (arg1);
13725 
13726       mode0 = insn_data[icode].operand[0].mode;
13727       mode1 = insn_data[icode].operand[1].mode;
13728 
13729       if (GET_MODE (op0) != VOIDmode)
13730 	op0 = force_reg (GET_MODE (op0), op0);
13731 
13732       op0 = gen_lowpart (mode0, op0);
13733 
13734       if (!insn_data[icode].operand[0].predicate (op0, mode0))
13735 	op0 = copy_to_mode_reg (mode0, op0);
13736 
13737       if (GET_MODE (op1) != VOIDmode)
13738 	op1 = force_reg (GET_MODE (op1), op1);
13739 
13740       op1 = gen_lowpart (mode1, op1);
13741 
13742       if (!insn_data[icode].operand[1].predicate (op1, mode1))
13743 	op1 = copy_to_mode_reg (mode1, op1);
13744 
13745       target = gen_reg_rtx (QImode);
13746 
13747       /* Emit kortest.  */
13748       emit_insn (GEN_FCN (icode) (op0, op1));
13749       /* And use setcc to return result from flags.  */
13750       ix86_expand_setcc (target, EQ,
13751 			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
13752       return target;
13753 
13754     case IX86_BUILTIN_GATHERSIV2DF:
13755       icode = CODE_FOR_avx2_gathersiv2df;
13756       goto gather_gen;
13757     case IX86_BUILTIN_GATHERSIV4DF:
13758       icode = CODE_FOR_avx2_gathersiv4df;
13759       goto gather_gen;
13760     case IX86_BUILTIN_GATHERDIV2DF:
13761       icode = CODE_FOR_avx2_gatherdiv2df;
13762       goto gather_gen;
13763     case IX86_BUILTIN_GATHERDIV4DF:
13764       icode = CODE_FOR_avx2_gatherdiv4df;
13765       goto gather_gen;
13766     case IX86_BUILTIN_GATHERSIV4SF:
13767       icode = CODE_FOR_avx2_gathersiv4sf;
13768       goto gather_gen;
13769     case IX86_BUILTIN_GATHERSIV8SF:
13770       icode = CODE_FOR_avx2_gathersiv8sf;
13771       goto gather_gen;
13772     case IX86_BUILTIN_GATHERDIV4SF:
13773       icode = CODE_FOR_avx2_gatherdiv4sf;
13774       goto gather_gen;
13775     case IX86_BUILTIN_GATHERDIV8SF:
13776       icode = CODE_FOR_avx2_gatherdiv8sf;
13777       goto gather_gen;
13778     case IX86_BUILTIN_GATHERSIV2DI:
13779       icode = CODE_FOR_avx2_gathersiv2di;
13780       goto gather_gen;
13781     case IX86_BUILTIN_GATHERSIV4DI:
13782       icode = CODE_FOR_avx2_gathersiv4di;
13783       goto gather_gen;
13784     case IX86_BUILTIN_GATHERDIV2DI:
13785       icode = CODE_FOR_avx2_gatherdiv2di;
13786       goto gather_gen;
13787     case IX86_BUILTIN_GATHERDIV4DI:
13788       icode = CODE_FOR_avx2_gatherdiv4di;
13789       goto gather_gen;
13790     case IX86_BUILTIN_GATHERSIV4SI:
13791       icode = CODE_FOR_avx2_gathersiv4si;
13792       goto gather_gen;
13793     case IX86_BUILTIN_GATHERSIV8SI:
13794       icode = CODE_FOR_avx2_gathersiv8si;
13795       goto gather_gen;
13796     case IX86_BUILTIN_GATHERDIV4SI:
13797       icode = CODE_FOR_avx2_gatherdiv4si;
13798       goto gather_gen;
13799     case IX86_BUILTIN_GATHERDIV8SI:
13800       icode = CODE_FOR_avx2_gatherdiv8si;
13801       goto gather_gen;
13802     case IX86_BUILTIN_GATHERALTSIV4DF:
13803       icode = CODE_FOR_avx2_gathersiv4df;
13804       goto gather_gen;
13805     case IX86_BUILTIN_GATHERALTDIV8SF:
13806       icode = CODE_FOR_avx2_gatherdiv8sf;
13807       goto gather_gen;
13808     case IX86_BUILTIN_GATHERALTSIV4DI:
13809       icode = CODE_FOR_avx2_gathersiv4di;
13810       goto gather_gen;
13811     case IX86_BUILTIN_GATHERALTDIV8SI:
13812       icode = CODE_FOR_avx2_gatherdiv8si;
13813       goto gather_gen;
13814     case IX86_BUILTIN_GATHER3SIV16SF:
13815       icode = CODE_FOR_avx512f_gathersiv16sf;
13816       goto gather_gen;
13817     case IX86_BUILTIN_GATHER3SIV8DF:
13818       icode = CODE_FOR_avx512f_gathersiv8df;
13819       goto gather_gen;
13820     case IX86_BUILTIN_GATHER3DIV16SF:
13821       icode = CODE_FOR_avx512f_gatherdiv16sf;
13822       goto gather_gen;
13823     case IX86_BUILTIN_GATHER3DIV8DF:
13824       icode = CODE_FOR_avx512f_gatherdiv8df;
13825       goto gather_gen;
13826     case IX86_BUILTIN_GATHER3SIV16SI:
13827       icode = CODE_FOR_avx512f_gathersiv16si;
13828       goto gather_gen;
13829     case IX86_BUILTIN_GATHER3SIV8DI:
13830       icode = CODE_FOR_avx512f_gathersiv8di;
13831       goto gather_gen;
13832     case IX86_BUILTIN_GATHER3DIV16SI:
13833       icode = CODE_FOR_avx512f_gatherdiv16si;
13834       goto gather_gen;
13835     case IX86_BUILTIN_GATHER3DIV8DI:
13836       icode = CODE_FOR_avx512f_gatherdiv8di;
13837       goto gather_gen;
13838     case IX86_BUILTIN_GATHER3ALTSIV8DF:
13839       icode = CODE_FOR_avx512f_gathersiv8df;
13840       goto gather_gen;
13841     case IX86_BUILTIN_GATHER3ALTDIV16SF:
13842       icode = CODE_FOR_avx512f_gatherdiv16sf;
13843       goto gather_gen;
13844     case IX86_BUILTIN_GATHER3ALTSIV8DI:
13845       icode = CODE_FOR_avx512f_gathersiv8di;
13846       goto gather_gen;
13847     case IX86_BUILTIN_GATHER3ALTDIV16SI:
13848       icode = CODE_FOR_avx512f_gatherdiv16si;
13849       goto gather_gen;
13850     case IX86_BUILTIN_GATHER3SIV2DF:
13851       icode = CODE_FOR_avx512vl_gathersiv2df;
13852       goto gather_gen;
13853     case IX86_BUILTIN_GATHER3SIV4DF:
13854       icode = CODE_FOR_avx512vl_gathersiv4df;
13855       goto gather_gen;
13856     case IX86_BUILTIN_GATHER3DIV2DF:
13857       icode = CODE_FOR_avx512vl_gatherdiv2df;
13858       goto gather_gen;
13859     case IX86_BUILTIN_GATHER3DIV4DF:
13860       icode = CODE_FOR_avx512vl_gatherdiv4df;
13861       goto gather_gen;
13862     case IX86_BUILTIN_GATHER3SIV4SF:
13863       icode = CODE_FOR_avx512vl_gathersiv4sf;
13864       goto gather_gen;
13865     case IX86_BUILTIN_GATHER3SIV8SF:
13866       icode = CODE_FOR_avx512vl_gathersiv8sf;
13867       goto gather_gen;
13868     case IX86_BUILTIN_GATHER3DIV4SF:
13869       icode = CODE_FOR_avx512vl_gatherdiv4sf;
13870       goto gather_gen;
13871     case IX86_BUILTIN_GATHER3DIV8SF:
13872       icode = CODE_FOR_avx512vl_gatherdiv8sf;
13873       goto gather_gen;
13874     case IX86_BUILTIN_GATHER3SIV2DI:
13875       icode = CODE_FOR_avx512vl_gathersiv2di;
13876       goto gather_gen;
13877     case IX86_BUILTIN_GATHER3SIV4DI:
13878       icode = CODE_FOR_avx512vl_gathersiv4di;
13879       goto gather_gen;
13880     case IX86_BUILTIN_GATHER3DIV2DI:
13881       icode = CODE_FOR_avx512vl_gatherdiv2di;
13882       goto gather_gen;
13883     case IX86_BUILTIN_GATHER3DIV4DI:
13884       icode = CODE_FOR_avx512vl_gatherdiv4di;
13885       goto gather_gen;
13886     case IX86_BUILTIN_GATHER3SIV4SI:
13887       icode = CODE_FOR_avx512vl_gathersiv4si;
13888       goto gather_gen;
13889     case IX86_BUILTIN_GATHER3SIV8SI:
13890       icode = CODE_FOR_avx512vl_gathersiv8si;
13891       goto gather_gen;
13892     case IX86_BUILTIN_GATHER3DIV4SI:
13893       icode = CODE_FOR_avx512vl_gatherdiv4si;
13894       goto gather_gen;
13895     case IX86_BUILTIN_GATHER3DIV8SI:
13896       icode = CODE_FOR_avx512vl_gatherdiv8si;
13897       goto gather_gen;
13898     case IX86_BUILTIN_GATHER3ALTSIV4DF:
13899       icode = CODE_FOR_avx512vl_gathersiv4df;
13900       goto gather_gen;
13901     case IX86_BUILTIN_GATHER3ALTDIV8SF:
13902       icode = CODE_FOR_avx512vl_gatherdiv8sf;
13903       goto gather_gen;
13904     case IX86_BUILTIN_GATHER3ALTSIV4DI:
13905       icode = CODE_FOR_avx512vl_gathersiv4di;
13906       goto gather_gen;
13907     case IX86_BUILTIN_GATHER3ALTDIV8SI:
13908       icode = CODE_FOR_avx512vl_gatherdiv8si;
13909       goto gather_gen;
13910     case IX86_BUILTIN_SCATTERSIV16SF:
13911       icode = CODE_FOR_avx512f_scattersiv16sf;
13912       goto scatter_gen;
13913     case IX86_BUILTIN_SCATTERSIV8DF:
13914       icode = CODE_FOR_avx512f_scattersiv8df;
13915       goto scatter_gen;
13916     case IX86_BUILTIN_SCATTERDIV16SF:
13917       icode = CODE_FOR_avx512f_scatterdiv16sf;
13918       goto scatter_gen;
13919     case IX86_BUILTIN_SCATTERDIV8DF:
13920       icode = CODE_FOR_avx512f_scatterdiv8df;
13921       goto scatter_gen;
13922     case IX86_BUILTIN_SCATTERSIV16SI:
13923       icode = CODE_FOR_avx512f_scattersiv16si;
13924       goto scatter_gen;
13925     case IX86_BUILTIN_SCATTERSIV8DI:
13926       icode = CODE_FOR_avx512f_scattersiv8di;
13927       goto scatter_gen;
13928     case IX86_BUILTIN_SCATTERDIV16SI:
13929       icode = CODE_FOR_avx512f_scatterdiv16si;
13930       goto scatter_gen;
13931     case IX86_BUILTIN_SCATTERDIV8DI:
13932       icode = CODE_FOR_avx512f_scatterdiv8di;
13933       goto scatter_gen;
13934     case IX86_BUILTIN_SCATTERSIV8SF:
13935       icode = CODE_FOR_avx512vl_scattersiv8sf;
13936       goto scatter_gen;
13937     case IX86_BUILTIN_SCATTERSIV4SF:
13938       icode = CODE_FOR_avx512vl_scattersiv4sf;
13939       goto scatter_gen;
13940     case IX86_BUILTIN_SCATTERSIV4DF:
13941       icode = CODE_FOR_avx512vl_scattersiv4df;
13942       goto scatter_gen;
13943     case IX86_BUILTIN_SCATTERSIV2DF:
13944       icode = CODE_FOR_avx512vl_scattersiv2df;
13945       goto scatter_gen;
13946     case IX86_BUILTIN_SCATTERDIV8SF:
13947       icode = CODE_FOR_avx512vl_scatterdiv8sf;
13948       goto scatter_gen;
13949     case IX86_BUILTIN_SCATTERDIV4SF:
13950       icode = CODE_FOR_avx512vl_scatterdiv4sf;
13951       goto scatter_gen;
13952     case IX86_BUILTIN_SCATTERDIV4DF:
13953       icode = CODE_FOR_avx512vl_scatterdiv4df;
13954       goto scatter_gen;
13955     case IX86_BUILTIN_SCATTERDIV2DF:
13956       icode = CODE_FOR_avx512vl_scatterdiv2df;
13957       goto scatter_gen;
13958     case IX86_BUILTIN_SCATTERSIV8SI:
13959       icode = CODE_FOR_avx512vl_scattersiv8si;
13960       goto scatter_gen;
13961     case IX86_BUILTIN_SCATTERSIV4SI:
13962       icode = CODE_FOR_avx512vl_scattersiv4si;
13963       goto scatter_gen;
13964     case IX86_BUILTIN_SCATTERSIV4DI:
13965       icode = CODE_FOR_avx512vl_scattersiv4di;
13966       goto scatter_gen;
13967     case IX86_BUILTIN_SCATTERSIV2DI:
13968       icode = CODE_FOR_avx512vl_scattersiv2di;
13969       goto scatter_gen;
13970     case IX86_BUILTIN_SCATTERDIV8SI:
13971       icode = CODE_FOR_avx512vl_scatterdiv8si;
13972       goto scatter_gen;
13973     case IX86_BUILTIN_SCATTERDIV4SI:
13974       icode = CODE_FOR_avx512vl_scatterdiv4si;
13975       goto scatter_gen;
13976     case IX86_BUILTIN_SCATTERDIV4DI:
13977       icode = CODE_FOR_avx512vl_scatterdiv4di;
13978       goto scatter_gen;
13979     case IX86_BUILTIN_SCATTERDIV2DI:
13980       icode = CODE_FOR_avx512vl_scatterdiv2di;
13981       goto scatter_gen;
13982     case IX86_BUILTIN_GATHERPFDPD:
13983       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
13984       goto vec_prefetch_gen;
13985     case IX86_BUILTIN_SCATTERALTSIV8DF:
13986       icode = CODE_FOR_avx512f_scattersiv8df;
13987       goto scatter_gen;
13988     case IX86_BUILTIN_SCATTERALTDIV16SF:
13989       icode = CODE_FOR_avx512f_scatterdiv16sf;
13990       goto scatter_gen;
13991     case IX86_BUILTIN_SCATTERALTSIV8DI:
13992       icode = CODE_FOR_avx512f_scattersiv8di;
13993       goto scatter_gen;
13994     case IX86_BUILTIN_SCATTERALTDIV16SI:
13995       icode = CODE_FOR_avx512f_scatterdiv16si;
13996       goto scatter_gen;
13997     case IX86_BUILTIN_SCATTERALTSIV4DF:
13998       icode = CODE_FOR_avx512vl_scattersiv4df;
13999       goto scatter_gen;
14000     case IX86_BUILTIN_SCATTERALTDIV8SF:
14001       icode = CODE_FOR_avx512vl_scatterdiv8sf;
14002       goto scatter_gen;
14003     case IX86_BUILTIN_SCATTERALTSIV4DI:
14004       icode = CODE_FOR_avx512vl_scattersiv4di;
14005       goto scatter_gen;
14006     case IX86_BUILTIN_SCATTERALTDIV8SI:
14007       icode = CODE_FOR_avx512vl_scatterdiv8si;
14008       goto scatter_gen;
14009     case IX86_BUILTIN_SCATTERALTSIV2DF:
14010       icode = CODE_FOR_avx512vl_scattersiv2df;
14011       goto scatter_gen;
14012     case IX86_BUILTIN_SCATTERALTDIV4SF:
14013       icode = CODE_FOR_avx512vl_scatterdiv4sf;
14014       goto scatter_gen;
14015     case IX86_BUILTIN_SCATTERALTSIV2DI:
14016       icode = CODE_FOR_avx512vl_scattersiv2di;
14017       goto scatter_gen;
14018     case IX86_BUILTIN_SCATTERALTDIV4SI:
14019       icode = CODE_FOR_avx512vl_scatterdiv4si;
14020       goto scatter_gen;
14021     case IX86_BUILTIN_GATHERPFDPS:
14022       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14023       goto vec_prefetch_gen;
14024     case IX86_BUILTIN_GATHERPFQPD:
14025       icode = CODE_FOR_avx512pf_gatherpfv8didf;
14026       goto vec_prefetch_gen;
14027     case IX86_BUILTIN_GATHERPFQPS:
14028       icode = CODE_FOR_avx512pf_gatherpfv8disf;
14029       goto vec_prefetch_gen;
14030     case IX86_BUILTIN_SCATTERPFDPD:
14031       icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14032       goto vec_prefetch_gen;
14033     case IX86_BUILTIN_SCATTERPFDPS:
14034       icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14035       goto vec_prefetch_gen;
14036     case IX86_BUILTIN_SCATTERPFQPD:
14037       icode = CODE_FOR_avx512pf_scatterpfv8didf;
14038       goto vec_prefetch_gen;
14039     case IX86_BUILTIN_SCATTERPFQPS:
14040       icode = CODE_FOR_avx512pf_scatterpfv8disf;
14041       goto vec_prefetch_gen;
14042 
14043     gather_gen:
14044       rtx half;
14045       rtx (*gen) (rtx, rtx);
14046 
14047       arg0 = CALL_EXPR_ARG (exp, 0);
14048       arg1 = CALL_EXPR_ARG (exp, 1);
14049       arg2 = CALL_EXPR_ARG (exp, 2);
14050       arg3 = CALL_EXPR_ARG (exp, 3);
14051       arg4 = CALL_EXPR_ARG (exp, 4);
14052       op0 = expand_normal (arg0);
14053       op1 = expand_normal (arg1);
14054       op2 = expand_normal (arg2);
14055       op3 = expand_normal (arg3);
14056       op4 = expand_normal (arg4);
14057       /* Note the arg order is different from the operand order.  */
14058       mode0 = insn_data[icode].operand[1].mode;
14059       mode2 = insn_data[icode].operand[3].mode;
14060       mode3 = insn_data[icode].operand[4].mode;
14061       mode4 = insn_data[icode].operand[5].mode;
14062 
14063       if (target == NULL_RTX
14064 	  || GET_MODE (target) != insn_data[icode].operand[0].mode
14065 	  || !insn_data[icode].operand[0].predicate (target,
14066 						     GET_MODE (target)))
14067 	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14068       else
14069 	subtarget = target;
14070 
14071       switch (fcode)
14072 	{
14073 	case IX86_BUILTIN_GATHER3ALTSIV8DF:
14074 	case IX86_BUILTIN_GATHER3ALTSIV8DI:
14075 	  half = gen_reg_rtx (V8SImode);
14076 	  if (!nonimmediate_operand (op2, V16SImode))
14077 	    op2 = copy_to_mode_reg (V16SImode, op2);
14078 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
14079 	  op2 = half;
14080 	  break;
14081 	case IX86_BUILTIN_GATHER3ALTSIV4DF:
14082 	case IX86_BUILTIN_GATHER3ALTSIV4DI:
14083 	case IX86_BUILTIN_GATHERALTSIV4DF:
14084 	case IX86_BUILTIN_GATHERALTSIV4DI:
14085 	  half = gen_reg_rtx (V4SImode);
14086 	  if (!nonimmediate_operand (op2, V8SImode))
14087 	    op2 = copy_to_mode_reg (V8SImode, op2);
14088 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
14089 	  op2 = half;
14090 	  break;
14091 	case IX86_BUILTIN_GATHER3ALTDIV16SF:
14092 	case IX86_BUILTIN_GATHER3ALTDIV16SI:
14093 	  half = gen_reg_rtx (mode0);
14094 	  if (mode0 == V8SFmode)
14095 	    gen = gen_vec_extract_lo_v16sf;
14096 	  else
14097 	    gen = gen_vec_extract_lo_v16si;
14098 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
14099 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14100 	  emit_insn (gen (half, op0));
14101 	  op0 = half;
14102 	  op3 = lowpart_subreg (QImode, op3, HImode);
14103 	  break;
14104 	case IX86_BUILTIN_GATHER3ALTDIV8SF:
14105 	case IX86_BUILTIN_GATHER3ALTDIV8SI:
14106 	case IX86_BUILTIN_GATHERALTDIV8SF:
14107 	case IX86_BUILTIN_GATHERALTDIV8SI:
14108 	  half = gen_reg_rtx (mode0);
14109 	  if (mode0 == V4SFmode)
14110 	    gen = gen_vec_extract_lo_v8sf;
14111 	  else
14112 	    gen = gen_vec_extract_lo_v8si;
14113 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
14114 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14115 	  emit_insn (gen (half, op0));
14116 	  op0 = half;
14117 	  if (VECTOR_MODE_P (GET_MODE (op3)))
14118 	    {
14119 	      half = gen_reg_rtx (mode0);
14120 	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
14121 		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14122 	      emit_insn (gen (half, op3));
14123 	      op3 = half;
14124 	    }
14125 	  break;
14126 	default:
14127 	  break;
14128 	}
14129 
14130       /* Force memory operand only with base register here.  But we
14131 	 don't want to do it on memory operand for other builtin
14132 	 functions.  */
14133       op1 = ix86_zero_extend_to_Pmode (op1);
14134 
14135       if (!insn_data[icode].operand[1].predicate (op0, mode0))
14136 	op0 = copy_to_mode_reg (mode0, op0);
14137       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14138 	op1 = copy_to_mode_reg (Pmode, op1);
14139       if (!insn_data[icode].operand[3].predicate (op2, mode2))
14140 	op2 = copy_to_mode_reg (mode2, op2);
14141 
14142       op3 = fixup_modeless_constant (op3, mode3);
14143 
14144       if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14145 	{
14146 	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
14147 	    op3 = copy_to_mode_reg (mode3, op3);
14148 	}
14149       else
14150 	{
14151 	  op3 = copy_to_reg (op3);
14152 	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14153 	}
14154       if (!insn_data[icode].operand[5].predicate (op4, mode4))
14155 	{
14156           error ("the last argument must be scale 1, 2, 4, 8");
14157           return const0_rtx;
14158 	}
14159 
14160       /* Optimize.  If mask is known to have all high bits set,
14161 	 replace op0 with pc_rtx to signal that the instruction
14162 	 overwrites the whole destination and doesn't use its
14163 	 previous contents.  */
14164       if (optimize)
14165 	{
14166 	  if (TREE_CODE (arg3) == INTEGER_CST)
14167 	    {
14168 	      if (integer_all_onesp (arg3))
14169 		op0 = pc_rtx;
14170 	    }
14171 	  else if (TREE_CODE (arg3) == VECTOR_CST)
14172 	    {
14173 	      unsigned int negative = 0;
14174 	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14175 		{
14176 		  tree cst = VECTOR_CST_ELT (arg3, i);
14177 		  if (TREE_CODE (cst) == INTEGER_CST
14178 		      && tree_int_cst_sign_bit (cst))
14179 		    negative++;
14180 		  else if (TREE_CODE (cst) == REAL_CST
14181 			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14182 		    negative++;
14183 		}
14184 	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14185 		op0 = pc_rtx;
14186 	    }
14187 	  else if (TREE_CODE (arg3) == SSA_NAME
14188 		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
14189 	    {
14190 	      /* Recognize also when mask is like:
14191 		 __v2df src = _mm_setzero_pd ();
14192 		 __v2df mask = _mm_cmpeq_pd (src, src);
14193 		 or
14194 		 __v8sf src = _mm256_setzero_ps ();
14195 		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14196 		 as that is a cheaper way to load all ones into
14197 		 a register than having to load a constant from
14198 		 memory.  */
14199 	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14200 	      if (is_gimple_call (def_stmt))
14201 		{
14202 		  tree fndecl = gimple_call_fndecl (def_stmt);
14203 		  if (fndecl
14204 		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
14205 		    switch (DECL_MD_FUNCTION_CODE (fndecl))
14206 		      {
14207 		      case IX86_BUILTIN_CMPPD:
14208 		      case IX86_BUILTIN_CMPPS:
14209 		      case IX86_BUILTIN_CMPPD256:
14210 		      case IX86_BUILTIN_CMPPS256:
14211 			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14212 			  break;
14213 			/* FALLTHRU */
14214 		      case IX86_BUILTIN_CMPEQPD:
14215 		      case IX86_BUILTIN_CMPEQPS:
14216 			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14217 			    && initializer_zerop (gimple_call_arg (def_stmt,
14218 								   1)))
14219 			  op0 = pc_rtx;
14220 			break;
14221 		      default:
14222 			break;
14223 		      }
14224 		}
14225 	    }
14226 	}
14227 
14228       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14229       if (! pat)
14230 	return const0_rtx;
14231       emit_insn (pat);
14232 
14233       switch (fcode)
14234 	{
14235 	case IX86_BUILTIN_GATHER3DIV16SF:
14236 	  if (target == NULL_RTX)
14237 	    target = gen_reg_rtx (V8SFmode);
14238 	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14239 	  break;
14240 	case IX86_BUILTIN_GATHER3DIV16SI:
14241 	  if (target == NULL_RTX)
14242 	    target = gen_reg_rtx (V8SImode);
14243 	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14244 	  break;
14245 	case IX86_BUILTIN_GATHER3DIV8SF:
14246 	case IX86_BUILTIN_GATHERDIV8SF:
14247 	  if (target == NULL_RTX)
14248 	    target = gen_reg_rtx (V4SFmode);
14249 	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14250 	  break;
14251 	case IX86_BUILTIN_GATHER3DIV8SI:
14252 	case IX86_BUILTIN_GATHERDIV8SI:
14253 	  if (target == NULL_RTX)
14254 	    target = gen_reg_rtx (V4SImode);
14255 	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14256 	  break;
14257 	default:
14258 	  target = subtarget;
14259 	  break;
14260 	}
14261       return target;
14262 
14263     scatter_gen:
14264       arg0 = CALL_EXPR_ARG (exp, 0);
14265       arg1 = CALL_EXPR_ARG (exp, 1);
14266       arg2 = CALL_EXPR_ARG (exp, 2);
14267       arg3 = CALL_EXPR_ARG (exp, 3);
14268       arg4 = CALL_EXPR_ARG (exp, 4);
14269       op0 = expand_normal (arg0);
14270       op1 = expand_normal (arg1);
14271       op2 = expand_normal (arg2);
14272       op3 = expand_normal (arg3);
14273       op4 = expand_normal (arg4);
14274       mode1 = insn_data[icode].operand[1].mode;
14275       mode2 = insn_data[icode].operand[2].mode;
14276       mode3 = insn_data[icode].operand[3].mode;
14277       mode4 = insn_data[icode].operand[4].mode;
14278 
14279       /* Scatter instruction stores operand op3 to memory with
14280 	 indices from op2 and scale from op4 under writemask op1.
14281 	 If index operand op2 has more elements then source operand
14282 	 op3 one need to use only its low half. And vice versa.  */
14283       switch (fcode)
14284 	{
14285 	case IX86_BUILTIN_SCATTERALTSIV8DF:
14286 	case IX86_BUILTIN_SCATTERALTSIV8DI:
14287 	  half = gen_reg_rtx (V8SImode);
14288 	  if (!nonimmediate_operand (op2, V16SImode))
14289 	    op2 = copy_to_mode_reg (V16SImode, op2);
14290 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
14291 	  op2 = half;
14292 	  break;
14293 	case IX86_BUILTIN_SCATTERALTDIV16SF:
14294 	case IX86_BUILTIN_SCATTERALTDIV16SI:
14295 	  half = gen_reg_rtx (mode3);
14296 	  if (mode3 == V8SFmode)
14297 	    gen = gen_vec_extract_lo_v16sf;
14298 	  else
14299 	    gen = gen_vec_extract_lo_v16si;
14300 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
14301 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14302 	  emit_insn (gen (half, op3));
14303 	  op3 = half;
14304 	  break;
14305 	case IX86_BUILTIN_SCATTERALTSIV4DF:
14306 	case IX86_BUILTIN_SCATTERALTSIV4DI:
14307 	  half = gen_reg_rtx (V4SImode);
14308 	  if (!nonimmediate_operand (op2, V8SImode))
14309 	    op2 = copy_to_mode_reg (V8SImode, op2);
14310 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
14311 	  op2 = half;
14312 	  break;
14313 	case IX86_BUILTIN_SCATTERALTDIV8SF:
14314 	case IX86_BUILTIN_SCATTERALTDIV8SI:
14315 	  half = gen_reg_rtx (mode3);
14316 	  if (mode3 == V4SFmode)
14317 	    gen = gen_vec_extract_lo_v8sf;
14318 	  else
14319 	    gen = gen_vec_extract_lo_v8si;
14320 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
14321 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14322 	  emit_insn (gen (half, op3));
14323 	  op3 = half;
14324 	  break;
14325 	case IX86_BUILTIN_SCATTERALTSIV2DF:
14326 	case IX86_BUILTIN_SCATTERALTSIV2DI:
14327 	  if (!nonimmediate_operand (op2, V4SImode))
14328 	    op2 = copy_to_mode_reg (V4SImode, op2);
14329 	  break;
14330 	case IX86_BUILTIN_SCATTERALTDIV4SF:
14331 	case IX86_BUILTIN_SCATTERALTDIV4SI:
14332 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
14333 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14334 	  break;
14335 	default:
14336 	  break;
14337 	}
14338 
14339       /* Force memory operand only with base register here.  But we
14340 	 don't want to do it on memory operand for other builtin
14341 	 functions.  */
14342       op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14343 
14344       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14345 	op0 = copy_to_mode_reg (Pmode, op0);
14346 
14347       op1 = fixup_modeless_constant (op1, mode1);
14348 
14349       if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14350 	{
14351 	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
14352 	    op1 = copy_to_mode_reg (mode1, op1);
14353 	}
14354       else
14355 	{
14356 	  op1 = copy_to_reg (op1);
14357 	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14358 	}
14359 
14360       if (!insn_data[icode].operand[2].predicate (op2, mode2))
14361 	op2 = copy_to_mode_reg (mode2, op2);
14362 
14363       if (!insn_data[icode].operand[3].predicate (op3, mode3))
14364 	op3 = copy_to_mode_reg (mode3, op3);
14365 
14366       if (!insn_data[icode].operand[4].predicate (op4, mode4))
14367 	{
14368 	  error ("the last argument must be scale 1, 2, 4, 8");
14369 	  return const0_rtx;
14370 	}
14371 
14372       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14373       if (! pat)
14374 	return const0_rtx;
14375 
14376       emit_insn (pat);
14377       return 0;
14378 
14379     vec_prefetch_gen:
14380       arg0 = CALL_EXPR_ARG (exp, 0);
14381       arg1 = CALL_EXPR_ARG (exp, 1);
14382       arg2 = CALL_EXPR_ARG (exp, 2);
14383       arg3 = CALL_EXPR_ARG (exp, 3);
14384       arg4 = CALL_EXPR_ARG (exp, 4);
14385       op0 = expand_normal (arg0);
14386       op1 = expand_normal (arg1);
14387       op2 = expand_normal (arg2);
14388       op3 = expand_normal (arg3);
14389       op4 = expand_normal (arg4);
14390       mode0 = insn_data[icode].operand[0].mode;
14391       mode1 = insn_data[icode].operand[1].mode;
14392       mode3 = insn_data[icode].operand[3].mode;
14393       mode4 = insn_data[icode].operand[4].mode;
14394 
14395       op0 = fixup_modeless_constant (op0, mode0);
14396 
14397       if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14398 	{
14399 	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
14400 	    op0 = copy_to_mode_reg (mode0, op0);
14401 	}
14402       else
14403 	{
14404 	  op0 = copy_to_reg (op0);
14405 	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14406 	}
14407 
14408       if (!insn_data[icode].operand[1].predicate (op1, mode1))
14409 	op1 = copy_to_mode_reg (mode1, op1);
14410 
14411       /* Force memory operand only with base register here.  But we
14412 	 don't want to do it on memory operand for other builtin
14413 	 functions.  */
14414       op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14415 
14416       if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14417 	op2 = copy_to_mode_reg (Pmode, op2);
14418 
14419       if (!insn_data[icode].operand[3].predicate (op3, mode3))
14420 	{
14421 	  error ("the forth argument must be scale 1, 2, 4, 8");
14422 	  return const0_rtx;
14423 	}
14424 
14425       if (!insn_data[icode].operand[4].predicate (op4, mode4))
14426 	{
14427 	  error ("incorrect hint operand");
14428 	  return const0_rtx;
14429 	}
14430 
14431       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14432       if (! pat)
14433 	return const0_rtx;
14434 
14435       emit_insn (pat);
14436 
14437       return 0;
14438 
14439     case IX86_BUILTIN_XABORT:
14440       icode = CODE_FOR_xabort;
14441       arg0 = CALL_EXPR_ARG (exp, 0);
14442       op0 = expand_normal (arg0);
14443       mode0 = insn_data[icode].operand[0].mode;
14444       if (!insn_data[icode].operand[0].predicate (op0, mode0))
14445 	{
14446 	  error ("the argument to %<xabort%> intrinsic must "
14447 		 "be an 8-bit immediate");
14448 	  return const0_rtx;
14449 	}
14450       emit_insn (gen_xabort (op0));
14451       return 0;
14452 
14453     case IX86_BUILTIN_RDSSPD:
14454     case IX86_BUILTIN_RDSSPQ:
14455       mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14456 
14457       if (target == 0
14458 	  || !register_operand (target, mode))
14459 	target = gen_reg_rtx (mode);
14460 
14461       op0 = force_reg (mode, const0_rtx);
14462 
14463       emit_insn (gen_rdssp (mode, target, op0));
14464       return target;
14465 
14466     case IX86_BUILTIN_INCSSPD:
14467     case IX86_BUILTIN_INCSSPQ:
14468       mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14469 
14470       arg0 = CALL_EXPR_ARG (exp, 0);
14471       op0 = expand_normal (arg0);
14472 
14473       op0 = force_reg (mode, op0);
14474 
14475       emit_insn (gen_incssp (mode, op0));
14476       return 0;
14477 
14478     case IX86_BUILTIN_HRESET:
14479       icode = CODE_FOR_hreset;
14480       arg0 = CALL_EXPR_ARG (exp, 0);
14481       op0 = expand_normal (arg0);
14482       op0 = force_reg (SImode, op0);
14483       emit_insn (gen_hreset (op0));
14484       return 0;
14485 
14486     case IX86_BUILTIN_RSTORSSP:
14487     case IX86_BUILTIN_CLRSSBSY:
14488       arg0 = CALL_EXPR_ARG (exp, 0);
14489       op0 = expand_normal (arg0);
14490       icode = (fcode == IX86_BUILTIN_RSTORSSP
14491 	       ? CODE_FOR_rstorssp
14492 	       : CODE_FOR_clrssbsy);
14493 
14494       if (!address_operand (op0, VOIDmode))
14495 	{
14496 	  op0 = convert_memory_address (Pmode, op0);
14497 	  op0 = copy_addr_to_reg (op0);
14498 	}
14499       emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
14500       return 0;
14501 
14502     case IX86_BUILTIN_WRSSD:
14503     case IX86_BUILTIN_WRSSQ:
14504     case IX86_BUILTIN_WRUSSD:
14505     case IX86_BUILTIN_WRUSSQ:
14506       mode = ((fcode == IX86_BUILTIN_WRSSD
14507 	       || fcode == IX86_BUILTIN_WRUSSD)
14508 	      ? SImode : DImode);
14509 
14510       arg0 = CALL_EXPR_ARG (exp, 0);
14511       op0 = expand_normal (arg0);
14512       arg1 = CALL_EXPR_ARG (exp, 1);
14513       op1 = expand_normal (arg1);
14514 
14515       op0 = force_reg (mode, op0);
14516 
14517       if (!address_operand (op1, VOIDmode))
14518 	{
14519 	  op1 = convert_memory_address (Pmode, op1);
14520 	  op1 = copy_addr_to_reg (op1);
14521 	}
14522       op1 = gen_rtx_MEM (mode, op1);
14523 
14524       icode = ((fcode == IX86_BUILTIN_WRSSD
14525 		|| fcode == IX86_BUILTIN_WRSSQ)
14526 	       ? code_for_wrss (mode)
14527 	       : code_for_wruss (mode));
14528       emit_insn (GEN_FCN (icode) (op0, op1));
14529 
14530       return 0;
14531 
14532     default:
14533       break;
14534     }
14535 
14536   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14537       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14538     {
14539       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14540       return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14541 					       target);
14542     }
14543 
14544   if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14545       && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14546     {
14547       i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14548       return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14549 					       target);
14550     }
14551 
14552   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14553       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14554     {
14555       i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14556       rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14557       rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14558       rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14559       int masked = 1;
14560       machine_mode mode, wide_mode, nar_mode;
14561 
14562       nar_mode  = V4SFmode;
14563       mode      = V16SFmode;
14564       wide_mode = V64SFmode;
14565       fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
14566       fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14567 
14568       switch (fcode)
14569 	{
14570 	case IX86_BUILTIN_4FMAPS:
14571 	  fcn = gen_avx5124fmaddps_4fmaddps;
14572 	  masked = 0;
14573 	  goto v4fma_expand;
14574 
14575 	case IX86_BUILTIN_4DPWSSD:
14576 	  nar_mode  = V4SImode;
14577 	  mode      = V16SImode;
14578 	  wide_mode = V64SImode;
14579 	  fcn = gen_avx5124vnniw_vp4dpwssd;
14580 	  masked = 0;
14581 	  goto v4fma_expand;
14582 
14583 	case IX86_BUILTIN_4DPWSSDS:
14584 	  nar_mode  = V4SImode;
14585 	  mode      = V16SImode;
14586 	  wide_mode = V64SImode;
14587 	  fcn = gen_avx5124vnniw_vp4dpwssds;
14588 	  masked = 0;
14589 	  goto v4fma_expand;
14590 
14591 	case IX86_BUILTIN_4FNMAPS:
14592 	  fcn = gen_avx5124fmaddps_4fnmaddps;
14593 	  masked = 0;
14594 	  goto v4fma_expand;
14595 
14596 	case IX86_BUILTIN_4FNMAPS_MASK:
14597 	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
14598 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
14599 	  goto v4fma_expand;
14600 
14601 	case IX86_BUILTIN_4DPWSSD_MASK:
14602 	  nar_mode  = V4SImode;
14603 	  mode      = V16SImode;
14604 	  wide_mode = V64SImode;
14605 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
14606 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
14607 	  goto v4fma_expand;
14608 
14609 	case IX86_BUILTIN_4DPWSSDS_MASK:
14610 	  nar_mode  = V4SImode;
14611 	  mode      = V16SImode;
14612 	  wide_mode = V64SImode;
14613 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
14614 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
14615 	  goto v4fma_expand;
14616 
14617 	case IX86_BUILTIN_4FMAPS_MASK:
14618 	  {
14619 	    tree args[4];
14620 	    rtx ops[4];
14621 	    rtx wide_reg;
14622 	    rtx accum;
14623 	    rtx addr;
14624 	    rtx mem;
14625 
14626 v4fma_expand:
14627 	    wide_reg = gen_reg_rtx (wide_mode);
14628 	    for (i = 0; i < 4; i++)
14629 	      {
14630 		args[i] = CALL_EXPR_ARG (exp, i);
14631 		ops[i] = expand_normal (args[i]);
14632 
14633 		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
14634 				ops[i]);
14635 	      }
14636 
14637 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14638 	    accum = force_reg (mode, accum);
14639 
14640 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14641 	    addr = force_reg (Pmode, addr);
14642 
14643 	    mem = gen_rtx_MEM (nar_mode, addr);
14644 
14645 	    target = gen_reg_rtx (mode);
14646 
14647 	    emit_move_insn (target, accum);
14648 
14649 	    if (! masked)
14650 	      emit_insn (fcn (target, accum, wide_reg, mem));
14651 	    else
14652 	      {
14653 		rtx merge, mask;
14654 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14655 
14656 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14657 
14658 		if (CONST_INT_P (mask))
14659 		  mask = fixup_modeless_constant (mask, HImode);
14660 
14661 		mask = force_reg (HImode, mask);
14662 
14663 		if (GET_MODE (mask) != HImode)
14664 		  mask = gen_rtx_SUBREG (HImode, mask, 0);
14665 
14666 		/* If merge is 0 then we're about to emit z-masked variant.  */
14667 		if (const0_operand (merge, mode))
14668 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14669 		/* If merge is the same as accum then emit merge-masked variant.  */
14670 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14671 		  {
14672 		    merge = force_reg (mode, merge);
14673 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14674 		  }
14675 		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
14676 		else
14677 		  {
14678 		    target = gen_reg_rtx (mode);
14679 		    emit_move_insn (target, merge);
14680 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14681 		  }
14682 	      }
14683 	    return target;
14684 	  }
14685 
14686 	case IX86_BUILTIN_4FNMASS:
14687 	  fcn = gen_avx5124fmaddps_4fnmaddss;
14688 	  masked = 0;
14689 	  goto s4fma_expand;
14690 
14691 	case IX86_BUILTIN_4FMASS:
14692 	  fcn = gen_avx5124fmaddps_4fmaddss;
14693 	  masked = 0;
14694 	  goto s4fma_expand;
14695 
14696 	case IX86_BUILTIN_4FNMASS_MASK:
14697 	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
14698 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
14699 	  goto s4fma_expand;
14700 
14701 	case IX86_BUILTIN_4FMASS_MASK:
14702 	  {
14703 	    tree args[4];
14704 	    rtx ops[4];
14705 	    rtx wide_reg;
14706 	    rtx accum;
14707 	    rtx addr;
14708 	    rtx mem;
14709 
14710 	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
14711 	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
14712 
14713 s4fma_expand:
14714 	    mode = V4SFmode;
14715 	    wide_reg = gen_reg_rtx (V64SFmode);
14716 	    for (i = 0; i < 4; i++)
14717 	      {
14718 		rtx tmp;
14719 		args[i] = CALL_EXPR_ARG (exp, i);
14720 		ops[i] = expand_normal (args[i]);
14721 
14722 		tmp = gen_reg_rtx (SFmode);
14723 		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
14724 
14725 		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
14726 				gen_rtx_SUBREG (V16SFmode, tmp, 0));
14727 	      }
14728 
14729 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14730 	    accum = force_reg (V4SFmode, accum);
14731 
14732 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14733 	    addr = force_reg (Pmode, addr);
14734 
14735 	    mem = gen_rtx_MEM (V4SFmode, addr);
14736 
14737 	    target = gen_reg_rtx (V4SFmode);
14738 
14739 	    emit_move_insn (target, accum);
14740 
14741 	    if (! masked)
14742 	      emit_insn (fcn (target, accum, wide_reg, mem));
14743 	    else
14744 	      {
14745 		rtx merge, mask;
14746 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14747 
14748 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14749 
14750 		if (CONST_INT_P (mask))
14751 		  mask = fixup_modeless_constant (mask, QImode);
14752 
14753 		mask = force_reg (QImode, mask);
14754 
14755 		if (GET_MODE (mask) != QImode)
14756 		  mask = gen_rtx_SUBREG (QImode, mask, 0);
14757 
14758 		/* If merge is 0 then we're about to emit z-masked variant.  */
14759 		if (const0_operand (merge, mode))
14760 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14761 		/* If merge is the same as accum then emit merge-masked
14762 		   variant.  */
14763 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14764 		  {
14765 		    merge = force_reg (mode, merge);
14766 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14767 		  }
14768 		/* Merge with something unknown might happen if we z-mask
14769 		   w/ -O0.  */
14770 		else
14771 		  {
14772 		    target = gen_reg_rtx (mode);
14773 		    emit_move_insn (target, merge);
14774 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14775 		  }
14776 		}
14777 	      return target;
14778 	    }
14779 	  case IX86_BUILTIN_RDPID:
14780 	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
14781 						     target);
14782 	  case IX86_BUILTIN_FABSQ:
14783 	  case IX86_BUILTIN_COPYSIGNQ:
14784 	    if (!TARGET_SSE)
14785 	      /* Emit a normal call if SSE isn't available.  */
14786 	      return expand_call (exp, target, ignore);
14787 	    /* FALLTHRU */
14788 	  default:
14789 	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
14790 	  }
14791     }
14792 
14793   if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
14794       && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
14795     {
14796       i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
14797       return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
14798     }
14799 
14800   if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
14801       && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
14802     {
14803       i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
14804       return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
14805     }
14806 
14807   if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
14808       && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
14809     {
14810       i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
14811       return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
14812     }
14813 
14814   if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
14815       && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
14816     {
14817       i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
14818       return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
14819     }
14820 
14821   if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
14822       && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
14823     {
14824       i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
14825       const struct builtin_description *d = bdesc_multi_arg + i;
14826       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
14827 					    (enum ix86_builtin_func_type)
14828 					    d->flag, d->comparison);
14829     }
14830 
14831   if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
14832       && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
14833     {
14834       i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
14835       return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
14836 					       target);
14837     }
14838 
14839   gcc_unreachable ();
14840 }
14841 
14842 /* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
14843    fill target with val via vec_duplicate.  */
14844 
14845 static bool
ix86_vector_duplicate_value(machine_mode mode,rtx target,rtx val)14846 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
14847 {
14848   bool ok;
14849   rtx_insn *insn;
14850   rtx dup;
14851 
14852   /* First attempt to recognize VAL as-is.  */
14853   dup = gen_vec_duplicate (mode, val);
14854   insn = emit_insn (gen_rtx_SET (target, dup));
14855   if (recog_memoized (insn) < 0)
14856     {
14857       rtx_insn *seq;
14858       machine_mode innermode = GET_MODE_INNER (mode);
14859       rtx reg;
14860 
14861       /* If that fails, force VAL into a register.  */
14862 
14863       start_sequence ();
14864       reg = force_reg (innermode, val);
14865       if (GET_MODE (reg) != innermode)
14866 	reg = gen_lowpart (innermode, reg);
14867       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
14868       seq = get_insns ();
14869       end_sequence ();
14870       if (seq)
14871 	emit_insn_before (seq, insn);
14872 
14873       ok = recog_memoized (insn) >= 0;
14874       gcc_assert (ok);
14875     }
14876   return true;
14877 }
14878 
14879 /* Get a vector mode of the same size as the original but with elements
14880    twice as wide.  This is only guaranteed to apply to integral vectors.  */
14881 
14882 static machine_mode
get_mode_wider_vector(machine_mode o)14883 get_mode_wider_vector (machine_mode o)
14884 {
14885   /* ??? Rely on the ordering that genmodes.cc gives to vectors.  */
14886   machine_mode n = GET_MODE_WIDER_MODE (o).require ();
14887   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
14888   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
14889   return n;
14890 }
14891 
14892 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
14893 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
14894 
14895 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
14896    with all elements equal to VAR.  Return true if successful.  */
14897 
14898 bool
ix86_expand_vector_init_duplicate(bool mmx_ok,machine_mode mode,rtx target,rtx val)14899 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
14900 				   rtx target, rtx val)
14901 {
14902   bool ok;
14903 
14904   switch (mode)
14905     {
14906     case E_V2SImode:
14907     case E_V2SFmode:
14908       if (!mmx_ok)
14909 	return false;
14910       /* FALLTHRU */
14911 
14912     case E_V4DFmode:
14913     case E_V4DImode:
14914     case E_V8SFmode:
14915     case E_V8SImode:
14916     case E_V2DFmode:
14917     case E_V2DImode:
14918     case E_V4SFmode:
14919     case E_V4SImode:
14920     case E_V16SImode:
14921     case E_V8DImode:
14922     case E_V16SFmode:
14923     case E_V8DFmode:
14924       return ix86_vector_duplicate_value (mode, target, val);
14925 
14926     case E_V4HImode:
14927       if (!mmx_ok)
14928 	return false;
14929       if (TARGET_SSE || TARGET_3DNOW_A)
14930 	{
14931 	  rtx x;
14932 
14933 	  val = gen_lowpart (SImode, val);
14934 	  x = gen_rtx_TRUNCATE (HImode, val);
14935 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
14936 	  emit_insn (gen_rtx_SET (target, x));
14937 	  return true;
14938 	}
14939       goto widen;
14940 
14941     case E_V2HImode:
14942       if (TARGET_SSE2)
14943 	{
14944 	  rtx x;
14945 
14946 	  val = gen_lowpart (SImode, val);
14947 	  x = gen_rtx_TRUNCATE (HImode, val);
14948 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
14949 	  emit_insn (gen_rtx_SET (target, x));
14950 	  return true;
14951 	}
14952       return false;
14953 
14954     case E_V8QImode:
14955     case E_V4QImode:
14956       if (!mmx_ok)
14957 	return false;
14958       goto widen;
14959 
14960     case E_V8HImode:
14961     case E_V8HFmode:
14962       if (TARGET_AVX2)
14963 	return ix86_vector_duplicate_value (mode, target, val);
14964 
14965       if (TARGET_SSE2)
14966 	{
14967 	  struct expand_vec_perm_d dperm;
14968 	  rtx tmp1, tmp2;
14969 
14970 	permute:
14971 	  memset (&dperm, 0, sizeof (dperm));
14972 	  dperm.target = target;
14973 	  dperm.vmode = mode;
14974 	  dperm.nelt = GET_MODE_NUNITS (mode);
14975 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
14976 	  dperm.one_operand_p = true;
14977 
14978 	  if (mode == V8HFmode)
14979 	    {
14980 	      tmp1 = force_reg (HFmode, val);
14981 	      tmp2 = gen_reg_rtx (mode);
14982 	      emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
14983 	      tmp1 = gen_lowpart (mode, tmp2);
14984 	    }
14985 	  else
14986 	    {
14987 	      /* Extend to SImode using a paradoxical SUBREG.  */
14988 	      tmp1 = gen_reg_rtx (SImode);
14989 	      emit_move_insn (tmp1, gen_lowpart (SImode, val));
14990 
14991 	      /* Insert the SImode value as
14992 		 low element of a V4SImode vector.  */
14993 	      tmp2 = gen_reg_rtx (V4SImode);
14994 	      emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
14995 	      tmp1 = gen_lowpart (mode, tmp2);
14996 	    }
14997 
14998 	  emit_move_insn (dperm.op0, tmp1);
14999 	  ok = (expand_vec_perm_1 (&dperm)
15000 		|| expand_vec_perm_broadcast_1 (&dperm));
15001 	  gcc_assert (ok);
15002 	  return ok;
15003 	}
15004       goto widen;
15005 
15006     case E_V16QImode:
15007       if (TARGET_AVX2)
15008 	return ix86_vector_duplicate_value (mode, target, val);
15009 
15010       if (TARGET_SSE2)
15011 	goto permute;
15012       goto widen;
15013 
15014     widen:
15015       /* Replicate the value once into the next wider mode and recurse.  */
15016       {
15017 	machine_mode smode, wsmode, wvmode;
15018 	rtx x;
15019 
15020 	smode = GET_MODE_INNER (mode);
15021 	wvmode = get_mode_wider_vector (mode);
15022 	wsmode = GET_MODE_INNER (wvmode);
15023 
15024 	val = convert_modes (wsmode, smode, val, true);
15025 
15026 	if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15027 	  emit_insn (gen_insv_1 (wsmode, val, val));
15028 	else
15029 	  {
15030 	    x = expand_simple_binop (wsmode, ASHIFT, val,
15031 				     GEN_INT (GET_MODE_BITSIZE (smode)),
15032 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
15033 	    val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15034 				       OPTAB_LIB_WIDEN);
15035 	  }
15036 
15037 	x = gen_reg_rtx (wvmode);
15038 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15039 	gcc_assert (ok);
15040 	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15041 	return ok;
15042       }
15043 
15044     case E_V16HImode:
15045     case E_V16HFmode:
15046     case E_V32QImode:
15047       if (TARGET_AVX2)
15048 	return ix86_vector_duplicate_value (mode, target, val);
15049       else
15050 	{
15051 	  machine_mode hvmode = (mode == V16HImode ? V8HImode
15052 				 : mode == V16HFmode ? V8HFmode
15053 				 : V16QImode);
15054 	  rtx x = gen_reg_rtx (hvmode);
15055 
15056 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15057 	  gcc_assert (ok);
15058 
15059 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
15060 	  emit_insn (gen_rtx_SET (target, x));
15061 	}
15062       return true;
15063 
15064     case E_V32HImode:
15065     case E_V32HFmode:
15066     case E_V64QImode:
15067       if (TARGET_AVX512BW)
15068 	return ix86_vector_duplicate_value (mode, target, val);
15069       else
15070 	{
15071 	  machine_mode hvmode = (mode == V32HImode ? V16HImode
15072 				 : mode == V32HFmode ? V16HFmode
15073 				 : V32QImode);
15074 	  rtx x = gen_reg_rtx (hvmode);
15075 
15076 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15077 	  gcc_assert (ok);
15078 
15079 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
15080 	  emit_insn (gen_rtx_SET (target, x));
15081 	}
15082       return true;
15083 
15084     default:
15085       return false;
15086     }
15087 }
15088 
15089 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
15090    whose ONE_VAR element is VAR, and other elements are zero.  Return true
15091    if successful.  */
15092 
15093 static bool
ix86_expand_vector_init_one_nonzero(bool mmx_ok,machine_mode mode,rtx target,rtx var,int one_var)15094 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15095 				     rtx target, rtx var, int one_var)
15096 {
15097   machine_mode vsimode;
15098   rtx new_target;
15099   rtx x, tmp;
15100   bool use_vector_set = false;
15101   rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15102 
15103   switch (mode)
15104     {
15105     case E_V2DImode:
15106       /* For SSE4.1, we normally use vector set.  But if the second
15107 	 element is zero and inter-unit moves are OK, we use movq
15108 	 instead.  */
15109       use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15110 			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
15111 			     && one_var == 0));
15112       break;
15113     case E_V16QImode:
15114     case E_V4SImode:
15115     case E_V4SFmode:
15116       use_vector_set = TARGET_SSE4_1;
15117       break;
15118     case E_V8HImode:
15119       use_vector_set = TARGET_SSE2;
15120       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15121 	? gen_vec_setv8hi_0 : NULL;
15122       break;
15123     case E_V8QImode:
15124       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15125       break;
15126     case E_V4HImode:
15127       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15128       break;
15129     case E_V4QImode:
15130       use_vector_set = TARGET_SSE4_1;
15131       break;
15132     case E_V32QImode:
15133       use_vector_set = TARGET_AVX;
15134       break;
15135     case E_V16HImode:
15136       use_vector_set = TARGET_AVX;
15137       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15138 	? gen_vec_setv16hi_0 : NULL;
15139       break;
15140     case E_V8SImode:
15141       use_vector_set = TARGET_AVX;
15142       gen_vec_set_0 = gen_vec_setv8si_0;
15143       break;
15144     case E_V8SFmode:
15145       use_vector_set = TARGET_AVX;
15146       gen_vec_set_0 = gen_vec_setv8sf_0;
15147       break;
15148     case E_V4DFmode:
15149       use_vector_set = TARGET_AVX;
15150       gen_vec_set_0 = gen_vec_setv4df_0;
15151       break;
15152     case E_V4DImode:
15153       /* Use ix86_expand_vector_set in 64bit mode only.  */
15154       use_vector_set = TARGET_AVX && TARGET_64BIT;
15155       gen_vec_set_0 = gen_vec_setv4di_0;
15156       break;
15157     case E_V16SImode:
15158       use_vector_set = TARGET_AVX512F && one_var == 0;
15159       gen_vec_set_0 = gen_vec_setv16si_0;
15160       break;
15161     case E_V16SFmode:
15162       use_vector_set = TARGET_AVX512F && one_var == 0;
15163       gen_vec_set_0 = gen_vec_setv16sf_0;
15164       break;
15165     case E_V8DFmode:
15166       use_vector_set = TARGET_AVX512F && one_var == 0;
15167       gen_vec_set_0 = gen_vec_setv8df_0;
15168       break;
15169     case E_V8DImode:
15170       /* Use ix86_expand_vector_set in 64bit mode only.  */
15171       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15172       gen_vec_set_0 = gen_vec_setv8di_0;
15173       break;
15174     case E_V8HFmode:
15175       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15176       gen_vec_set_0 = gen_vec_setv8hf_0;
15177       break;
15178     case E_V16HFmode:
15179       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15180       gen_vec_set_0 = gen_vec_setv16hf_0;
15181       break;
15182     case E_V32HFmode:
15183       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15184       gen_vec_set_0 = gen_vec_setv32hf_0;
15185       break;
15186     case E_V32HImode:
15187       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15188       gen_vec_set_0 = gen_vec_setv32hi_0;
15189     default:
15190       break;
15191     }
15192 
15193   if (use_vector_set)
15194     {
15195       if (gen_vec_set_0 && one_var == 0)
15196 	{
15197 	  var = force_reg (GET_MODE_INNER (mode), var);
15198 	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15199 	  return true;
15200 	}
15201       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15202       var = force_reg (GET_MODE_INNER (mode), var);
15203       ix86_expand_vector_set (mmx_ok, target, var, one_var);
15204       return true;
15205     }
15206 
15207   switch (mode)
15208     {
15209     case E_V2SFmode:
15210     case E_V2SImode:
15211       if (!mmx_ok)
15212 	return false;
15213       /* FALLTHRU */
15214 
15215     case E_V2DFmode:
15216     case E_V2DImode:
15217       if (one_var != 0)
15218 	return false;
15219       var = force_reg (GET_MODE_INNER (mode), var);
15220       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15221       emit_insn (gen_rtx_SET (target, x));
15222       return true;
15223 
15224     case E_V4SFmode:
15225     case E_V4SImode:
15226       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15227 	new_target = gen_reg_rtx (mode);
15228       else
15229 	new_target = target;
15230       var = force_reg (GET_MODE_INNER (mode), var);
15231       x = gen_rtx_VEC_DUPLICATE (mode, var);
15232       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15233       emit_insn (gen_rtx_SET (new_target, x));
15234       if (one_var != 0)
15235 	{
15236 	  /* We need to shuffle the value to the correct position, so
15237 	     create a new pseudo to store the intermediate result.  */
15238 
15239 	  /* With SSE2, we can use the integer shuffle insns.  */
15240 	  if (mode != V4SFmode && TARGET_SSE2)
15241 	    {
15242 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15243 					    const1_rtx,
15244 					    GEN_INT (one_var == 1 ? 0 : 1),
15245 					    GEN_INT (one_var == 2 ? 0 : 1),
15246 					    GEN_INT (one_var == 3 ? 0 : 1)));
15247 	      if (target != new_target)
15248 		emit_move_insn (target, new_target);
15249 	      return true;
15250 	    }
15251 
15252 	  /* Otherwise convert the intermediate result to V4SFmode and
15253 	     use the SSE1 shuffle instructions.  */
15254 	  if (mode != V4SFmode)
15255 	    {
15256 	      tmp = gen_reg_rtx (V4SFmode);
15257 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15258 	    }
15259 	  else
15260 	    tmp = new_target;
15261 
15262 	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15263 				       const1_rtx,
15264 				       GEN_INT (one_var == 1 ? 0 : 1),
15265 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
15266 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15267 
15268 	  if (mode != V4SFmode)
15269 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15270 	  else if (tmp != target)
15271 	    emit_move_insn (target, tmp);
15272 	}
15273       else if (target != new_target)
15274 	emit_move_insn (target, new_target);
15275       return true;
15276 
15277     case E_V8HImode:
15278     case E_V16QImode:
15279       vsimode = V4SImode;
15280       goto widen;
15281     case E_V4HImode:
15282     case E_V8QImode:
15283       if (!mmx_ok)
15284 	return false;
15285       vsimode = V2SImode;
15286       goto widen;
15287     widen:
15288       if (one_var != 0)
15289 	return false;
15290 
15291       /* Zero extend the variable element to SImode and recurse.  */
15292       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15293 
15294       x = gen_reg_rtx (vsimode);
15295       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15296 						var, one_var))
15297 	gcc_unreachable ();
15298 
15299       emit_move_insn (target, gen_lowpart (mode, x));
15300       return true;
15301 
15302     default:
15303       return false;
15304     }
15305 }
15306 
15307 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
15308    consisting of the values in VALS.  It is known that all elements
15309    except ONE_VAR are constants.  Return true if successful.  */
15310 
15311 static bool
ix86_expand_vector_init_one_var(bool mmx_ok,machine_mode mode,rtx target,rtx vals,int one_var)15312 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15313 				 rtx target, rtx vals, int one_var)
15314 {
15315   rtx var = XVECEXP (vals, 0, one_var);
15316   machine_mode wmode;
15317   rtx const_vec, x;
15318 
15319   const_vec = copy_rtx (vals);
15320   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15321   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15322 
15323   switch (mode)
15324     {
15325     case E_V2DFmode:
15326     case E_V2DImode:
15327     case E_V2SFmode:
15328     case E_V2SImode:
15329       /* For the two element vectors, it's just as easy to use
15330 	 the general case.  */
15331       return false;
15332 
15333     case E_V4DImode:
15334       /* Use ix86_expand_vector_set in 64bit mode only.  */
15335       if (!TARGET_64BIT)
15336 	return false;
15337       /* FALLTHRU */
15338     case E_V8HFmode:
15339     case E_V16HFmode:
15340     case E_V4DFmode:
15341     case E_V8SFmode:
15342     case E_V8SImode:
15343     case E_V16HImode:
15344     case E_V32QImode:
15345     case E_V4SFmode:
15346     case E_V4SImode:
15347     case E_V8HImode:
15348     case E_V4HImode:
15349       break;
15350 
15351     case E_V16QImode:
15352       if (TARGET_SSE4_1)
15353 	break;
15354       wmode = V8HImode;
15355       goto widen;
15356     case E_V8QImode:
15357       if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15358 	break;
15359       wmode = V4HImode;
15360       goto widen;
15361     case E_V4QImode:
15362       if (TARGET_SSE4_1)
15363 	break;
15364       wmode = V2HImode;
15365     widen:
15366       /* There's no way to set one QImode entry easily.  Combine
15367 	 the variable value with its adjacent constant value, and
15368 	 promote to an HImode set.  */
15369       x = XVECEXP (vals, 0, one_var ^ 1);
15370       if (one_var & 1)
15371 	{
15372 	  var = convert_modes (HImode, QImode, var, true);
15373 	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15374 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
15375 	  x = GEN_INT (INTVAL (x) & 0xff);
15376 	}
15377       else
15378 	{
15379 	  var = convert_modes (HImode, QImode, var, true);
15380 	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
15381 	}
15382       if (x != const0_rtx)
15383 	var = expand_simple_binop (HImode, IOR, var, x, var,
15384 				   1, OPTAB_LIB_WIDEN);
15385 
15386       x = gen_reg_rtx (wmode);
15387       emit_move_insn (x, gen_lowpart (wmode, const_vec));
15388       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15389 
15390       emit_move_insn (target, gen_lowpart (mode, x));
15391       return true;
15392 
15393     default:
15394       return false;
15395     }
15396 
15397   emit_move_insn (target, const_vec);
15398   ix86_expand_vector_set (mmx_ok, target, var, one_var);
15399   return true;
15400 }
15401 
15402 /* A subroutine of ix86_expand_vector_init_general.  Use vector
15403    concatenate to handle the most general case: all values variable,
15404    and none identical.  */
15405 
15406 static void
ix86_expand_vector_init_concat(machine_mode mode,rtx target,rtx * ops,int n)15407 ix86_expand_vector_init_concat (machine_mode mode,
15408 				rtx target, rtx *ops, int n)
15409 {
15410   machine_mode half_mode = VOIDmode;
15411   rtx half[2];
15412   rtvec v;
15413   int i, j;
15414 
15415   switch (n)
15416     {
15417     case 2:
15418       switch (mode)
15419 	{
15420 	case E_V32HFmode:
15421 	  half_mode = V16HFmode;
15422 	  break;
15423 	case E_V16SImode:
15424 	  half_mode = V8SImode;
15425 	  break;
15426 	case E_V16SFmode:
15427 	  half_mode = V8SFmode;
15428 	  break;
15429 	case E_V8DImode:
15430 	  half_mode = V4DImode;
15431 	  break;
15432 	case E_V8DFmode:
15433 	  half_mode = V4DFmode;
15434 	  break;
15435 	case E_V16HFmode:
15436 	  half_mode = V8HFmode;
15437 	  break;
15438 	case E_V8SImode:
15439 	  half_mode = V4SImode;
15440 	  break;
15441 	case E_V8SFmode:
15442 	  half_mode = V4SFmode;
15443 	  break;
15444 	case E_V4DImode:
15445 	  half_mode = V2DImode;
15446 	  break;
15447 	case E_V4DFmode:
15448 	  half_mode = V2DFmode;
15449 	  break;
15450 	case E_V4SImode:
15451 	  half_mode = V2SImode;
15452 	  break;
15453 	case E_V4SFmode:
15454 	  half_mode = V2SFmode;
15455 	  break;
15456 	case E_V2DImode:
15457 	  half_mode = DImode;
15458 	  break;
15459 	case E_V2SImode:
15460 	  half_mode = SImode;
15461 	  break;
15462 	case E_V2DFmode:
15463 	  half_mode = DFmode;
15464 	  break;
15465 	case E_V2SFmode:
15466 	  half_mode = SFmode;
15467 	  break;
15468 	default:
15469 	  gcc_unreachable ();
15470 	}
15471 
15472       if (!register_operand (ops[1], half_mode))
15473 	ops[1] = force_reg (half_mode, ops[1]);
15474       if (!register_operand (ops[0], half_mode))
15475 	ops[0] = force_reg (half_mode, ops[0]);
15476       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15477 							  ops[1])));
15478       break;
15479 
15480     case 4:
15481       switch (mode)
15482 	{
15483 	case E_V4DImode:
15484 	  half_mode = V2DImode;
15485 	  break;
15486 	case E_V4DFmode:
15487 	  half_mode = V2DFmode;
15488 	  break;
15489 	case E_V4SImode:
15490 	  half_mode = V2SImode;
15491 	  break;
15492 	case E_V4SFmode:
15493 	  half_mode = V2SFmode;
15494 	  break;
15495 	default:
15496 	  gcc_unreachable ();
15497 	}
15498       goto half;
15499 
15500     case 8:
15501       switch (mode)
15502 	{
15503 	case E_V8DImode:
15504 	  half_mode = V4DImode;
15505 	  break;
15506 	case E_V8DFmode:
15507 	  half_mode = V4DFmode;
15508 	  break;
15509 	case E_V8SImode:
15510 	  half_mode = V4SImode;
15511 	  break;
15512 	case E_V8SFmode:
15513 	  half_mode = V4SFmode;
15514 	  break;
15515 	default:
15516 	  gcc_unreachable ();
15517 	}
15518       goto half;
15519 
15520     case 16:
15521       switch (mode)
15522 	{
15523 	case E_V16SImode:
15524 	  half_mode = V8SImode;
15525 	  break;
15526 	case E_V16SFmode:
15527 	  half_mode = V8SFmode;
15528 	  break;
15529 	default:
15530 	  gcc_unreachable ();
15531 	}
15532       goto half;
15533 
15534 half:
15535       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
15536       i = n - 1;
15537       for (j = 1; j != -1; j--)
15538 	{
15539 	  half[j] = gen_reg_rtx (half_mode);
15540 	  switch (n >> 1)
15541 	    {
15542 	    case 2:
15543 	      v = gen_rtvec (2, ops[i-1], ops[i]);
15544 	      i -= 2;
15545 	      break;
15546 	    case 4:
15547 	      v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
15548 	      i -= 4;
15549 	      break;
15550 	    case 8:
15551 	      v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
15552 			     ops[i-3], ops[i-2], ops[i-1], ops[i]);
15553 	      i -= 8;
15554 	      break;
15555 	    default:
15556 	      gcc_unreachable ();
15557 	    }
15558 	  ix86_expand_vector_init (false, half[j],
15559 				   gen_rtx_PARALLEL (half_mode, v));
15560 	}
15561 
15562       ix86_expand_vector_init_concat (mode, target, half, 2);
15563       break;
15564 
15565     default:
15566       gcc_unreachable ();
15567     }
15568 }
15569 
15570 /* A subroutine of ix86_expand_vector_init_general.  Use vector
15571    interleave to handle the most general case: all values variable,
15572    and none identical.  */
15573 
15574 static void
ix86_expand_vector_init_interleave(machine_mode mode,rtx target,rtx * ops,int n)15575 ix86_expand_vector_init_interleave (machine_mode mode,
15576 				    rtx target, rtx *ops, int n)
15577 {
15578   machine_mode first_imode, second_imode, third_imode, inner_mode;
15579   int i, j;
15580   rtx op, op0, op1;
15581   rtx (*gen_load_even) (rtx, rtx, rtx);
15582   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
15583   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
15584 
15585   switch (mode)
15586     {
15587     case E_V8HFmode:
15588       gen_load_even = gen_vec_interleave_lowv8hf;
15589       gen_interleave_first_low = gen_vec_interleave_lowv4si;
15590       gen_interleave_second_low = gen_vec_interleave_lowv2di;
15591       inner_mode = HFmode;
15592       first_imode = V4SImode;
15593       second_imode = V2DImode;
15594       third_imode = VOIDmode;
15595       break;
15596     case E_V8HImode:
15597       gen_load_even = gen_vec_setv8hi;
15598       gen_interleave_first_low = gen_vec_interleave_lowv4si;
15599       gen_interleave_second_low = gen_vec_interleave_lowv2di;
15600       inner_mode = HImode;
15601       first_imode = V4SImode;
15602       second_imode = V2DImode;
15603       third_imode = VOIDmode;
15604       break;
15605     case E_V16QImode:
15606       gen_load_even = gen_vec_setv16qi;
15607       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
15608       gen_interleave_second_low = gen_vec_interleave_lowv4si;
15609       inner_mode = QImode;
15610       first_imode = V8HImode;
15611       second_imode = V4SImode;
15612       third_imode = V2DImode;
15613       break;
15614     default:
15615       gcc_unreachable ();
15616     }
15617 
15618   for (i = 0; i < n; i++)
15619     {
15620       op = ops [i + i];
15621       if (inner_mode == HFmode)
15622 	{
15623 	  rtx even, odd;
15624 	  /* Use vpuncklwd to pack 2 HFmode.  */
15625 	  op0 = gen_reg_rtx (V8HFmode);
15626 	  even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode);
15627 	  odd = lowpart_subreg (V8HFmode,
15628 				force_reg (HFmode, ops[i + i + 1]),
15629 				HFmode);
15630 	  emit_insn (gen_load_even (op0, even, odd));
15631 	}
15632       else
15633 	{
15634 	  /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
15635 	  op0 = gen_reg_rtx (SImode);
15636 	  emit_move_insn (op0, gen_lowpart (SImode, op));
15637 
15638 	  /* Insert the SImode value as low element of V4SImode vector.  */
15639 	  op1 = gen_reg_rtx (V4SImode);
15640 	  op0 = gen_rtx_VEC_MERGE (V4SImode,
15641 				   gen_rtx_VEC_DUPLICATE (V4SImode,
15642 							  op0),
15643 				   CONST0_RTX (V4SImode),
15644 				   const1_rtx);
15645 	  emit_insn (gen_rtx_SET (op1, op0));
15646 
15647 	  /* Cast the V4SImode vector back to a vector in orignal mode.  */
15648 	  op0 = gen_reg_rtx (mode);
15649 	  emit_move_insn (op0, gen_lowpart (mode, op1));
15650 
15651 	  /* Load even elements into the second position.  */
15652 	  emit_insn (gen_load_even (op0,
15653 				    force_reg (inner_mode,
15654 					       ops[i + i + 1]),
15655 				    const1_rtx));
15656 	}
15657 
15658       /* Cast vector to FIRST_IMODE vector.  */
15659       ops[i] = gen_reg_rtx (first_imode);
15660       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
15661     }
15662 
15663   /* Interleave low FIRST_IMODE vectors.  */
15664   for (i = j = 0; i < n; i += 2, j++)
15665     {
15666       op0 = gen_reg_rtx (first_imode);
15667       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
15668 
15669       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
15670       ops[j] = gen_reg_rtx (second_imode);
15671       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
15672     }
15673 
15674   /* Interleave low SECOND_IMODE vectors.  */
15675   switch (second_imode)
15676     {
15677     case E_V4SImode:
15678       for (i = j = 0; i < n / 2; i += 2, j++)
15679 	{
15680 	  op0 = gen_reg_rtx (second_imode);
15681 	  emit_insn (gen_interleave_second_low (op0, ops[i],
15682 						ops[i + 1]));
15683 
15684 	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
15685 	     vector.  */
15686 	  ops[j] = gen_reg_rtx (third_imode);
15687 	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
15688 	}
15689       second_imode = V2DImode;
15690       gen_interleave_second_low = gen_vec_interleave_lowv2di;
15691       /* FALLTHRU */
15692 
15693     case E_V2DImode:
15694       op0 = gen_reg_rtx (second_imode);
15695       emit_insn (gen_interleave_second_low (op0, ops[0],
15696 					    ops[1]));
15697 
15698       /* Cast the SECOND_IMODE vector back to a vector on original
15699 	 mode.  */
15700       emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
15701       break;
15702 
15703     default:
15704       gcc_unreachable ();
15705     }
15706 }
15707 
15708 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
15709    all values variable, and none identical.  */
15710 
15711 static void
ix86_expand_vector_init_general(bool mmx_ok,machine_mode mode,rtx target,rtx vals)15712 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
15713 				 rtx target, rtx vals)
15714 {
15715   rtx ops[64], op0, op1, op2, op3, op4, op5;
15716   machine_mode half_mode = VOIDmode;
15717   machine_mode quarter_mode = VOIDmode;
15718   int n, i;
15719 
15720   switch (mode)
15721     {
15722     case E_V2SFmode:
15723     case E_V2SImode:
15724       if (!mmx_ok && !TARGET_SSE)
15725 	break;
15726       /* FALLTHRU */
15727 
15728     case E_V16SImode:
15729     case E_V16SFmode:
15730     case E_V8DFmode:
15731     case E_V8DImode:
15732     case E_V8SFmode:
15733     case E_V8SImode:
15734     case E_V4DFmode:
15735     case E_V4DImode:
15736     case E_V4SFmode:
15737     case E_V4SImode:
15738     case E_V2DFmode:
15739     case E_V2DImode:
15740       n = GET_MODE_NUNITS (mode);
15741       for (i = 0; i < n; i++)
15742 	ops[i] = XVECEXP (vals, 0, i);
15743       ix86_expand_vector_init_concat (mode, target, ops, n);
15744       return;
15745 
15746     case E_V2TImode:
15747       for (i = 0; i < 2; i++)
15748 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15749       op0 = gen_reg_rtx (V4DImode);
15750       ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
15751       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15752       return;
15753 
15754     case E_V4TImode:
15755       for (i = 0; i < 4; i++)
15756 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15757       ops[4] = gen_reg_rtx (V4DImode);
15758       ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
15759       ops[5] = gen_reg_rtx (V4DImode);
15760       ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
15761       op0 = gen_reg_rtx (V8DImode);
15762       ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
15763       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15764       return;
15765 
15766     case E_V32QImode:
15767       half_mode = V16QImode;
15768       goto half;
15769 
15770     case E_V16HImode:
15771       half_mode = V8HImode;
15772       goto half;
15773 
15774     case E_V16HFmode:
15775       half_mode = V8HFmode;
15776       goto half;
15777 
15778 half:
15779       n = GET_MODE_NUNITS (mode);
15780       for (i = 0; i < n; i++)
15781 	ops[i] = XVECEXP (vals, 0, i);
15782       op0 = gen_reg_rtx (half_mode);
15783       op1 = gen_reg_rtx (half_mode);
15784       ix86_expand_vector_init_interleave (half_mode, op0, ops,
15785 					  n >> 2);
15786       ix86_expand_vector_init_interleave (half_mode, op1,
15787 					  &ops [n >> 1], n >> 2);
15788       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
15789       return;
15790 
15791     case E_V64QImode:
15792       quarter_mode = V16QImode;
15793       half_mode = V32QImode;
15794       goto quarter;
15795 
15796     case E_V32HImode:
15797       quarter_mode = V8HImode;
15798       half_mode = V16HImode;
15799       goto quarter;
15800 
15801     case E_V32HFmode:
15802       quarter_mode = V8HFmode;
15803       half_mode = V16HFmode;
15804       goto quarter;
15805 
15806 quarter:
15807       n = GET_MODE_NUNITS (mode);
15808       for (i = 0; i < n; i++)
15809 	ops[i] = XVECEXP (vals, 0, i);
15810       op0 = gen_reg_rtx (quarter_mode);
15811       op1 = gen_reg_rtx (quarter_mode);
15812       op2 = gen_reg_rtx (quarter_mode);
15813       op3 = gen_reg_rtx (quarter_mode);
15814       op4 = gen_reg_rtx (half_mode);
15815       op5 = gen_reg_rtx (half_mode);
15816       ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
15817 					  n >> 3);
15818       ix86_expand_vector_init_interleave (quarter_mode, op1,
15819 					  &ops [n >> 2], n >> 3);
15820       ix86_expand_vector_init_interleave (quarter_mode, op2,
15821 					  &ops [n >> 1], n >> 3);
15822       ix86_expand_vector_init_interleave (quarter_mode, op3,
15823 					  &ops [(n >> 1) | (n >> 2)], n >> 3);
15824       emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
15825       emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
15826       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
15827       return;
15828 
15829     case E_V16QImode:
15830       if (!TARGET_SSE4_1)
15831 	break;
15832       /* FALLTHRU */
15833 
15834     case E_V8HImode:
15835       if (!TARGET_SSE2)
15836 	break;
15837 
15838       /* Don't use ix86_expand_vector_init_interleave if we can't
15839 	 move from GPR to SSE register directly.  */
15840       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
15841 	break;
15842       /* FALLTHRU */
15843 
15844     case E_V8HFmode:
15845 
15846       n = GET_MODE_NUNITS (mode);
15847       for (i = 0; i < n; i++)
15848 	ops[i] = XVECEXP (vals, 0, i);
15849       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
15850       return;
15851 
15852     case E_V4HImode:
15853     case E_V8QImode:
15854 
15855     case E_V2HImode:
15856     case E_V4QImode:
15857       break;
15858 
15859     default:
15860       gcc_unreachable ();
15861     }
15862 
15863     {
15864       int i, j, n_elts, n_words, n_elt_per_word;
15865       machine_mode tmp_mode, inner_mode;
15866       rtx words[4], shift;
15867 
15868       tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
15869 
15870       inner_mode = GET_MODE_INNER (mode);
15871       n_elts = GET_MODE_NUNITS (mode);
15872       n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
15873       n_elt_per_word = n_elts / n_words;
15874       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
15875 
15876       for (i = 0; i < n_words; ++i)
15877 	{
15878 	  rtx word = NULL_RTX;
15879 
15880 	  for (j = 0; j < n_elt_per_word; ++j)
15881 	    {
15882 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
15883 	      elt = convert_modes (tmp_mode, inner_mode, elt, true);
15884 
15885 	      if (j == 0)
15886 		word = elt;
15887 	      else
15888 		{
15889 		  word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
15890 					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
15891 		  word = expand_simple_binop (tmp_mode, IOR, word, elt,
15892 					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
15893 		}
15894 	    }
15895 
15896 	  words[i] = word;
15897 	}
15898 
15899       if (n_words == 1)
15900 	emit_move_insn (target, gen_lowpart (mode, words[0]));
15901       else if (n_words == 2)
15902 	{
15903 	  rtx tmp = gen_reg_rtx (mode);
15904 	  emit_clobber (tmp);
15905 	  emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
15906 	  emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
15907 	  emit_move_insn (target, tmp);
15908 	}
15909       else if (n_words == 4)
15910 	{
15911 	  rtx tmp = gen_reg_rtx (V4SImode);
15912 	  gcc_assert (tmp_mode == SImode);
15913 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
15914 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
15915 	  emit_move_insn (target, gen_lowpart (mode, tmp));
15916 	}
15917       else
15918 	gcc_unreachable ();
15919     }
15920 }
15921 
15922 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
15923    instructions unless MMX_OK is true.  */
15924 
15925 void
ix86_expand_vector_init(bool mmx_ok,rtx target,rtx vals)15926 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
15927 {
15928   machine_mode mode = GET_MODE (target);
15929   machine_mode inner_mode = GET_MODE_INNER (mode);
15930   int n_elts = GET_MODE_NUNITS (mode);
15931   int n_var = 0, one_var = -1;
15932   bool all_same = true, all_const_zero = true;
15933   int i;
15934   rtx x;
15935 
15936   /* Handle first initialization from vector elts.  */
15937   if (n_elts != XVECLEN (vals, 0))
15938     {
15939       rtx subtarget = target;
15940       x = XVECEXP (vals, 0, 0);
15941       gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
15942       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
15943 	{
15944 	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
15945 	  if (inner_mode == QImode
15946 	      || inner_mode == HImode
15947 	      || inner_mode == TImode
15948 	      || inner_mode == HFmode)
15949 	    {
15950 	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
15951 	      scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
15952 	      n_bits /= GET_MODE_SIZE (elt_mode);
15953 	      mode = mode_for_vector (elt_mode, n_bits).require ();
15954 	      inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
15955 	      ops[0] = gen_lowpart (inner_mode, ops[0]);
15956 	      ops[1] = gen_lowpart (inner_mode, ops[1]);
15957 	      subtarget = gen_reg_rtx (mode);
15958 	    }
15959 	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
15960 	  if (subtarget != target)
15961 	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
15962 	  return;
15963 	}
15964       gcc_unreachable ();
15965     }
15966 
15967   for (i = 0; i < n_elts; ++i)
15968     {
15969       x = XVECEXP (vals, 0, i);
15970       if (!(CONST_SCALAR_INT_P (x)
15971 	    || CONST_DOUBLE_P (x)
15972 	    || CONST_FIXED_P (x)))
15973 	n_var++, one_var = i;
15974       else if (x != CONST0_RTX (inner_mode))
15975 	all_const_zero = false;
15976       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
15977 	all_same = false;
15978     }
15979 
15980   /* Constants are best loaded from the constant pool.  */
15981   if (n_var == 0)
15982     {
15983       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
15984       return;
15985     }
15986 
15987   /* If all values are identical, broadcast the value.  */
15988   if (all_same
15989       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
15990 					    XVECEXP (vals, 0, 0)))
15991     return;
15992 
15993   /* Values where only one field is non-constant are best loaded from
15994      the pool and overwritten via move later.  */
15995   if (n_var == 1)
15996     {
15997       if (all_const_zero
15998 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
15999 						  XVECEXP (vals, 0, one_var),
16000 						  one_var))
16001 	return;
16002 
16003       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16004 	return;
16005     }
16006 
16007   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16008 }
16009 
16010 /* Implemented as
16011    V setg (V v, int idx, T val)
16012    {
16013      V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16014      V valv = (V){val, val, val, val, val, val, val, val};
16015      V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16016      v = (v & ~mask) | (valv & mask);
16017      return v;
16018    }.  */
16019 void
ix86_expand_vector_set_var(rtx target,rtx val,rtx idx)16020 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16021 {
16022   rtx vec[64];
16023   machine_mode mode = GET_MODE (target);
16024   machine_mode cmp_mode = mode;
16025   int n_elts = GET_MODE_NUNITS (mode);
16026   rtx valv,idxv,constv,idx_tmp;
16027   bool ok = false;
16028 
16029   /* 512-bits vector byte/word broadcast and comparison only available
16030      under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16031      when without TARGET_AVX512BW.  */
16032   if ((mode == V32HImode || mode == V32HFmode || mode == V64QImode)
16033       && !TARGET_AVX512BW)
16034     {
16035       gcc_assert (TARGET_AVX512F);
16036       rtx vhi, vlo, idx_hi;
16037       machine_mode half_mode;
16038       rtx (*extract_hi)(rtx, rtx);
16039       rtx (*extract_lo)(rtx, rtx);
16040 
16041       if (mode == V32HImode)
16042 	{
16043 	  half_mode = V16HImode;
16044 	  extract_hi = gen_vec_extract_hi_v32hi;
16045 	  extract_lo = gen_vec_extract_lo_v32hi;
16046 	}
16047       else if (mode == V32HFmode)
16048 	{
16049 	  half_mode = V16HFmode;
16050 	  extract_hi = gen_vec_extract_hi_v32hf;
16051 	  extract_lo = gen_vec_extract_lo_v32hf;
16052 	}
16053       else
16054 	{
16055 	  half_mode = V32QImode;
16056 	  extract_hi = gen_vec_extract_hi_v64qi;
16057 	  extract_lo = gen_vec_extract_lo_v64qi;
16058 	}
16059 
16060       vhi = gen_reg_rtx (half_mode);
16061       vlo = gen_reg_rtx (half_mode);
16062       idx_hi = gen_reg_rtx (GET_MODE (idx));
16063       emit_insn (extract_hi (vhi, target));
16064       emit_insn (extract_lo (vlo, target));
16065       vec[0] = idx_hi;
16066       vec[1] = idx;
16067       vec[2] = GEN_INT (n_elts/2);
16068       ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16069       ix86_expand_vector_set_var (vhi, val, idx_hi);
16070       ix86_expand_vector_set_var (vlo, val, idx);
16071       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16072       return;
16073     }
16074 
16075   if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16076     {
16077       switch (mode)
16078 	{
16079 	case E_V2DFmode:
16080 	  cmp_mode = V2DImode;
16081 	  break;
16082 	case E_V4DFmode:
16083 	  cmp_mode = V4DImode;
16084 	  break;
16085 	case E_V8DFmode:
16086 	  cmp_mode = V8DImode;
16087 	  break;
16088 	case E_V2SFmode:
16089 	  cmp_mode = V2SImode;
16090 	  break;
16091 	case E_V4SFmode:
16092 	  cmp_mode = V4SImode;
16093 	  break;
16094 	case E_V8SFmode:
16095 	  cmp_mode = V8SImode;
16096 	  break;
16097 	case E_V16SFmode:
16098 	  cmp_mode = V16SImode;
16099 	  break;
16100 	case E_V8HFmode:
16101 	  cmp_mode = V8HImode;
16102 	  break;
16103 	case E_V16HFmode:
16104 	  cmp_mode = V16HImode;
16105 	  break;
16106 	case E_V32HFmode:
16107 	  cmp_mode = V32HImode;
16108 	  break;
16109 	default:
16110 	  gcc_unreachable ();
16111 	}
16112     }
16113 
16114   for (int i = 0; i != n_elts; i++)
16115     vec[i] = GEN_INT (i);
16116   constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16117   valv = gen_reg_rtx (mode);
16118   idxv = gen_reg_rtx (cmp_mode);
16119   idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16120 
16121   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16122 					  mode, valv, val);
16123   gcc_assert (ok);
16124   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16125 					  cmp_mode, idxv, idx_tmp);
16126   gcc_assert (ok);
16127   vec[0] = target;
16128   vec[1] = valv;
16129   vec[2] = target;
16130   vec[3] = gen_rtx_EQ (mode, idxv, constv);
16131   vec[4] = idxv;
16132   vec[5] = constv;
16133   ok = ix86_expand_int_vcond (vec);
16134   gcc_assert (ok);
16135 }
16136 
16137 void
ix86_expand_vector_set(bool mmx_ok,rtx target,rtx val,int elt)16138 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16139 {
16140   machine_mode mode = GET_MODE (target);
16141   machine_mode inner_mode = GET_MODE_INNER (mode);
16142   machine_mode half_mode;
16143   bool use_vec_merge = false;
16144   bool blendm_const = false;
16145   rtx tmp;
16146   static rtx (*gen_extract[7][2]) (rtx, rtx)
16147     = {
16148 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16149 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16150 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16151 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16152 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16153 	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16154 	{ gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf }
16155       };
16156   static rtx (*gen_insert[7][2]) (rtx, rtx, rtx)
16157     = {
16158 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16159 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16160 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16161 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16162 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16163 	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16164 	{ gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16165       };
16166   int i, j, n;
16167   machine_mode mmode = VOIDmode;
16168   rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16169 
16170   switch (mode)
16171     {
16172     case E_V2SImode:
16173       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16174       if (use_vec_merge)
16175 	break;
16176       /* FALLTHRU */
16177 
16178     case E_V2SFmode:
16179       if (mmx_ok)
16180 	{
16181 	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16182 	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16183 	  if (elt == 0)
16184 	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16185 	  else
16186 	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16187 	  emit_insn (gen_rtx_SET (target, tmp));
16188 	  return;
16189 	}
16190       break;
16191 
16192     case E_V2DImode:
16193       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16194       if (use_vec_merge)
16195 	break;
16196 
16197       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16198       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16199       if (elt == 0)
16200 	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16201       else
16202 	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16203       emit_insn (gen_rtx_SET (target, tmp));
16204       return;
16205 
16206     case E_V2DFmode:
16207       /* NB: For ELT == 0, use standard scalar operation patterns which
16208 	 preserve the rest of the vector for combiner:
16209 
16210 	 (vec_merge:V2DF
16211 	   (vec_duplicate:V2DF (reg:DF))
16212 	   (reg:V2DF)
16213 	   (const_int 1))
16214        */
16215       if (elt == 0)
16216 	goto do_vec_merge;
16217 
16218       {
16219 	rtx op0, op1;
16220 
16221 	/* For the two element vectors, we implement a VEC_CONCAT with
16222 	   the extraction of the other element.  */
16223 
16224 	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16225 	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16226 
16227 	if (elt == 0)
16228 	  op0 = val, op1 = tmp;
16229 	else
16230 	  op0 = tmp, op1 = val;
16231 
16232 	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16233 	emit_insn (gen_rtx_SET (target, tmp));
16234       }
16235       return;
16236 
16237     case E_V4SFmode:
16238       use_vec_merge = TARGET_SSE4_1;
16239       if (use_vec_merge)
16240 	break;
16241 
16242       switch (elt)
16243 	{
16244 	case 0:
16245 	  use_vec_merge = true;
16246 	  break;
16247 
16248 	case 1:
16249 	  /* tmp = target = A B C D */
16250 	  tmp = copy_to_reg (target);
16251 	  /* target = A A B B */
16252 	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16253 	  /* target = X A B B */
16254 	  ix86_expand_vector_set (false, target, val, 0);
16255 	  /* target = A X C D  */
16256 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16257 					  const1_rtx, const0_rtx,
16258 					  GEN_INT (2+4), GEN_INT (3+4)));
16259 	  return;
16260 
16261 	case 2:
16262 	  /* tmp = target = A B C D */
16263 	  tmp = copy_to_reg (target);
16264 	  /* tmp = X B C D */
16265 	  ix86_expand_vector_set (false, tmp, val, 0);
16266 	  /* target = A B X D */
16267 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16268 					  const0_rtx, const1_rtx,
16269 					  GEN_INT (0+4), GEN_INT (3+4)));
16270 	  return;
16271 
16272 	case 3:
16273 	  /* tmp = target = A B C D */
16274 	  tmp = copy_to_reg (target);
16275 	  /* tmp = X B C D */
16276 	  ix86_expand_vector_set (false, tmp, val, 0);
16277 	  /* target = A B X D */
16278 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16279 					  const0_rtx, const1_rtx,
16280 					  GEN_INT (2+4), GEN_INT (0+4)));
16281 	  return;
16282 
16283 	default:
16284 	  gcc_unreachable ();
16285 	}
16286       break;
16287 
16288     case E_V4SImode:
16289       use_vec_merge = TARGET_SSE4_1;
16290       if (use_vec_merge)
16291 	break;
16292 
16293       /* Element 0 handled by vec_merge below.  */
16294       if (elt == 0)
16295 	{
16296 	  use_vec_merge = true;
16297 	  break;
16298 	}
16299 
16300       if (TARGET_SSE2)
16301 	{
16302 	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
16303 	     store into element 0, then shuffle them back.  */
16304 
16305 	  rtx order[4];
16306 
16307 	  order[0] = GEN_INT (elt);
16308 	  order[1] = const1_rtx;
16309 	  order[2] = const2_rtx;
16310 	  order[3] = GEN_INT (3);
16311 	  order[elt] = const0_rtx;
16312 
16313 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16314 					order[1], order[2], order[3]));
16315 
16316 	  ix86_expand_vector_set (false, target, val, 0);
16317 
16318 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16319 					order[1], order[2], order[3]));
16320 	}
16321       else
16322 	{
16323 	  /* For SSE1, we have to reuse the V4SF code.  */
16324 	  rtx t = gen_reg_rtx (V4SFmode);
16325 	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
16326 	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16327 	  emit_move_insn (target, gen_lowpart (mode, t));
16328 	}
16329       return;
16330 
16331     case E_V8HImode:
16332     case E_V8HFmode:
16333     case E_V2HImode:
16334       use_vec_merge = TARGET_SSE2;
16335       break;
16336     case E_V4HImode:
16337       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16338       break;
16339 
16340     case E_V16QImode:
16341     case E_V4QImode:
16342       use_vec_merge = TARGET_SSE4_1;
16343       break;
16344 
16345     case E_V8QImode:
16346       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16347       break;
16348 
16349     case E_V32QImode:
16350       half_mode = V16QImode;
16351       j = 0;
16352       n = 16;
16353       goto half;
16354 
16355     case E_V16HFmode:
16356       /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw.  */
16357       if (TARGET_AVX2 && elt != 0)
16358 	{
16359 	  mmode = SImode;
16360 	  gen_blendm = gen_avx2_pblendph_1;
16361 	  blendm_const = true;
16362 	  break;
16363 	}
16364       else
16365 	{
16366 	  half_mode = V8HFmode;
16367 	  j = 6;
16368 	  n = 8;
16369 	  goto half;
16370 	}
16371 
16372     case E_V16HImode:
16373       half_mode = V8HImode;
16374       j = 1;
16375       n = 8;
16376       goto half;
16377 
16378     case E_V8SImode:
16379       half_mode = V4SImode;
16380       j = 2;
16381       n = 4;
16382       goto half;
16383 
16384     case E_V4DImode:
16385       half_mode = V2DImode;
16386       j = 3;
16387       n = 2;
16388       goto half;
16389 
16390     case E_V8SFmode:
16391       half_mode = V4SFmode;
16392       j = 4;
16393       n = 4;
16394       goto half;
16395 
16396     case E_V4DFmode:
16397       half_mode = V2DFmode;
16398       j = 5;
16399       n = 2;
16400       goto half;
16401 
16402 half:
16403       /* Compute offset.  */
16404       i = elt / n;
16405       elt %= n;
16406 
16407       gcc_assert (i <= 1);
16408 
16409       /* Extract the half.  */
16410       tmp = gen_reg_rtx (half_mode);
16411       emit_insn (gen_extract[j][i] (tmp, target));
16412 
16413       /* Put val in tmp at elt.  */
16414       ix86_expand_vector_set (false, tmp, val, elt);
16415 
16416       /* Put it back.  */
16417       emit_insn (gen_insert[j][i] (target, target, tmp));
16418       return;
16419 
16420     case E_V8DFmode:
16421       if (TARGET_AVX512F)
16422 	{
16423 	  mmode = QImode;
16424 	  gen_blendm = gen_avx512f_blendmv8df;
16425 	}
16426       break;
16427 
16428     case E_V8DImode:
16429       if (TARGET_AVX512F)
16430 	{
16431 	  mmode = QImode;
16432 	  gen_blendm = gen_avx512f_blendmv8di;
16433 	}
16434       break;
16435 
16436     case E_V16SFmode:
16437       if (TARGET_AVX512F)
16438 	{
16439 	  mmode = HImode;
16440 	  gen_blendm = gen_avx512f_blendmv16sf;
16441 	}
16442       break;
16443 
16444     case E_V16SImode:
16445       if (TARGET_AVX512F)
16446 	{
16447 	  mmode = HImode;
16448 	  gen_blendm = gen_avx512f_blendmv16si;
16449 	}
16450       break;
16451 
16452     case E_V32HFmode:
16453       if (TARGET_AVX512BW)
16454 	{
16455 	  mmode = SImode;
16456 	  gen_blendm = gen_avx512bw_blendmv32hf;
16457 	}
16458       break;
16459     case E_V32HImode:
16460       if (TARGET_AVX512BW)
16461 	{
16462 	  mmode = SImode;
16463 	  gen_blendm = gen_avx512bw_blendmv32hi;
16464 	}
16465       else if (TARGET_AVX512F)
16466 	{
16467 	  half_mode = E_V8HImode;
16468 	  n = 8;
16469 	  goto quarter;
16470 	}
16471       break;
16472 
16473     case E_V64QImode:
16474       if (TARGET_AVX512BW)
16475 	{
16476 	  mmode = DImode;
16477 	  gen_blendm = gen_avx512bw_blendmv64qi;
16478 	}
16479       else if (TARGET_AVX512F)
16480 	{
16481 	  half_mode = E_V16QImode;
16482 	  n = 16;
16483 	  goto quarter;
16484 	}
16485       break;
16486 
16487 quarter:
16488       /* Compute offset.  */
16489       i = elt / n;
16490       elt %= n;
16491 
16492       gcc_assert (i <= 3);
16493 
16494       {
16495 	/* Extract the quarter.  */
16496 	tmp = gen_reg_rtx (V4SImode);
16497 	rtx tmp2 = gen_lowpart (V16SImode, target);
16498 	rtx mask = gen_reg_rtx (QImode);
16499 
16500 	emit_move_insn (mask, constm1_rtx);
16501 	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
16502 						   tmp, mask));
16503 
16504 	tmp2 = gen_reg_rtx (half_mode);
16505 	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
16506 	tmp = tmp2;
16507 
16508 	/* Put val in tmp at elt.  */
16509 	ix86_expand_vector_set (false, tmp, val, elt);
16510 
16511 	/* Put it back.  */
16512 	tmp2 = gen_reg_rtx (V16SImode);
16513 	rtx tmp3 = gen_lowpart (V16SImode, target);
16514 	mask = gen_reg_rtx (HImode);
16515 	emit_move_insn (mask, constm1_rtx);
16516 	tmp = gen_lowpart (V4SImode, tmp);
16517 	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
16518 						  tmp3, mask));
16519 	emit_move_insn (target, gen_lowpart (mode, tmp2));
16520       }
16521       return;
16522 
16523     default:
16524       break;
16525     }
16526 
16527   if (mmode != VOIDmode)
16528     {
16529       tmp = gen_reg_rtx (mode);
16530       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
16531       rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
16532       /* The avx512*_blendm<mode> expanders have different operand order
16533 	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
16534 	 elements where the mask is set and second input operand otherwise,
16535 	 in {sse,avx}*_*blend* the first input operand is used for elements
16536 	 where the mask is clear and second input operand otherwise.  */
16537       if (!blendm_const)
16538 	merge_mask = force_reg (mmode, merge_mask);
16539       emit_insn (gen_blendm (target, target, tmp, merge_mask));
16540     }
16541   else if (use_vec_merge)
16542     {
16543 do_vec_merge:
16544       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
16545       tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
16546 			       GEN_INT (HOST_WIDE_INT_1U << elt));
16547       emit_insn (gen_rtx_SET (target, tmp));
16548     }
16549   else
16550     {
16551       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16552 
16553       emit_move_insn (mem, target);
16554 
16555       tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
16556       emit_move_insn (tmp, val);
16557 
16558       emit_move_insn (target, mem);
16559     }
16560 }
16561 
16562 void
ix86_expand_vector_extract(bool mmx_ok,rtx target,rtx vec,int elt)16563 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
16564 {
16565   machine_mode mode = GET_MODE (vec);
16566   machine_mode inner_mode = GET_MODE_INNER (mode);
16567   bool use_vec_extr = false;
16568   rtx tmp;
16569 
16570   switch (mode)
16571     {
16572     case E_V2SImode:
16573       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16574       if (use_vec_extr)
16575 	break;
16576       /* FALLTHRU */
16577 
16578     case E_V2SFmode:
16579       if (!mmx_ok)
16580 	break;
16581       /* FALLTHRU */
16582 
16583     case E_V2DFmode:
16584     case E_V2DImode:
16585     case E_V2TImode:
16586     case E_V4TImode:
16587       use_vec_extr = true;
16588       break;
16589 
16590     case E_V4SFmode:
16591       use_vec_extr = TARGET_SSE4_1;
16592       if (use_vec_extr)
16593 	break;
16594 
16595       switch (elt)
16596 	{
16597 	case 0:
16598 	  tmp = vec;
16599 	  break;
16600 
16601 	case 1:
16602 	case 3:
16603 	  tmp = gen_reg_rtx (mode);
16604 	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
16605 				       GEN_INT (elt), GEN_INT (elt),
16606 				       GEN_INT (elt+4), GEN_INT (elt+4)));
16607 	  break;
16608 
16609 	case 2:
16610 	  tmp = gen_reg_rtx (mode);
16611 	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
16612 	  break;
16613 
16614 	default:
16615 	  gcc_unreachable ();
16616 	}
16617       vec = tmp;
16618       use_vec_extr = true;
16619       elt = 0;
16620       break;
16621 
16622     case E_V4SImode:
16623       use_vec_extr = TARGET_SSE4_1;
16624       if (use_vec_extr)
16625 	break;
16626 
16627       if (TARGET_SSE2)
16628 	{
16629 	  switch (elt)
16630 	    {
16631 	    case 0:
16632 	      tmp = vec;
16633 	      break;
16634 
16635 	    case 1:
16636 	    case 3:
16637 	      tmp = gen_reg_rtx (mode);
16638 	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
16639 					    GEN_INT (elt), GEN_INT (elt),
16640 					    GEN_INT (elt), GEN_INT (elt)));
16641 	      break;
16642 
16643 	    case 2:
16644 	      tmp = gen_reg_rtx (mode);
16645 	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
16646 	      break;
16647 
16648 	    default:
16649 	      gcc_unreachable ();
16650 	    }
16651 	  vec = tmp;
16652 	  use_vec_extr = true;
16653 	  elt = 0;
16654 	}
16655       else
16656 	{
16657 	  /* For SSE1, we have to reuse the V4SF code.  */
16658 	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
16659 				      gen_lowpart (V4SFmode, vec), elt);
16660 	  return;
16661 	}
16662       break;
16663 
16664     case E_V8HImode:
16665     case E_V8HFmode:
16666     case E_V2HImode:
16667       use_vec_extr = TARGET_SSE2;
16668       break;
16669     case E_V4HImode:
16670       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16671       break;
16672 
16673     case E_V16QImode:
16674       use_vec_extr = TARGET_SSE4_1;
16675       if (!use_vec_extr
16676 	  && TARGET_SSE2
16677 	  && elt == 0
16678 	  && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
16679 	{
16680 	  tmp = gen_reg_rtx (SImode);
16681 	  ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
16682 				      0);
16683 	  emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
16684 	  return;
16685 	}
16686       break;
16687     case E_V4QImode:
16688       use_vec_extr = TARGET_SSE4_1;
16689       break;
16690 
16691     case E_V8SFmode:
16692       if (TARGET_AVX)
16693 	{
16694 	  tmp = gen_reg_rtx (V4SFmode);
16695 	  if (elt < 4)
16696 	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
16697 	  else
16698 	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
16699 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
16700 	  return;
16701 	}
16702       break;
16703 
16704     case E_V4DFmode:
16705       if (TARGET_AVX)
16706 	{
16707 	  tmp = gen_reg_rtx (V2DFmode);
16708 	  if (elt < 2)
16709 	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
16710 	  else
16711 	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
16712 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
16713 	  return;
16714 	}
16715       break;
16716 
16717     case E_V32QImode:
16718       if (TARGET_AVX)
16719 	{
16720 	  tmp = gen_reg_rtx (V16QImode);
16721 	  if (elt < 16)
16722 	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
16723 	  else
16724 	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
16725 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
16726 	  return;
16727 	}
16728       break;
16729 
16730     case E_V16HImode:
16731       if (TARGET_AVX)
16732 	{
16733 	  tmp = gen_reg_rtx (V8HImode);
16734 	  if (elt < 8)
16735 	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
16736 	  else
16737 	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
16738 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
16739 	  return;
16740 	}
16741       break;
16742 
16743     case E_V8SImode:
16744       if (TARGET_AVX)
16745 	{
16746 	  tmp = gen_reg_rtx (V4SImode);
16747 	  if (elt < 4)
16748 	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
16749 	  else
16750 	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
16751 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
16752 	  return;
16753 	}
16754       break;
16755 
16756     case E_V4DImode:
16757       if (TARGET_AVX)
16758 	{
16759 	  tmp = gen_reg_rtx (V2DImode);
16760 	  if (elt < 2)
16761 	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
16762 	  else
16763 	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
16764 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
16765 	  return;
16766 	}
16767       break;
16768 
16769     case E_V32HImode:
16770       if (TARGET_AVX512BW)
16771 	{
16772 	  tmp = gen_reg_rtx (V16HImode);
16773 	  if (elt < 16)
16774 	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
16775 	  else
16776 	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
16777 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
16778 	  return;
16779 	}
16780       break;
16781 
16782     case E_V64QImode:
16783       if (TARGET_AVX512BW)
16784 	{
16785 	  tmp = gen_reg_rtx (V32QImode);
16786 	  if (elt < 32)
16787 	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
16788 	  else
16789 	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
16790 	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
16791 	  return;
16792 	}
16793       break;
16794 
16795     case E_V16SFmode:
16796       tmp = gen_reg_rtx (V8SFmode);
16797       if (elt < 8)
16798 	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
16799       else
16800 	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
16801       ix86_expand_vector_extract (false, target, tmp, elt & 7);
16802       return;
16803 
16804     case E_V8DFmode:
16805       tmp = gen_reg_rtx (V4DFmode);
16806       if (elt < 4)
16807 	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
16808       else
16809 	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
16810       ix86_expand_vector_extract (false, target, tmp, elt & 3);
16811       return;
16812 
16813     case E_V16SImode:
16814       tmp = gen_reg_rtx (V8SImode);
16815       if (elt < 8)
16816 	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
16817       else
16818 	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
16819       ix86_expand_vector_extract (false, target, tmp, elt & 7);
16820       return;
16821 
16822     case E_V8DImode:
16823       tmp = gen_reg_rtx (V4DImode);
16824       if (elt < 4)
16825 	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
16826       else
16827 	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
16828       ix86_expand_vector_extract (false, target, tmp, elt & 3);
16829       return;
16830 
16831     case E_V32HFmode:
16832       if (TARGET_AVX512BW)
16833 	{
16834 	  tmp = gen_reg_rtx (V16HFmode);
16835 	  if (elt < 16)
16836 	    emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
16837 	  else
16838 	    emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
16839 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
16840 	  return;
16841 	}
16842       break;
16843 
16844     case E_V16HFmode:
16845       if (TARGET_AVX)
16846 	{
16847 	  tmp = gen_reg_rtx (V8HFmode);
16848 	  if (elt < 8)
16849 	    emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
16850 	  else
16851 	    emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
16852 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
16853 	  return;
16854 	}
16855       break;
16856 
16857     case E_V8QImode:
16858       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16859       /* ??? Could extract the appropriate HImode element and shift.  */
16860       break;
16861 
16862     default:
16863       break;
16864     }
16865 
16866   if (use_vec_extr)
16867     {
16868       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
16869       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
16870 
16871       /* Let the rtl optimizers know about the zero extension performed.  */
16872       if (inner_mode == QImode || inner_mode == HImode)
16873 	{
16874 	  rtx reg = gen_reg_rtx (SImode);
16875 	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
16876 	  emit_move_insn (reg, tmp);
16877 	  tmp = gen_lowpart (inner_mode, reg);
16878 	  SUBREG_PROMOTED_VAR_P (tmp) = 1;
16879 	  SUBREG_PROMOTED_SET (tmp, 1);
16880 	}
16881 
16882       emit_move_insn (target, tmp);
16883     }
16884   else
16885     {
16886       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16887 
16888       emit_move_insn (mem, vec);
16889 
16890       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
16891       emit_move_insn (target, tmp);
16892     }
16893 }
16894 
16895 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
16896    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
16897    The upper bits of DEST are undefined, though they shouldn't cause
16898    exceptions (some bits from src or all zeros are ok).  */
16899 
16900 static void
emit_reduc_half(rtx dest,rtx src,int i)16901 emit_reduc_half (rtx dest, rtx src, int i)
16902 {
16903   rtx tem, d = dest;
16904   switch (GET_MODE (src))
16905     {
16906     case E_V4SFmode:
16907       if (i == 128)
16908 	tem = gen_sse_movhlps (dest, src, src);
16909       else
16910 	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
16911 				   GEN_INT (1 + 4), GEN_INT (1 + 4));
16912       break;
16913     case E_V2DFmode:
16914       tem = gen_vec_interleave_highv2df (dest, src, src);
16915       break;
16916     case E_V4QImode:
16917       d = gen_reg_rtx (V1SImode);
16918       tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
16919 			       GEN_INT (i / 2));
16920       break;
16921     case E_V4HImode:
16922       d = gen_reg_rtx (V1DImode);
16923       tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
16924 			       GEN_INT (i / 2));
16925       break;
16926     case E_V16QImode:
16927     case E_V8HImode:
16928     case E_V8HFmode:
16929     case E_V4SImode:
16930     case E_V2DImode:
16931       d = gen_reg_rtx (V1TImode);
16932       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
16933 				GEN_INT (i / 2));
16934       break;
16935     case E_V8SFmode:
16936       if (i == 256)
16937 	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
16938       else
16939 	tem = gen_avx_shufps256 (dest, src, src,
16940 				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
16941       break;
16942     case E_V4DFmode:
16943       if (i == 256)
16944 	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
16945       else
16946 	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
16947       break;
16948     case E_V32QImode:
16949     case E_V16HImode:
16950     case E_V16HFmode:
16951     case E_V8SImode:
16952     case E_V4DImode:
16953       if (i == 256)
16954 	{
16955 	  if (GET_MODE (dest) != V4DImode)
16956 	    d = gen_reg_rtx (V4DImode);
16957 	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
16958 				   gen_lowpart (V4DImode, src),
16959 				   const1_rtx);
16960 	}
16961       else
16962 	{
16963 	  d = gen_reg_rtx (V2TImode);
16964 	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
16965 				    GEN_INT (i / 2));
16966 	}
16967       break;
16968     case E_V64QImode:
16969     case E_V32HImode:
16970     case E_V32HFmode:
16971       if (i < 64)
16972 	{
16973 	  d = gen_reg_rtx (V4TImode);
16974 	  tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
16975 					GEN_INT (i / 2));
16976 	  break;
16977 	}
16978       /* FALLTHRU */
16979     case E_V16SImode:
16980     case E_V16SFmode:
16981     case E_V8DImode:
16982     case E_V8DFmode:
16983       if (i > 128)
16984 	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
16985 					gen_lowpart (V16SImode, src),
16986 					gen_lowpart (V16SImode, src),
16987 					GEN_INT (0x4 + (i == 512 ? 4 : 0)),
16988 					GEN_INT (0x5 + (i == 512 ? 4 : 0)),
16989 					GEN_INT (0x6 + (i == 512 ? 4 : 0)),
16990 					GEN_INT (0x7 + (i == 512 ? 4 : 0)),
16991 					GEN_INT (0xC), GEN_INT (0xD),
16992 					GEN_INT (0xE), GEN_INT (0xF),
16993 					GEN_INT (0x10), GEN_INT (0x11),
16994 					GEN_INT (0x12), GEN_INT (0x13),
16995 					GEN_INT (0x14), GEN_INT (0x15),
16996 					GEN_INT (0x16), GEN_INT (0x17));
16997       else
16998 	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
16999 				    gen_lowpart (V16SImode, src),
17000 				    GEN_INT (i == 128 ? 0x2 : 0x1),
17001 				    GEN_INT (0x3),
17002 				    GEN_INT (0x3),
17003 				    GEN_INT (0x3),
17004 				    GEN_INT (i == 128 ? 0x6 : 0x5),
17005 				    GEN_INT (0x7),
17006 				    GEN_INT (0x7),
17007 				    GEN_INT (0x7),
17008 				    GEN_INT (i == 128 ? 0xA : 0x9),
17009 				    GEN_INT (0xB),
17010 				    GEN_INT (0xB),
17011 				    GEN_INT (0xB),
17012 				    GEN_INT (i == 128 ? 0xE : 0xD),
17013 				    GEN_INT (0xF),
17014 				    GEN_INT (0xF),
17015 				    GEN_INT (0xF));
17016       break;
17017     default:
17018       gcc_unreachable ();
17019     }
17020   emit_insn (tem);
17021   if (d != dest)
17022     emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17023 }
17024 
17025 /* Expand a vector reduction.  FN is the binary pattern to reduce;
17026    DEST is the destination; IN is the input vector.  */
17027 
17028 void
ix86_expand_reduc(rtx (* fn)(rtx,rtx,rtx),rtx dest,rtx in)17029 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17030 {
17031   rtx half, dst, vec = in;
17032   machine_mode mode = GET_MODE (in);
17033   int i;
17034 
17035   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
17036   if (TARGET_SSE4_1
17037       && mode == V8HImode
17038       && fn == gen_uminv8hi3)
17039     {
17040       emit_insn (gen_sse4_1_phminposuw (dest, in));
17041       return;
17042     }
17043 
17044   for (i = GET_MODE_BITSIZE (mode);
17045        i > GET_MODE_UNIT_BITSIZE (mode);
17046        i >>= 1)
17047     {
17048       half = gen_reg_rtx (mode);
17049       emit_reduc_half (half, vec, i);
17050       if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17051 	dst = dest;
17052       else
17053 	dst = gen_reg_rtx (mode);
17054       emit_insn (fn (dst, half, vec));
17055       vec = dst;
17056     }
17057 }
17058 
17059 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17060    FP status register is set.  */
17061 
17062 void
ix86_emit_fp_unordered_jump(rtx label)17063 ix86_emit_fp_unordered_jump (rtx label)
17064 {
17065   rtx reg = gen_reg_rtx (HImode);
17066   rtx_insn *insn;
17067   rtx temp;
17068 
17069   emit_insn (gen_x86_fnstsw_1 (reg));
17070 
17071   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17072     {
17073       emit_insn (gen_x86_sahf_1 (reg));
17074 
17075       temp = gen_rtx_REG (CCmode, FLAGS_REG);
17076       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17077     }
17078   else
17079     {
17080       emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17081 
17082       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17083       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17084     }
17085 
17086   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17087 			      gen_rtx_LABEL_REF (VOIDmode, label),
17088 			      pc_rtx);
17089   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17090   predict_jump (REG_BR_PROB_BASE * 10 / 100);
17091   JUMP_LABEL (insn) = label;
17092 }
17093 
17094 /* Output code to perform an sinh XFmode calculation.  */
17095 
17096 void
ix86_emit_i387_sinh(rtx op0,rtx op1)17097 ix86_emit_i387_sinh (rtx op0, rtx op1)
17098 {
17099   rtx e1 = gen_reg_rtx (XFmode);
17100   rtx e2 = gen_reg_rtx (XFmode);
17101   rtx scratch = gen_reg_rtx (HImode);
17102   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17103   rtx half = const_double_from_real_value (dconsthalf, XFmode);
17104   rtx cst1, tmp;
17105   rtx_code_label *jump_label = gen_label_rtx ();
17106   rtx_insn *insn;
17107 
17108   /* scratch = fxam (op1) */
17109   emit_insn (gen_fxamxf2_i387 (scratch, op1));
17110 
17111   /* e1 = expm1 (|op1|) */
17112   emit_insn (gen_absxf2 (e2, op1));
17113   emit_insn (gen_expm1xf2 (e1, e2));
17114 
17115   /* e2 = e1 / (e1 + 1.0) + e1 */
17116   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17117   emit_insn (gen_addxf3 (e2, e1, cst1));
17118   emit_insn (gen_divxf3 (e2, e1, e2));
17119   emit_insn (gen_addxf3 (e2, e2, e1));
17120 
17121   /* flags = signbit (op1) */
17122   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17123 
17124   /* if (flags) then e2 = -e2 */
17125   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17126 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17127 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
17128 			      pc_rtx);
17129   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17130   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17131   JUMP_LABEL (insn) = jump_label;
17132 
17133   emit_insn (gen_negxf2 (e2, e2));
17134 
17135   emit_label (jump_label);
17136   LABEL_NUSES (jump_label) = 1;
17137 
17138   /* op0 = 0.5 * e2 */
17139   half = force_reg (XFmode, half);
17140   emit_insn (gen_mulxf3 (op0, e2, half));
17141 }
17142 
17143 /* Output code to perform an cosh XFmode calculation.  */
17144 
17145 void
ix86_emit_i387_cosh(rtx op0,rtx op1)17146 ix86_emit_i387_cosh (rtx op0, rtx op1)
17147 {
17148   rtx e1 = gen_reg_rtx (XFmode);
17149   rtx e2 = gen_reg_rtx (XFmode);
17150   rtx half = const_double_from_real_value (dconsthalf, XFmode);
17151   rtx cst1;
17152 
17153   /* e1 = exp (op1) */
17154   emit_insn (gen_expxf2 (e1, op1));
17155 
17156   /* e2 = e1 + 1.0 / e1 */
17157   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17158   emit_insn (gen_divxf3 (e2, cst1, e1));
17159   emit_insn (gen_addxf3 (e2, e1, e2));
17160 
17161   /* op0 = 0.5 * e2 */
17162   half = force_reg (XFmode, half);
17163   emit_insn (gen_mulxf3 (op0, e2, half));
17164 }
17165 
17166 /* Output code to perform an tanh XFmode calculation.  */
17167 
17168 void
ix86_emit_i387_tanh(rtx op0,rtx op1)17169 ix86_emit_i387_tanh (rtx op0, rtx op1)
17170 {
17171   rtx e1 = gen_reg_rtx (XFmode);
17172   rtx e2 = gen_reg_rtx (XFmode);
17173   rtx scratch = gen_reg_rtx (HImode);
17174   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17175   rtx cst2, tmp;
17176   rtx_code_label *jump_label = gen_label_rtx ();
17177   rtx_insn *insn;
17178 
17179   /* scratch = fxam (op1) */
17180   emit_insn (gen_fxamxf2_i387 (scratch, op1));
17181 
17182   /* e1 = expm1 (-|2 * op1|) */
17183   emit_insn (gen_addxf3 (e2, op1, op1));
17184   emit_insn (gen_absxf2 (e2, e2));
17185   emit_insn (gen_negxf2 (e2, e2));
17186   emit_insn (gen_expm1xf2 (e1, e2));
17187 
17188   /* e2 = e1 / (e1 + 2.0) */
17189   cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17190   emit_insn (gen_addxf3 (e2, e1, cst2));
17191   emit_insn (gen_divxf3 (e2, e1, e2));
17192 
17193   /* flags = signbit (op1) */
17194   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17195 
17196   /* if (!flags) then e2 = -e2 */
17197   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17198 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
17199 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
17200 			      pc_rtx);
17201   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17202   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17203   JUMP_LABEL (insn) = jump_label;
17204 
17205   emit_insn (gen_negxf2 (e2, e2));
17206 
17207   emit_label (jump_label);
17208   LABEL_NUSES (jump_label) = 1;
17209 
17210   emit_move_insn (op0, e2);
17211 }
17212 
17213 /* Output code to perform an asinh XFmode calculation.  */
17214 
17215 void
ix86_emit_i387_asinh(rtx op0,rtx op1)17216 ix86_emit_i387_asinh (rtx op0, rtx op1)
17217 {
17218   rtx e1 = gen_reg_rtx (XFmode);
17219   rtx e2 = gen_reg_rtx (XFmode);
17220   rtx scratch = gen_reg_rtx (HImode);
17221   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17222   rtx cst1, tmp;
17223   rtx_code_label *jump_label = gen_label_rtx ();
17224   rtx_insn *insn;
17225 
17226   /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17227   emit_insn (gen_mulxf3 (e1, op1, op1));
17228   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17229   emit_insn (gen_addxf3 (e2, e1, cst1));
17230   emit_insn (gen_sqrtxf2 (e2, e2));
17231   emit_insn (gen_addxf3 (e2, e2, cst1));
17232 
17233   /* e1 = e1 / e2 */
17234   emit_insn (gen_divxf3 (e1, e1, e2));
17235 
17236   /* scratch = fxam (op1) */
17237   emit_insn (gen_fxamxf2_i387 (scratch, op1));
17238 
17239   /* e1 = e1 + |op1| */
17240   emit_insn (gen_absxf2 (e2, op1));
17241   emit_insn (gen_addxf3 (e1, e1, e2));
17242 
17243   /* e2 = log1p (e1) */
17244   ix86_emit_i387_log1p (e2, e1);
17245 
17246   /* flags = signbit (op1) */
17247   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17248 
17249   /* if (flags) then e2 = -e2 */
17250   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17251 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17252 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
17253 			      pc_rtx);
17254   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17255   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17256   JUMP_LABEL (insn) = jump_label;
17257 
17258   emit_insn (gen_negxf2 (e2, e2));
17259 
17260   emit_label (jump_label);
17261   LABEL_NUSES (jump_label) = 1;
17262 
17263   emit_move_insn (op0, e2);
17264 }
17265 
17266 /* Output code to perform an acosh XFmode calculation.  */
17267 
17268 void
ix86_emit_i387_acosh(rtx op0,rtx op1)17269 ix86_emit_i387_acosh (rtx op0, rtx op1)
17270 {
17271   rtx e1 = gen_reg_rtx (XFmode);
17272   rtx e2 = gen_reg_rtx (XFmode);
17273   rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17274 
17275   /* e2 = sqrt (op1 + 1.0) */
17276   emit_insn (gen_addxf3 (e2, op1, cst1));
17277   emit_insn (gen_sqrtxf2 (e2, e2));
17278 
17279   /* e1 = sqrt (op1 - 1.0) */
17280   emit_insn (gen_subxf3 (e1, op1, cst1));
17281   emit_insn (gen_sqrtxf2 (e1, e1));
17282 
17283   /* e1 = e1 * e2 */
17284   emit_insn (gen_mulxf3 (e1, e1, e2));
17285 
17286   /* e1 = e1 + op1 */
17287   emit_insn (gen_addxf3 (e1, e1, op1));
17288 
17289   /* op0 = log (e1) */
17290   emit_insn (gen_logxf2 (op0, e1));
17291 }
17292 
17293 /* Output code to perform an atanh XFmode calculation.  */
17294 
17295 void
ix86_emit_i387_atanh(rtx op0,rtx op1)17296 ix86_emit_i387_atanh (rtx op0, rtx op1)
17297 {
17298   rtx e1 = gen_reg_rtx (XFmode);
17299   rtx e2 = gen_reg_rtx (XFmode);
17300   rtx scratch = gen_reg_rtx (HImode);
17301   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17302   rtx half = const_double_from_real_value (dconsthalf, XFmode);
17303   rtx cst1, tmp;
17304   rtx_code_label *jump_label = gen_label_rtx ();
17305   rtx_insn *insn;
17306 
17307   /* scratch = fxam (op1) */
17308   emit_insn (gen_fxamxf2_i387 (scratch, op1));
17309 
17310   /* e2 = |op1| */
17311   emit_insn (gen_absxf2 (e2, op1));
17312 
17313   /* e1 = -(e2 + e2) / (e2 + 1.0) */
17314   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17315   emit_insn (gen_addxf3 (e1, e2, cst1));
17316   emit_insn (gen_addxf3 (e2, e2, e2));
17317   emit_insn (gen_negxf2 (e2, e2));
17318   emit_insn (gen_divxf3 (e1, e2, e1));
17319 
17320   /* e2 = log1p (e1) */
17321   ix86_emit_i387_log1p (e2, e1);
17322 
17323   /* flags = signbit (op1) */
17324   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17325 
17326   /* if (!flags) then e2 = -e2 */
17327   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17328 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
17329 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
17330 			      pc_rtx);
17331   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17332   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17333   JUMP_LABEL (insn) = jump_label;
17334 
17335   emit_insn (gen_negxf2 (e2, e2));
17336 
17337   emit_label (jump_label);
17338   LABEL_NUSES (jump_label) = 1;
17339 
17340   /* op0 = 0.5 * e2 */
17341   half = force_reg (XFmode, half);
17342   emit_insn (gen_mulxf3 (op0, e2, half));
17343 }
17344 
17345 /* Output code to perform a log1p XFmode calculation.  */
17346 
17347 void
ix86_emit_i387_log1p(rtx op0,rtx op1)17348 ix86_emit_i387_log1p (rtx op0, rtx op1)
17349 {
17350   rtx_code_label *label1 = gen_label_rtx ();
17351   rtx_code_label *label2 = gen_label_rtx ();
17352 
17353   rtx tmp = gen_reg_rtx (XFmode);
17354   rtx res = gen_reg_rtx (XFmode);
17355   rtx cst, cstln2, cst1;
17356   rtx_insn *insn;
17357 
17358   /* The emit_jump call emits pending stack adjust, make sure it is emitted
17359      before the conditional jump, otherwise the stack adjustment will be
17360      only conditional.  */
17361   do_pending_stack_adjust ();
17362 
17363   cst = const_double_from_real_value
17364     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17365   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17366 
17367   emit_insn (gen_absxf2 (tmp, op1));
17368 
17369   cst = force_reg (XFmode, cst);
17370   ix86_expand_branch (GE, tmp, cst, label1);
17371   predict_jump (REG_BR_PROB_BASE * 10 / 100);
17372   insn = get_last_insn ();
17373   JUMP_LABEL (insn) = label1;
17374 
17375   emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17376   emit_jump (label2);
17377 
17378   emit_label (label1);
17379   LABEL_NUSES (label1) = 1;
17380 
17381   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17382   emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17383   emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17384 
17385   emit_label (label2);
17386   LABEL_NUSES (label2) = 1;
17387 
17388   emit_move_insn (op0, res);
17389 }
17390 
17391 /* Emit code for round calculation.  */
17392 void
ix86_emit_i387_round(rtx op0,rtx op1)17393 ix86_emit_i387_round (rtx op0, rtx op1)
17394 {
17395   machine_mode inmode = GET_MODE (op1);
17396   machine_mode outmode = GET_MODE (op0);
17397   rtx e1 = gen_reg_rtx (XFmode);
17398   rtx e2 = gen_reg_rtx (XFmode);
17399   rtx scratch = gen_reg_rtx (HImode);
17400   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17401   rtx half = const_double_from_real_value (dconsthalf, XFmode);
17402   rtx res = gen_reg_rtx (outmode);
17403   rtx_code_label *jump_label = gen_label_rtx ();
17404   rtx (*floor_insn) (rtx, rtx);
17405   rtx (*neg_insn) (rtx, rtx);
17406   rtx_insn *insn;
17407   rtx tmp;
17408 
17409   switch (inmode)
17410     {
17411     case E_SFmode:
17412     case E_DFmode:
17413       tmp = gen_reg_rtx (XFmode);
17414 
17415       emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17416       op1 = tmp;
17417       break;
17418     case E_XFmode:
17419       break;
17420     default:
17421       gcc_unreachable ();
17422     }
17423 
17424   switch (outmode)
17425     {
17426     case E_SFmode:
17427       floor_insn = gen_frndintxf2_floor;
17428       neg_insn = gen_negsf2;
17429       break;
17430     case E_DFmode:
17431       floor_insn = gen_frndintxf2_floor;
17432       neg_insn = gen_negdf2;
17433       break;
17434     case E_XFmode:
17435       floor_insn = gen_frndintxf2_floor;
17436       neg_insn = gen_negxf2;
17437       break;
17438     case E_HImode:
17439       floor_insn = gen_lfloorxfhi2;
17440       neg_insn = gen_neghi2;
17441       break;
17442     case E_SImode:
17443       floor_insn = gen_lfloorxfsi2;
17444       neg_insn = gen_negsi2;
17445       break;
17446     case E_DImode:
17447       floor_insn = gen_lfloorxfdi2;
17448       neg_insn = gen_negdi2;
17449       break;
17450     default:
17451       gcc_unreachable ();
17452     }
17453 
17454   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17455 
17456   /* scratch = fxam(op1) */
17457   emit_insn (gen_fxamxf2_i387 (scratch, op1));
17458 
17459   /* e1 = fabs(op1) */
17460   emit_insn (gen_absxf2 (e1, op1));
17461 
17462   /* e2 = e1 + 0.5 */
17463   half = force_reg (XFmode, half);
17464   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
17465 
17466   /* res = floor(e2) */
17467   switch (outmode)
17468     {
17469     case E_SFmode:
17470     case E_DFmode:
17471       {
17472 	tmp = gen_reg_rtx (XFmode);
17473 
17474 	emit_insn (floor_insn (tmp, e2));
17475 	emit_insn (gen_rtx_SET (res,
17476 				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
17477 						UNSPEC_TRUNC_NOOP)));
17478       }
17479       break;
17480     default:
17481       emit_insn (floor_insn (res, e2));
17482     }
17483 
17484   /* flags = signbit(a) */
17485   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17486 
17487   /* if (flags) then res = -res */
17488   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17489 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17490 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
17491 			      pc_rtx);
17492   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17493   predict_jump (REG_BR_PROB_BASE * 50 / 100);
17494   JUMP_LABEL (insn) = jump_label;
17495 
17496   emit_insn (neg_insn (res, res));
17497 
17498   emit_label (jump_label);
17499   LABEL_NUSES (jump_label) = 1;
17500 
17501   emit_move_insn (op0, res);
17502 }
17503 
17504 /* Output code to perform a Newton-Rhapson approximation of a single precision
17505    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
17506 
17507 void
ix86_emit_swdivsf(rtx res,rtx a,rtx b,machine_mode mode)17508 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
17509 {
17510   rtx x0, x1, e0, e1;
17511 
17512   x0 = gen_reg_rtx (mode);
17513   e0 = gen_reg_rtx (mode);
17514   e1 = gen_reg_rtx (mode);
17515   x1 = gen_reg_rtx (mode);
17516 
17517   /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17518 
17519   b = force_reg (mode, b);
17520 
17521   /* x0 = rcp(b) estimate */
17522   if (mode == V16SFmode || mode == V8DFmode)
17523     {
17524       if (TARGET_AVX512ER)
17525 	{
17526 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17527 						      UNSPEC_RCP28)));
17528 	  /* res = a * x0 */
17529 	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
17530 	  return;
17531 	}
17532       else
17533 	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17534 						    UNSPEC_RCP14)));
17535     }
17536   else
17537     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17538 						UNSPEC_RCP)));
17539 
17540   /* e0 = x0 * b */
17541   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
17542 
17543   /* e0 = x0 * e0 */
17544   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
17545 
17546   /* e1 = x0 + x0 */
17547   emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
17548 
17549   /* x1 = e1 - e0 */
17550   emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
17551 
17552   /* res = a * x1 */
17553   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
17554 }
17555 
17556 /* Output code to perform a Newton-Rhapson approximation of a
17557    single precision floating point [reciprocal] square root.  */
17558 
17559 void
ix86_emit_swsqrtsf(rtx res,rtx a,machine_mode mode,bool recip)17560 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
17561 {
17562   rtx x0, e0, e1, e2, e3, mthree, mhalf;
17563   REAL_VALUE_TYPE r;
17564   int unspec;
17565 
17566   x0 = gen_reg_rtx (mode);
17567   e0 = gen_reg_rtx (mode);
17568   e1 = gen_reg_rtx (mode);
17569   e2 = gen_reg_rtx (mode);
17570   e3 = gen_reg_rtx (mode);
17571 
17572   if (TARGET_AVX512ER && mode == V16SFmode)
17573     {
17574       if (recip)
17575 	/* res = rsqrt28(a) estimate */
17576 	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17577 						     UNSPEC_RSQRT28)));
17578       else
17579 	{
17580 	  /* x0 = rsqrt28(a) estimate */
17581 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17582 						      UNSPEC_RSQRT28)));
17583 	  /* res = rcp28(x0) estimate */
17584 	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
17585 						       UNSPEC_RCP28)));
17586 	}
17587       return;
17588     }
17589 
17590   real_from_integer (&r, VOIDmode, -3, SIGNED);
17591   mthree = const_double_from_real_value (r, SFmode);
17592 
17593   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
17594   mhalf = const_double_from_real_value (r, SFmode);
17595   unspec = UNSPEC_RSQRT;
17596 
17597   if (VECTOR_MODE_P (mode))
17598     {
17599       mthree = ix86_build_const_vector (mode, true, mthree);
17600       mhalf = ix86_build_const_vector (mode, true, mhalf);
17601       /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
17602       if (GET_MODE_SIZE (mode) == 64)
17603 	unspec = UNSPEC_RSQRT14;
17604     }
17605 
17606   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
17607      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
17608 
17609   a = force_reg (mode, a);
17610 
17611   /* x0 = rsqrt(a) estimate */
17612   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17613 					      unspec)));
17614 
17615   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
17616   if (!recip)
17617     {
17618       rtx zero = force_reg (mode, CONST0_RTX(mode));
17619       rtx mask;
17620 
17621       /* Handle masked compare.  */
17622       if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
17623 	{
17624 	  mask = gen_reg_rtx (HImode);
17625 	  /* Imm value 0x4 corresponds to not-equal comparison.  */
17626 	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
17627 	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
17628 	}
17629       else
17630 	{
17631 	  mask = gen_reg_rtx (mode);
17632 	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
17633 	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
17634 	}
17635     }
17636 
17637   mthree = force_reg (mode, mthree);
17638 
17639   /* e0 = x0 * a */
17640   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
17641 
17642   unsigned vector_size = GET_MODE_SIZE (mode);
17643   if (TARGET_FMA
17644       || (TARGET_AVX512F && vector_size == 64)
17645       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
17646     emit_insn (gen_rtx_SET (e2,
17647 			    gen_rtx_FMA (mode, e0, x0, mthree)));
17648   else
17649     {
17650       /* e1 = e0 * x0 */
17651       emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
17652 
17653       /* e2 = e1 - 3. */
17654       emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
17655     }
17656 
17657   mhalf = force_reg (mode, mhalf);
17658   if (recip)
17659     /* e3 = -.5 * x0 */
17660     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
17661   else
17662     /* e3 = -.5 * e0 */
17663     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
17664   /* ret = e2 * e3 */
17665   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
17666 }
17667 
17668 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
17669    mask for masking out the sign-bit is stored in *SMASK, if that is
17670    non-null.  */
17671 
17672 static rtx
ix86_expand_sse_fabs(rtx op0,rtx * smask)17673 ix86_expand_sse_fabs (rtx op0, rtx *smask)
17674 {
17675   machine_mode vmode, mode = GET_MODE (op0);
17676   rtx xa, mask;
17677 
17678   xa = gen_reg_rtx (mode);
17679   if (mode == SFmode)
17680     vmode = V4SFmode;
17681   else if (mode == DFmode)
17682     vmode = V2DFmode;
17683   else
17684     vmode = mode;
17685   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
17686   if (!VECTOR_MODE_P (mode))
17687     {
17688       /* We need to generate a scalar mode mask in this case.  */
17689       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17690       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17691       mask = gen_reg_rtx (mode);
17692       emit_insn (gen_rtx_SET (mask, tmp));
17693     }
17694   emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
17695 
17696   if (smask)
17697     *smask = mask;
17698 
17699   return xa;
17700 }
17701 
17702 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
17703    swapping the operands if SWAP_OPERANDS is true.  The expanded
17704    code is a forward jump to a newly created label in case the
17705    comparison is true.  The generated label rtx is returned.  */
17706 static rtx_code_label *
ix86_expand_sse_compare_and_jump(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)17707 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
17708                                   bool swap_operands)
17709 {
17710   bool unordered_compare = ix86_unordered_fp_compare (code);
17711   rtx_code_label *label;
17712   rtx tmp, reg;
17713 
17714   if (swap_operands)
17715     std::swap (op0, op1);
17716 
17717   label = gen_label_rtx ();
17718   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
17719   if (unordered_compare)
17720     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
17721   reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
17722   emit_insn (gen_rtx_SET (reg, tmp));
17723   tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
17724   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
17725 			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
17726   tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17727   JUMP_LABEL (tmp) = label;
17728 
17729   return label;
17730 }
17731 
17732 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
17733    using comparison code CODE.  Operands are swapped for the comparison if
17734    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
17735 static rtx
ix86_expand_sse_compare_mask(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)17736 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
17737 			      bool swap_operands)
17738 {
17739   rtx (*insn)(rtx, rtx, rtx, rtx);
17740   machine_mode mode = GET_MODE (op0);
17741   rtx mask = gen_reg_rtx (mode);
17742 
17743   if (swap_operands)
17744     std::swap (op0, op1);
17745 
17746   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
17747 
17748   emit_insn (insn (mask, op0, op1,
17749 		   gen_rtx_fmt_ee (code, mode, op0, op1)));
17750   return mask;
17751 }
17752 
17753 /* Expand copysign from SIGN to the positive value ABS_VALUE
17754    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
17755    the sign-bit.  */
17756 
17757 static void
ix86_sse_copysign_to_positive(rtx result,rtx abs_value,rtx sign,rtx mask)17758 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
17759 {
17760   machine_mode mode = GET_MODE (sign);
17761   rtx sgn = gen_reg_rtx (mode);
17762   if (mask == NULL_RTX)
17763     {
17764       machine_mode vmode;
17765 
17766       if (mode == SFmode)
17767 	vmode = V4SFmode;
17768       else if (mode == DFmode)
17769 	vmode = V2DFmode;
17770       else
17771 	vmode = mode;
17772 
17773       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
17774       if (!VECTOR_MODE_P (mode))
17775 	{
17776 	  /* We need to generate a scalar mode mask in this case.  */
17777 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17778 	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17779 	  mask = gen_reg_rtx (mode);
17780 	  emit_insn (gen_rtx_SET (mask, tmp));
17781 	}
17782     }
17783   else
17784     mask = gen_rtx_NOT (mode, mask);
17785   emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
17786   emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
17787 }
17788 
17789 /* Expand SSE sequence for computing lround from OP1 storing
17790    into OP0.  */
17791 
17792 void
ix86_expand_lround(rtx op0,rtx op1)17793 ix86_expand_lround (rtx op0, rtx op1)
17794 {
17795   /* C code for the stuff we're doing below:
17796 	tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
17797 	return (long)tmp;
17798    */
17799   machine_mode mode = GET_MODE (op1);
17800   const struct real_format *fmt;
17801   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
17802   rtx adj;
17803 
17804   /* load nextafter (0.5, 0.0) */
17805   fmt = REAL_MODE_FORMAT (mode);
17806   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
17807   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
17808 
17809   /* adj = copysign (0.5, op1) */
17810   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
17811   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
17812 
17813   /* adj = op1 + adj */
17814   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
17815 
17816   /* op0 = (imode)adj */
17817   expand_fix (op0, adj, 0);
17818 }
17819 
17820 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
17821    into OPERAND0.  */
17822 
17823 void
ix86_expand_lfloorceil(rtx op0,rtx op1,bool do_floor)17824 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
17825 {
17826   /* C code for the stuff we're doing below (for do_floor):
17827 	xi = (long)op1;
17828 	xi -= (double)xi > op1 ? 1 : 0;
17829 	return xi;
17830    */
17831   machine_mode fmode = GET_MODE (op1);
17832   machine_mode imode = GET_MODE (op0);
17833   rtx ireg, freg, tmp;
17834   rtx_code_label *label;
17835 
17836   /* reg = (long)op1 */
17837   ireg = gen_reg_rtx (imode);
17838   expand_fix (ireg, op1, 0);
17839 
17840   /* freg = (double)reg */
17841   freg = gen_reg_rtx (fmode);
17842   expand_float (freg, ireg, 0);
17843 
17844   /* ireg = (freg > op1) ? ireg - 1 : ireg */
17845   label = ix86_expand_sse_compare_and_jump (UNLE,
17846 					    freg, op1, !do_floor);
17847   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
17848 			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
17849   emit_move_insn (ireg, tmp);
17850 
17851   emit_label (label);
17852   LABEL_NUSES (label) = 1;
17853 
17854   emit_move_insn (op0, ireg);
17855 }
17856 
17857 /* Generate and return a rtx of mode MODE for 2**n where n is the number
17858    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
17859 
17860 static rtx
ix86_gen_TWO52(machine_mode mode)17861 ix86_gen_TWO52 (machine_mode mode)
17862 {
17863   const struct real_format *fmt;
17864   REAL_VALUE_TYPE TWO52r;
17865   rtx TWO52;
17866 
17867   fmt = REAL_MODE_FORMAT (mode);
17868   real_2expN (&TWO52r, fmt->p - 1, mode);
17869   TWO52 = const_double_from_real_value (TWO52r, mode);
17870   TWO52 = force_reg (mode, TWO52);
17871 
17872   return TWO52;
17873 }
17874 
17875 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
17876 
17877 void
ix86_expand_rint(rtx operand0,rtx operand1)17878 ix86_expand_rint (rtx operand0, rtx operand1)
17879 {
17880   /* C code for the stuff we're doing below:
17881 	xa = fabs (operand1);
17882 	if (!isless (xa, 2**52))
17883 	  return operand1;
17884 	two52 = 2**52;
17885 	if (flag_rounding_math)
17886 	  {
17887 	    two52 = copysign (two52, operand1);
17888 	    xa = operand1;
17889 	  }
17890 	xa = xa + two52 - two52;
17891 	return copysign (xa, operand1);
17892    */
17893   machine_mode mode = GET_MODE (operand0);
17894   rtx res, xa, TWO52, mask;
17895   rtx_code_label *label;
17896 
17897   TWO52 = ix86_gen_TWO52 (mode);
17898 
17899   /* Temporary for holding the result, initialized to the input
17900      operand to ease control flow.  */
17901   res = copy_to_reg (operand1);
17902 
17903   /* xa = abs (operand1) */
17904   xa = ix86_expand_sse_fabs (res, &mask);
17905 
17906   /* if (!isless (xa, TWO52)) goto label; */
17907   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17908 
17909   if (flag_rounding_math)
17910     {
17911       ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
17912       xa = res;
17913     }
17914 
17915   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17916   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
17917 
17918   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
17919   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
17920     xa = ix86_expand_sse_fabs (xa, NULL);
17921 
17922   ix86_sse_copysign_to_positive (res, xa, res, mask);
17923 
17924   emit_label (label);
17925   LABEL_NUSES (label) = 1;
17926 
17927   emit_move_insn (operand0, res);
17928 }
17929 
17930 /* Expand SSE2 sequence for computing floor or ceil
17931    from OPERAND1 storing into OPERAND0.  */
17932 void
ix86_expand_floorceil(rtx operand0,rtx operand1,bool do_floor)17933 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
17934 {
17935   /* C code for the stuff we expand below.
17936 	double xa = fabs (x), x2;
17937 	if (!isless (xa, TWO52))
17938 	  return x;
17939 	x2 = (double)(long)x;
17940 
17941      Compensate.  Floor:
17942 	if (x2 > x)
17943 	  x2 -= 1;
17944      Compensate.  Ceil:
17945 	if (x2 < x)
17946 	  x2 += 1;
17947 
17948 	if (HONOR_SIGNED_ZEROS (mode))
17949 	  return copysign (x2, x);
17950 	return x2;
17951    */
17952   machine_mode mode = GET_MODE (operand0);
17953   rtx xa, xi, TWO52, tmp, one, res, mask;
17954   rtx_code_label *label;
17955 
17956   TWO52 = ix86_gen_TWO52 (mode);
17957 
17958   /* Temporary for holding the result, initialized to the input
17959      operand to ease control flow.  */
17960   res = copy_to_reg (operand1);
17961 
17962   /* xa = abs (operand1) */
17963   xa = ix86_expand_sse_fabs (res, &mask);
17964 
17965   /* if (!isless (xa, TWO52)) goto label; */
17966   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17967 
17968   /* xa = (double)(long)x */
17969   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
17970   expand_fix (xi, res, 0);
17971   expand_float (xa, xi, 0);
17972 
17973   /* generate 1.0 */
17974   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
17975 
17976   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
17977   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
17978   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
17979   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
17980 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
17981   if (HONOR_SIGNED_ZEROS (mode))
17982     {
17983       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
17984       if (do_floor && flag_rounding_math)
17985 	tmp = ix86_expand_sse_fabs (tmp, NULL);
17986 
17987       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
17988     }
17989   emit_move_insn (res, tmp);
17990 
17991   emit_label (label);
17992   LABEL_NUSES (label) = 1;
17993 
17994   emit_move_insn (operand0, res);
17995 }
17996 
17997 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
17998    into OPERAND0 without relying on DImode truncation via cvttsd2siq
17999    that is only available on 64bit targets.  */
18000 void
ix86_expand_floorceildf_32(rtx operand0,rtx operand1,bool do_floor)18001 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
18002 {
18003   /* C code for the stuff we expand below.
18004 	double xa = fabs (x), x2;
18005 	if (!isless (xa, TWO52))
18006 	  return x;
18007 	xa = xa + TWO52 - TWO52;
18008 	x2 = copysign (xa, x);
18009 
18010      Compensate.  Floor:
18011 	if (x2 > x)
18012 	  x2 -= 1;
18013      Compensate.  Ceil:
18014 	if (x2 < x)
18015 	  x2 += 1;
18016 
18017 	if (HONOR_SIGNED_ZEROS (mode))
18018 	  x2 = copysign (x2, x);
18019 	return x2;
18020    */
18021   machine_mode mode = GET_MODE (operand0);
18022   rtx xa, TWO52, tmp, one, res, mask;
18023   rtx_code_label *label;
18024 
18025   TWO52 = ix86_gen_TWO52 (mode);
18026 
18027   /* Temporary for holding the result, initialized to the input
18028      operand to ease control flow.  */
18029   res = copy_to_reg (operand1);
18030 
18031   /* xa = abs (operand1) */
18032   xa = ix86_expand_sse_fabs (res, &mask);
18033 
18034   /* if (!isless (xa, TWO52)) goto label; */
18035   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18036 
18037   /* xa = xa + TWO52 - TWO52; */
18038   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18039   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18040 
18041   /* xa = copysign (xa, operand1) */
18042   ix86_sse_copysign_to_positive (xa, xa, res, mask);
18043 
18044   /* generate 1.0 */
18045   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18046 
18047   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18048   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18049   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18050   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18051 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18052   if (HONOR_SIGNED_ZEROS (mode))
18053     {
18054       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
18055       if (do_floor && flag_rounding_math)
18056 	tmp = ix86_expand_sse_fabs (tmp, NULL);
18057 
18058       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18059     }
18060   emit_move_insn (res, tmp);
18061 
18062   emit_label (label);
18063   LABEL_NUSES (label) = 1;
18064 
18065   emit_move_insn (operand0, res);
18066 }
18067 
18068 /* Expand SSE sequence for computing trunc
18069    from OPERAND1 storing into OPERAND0.  */
18070 void
ix86_expand_trunc(rtx operand0,rtx operand1)18071 ix86_expand_trunc (rtx operand0, rtx operand1)
18072 {
18073   /* C code for SSE variant we expand below.
18074 	double xa = fabs (x), x2;
18075 	if (!isless (xa, TWO52))
18076 	  return x;
18077 	x2 = (double)(long)x;
18078 	if (HONOR_SIGNED_ZEROS (mode))
18079 	  return copysign (x2, x);
18080 	return x2;
18081    */
18082   machine_mode mode = GET_MODE (operand0);
18083   rtx xa, xi, TWO52, res, mask;
18084   rtx_code_label *label;
18085 
18086   TWO52 = ix86_gen_TWO52 (mode);
18087 
18088   /* Temporary for holding the result, initialized to the input
18089      operand to ease control flow.  */
18090   res = copy_to_reg (operand1);
18091 
18092   /* xa = abs (operand1) */
18093   xa = ix86_expand_sse_fabs (res, &mask);
18094 
18095   /* if (!isless (xa, TWO52)) goto label; */
18096   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18097 
18098   /* xa = (double)(long)x */
18099   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18100   expand_fix (xi, res, 0);
18101   expand_float (xa, xi, 0);
18102 
18103   if (HONOR_SIGNED_ZEROS (mode))
18104     ix86_sse_copysign_to_positive (xa, xa, res, mask);
18105 
18106   emit_move_insn (res, xa);
18107 
18108   emit_label (label);
18109   LABEL_NUSES (label) = 1;
18110 
18111   emit_move_insn (operand0, res);
18112 }
18113 
18114 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18115    into OPERAND0 without relying on DImode truncation via cvttsd2siq
18116    that is only available on 64bit targets.  */
18117 void
ix86_expand_truncdf_32(rtx operand0,rtx operand1)18118 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18119 {
18120   machine_mode mode = GET_MODE (operand0);
18121   rtx xa, xa2, TWO52, tmp, one, res, mask;
18122   rtx_code_label *label;
18123 
18124   /* C code for SSE variant we expand below.
18125 	double xa = fabs (x), x2;
18126 	if (!isless (xa, TWO52))
18127 	  return x;
18128 	xa2 = xa + TWO52 - TWO52;
18129      Compensate:
18130 	if (xa2 > xa)
18131 	  xa2 -= 1.0;
18132 	x2 = copysign (xa2, x);
18133 	return x2;
18134    */
18135 
18136   TWO52 = ix86_gen_TWO52 (mode);
18137 
18138   /* Temporary for holding the result, initialized to the input
18139      operand to ease control flow.  */
18140   res =copy_to_reg (operand1);
18141 
18142   /* xa = abs (operand1) */
18143   xa = ix86_expand_sse_fabs (res, &mask);
18144 
18145   /* if (!isless (xa, TWO52)) goto label; */
18146   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18147 
18148   /* xa2 = xa + TWO52 - TWO52; */
18149   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18150   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18151 
18152   /* generate 1.0 */
18153   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18154 
18155   /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0)  */
18156   tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18157   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18158   tmp = expand_simple_binop (mode, MINUS,
18159 			     xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18160   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
18161   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18162     tmp = ix86_expand_sse_fabs (tmp, NULL);
18163 
18164   /* res = copysign (xa2, operand1) */
18165   ix86_sse_copysign_to_positive (res, tmp, res, mask);
18166 
18167   emit_label (label);
18168   LABEL_NUSES (label) = 1;
18169 
18170   emit_move_insn (operand0, res);
18171 }
18172 
18173 /* Expand SSE sequence for computing round
18174    from OPERAND1 storing into OPERAND0.  */
18175 void
ix86_expand_round(rtx operand0,rtx operand1)18176 ix86_expand_round (rtx operand0, rtx operand1)
18177 {
18178   /* C code for the stuff we're doing below:
18179 	double xa = fabs (x);
18180 	if (!isless (xa, TWO52))
18181 	  return x;
18182 	xa = (double)(long)(xa + nextafter (0.5, 0.0));
18183 	return copysign (xa, x);
18184    */
18185   machine_mode mode = GET_MODE (operand0);
18186   rtx res, TWO52, xa, xi, half, mask;
18187   rtx_code_label *label;
18188   const struct real_format *fmt;
18189   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18190 
18191   /* Temporary for holding the result, initialized to the input
18192      operand to ease control flow.  */
18193   res = copy_to_reg (operand1);
18194 
18195   TWO52 = ix86_gen_TWO52 (mode);
18196   xa = ix86_expand_sse_fabs (res, &mask);
18197   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18198 
18199   /* load nextafter (0.5, 0.0) */
18200   fmt = REAL_MODE_FORMAT (mode);
18201   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18202   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18203 
18204   /* xa = xa + 0.5 */
18205   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18206   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18207 
18208   /* xa = (double)(int64_t)xa */
18209   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18210   expand_fix (xi, xa, 0);
18211   expand_float (xa, xi, 0);
18212 
18213   /* res = copysign (xa, operand1) */
18214   ix86_sse_copysign_to_positive (res, xa, res, mask);
18215 
18216   emit_label (label);
18217   LABEL_NUSES (label) = 1;
18218 
18219   emit_move_insn (operand0, res);
18220 }
18221 
18222 /* Expand SSE sequence for computing round from OPERAND1 storing
18223    into OPERAND0 without relying on DImode truncation via cvttsd2siq
18224    that is only available on 64bit targets.  */
18225 void
ix86_expand_rounddf_32(rtx operand0,rtx operand1)18226 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18227 {
18228   /* C code for the stuff we expand below.
18229 	double xa = fabs (x), xa2, x2;
18230 	if (!isless (xa, TWO52))
18231 	  return x;
18232      Using the absolute value and copying back sign makes
18233      -0.0 -> -0.0 correct.
18234 	xa2 = xa + TWO52 - TWO52;
18235      Compensate.
18236 	dxa = xa2 - xa;
18237 	if (dxa <= -0.5)
18238 	  xa2 += 1;
18239 	else if (dxa > 0.5)
18240 	  xa2 -= 1;
18241 	x2 = copysign (xa2, x);
18242 	return x2;
18243    */
18244   machine_mode mode = GET_MODE (operand0);
18245   rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18246   rtx_code_label *label;
18247 
18248   TWO52 = ix86_gen_TWO52 (mode);
18249 
18250   /* Temporary for holding the result, initialized to the input
18251      operand to ease control flow.  */
18252   res = copy_to_reg (operand1);
18253 
18254   /* xa = abs (operand1) */
18255   xa = ix86_expand_sse_fabs (res, &mask);
18256 
18257   /* if (!isless (xa, TWO52)) goto label; */
18258   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18259 
18260   /* xa2 = xa + TWO52 - TWO52; */
18261   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18262   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18263 
18264   /* dxa = xa2 - xa; */
18265   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18266 
18267   /* generate 0.5, 1.0 and -0.5 */
18268   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18269   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18270   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18271 			       0, OPTAB_DIRECT);
18272 
18273   /* Compensate.  */
18274   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18275   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18276   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18277   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18278   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18279   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18280   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18281   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18282 
18283   /* res = copysign (xa2, operand1) */
18284   ix86_sse_copysign_to_positive (res, xa2, res, mask);
18285 
18286   emit_label (label);
18287   LABEL_NUSES (label) = 1;
18288 
18289   emit_move_insn (operand0, res);
18290 }
18291 
18292 /* Expand SSE sequence for computing round
18293    from OP1 storing into OP0 using sse4 round insn.  */
18294 void
ix86_expand_round_sse4(rtx op0,rtx op1)18295 ix86_expand_round_sse4 (rtx op0, rtx op1)
18296 {
18297   machine_mode mode = GET_MODE (op0);
18298   rtx e1, e2, res, half;
18299   const struct real_format *fmt;
18300   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18301   rtx (*gen_copysign) (rtx, rtx, rtx);
18302   rtx (*gen_round) (rtx, rtx, rtx);
18303 
18304   switch (mode)
18305     {
18306     case E_SFmode:
18307       gen_copysign = gen_copysignsf3;
18308       gen_round = gen_sse4_1_roundsf2;
18309       break;
18310     case E_DFmode:
18311       gen_copysign = gen_copysigndf3;
18312       gen_round = gen_sse4_1_rounddf2;
18313       break;
18314     default:
18315       gcc_unreachable ();
18316     }
18317 
18318   /* round (a) = trunc (a + copysign (0.5, a)) */
18319 
18320   /* load nextafter (0.5, 0.0) */
18321   fmt = REAL_MODE_FORMAT (mode);
18322   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18323   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18324   half = const_double_from_real_value (pred_half, mode);
18325 
18326   /* e1 = copysign (0.5, op1) */
18327   e1 = gen_reg_rtx (mode);
18328   emit_insn (gen_copysign (e1, half, op1));
18329 
18330   /* e2 = op1 + e1 */
18331   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18332 
18333   /* res = trunc (e2) */
18334   res = gen_reg_rtx (mode);
18335   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18336 
18337   emit_move_insn (op0, res);
18338 }
18339 
18340 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18341    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18342    insn every time.  */
18343 
18344 static GTY(()) rtx_insn *vselect_insn;
18345 
18346 /* Initialize vselect_insn.  */
18347 
18348 static void
init_vselect_insn(void)18349 init_vselect_insn (void)
18350 {
18351   unsigned i;
18352   rtx x;
18353 
18354   x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18355   for (i = 0; i < MAX_VECT_LEN; ++i)
18356     XVECEXP (x, 0, i) = const0_rtx;
18357   x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18358 							const0_rtx), x);
18359   x = gen_rtx_SET (const0_rtx, x);
18360   start_sequence ();
18361   vselect_insn = emit_insn (x);
18362   end_sequence ();
18363 }
18364 
18365 /* Construct (set target (vec_select op0 (parallel perm))) and
18366    return true if that's a valid instruction in the active ISA.  */
18367 
18368 static bool
expand_vselect(rtx target,rtx op0,const unsigned char * perm,unsigned nelt,bool testing_p)18369 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18370 		unsigned nelt, bool testing_p)
18371 {
18372   unsigned int i;
18373   rtx x, save_vconcat;
18374   int icode;
18375 
18376   if (vselect_insn == NULL_RTX)
18377     init_vselect_insn ();
18378 
18379   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18380   PUT_NUM_ELEM (XVEC (x, 0), nelt);
18381   for (i = 0; i < nelt; ++i)
18382     XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18383   save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18384   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18385   PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18386   SET_DEST (PATTERN (vselect_insn)) = target;
18387   icode = recog_memoized (vselect_insn);
18388 
18389   if (icode >= 0 && !testing_p)
18390     emit_insn (copy_rtx (PATTERN (vselect_insn)));
18391 
18392   SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18393   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18394   INSN_CODE (vselect_insn) = -1;
18395 
18396   return icode >= 0;
18397 }
18398 
18399 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
18400 
18401 static bool
expand_vselect_vconcat(rtx target,rtx op0,rtx op1,const unsigned char * perm,unsigned nelt,bool testing_p)18402 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18403 			const unsigned char *perm, unsigned nelt,
18404 			bool testing_p)
18405 {
18406   machine_mode v2mode;
18407   rtx x;
18408   bool ok;
18409 
18410   if (vselect_insn == NULL_RTX)
18411     init_vselect_insn ();
18412 
18413   if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18414     return false;
18415   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18416   PUT_MODE (x, v2mode);
18417   XEXP (x, 0) = op0;
18418   XEXP (x, 1) = op1;
18419   ok = expand_vselect (target, x, perm, nelt, testing_p);
18420   XEXP (x, 0) = const0_rtx;
18421   XEXP (x, 1) = const0_rtx;
18422   return ok;
18423 }
18424 
18425 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
18426    using movss or movsd.  */
18427 static bool
expand_vec_perm_movs(struct expand_vec_perm_d * d)18428 expand_vec_perm_movs (struct expand_vec_perm_d *d)
18429 {
18430   machine_mode vmode = d->vmode;
18431   unsigned i, nelt = d->nelt;
18432   rtx x;
18433 
18434   if (d->one_operand_p)
18435     return false;
18436 
18437   if (!(TARGET_SSE && vmode == V4SFmode)
18438       && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
18439       && !(TARGET_SSE2 && vmode == V2DFmode))
18440     return false;
18441 
18442   /* Only the first element is changed.  */
18443   if (d->perm[0] != nelt && d->perm[0] != 0)
18444     return false;
18445   for (i = 1; i < nelt; ++i)
18446     if (d->perm[i] != i + nelt - d->perm[0])
18447       return false;
18448 
18449   if (d->testing_p)
18450     return true;
18451 
18452   if (d->perm[0] == nelt)
18453     x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18454   else
18455     x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18456 
18457   emit_insn (gen_rtx_SET (d->target, x));
18458 
18459   return true;
18460 }
18461 
18462 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
18463    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
18464 
18465 static bool
expand_vec_perm_blend(struct expand_vec_perm_d * d)18466 expand_vec_perm_blend (struct expand_vec_perm_d *d)
18467 {
18468   machine_mode mmode, vmode = d->vmode;
18469   unsigned i, nelt = d->nelt;
18470   unsigned HOST_WIDE_INT mask;
18471   rtx target, op0, op1, maskop, x;
18472   rtx rperm[32], vperm;
18473 
18474   if (d->one_operand_p)
18475     return false;
18476   if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
18477       && (TARGET_AVX512BW
18478 	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
18479     ;
18480   else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
18481     ;
18482   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
18483     ;
18484   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
18485 			     || GET_MODE_SIZE (vmode) == 8
18486 			     || GET_MODE_SIZE (vmode) == 4))
18487     ;
18488   else
18489     return false;
18490 
18491   /* This is a blend, not a permute.  Elements must stay in their
18492      respective lanes.  */
18493   for (i = 0; i < nelt; ++i)
18494     {
18495       unsigned e = d->perm[i];
18496       if (!(e == i || e == i + nelt))
18497 	return false;
18498     }
18499 
18500   if (d->testing_p)
18501     return true;
18502 
18503   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
18504      decision should be extracted elsewhere, so that we only try that
18505      sequence once all budget==3 options have been tried.  */
18506   target = d->target;
18507   op0 = d->op0;
18508   op1 = d->op1;
18509   mask = 0;
18510 
18511   switch (vmode)
18512     {
18513     case E_V8DFmode:
18514     case E_V16SFmode:
18515     case E_V4DFmode:
18516     case E_V8SFmode:
18517     case E_V2DFmode:
18518     case E_V4SFmode:
18519     case E_V4HImode:
18520     case E_V8HImode:
18521     case E_V8SImode:
18522     case E_V32HImode:
18523     case E_V64QImode:
18524     case E_V16SImode:
18525     case E_V8DImode:
18526       for (i = 0; i < nelt; ++i)
18527 	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
18528       break;
18529 
18530     case E_V2DImode:
18531       for (i = 0; i < 2; ++i)
18532 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
18533       vmode = V8HImode;
18534       goto do_subreg;
18535 
18536     case E_V2SImode:
18537       for (i = 0; i < 2; ++i)
18538 	mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
18539       vmode = V4HImode;
18540       goto do_subreg;
18541 
18542     case E_V4SImode:
18543       for (i = 0; i < 4; ++i)
18544 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18545       vmode = V8HImode;
18546       goto do_subreg;
18547 
18548     case E_V16QImode:
18549       /* See if bytes move in pairs so we can use pblendw with
18550 	 an immediate argument, rather than pblendvb with a vector
18551 	 argument.  */
18552       for (i = 0; i < 16; i += 2)
18553 	if (d->perm[i] + 1 != d->perm[i + 1])
18554 	  {
18555 	  use_pblendvb:
18556 	    for (i = 0; i < nelt; ++i)
18557 	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
18558 
18559 	  finish_pblendvb:
18560 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18561 	    vperm = force_reg (vmode, vperm);
18562 
18563 	    if (GET_MODE_SIZE (vmode) == 4)
18564 	      emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
18565 	    else if (GET_MODE_SIZE (vmode) == 8)
18566 	      emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
18567 	    else if (GET_MODE_SIZE (vmode) == 16)
18568 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
18569 	    else
18570 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
18571 	    if (target != d->target)
18572 	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18573 	    return true;
18574 	  }
18575 
18576       for (i = 0; i < 8; ++i)
18577 	mask |= (d->perm[i * 2] >= 16) << i;
18578       vmode = V8HImode;
18579       /* FALLTHRU */
18580 
18581     do_subreg:
18582       target = gen_reg_rtx (vmode);
18583       op0 = gen_lowpart (vmode, op0);
18584       op1 = gen_lowpart (vmode, op1);
18585       break;
18586 
18587     case E_V8QImode:
18588       for (i = 0; i < 8; i += 2)
18589 	if (d->perm[i] + 1 != d->perm[i + 1])
18590 	  goto use_pblendvb;
18591 
18592       for (i = 0; i < 4; ++i)
18593 	mask |= (d->perm[i * 2] >= 8) << i;
18594       vmode = V4HImode;
18595       goto do_subreg;
18596 
18597     case E_V4QImode:
18598       for (i = 0; i < 4; i += 2)
18599 	if (d->perm[i] + 1 != d->perm[i + 1])
18600 	  goto use_pblendvb;
18601 
18602       for (i = 0; i < 2; ++i)
18603 	mask |= (d->perm[i * 2] >= 4) << i;
18604       vmode = V2HImode;
18605       goto do_subreg;
18606 
18607     case E_V32QImode:
18608       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
18609       for (i = 0; i < 32; i += 2)
18610 	if (d->perm[i] + 1 != d->perm[i + 1])
18611 	  goto use_pblendvb;
18612       /* See if bytes move in quadruplets.  If yes, vpblendd
18613 	 with immediate can be used.  */
18614       for (i = 0; i < 32; i += 4)
18615 	if (d->perm[i] + 2 != d->perm[i + 2])
18616 	  break;
18617       if (i < 32)
18618 	{
18619 	  /* See if bytes move the same in both lanes.  If yes,
18620 	     vpblendw with immediate can be used.  */
18621 	  for (i = 0; i < 16; i += 2)
18622 	    if (d->perm[i] + 16 != d->perm[i + 16])
18623 	      goto use_pblendvb;
18624 
18625 	  /* Use vpblendw.  */
18626 	  for (i = 0; i < 16; ++i)
18627 	    mask |= (d->perm[i * 2] >= 32) << i;
18628 	  vmode = V16HImode;
18629 	  goto do_subreg;
18630 	}
18631 
18632       /* Use vpblendd.  */
18633       for (i = 0; i < 8; ++i)
18634 	mask |= (d->perm[i * 4] >= 32) << i;
18635       vmode = V8SImode;
18636       goto do_subreg;
18637 
18638     case E_V16HImode:
18639       /* See if words move in pairs.  If yes, vpblendd can be used.  */
18640       for (i = 0; i < 16; i += 2)
18641 	if (d->perm[i] + 1 != d->perm[i + 1])
18642 	  break;
18643       if (i < 16)
18644 	{
18645 	  /* See if words move the same in both lanes.  If not,
18646 	     vpblendvb must be used.  */
18647 	  for (i = 0; i < 8; i++)
18648 	    if (d->perm[i] + 8 != d->perm[i + 8])
18649 	      {
18650 		/* Use vpblendvb.  */
18651 		for (i = 0; i < 32; ++i)
18652 		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
18653 
18654 		vmode = V32QImode;
18655 		nelt = 32;
18656 		target = gen_reg_rtx (vmode);
18657 		op0 = gen_lowpart (vmode, op0);
18658 		op1 = gen_lowpart (vmode, op1);
18659 		goto finish_pblendvb;
18660 	      }
18661 
18662 	  /* Use vpblendw.  */
18663 	  for (i = 0; i < 16; ++i)
18664 	    mask |= (d->perm[i] >= 16) << i;
18665 	  break;
18666 	}
18667 
18668       /* Use vpblendd.  */
18669       for (i = 0; i < 8; ++i)
18670 	mask |= (d->perm[i * 2] >= 16) << i;
18671       vmode = V8SImode;
18672       goto do_subreg;
18673 
18674     case E_V4DImode:
18675       /* Use vpblendd.  */
18676       for (i = 0; i < 4; ++i)
18677 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18678       vmode = V8SImode;
18679       goto do_subreg;
18680 
18681     default:
18682       gcc_unreachable ();
18683     }
18684 
18685   switch (vmode)
18686     {
18687     case E_V8DFmode:
18688     case E_V8DImode:
18689       mmode = QImode;
18690       break;
18691     case E_V16SFmode:
18692     case E_V16SImode:
18693       mmode = HImode;
18694       break;
18695     case E_V32HImode:
18696       mmode = SImode;
18697       break;
18698     case E_V64QImode:
18699       mmode = DImode;
18700       break;
18701     default:
18702       mmode = VOIDmode;
18703     }
18704 
18705   if (mmode != VOIDmode)
18706     maskop = force_reg (mmode, gen_int_mode (mask, mmode));
18707   else
18708     maskop = GEN_INT (mask);
18709 
18710   /* This matches five different patterns with the different modes.  */
18711   x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
18712   x = gen_rtx_SET (target, x);
18713   emit_insn (x);
18714   if (target != d->target)
18715     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18716 
18717   return true;
18718 }
18719 
18720 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
18721    in terms of the variable form of vpermilps.
18722 
18723    Note that we will have already failed the immediate input vpermilps,
18724    which requires that the high and low part shuffle be identical; the
18725    variable form doesn't require that.  */
18726 
18727 static bool
expand_vec_perm_vpermil(struct expand_vec_perm_d * d)18728 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
18729 {
18730   rtx rperm[8], vperm;
18731   unsigned i;
18732 
18733   if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
18734     return false;
18735 
18736   /* We can only permute within the 128-bit lane.  */
18737   for (i = 0; i < 8; ++i)
18738     {
18739       unsigned e = d->perm[i];
18740       if (i < 4 ? e >= 4 : e < 4)
18741 	return false;
18742     }
18743 
18744   if (d->testing_p)
18745     return true;
18746 
18747   for (i = 0; i < 8; ++i)
18748     {
18749       unsigned e = d->perm[i];
18750 
18751       /* Within each 128-bit lane, the elements of op0 are numbered
18752 	 from 0 and the elements of op1 are numbered from 4.  */
18753       if (e >= 8 + 4)
18754 	e -= 8;
18755       else if (e >= 4)
18756 	e -= 4;
18757 
18758       rperm[i] = GEN_INT (e);
18759     }
18760 
18761   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
18762   vperm = force_reg (V8SImode, vperm);
18763   emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
18764 
18765   return true;
18766 }
18767 
18768 /* For V*[QHS]Imode permutations, check if the same permutation
18769    can't be performed in a 2x, 4x or 8x wider inner mode.  */
18770 
18771 static bool
canonicalize_vector_int_perm(const struct expand_vec_perm_d * d,struct expand_vec_perm_d * nd)18772 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
18773 			      struct expand_vec_perm_d *nd)
18774 {
18775   int i;
18776   machine_mode mode = VOIDmode;
18777 
18778   switch (d->vmode)
18779     {
18780     case E_V8QImode: mode = V4HImode; break;
18781     case E_V16QImode: mode = V8HImode; break;
18782     case E_V32QImode: mode = V16HImode; break;
18783     case E_V64QImode: mode = V32HImode; break;
18784     case E_V4HImode: mode = V2SImode; break;
18785     case E_V8HImode: mode = V4SImode; break;
18786     case E_V16HImode: mode = V8SImode; break;
18787     case E_V32HImode: mode = V16SImode; break;
18788     case E_V4SImode: mode = V2DImode; break;
18789     case E_V8SImode: mode = V4DImode; break;
18790     case E_V16SImode: mode = V8DImode; break;
18791     default: return false;
18792     }
18793   for (i = 0; i < d->nelt; i += 2)
18794     if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
18795       return false;
18796   nd->vmode = mode;
18797   nd->nelt = d->nelt / 2;
18798   for (i = 0; i < nd->nelt; i++)
18799     nd->perm[i] = d->perm[2 * i] / 2;
18800   if (GET_MODE_INNER (mode) != DImode)
18801     canonicalize_vector_int_perm (nd, nd);
18802   if (nd != d)
18803     {
18804       nd->one_operand_p = d->one_operand_p;
18805       nd->testing_p = d->testing_p;
18806       if (d->op0 == d->op1)
18807 	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
18808       else
18809 	{
18810 	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
18811 	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
18812 	}
18813       if (d->testing_p)
18814 	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
18815       else
18816 	nd->target = gen_reg_rtx (nd->vmode);
18817     }
18818   return true;
18819 }
18820 
18821 /* Return true if permutation D can be performed as VMODE permutation
18822    instead.  */
18823 
18824 static bool
valid_perm_using_mode_p(machine_mode vmode,struct expand_vec_perm_d * d)18825 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
18826 {
18827   unsigned int i, j, chunk;
18828 
18829   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
18830       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
18831       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
18832     return false;
18833 
18834   if (GET_MODE_NUNITS (vmode) >= d->nelt)
18835     return true;
18836 
18837   chunk = d->nelt / GET_MODE_NUNITS (vmode);
18838   for (i = 0; i < d->nelt; i += chunk)
18839     if (d->perm[i] & (chunk - 1))
18840       return false;
18841     else
18842       for (j = 1; j < chunk; ++j)
18843 	if (d->perm[i] + j != d->perm[i + j])
18844 	  return false;
18845 
18846   return true;
18847 }
18848 
18849 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
18850    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
18851 
18852 static bool
expand_vec_perm_pshufb(struct expand_vec_perm_d * d)18853 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
18854 {
18855   unsigned i, nelt, eltsz, mask;
18856   unsigned char perm[64];
18857   machine_mode vmode;
18858   struct expand_vec_perm_d nd;
18859   rtx rperm[64], vperm, target, op0, op1;
18860 
18861   nelt = d->nelt;
18862 
18863   if (!d->one_operand_p)
18864     switch (GET_MODE_SIZE (d->vmode))
18865       {
18866       case 4:
18867 	if (!TARGET_XOP)
18868 	  return false;
18869 	vmode = V4QImode;
18870 	break;
18871 
18872       case 8:
18873 	if (!TARGET_XOP)
18874 	  return false;
18875 	vmode = V8QImode;
18876 	break;
18877 
18878       case 16:
18879 	if (!TARGET_XOP)
18880 	  return false;
18881 	vmode = V16QImode;
18882 	break;
18883 
18884       case 32:
18885 	if (!TARGET_AVX2)
18886 	  return false;
18887 
18888 	if (valid_perm_using_mode_p (V2TImode, d))
18889 	  {
18890 	    if (d->testing_p)
18891 	      return true;
18892 
18893 	    /* Use vperm2i128 insn.  The pattern uses
18894 	       V4DImode instead of V2TImode.  */
18895 	    target = d->target;
18896 	    if (d->vmode != V4DImode)
18897 	      target = gen_reg_rtx (V4DImode);
18898 	    op0 = gen_lowpart (V4DImode, d->op0);
18899 	    op1 = gen_lowpart (V4DImode, d->op1);
18900 	    rperm[0]
18901 	      = GEN_INT ((d->perm[0] / (nelt / 2))
18902 			 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
18903 	    emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
18904 	    if (target != d->target)
18905 	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18906 	    return true;
18907 	  }
18908 	/* FALLTHRU */
18909 
18910       default:
18911 	return false;
18912       }
18913   else
18914     switch (GET_MODE_SIZE (d->vmode))
18915       {
18916       case 4:
18917 	if (!TARGET_SSSE3)
18918 	  return false;
18919 	vmode = V4QImode;
18920 	break;
18921 
18922       case 8:
18923 	if (!TARGET_SSSE3)
18924 	  return false;
18925 	vmode = V8QImode;
18926 	break;
18927 
18928       case 16:
18929 	if (!TARGET_SSSE3)
18930 	  return false;
18931 	vmode = V16QImode;
18932 	break;
18933 
18934       case 32:
18935 	if (!TARGET_AVX2)
18936 	  return false;
18937 
18938 	/* V4DImode should be already handled through
18939 	   expand_vselect by vpermq instruction.  */
18940 	gcc_assert (d->vmode != V4DImode);
18941 
18942 	vmode = V32QImode;
18943 	if (d->vmode == V8SImode
18944 	    || d->vmode == V16HImode
18945 	    || d->vmode == V32QImode)
18946 	  {
18947 	    /* First see if vpermq can be used for
18948 	       V8SImode/V16HImode/V32QImode.  */
18949 	    if (valid_perm_using_mode_p (V4DImode, d))
18950 	      {
18951 		for (i = 0; i < 4; i++)
18952 		  perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
18953 		if (d->testing_p)
18954 		  return true;
18955 		target = gen_reg_rtx (V4DImode);
18956 		if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
18957 				    perm, 4, false))
18958 		  {
18959 		    emit_move_insn (d->target,
18960 				    gen_lowpart (d->vmode, target));
18961 		    return true;
18962 		  }
18963 		return false;
18964 	      }
18965 
18966 	    /* Next see if vpermd can be used.  */
18967 	    if (valid_perm_using_mode_p (V8SImode, d))
18968 	      vmode = V8SImode;
18969 	  }
18970 	/* Or if vpermps can be used.  */
18971 	else if (d->vmode == V8SFmode)
18972 	  vmode = V8SImode;
18973 
18974 	if (vmode == V32QImode)
18975 	  {
18976 	    /* vpshufb only works intra lanes, it is not
18977 	       possible to shuffle bytes in between the lanes.  */
18978 	    for (i = 0; i < nelt; ++i)
18979 	      if ((d->perm[i] ^ i) & (nelt / 2))
18980 		return false;
18981 	  }
18982 	break;
18983 
18984       case 64:
18985 	if (!TARGET_AVX512BW)
18986 	  return false;
18987 
18988 	/* If vpermq didn't work, vpshufb won't work either.  */
18989 	if (d->vmode == V8DFmode || d->vmode == V8DImode)
18990 	  return false;
18991 
18992 	vmode = V64QImode;
18993 	if (d->vmode == V16SImode
18994 	    || d->vmode == V32HImode
18995 	    || d->vmode == V64QImode)
18996 	  {
18997 	    /* First see if vpermq can be used for
18998 	       V16SImode/V32HImode/V64QImode.  */
18999 	    if (valid_perm_using_mode_p (V8DImode, d))
19000 	      {
19001 		for (i = 0; i < 8; i++)
19002 		  perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19003 		if (d->testing_p)
19004 		  return true;
19005 		target = gen_reg_rtx (V8DImode);
19006 		if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19007 				    perm, 8, false))
19008 		  {
19009 		    emit_move_insn (d->target,
19010 				    gen_lowpart (d->vmode, target));
19011 		    return true;
19012 		  }
19013 		return false;
19014 	      }
19015 
19016 	    /* Next see if vpermd can be used.  */
19017 	    if (valid_perm_using_mode_p (V16SImode, d))
19018 	      vmode = V16SImode;
19019 	  }
19020 	/* Or if vpermps can be used.  */
19021 	else if (d->vmode == V16SFmode)
19022 	  vmode = V16SImode;
19023 
19024 	if (vmode == V64QImode)
19025 	  {
19026 	    /* vpshufb only works intra lanes, it is not
19027 	       possible to shuffle bytes in between the lanes.  */
19028 	    for (i = 0; i < nelt; ++i)
19029 	      if ((d->perm[i] ^ i) & (3 * nelt / 4))
19030 		return false;
19031 	  }
19032 	break;
19033 
19034       default:
19035 	return false;
19036       }
19037 
19038   if (d->testing_p)
19039     return true;
19040 
19041   /* Try to avoid variable permutation instruction.  */
19042   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19043     {
19044       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19045       return true;
19046     }
19047 
19048   if (vmode == V8SImode)
19049     for (i = 0; i < 8; ++i)
19050       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19051   else if (vmode == V16SImode)
19052     for (i = 0; i < 16; ++i)
19053       rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19054   else
19055     {
19056       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19057       if (!d->one_operand_p)
19058 	mask = 2 * nelt - 1;
19059       else if (vmode == V64QImode)
19060 	mask = nelt / 4 - 1;
19061       else if (vmode == V32QImode)
19062 	mask = nelt / 2 - 1;
19063       else
19064 	mask = nelt - 1;
19065 
19066       for (i = 0; i < nelt; ++i)
19067 	{
19068 	  unsigned j, e = d->perm[i] & mask;
19069 	  for (j = 0; j < eltsz; ++j)
19070 	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19071 	}
19072     }
19073 
19074   machine_mode vpmode = vmode;
19075 
19076   nelt = GET_MODE_SIZE (vmode);
19077 
19078   /* Emulate narrow modes with V16QI instructions.  */
19079   if (nelt < 16)
19080     {
19081       rtx m128 = GEN_INT (-128);
19082 
19083       /* Remap elements from the second operand, as we have to
19084 	 account for inactive top elements from the first operand.  */
19085       if (!d->one_operand_p)
19086 	{
19087 	  for (i = 0; i < nelt; ++i)
19088 	    {
19089 	      unsigned ival = UINTVAL (rperm[i]);
19090 	      if (ival >= nelt)
19091 		rperm[i] = GEN_INT (ival + 16 - nelt);
19092 	    }
19093 	}
19094 
19095       /* Fill inactive elements in the top positions with zeros.  */
19096       for (i = nelt; i < 16; ++i)
19097 	rperm[i] = m128;
19098 
19099       vpmode = V16QImode;
19100     }
19101 
19102   vperm = gen_rtx_CONST_VECTOR (vpmode,
19103 				gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19104   vperm = force_reg (vpmode, vperm);
19105 
19106   if (vmode == d->vmode)
19107     target = d->target;
19108   else
19109     target = gen_reg_rtx (vmode);
19110 
19111   op0 = gen_lowpart (vmode, d->op0);
19112 
19113   if (d->one_operand_p)
19114     {
19115       rtx (*gen) (rtx, rtx, rtx);
19116 
19117       if (vmode == V4QImode)
19118 	gen = gen_mmx_pshufbv4qi3;
19119       else if (vmode == V8QImode)
19120 	gen = gen_mmx_pshufbv8qi3;
19121       else if (vmode == V16QImode)
19122 	gen = gen_ssse3_pshufbv16qi3;
19123       else if (vmode == V32QImode)
19124 	gen = gen_avx2_pshufbv32qi3;
19125       else if (vmode == V64QImode)
19126 	gen = gen_avx512bw_pshufbv64qi3;
19127       else if (vmode == V8SFmode)
19128 	gen = gen_avx2_permvarv8sf;
19129       else if (vmode == V8SImode)
19130 	gen = gen_avx2_permvarv8si;
19131       else if (vmode == V16SFmode)
19132 	gen = gen_avx512f_permvarv16sf;
19133       else if (vmode == V16SImode)
19134 	gen = gen_avx512f_permvarv16si;
19135       else
19136 	gcc_unreachable ();
19137 
19138       emit_insn (gen (target, op0, vperm));
19139     }
19140   else
19141     {
19142       rtx (*gen) (rtx, rtx, rtx, rtx);
19143 
19144       op1 = gen_lowpart (vmode, d->op1);
19145 
19146       if (vmode == V4QImode)
19147 	gen = gen_mmx_ppermv32;
19148       else if (vmode == V8QImode)
19149 	gen = gen_mmx_ppermv64;
19150       else if (vmode == V16QImode)
19151 	gen = gen_xop_pperm;
19152       else
19153 	gcc_unreachable ();
19154 
19155       emit_insn (gen (target, op0, op1, vperm));
19156     }
19157 
19158   if (target != d->target)
19159     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19160 
19161   return true;
19162 }
19163 
19164 /* Try to expand one-operand permutation with constant mask.  */
19165 
19166 static bool
ix86_expand_vec_one_operand_perm_avx512(struct expand_vec_perm_d * d)19167 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19168 {
19169   machine_mode mode = GET_MODE (d->op0);
19170   machine_mode maskmode = mode;
19171   unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
19172   rtx (*gen) (rtx, rtx, rtx) = NULL;
19173   rtx target, op0, mask;
19174   rtx vec[64];
19175 
19176   if (!rtx_equal_p (d->op0, d->op1))
19177     return false;
19178 
19179   if (!TARGET_AVX512F)
19180     return false;
19181 
19182   /* Accept VNxHImode and VNxQImode now.  */
19183   if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19184     return false;
19185 
19186   /* vpermw.  */
19187   if (!TARGET_AVX512BW && inner_size == 2)
19188     return false;
19189 
19190   /* vpermb.  */
19191   if (!TARGET_AVX512VBMI && inner_size == 1)
19192     return false;
19193 
19194   switch (mode)
19195     {
19196     case E_V16SImode:
19197       gen = gen_avx512f_permvarv16si;
19198       break;
19199     case E_V16SFmode:
19200       gen = gen_avx512f_permvarv16sf;
19201       maskmode = V16SImode;
19202       break;
19203     case E_V8DImode:
19204       gen = gen_avx512f_permvarv8di;
19205       break;
19206     case E_V8DFmode:
19207       gen = gen_avx512f_permvarv8df;
19208       maskmode = V8DImode;
19209       break;
19210     case E_V32HImode:
19211       gen = gen_avx512bw_permvarv32hi;
19212       break;
19213     case E_V16HImode:
19214       gen = gen_avx512vl_permvarv16hi;
19215       break;
19216     case E_V8HImode:
19217       gen = gen_avx512vl_permvarv8hi;
19218       break;
19219     case E_V64QImode:
19220       gen = gen_avx512bw_permvarv64qi;
19221       break;
19222     case E_V32QImode:
19223       gen = gen_avx512vl_permvarv32qi;
19224       break;
19225     case E_V16QImode:
19226       gen = gen_avx512vl_permvarv16qi;
19227       break;
19228 
19229     default:
19230       return false;
19231     }
19232 
19233   if (d->testing_p)
19234     return true;
19235 
19236   target = d->target;
19237   op0 = d->op0;
19238   for (int i = 0; i < d->nelt; ++i)
19239     vec[i] = GEN_INT (d->perm[i]);
19240   mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19241   emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19242   return true;
19243 }
19244 
19245 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19246 
19247 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
19248    in a single instruction.  */
19249 
19250 static bool
expand_vec_perm_1(struct expand_vec_perm_d * d)19251 expand_vec_perm_1 (struct expand_vec_perm_d *d)
19252 {
19253   unsigned i, nelt = d->nelt;
19254   struct expand_vec_perm_d nd;
19255 
19256   /* Check plain VEC_SELECT first, because AVX has instructions that could
19257      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19258      input where SEL+CONCAT may not.  */
19259   if (d->one_operand_p)
19260     {
19261       int mask = nelt - 1;
19262       bool identity_perm = true;
19263       bool broadcast_perm = true;
19264 
19265       for (i = 0; i < nelt; i++)
19266 	{
19267 	  nd.perm[i] = d->perm[i] & mask;
19268 	  if (nd.perm[i] != i)
19269 	    identity_perm = false;
19270 	  if (nd.perm[i])
19271 	    broadcast_perm = false;
19272 	}
19273 
19274       if (identity_perm)
19275 	{
19276 	  if (!d->testing_p)
19277 	    emit_move_insn (d->target, d->op0);
19278 	  return true;
19279 	}
19280       else if (broadcast_perm && TARGET_AVX2)
19281 	{
19282 	  /* Use vpbroadcast{b,w,d}.  */
19283 	  rtx (*gen) (rtx, rtx) = NULL;
19284 	  switch (d->vmode)
19285 	    {
19286 	    case E_V64QImode:
19287 	      if (TARGET_AVX512BW)
19288 		gen = gen_avx512bw_vec_dupv64qi_1;
19289 	      break;
19290 	    case E_V32QImode:
19291 	      gen = gen_avx2_pbroadcastv32qi_1;
19292 	      break;
19293 	    case E_V32HImode:
19294 	      if (TARGET_AVX512BW)
19295 		gen = gen_avx512bw_vec_dupv32hi_1;
19296 	      break;
19297 	    case E_V16HImode:
19298 	      gen = gen_avx2_pbroadcastv16hi_1;
19299 	      break;
19300 	    case E_V16SImode:
19301 	      if (TARGET_AVX512F)
19302 		gen = gen_avx512f_vec_dupv16si_1;
19303 	      break;
19304 	    case E_V8SImode:
19305 	      gen = gen_avx2_pbroadcastv8si_1;
19306 	      break;
19307 	    case E_V16QImode:
19308 	      gen = gen_avx2_pbroadcastv16qi;
19309 	      break;
19310 	    case E_V8HImode:
19311 	      gen = gen_avx2_pbroadcastv8hi;
19312 	      break;
19313 	    case E_V16SFmode:
19314 	      if (TARGET_AVX512F)
19315 		gen = gen_avx512f_vec_dupv16sf_1;
19316 	      break;
19317 	    case E_V8SFmode:
19318 	      gen = gen_avx2_vec_dupv8sf_1;
19319 	      break;
19320 	    case E_V8DFmode:
19321 	      if (TARGET_AVX512F)
19322 		gen = gen_avx512f_vec_dupv8df_1;
19323 	      break;
19324 	    case E_V8DImode:
19325 	      if (TARGET_AVX512F)
19326 		gen = gen_avx512f_vec_dupv8di_1;
19327 	      break;
19328 	    /* For other modes prefer other shuffles this function creates.  */
19329 	    default: break;
19330 	    }
19331 	  if (gen != NULL)
19332 	    {
19333 	      if (!d->testing_p)
19334 		emit_insn (gen (d->target, d->op0));
19335 	      return true;
19336 	    }
19337 	}
19338 
19339       if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19340 	return true;
19341 
19342       /* There are plenty of patterns in sse.md that are written for
19343 	 SEL+CONCAT and are not replicated for a single op.  Perhaps
19344 	 that should be changed, to avoid the nastiness here.  */
19345 
19346       /* Recognize interleave style patterns, which means incrementing
19347 	 every other permutation operand.  */
19348       for (i = 0; i < nelt; i += 2)
19349 	{
19350 	  nd.perm[i] = d->perm[i] & mask;
19351 	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19352 	}
19353       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19354 				  d->testing_p))
19355 	return true;
19356 
19357       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
19358       if (nelt >= 4)
19359 	{
19360 	  for (i = 0; i < nelt; i += 4)
19361 	    {
19362 	      nd.perm[i + 0] = d->perm[i + 0] & mask;
19363 	      nd.perm[i + 1] = d->perm[i + 1] & mask;
19364 	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19365 	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19366 	    }
19367 
19368 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19369 				      d->testing_p))
19370 	    return true;
19371 	}
19372     }
19373 
19374   /* Try movss/movsd instructions.  */
19375   if (expand_vec_perm_movs (d))
19376     return true;
19377 
19378   /* Finally, try the fully general two operand permute.  */
19379   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
19380 			      d->testing_p))
19381     return true;
19382 
19383   /* Recognize interleave style patterns with reversed operands.  */
19384   if (!d->one_operand_p)
19385     {
19386       for (i = 0; i < nelt; ++i)
19387 	{
19388 	  unsigned e = d->perm[i];
19389 	  if (e >= nelt)
19390 	    e -= nelt;
19391 	  else
19392 	    e += nelt;
19393 	  nd.perm[i] = e;
19394 	}
19395 
19396       if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
19397 				  d->testing_p))
19398 	return true;
19399     }
19400 
19401   /* Try the SSE4.1 blend variable merge instructions.  */
19402   if (expand_vec_perm_blend (d))
19403     return true;
19404 
19405   /* Try one of the AVX vpermil variable permutations.  */
19406   if (expand_vec_perm_vpermil (d))
19407     return true;
19408 
19409   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19410      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
19411   if (expand_vec_perm_pshufb (d))
19412     return true;
19413 
19414   /* Try the AVX2 vpalignr instruction.  */
19415   if (expand_vec_perm_palignr (d, true))
19416     return true;
19417 
19418   /* Try the AVX512F vperm{w,b,s,d} instructions  */
19419   if (ix86_expand_vec_one_operand_perm_avx512 (d))
19420     return true;
19421 
19422   /* Try the AVX512F vpermt2/vpermi2 instructions.  */
19423   if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
19424     return true;
19425 
19426   /* See if we can get the same permutation in different vector integer
19427      mode.  */
19428   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19429     {
19430       if (!d->testing_p)
19431 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19432       return true;
19433     }
19434   return false;
19435 }
19436 
19437 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
19438    in terms of a pair of pshuflw + pshufhw instructions.  */
19439 
19440 static bool
expand_vec_perm_pshuflw_pshufhw(struct expand_vec_perm_d * d)19441 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
19442 {
19443   unsigned char perm2[MAX_VECT_LEN];
19444   unsigned i;
19445   bool ok;
19446 
19447   if (d->vmode != V8HImode || !d->one_operand_p)
19448     return false;
19449 
19450   /* The two permutations only operate in 64-bit lanes.  */
19451   for (i = 0; i < 4; ++i)
19452     if (d->perm[i] >= 4)
19453       return false;
19454   for (i = 4; i < 8; ++i)
19455     if (d->perm[i] < 4)
19456       return false;
19457 
19458   if (d->testing_p)
19459     return true;
19460 
19461   /* Emit the pshuflw.  */
19462   memcpy (perm2, d->perm, 4);
19463   for (i = 4; i < 8; ++i)
19464     perm2[i] = i;
19465   ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
19466   gcc_assert (ok);
19467 
19468   /* Emit the pshufhw.  */
19469   memcpy (perm2 + 4, d->perm + 4, 4);
19470   for (i = 0; i < 4; ++i)
19471     perm2[i] = i;
19472   ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
19473   gcc_assert (ok);
19474 
19475   return true;
19476 }
19477 
19478 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
19479    the permutation using the SSSE3 palignr instruction.  This succeeds
19480    when all of the elements in PERM fit within one vector and we merely
19481    need to shift them down so that a single vector permutation has a
19482    chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
19483    the vpalignr instruction itself can perform the requested permutation.  */
19484 
19485 static bool
expand_vec_perm_palignr(struct expand_vec_perm_d * d,bool single_insn_only_p)19486 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
19487 {
19488   unsigned i, nelt = d->nelt;
19489   unsigned min, max, minswap, maxswap;
19490   bool in_order, ok, swap = false;
19491   rtx shift, target;
19492   struct expand_vec_perm_d dcopy;
19493 
19494   /* Even with AVX, palignr only operates on 128-bit vectors,
19495      in AVX2 palignr operates on both 128-bit lanes.  */
19496   if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
19497       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
19498     return false;
19499 
19500   min = 2 * nelt;
19501   max = 0;
19502   minswap = 2 * nelt;
19503   maxswap = 0;
19504   for (i = 0; i < nelt; ++i)
19505     {
19506       unsigned e = d->perm[i];
19507       unsigned eswap = d->perm[i] ^ nelt;
19508       if (GET_MODE_SIZE (d->vmode) == 32)
19509 	{
19510 	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
19511 	  eswap = e ^ (nelt / 2);
19512 	}
19513       if (e < min)
19514 	min = e;
19515       if (e > max)
19516 	max = e;
19517       if (eswap < minswap)
19518 	minswap = eswap;
19519       if (eswap > maxswap)
19520 	maxswap = eswap;
19521     }
19522   if (min == 0
19523       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
19524     {
19525       if (d->one_operand_p
19526 	  || minswap == 0
19527 	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
19528 				   ? nelt / 2 : nelt))
19529 	return false;
19530       swap = true;
19531       min = minswap;
19532       max = maxswap;
19533     }
19534 
19535   /* Given that we have SSSE3, we know we'll be able to implement the
19536      single operand permutation after the palignr with pshufb for
19537      128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
19538      first.  */
19539   if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
19540     return true;
19541 
19542   dcopy = *d;
19543   if (swap)
19544     {
19545       dcopy.op0 = d->op1;
19546       dcopy.op1 = d->op0;
19547       for (i = 0; i < nelt; ++i)
19548 	dcopy.perm[i] ^= nelt;
19549     }
19550 
19551   in_order = true;
19552   for (i = 0; i < nelt; ++i)
19553     {
19554       unsigned e = dcopy.perm[i];
19555       if (GET_MODE_SIZE (d->vmode) == 32
19556 	  && e >= nelt
19557 	  && (e & (nelt / 2 - 1)) < min)
19558 	e = e - min - (nelt / 2);
19559       else
19560 	e = e - min;
19561       if (e != i)
19562 	in_order = false;
19563       dcopy.perm[i] = e;
19564     }
19565   dcopy.one_operand_p = true;
19566 
19567   if (single_insn_only_p && !in_order)
19568     return false;
19569 
19570   /* For AVX2, test whether we can permute the result in one instruction.  */
19571   if (d->testing_p)
19572     {
19573       if (in_order)
19574 	return true;
19575       dcopy.op1 = dcopy.op0;
19576       return expand_vec_perm_1 (&dcopy);
19577     }
19578 
19579   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
19580   if (GET_MODE_SIZE (d->vmode) == 16)
19581     {
19582       target = gen_reg_rtx (TImode);
19583       emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
19584 				      gen_lowpart (TImode, dcopy.op0), shift));
19585     }
19586   else
19587     {
19588       target = gen_reg_rtx (V2TImode);
19589       emit_insn (gen_avx2_palignrv2ti (target,
19590 				       gen_lowpart (V2TImode, dcopy.op1),
19591 				       gen_lowpart (V2TImode, dcopy.op0),
19592 				       shift));
19593     }
19594 
19595   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
19596 
19597   /* Test for the degenerate case where the alignment by itself
19598      produces the desired permutation.  */
19599   if (in_order)
19600     {
19601       emit_move_insn (d->target, dcopy.op0);
19602       return true;
19603     }
19604 
19605   ok = expand_vec_perm_1 (&dcopy);
19606   gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
19607 
19608   return ok;
19609 }
19610 
19611 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
19612    the permutation using the SSE4_1 pblendv instruction.  Potentially
19613    reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
19614 
19615 static bool
expand_vec_perm_pblendv(struct expand_vec_perm_d * d)19616 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
19617 {
19618   unsigned i, which, nelt = d->nelt;
19619   struct expand_vec_perm_d dcopy, dcopy1;
19620   machine_mode vmode = d->vmode;
19621   bool ok;
19622 
19623   /* Use the same checks as in expand_vec_perm_blend.  */
19624   if (d->one_operand_p)
19625     return false;
19626   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19627     ;
19628   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19629     ;
19630   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
19631 			     || GET_MODE_SIZE (vmode) == 8
19632 			     || GET_MODE_SIZE (vmode) == 16))
19633     ;
19634   else
19635     return false;
19636 
19637   /* Figure out where permutation elements stay not in their
19638      respective lanes.  */
19639   for (i = 0, which = 0; i < nelt; ++i)
19640     {
19641       unsigned e = d->perm[i];
19642       if (e != i)
19643 	which |= (e < nelt ? 1 : 2);
19644     }
19645   /* We can pblend the part where elements stay not in their
19646      respective lanes only when these elements are all in one
19647      half of a permutation.
19648      {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
19649      lanes, but both 8 and 9 >= 8
19650      {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
19651      respective lanes and 8 >= 8, but 2 not.  */
19652   if (which != 1 && which != 2)
19653     return false;
19654   if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
19655     return true;
19656 
19657   /* First we apply one operand permutation to the part where
19658      elements stay not in their respective lanes.  */
19659   dcopy = *d;
19660   if (which == 2)
19661     dcopy.op0 = dcopy.op1 = d->op1;
19662   else
19663     dcopy.op0 = dcopy.op1 = d->op0;
19664   if (!d->testing_p)
19665     dcopy.target = gen_reg_rtx (vmode);
19666   dcopy.one_operand_p = true;
19667 
19668   for (i = 0; i < nelt; ++i)
19669     dcopy.perm[i] = d->perm[i] & (nelt - 1);
19670 
19671   ok = expand_vec_perm_1 (&dcopy);
19672   if (GET_MODE_SIZE (vmode) != 16 && !ok)
19673     return false;
19674   else
19675     gcc_assert (ok);
19676   if (d->testing_p)
19677     return true;
19678 
19679   /* Next we put permuted elements into their positions.  */
19680   dcopy1 = *d;
19681   if (which == 2)
19682     dcopy1.op1 = dcopy.target;
19683   else
19684     dcopy1.op0 = dcopy.target;
19685 
19686   for (i = 0; i < nelt; ++i)
19687     dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
19688 
19689   ok = expand_vec_perm_blend (&dcopy1);
19690   gcc_assert (ok);
19691 
19692   return true;
19693 }
19694 
19695 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
19696 
19697 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
19698    a two vector permutation into a single vector permutation by using
19699    an interleave operation to merge the vectors.  */
19700 
19701 static bool
expand_vec_perm_interleave2(struct expand_vec_perm_d * d)19702 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
19703 {
19704   struct expand_vec_perm_d dremap, dfinal;
19705   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
19706   unsigned HOST_WIDE_INT contents;
19707   unsigned char remap[2 * MAX_VECT_LEN];
19708   rtx_insn *seq;
19709   bool ok, same_halves = false;
19710 
19711   if (GET_MODE_SIZE (d->vmode) == 4
19712       || GET_MODE_SIZE (d->vmode) == 8
19713       || GET_MODE_SIZE (d->vmode) == 16)
19714     {
19715       if (d->one_operand_p)
19716 	return false;
19717     }
19718   else if (GET_MODE_SIZE (d->vmode) == 32)
19719     {
19720       if (!TARGET_AVX)
19721 	return false;
19722       /* For 32-byte modes allow even d->one_operand_p.
19723 	 The lack of cross-lane shuffling in some instructions
19724 	 might prevent a single insn shuffle.  */
19725       dfinal = *d;
19726       dfinal.testing_p = true;
19727       /* If expand_vec_perm_interleave3 can expand this into
19728 	 a 3 insn sequence, give up and let it be expanded as
19729 	 3 insn sequence.  While that is one insn longer,
19730 	 it doesn't need a memory operand and in the common
19731 	 case that both interleave low and high permutations
19732 	 with the same operands are adjacent needs 4 insns
19733 	 for both after CSE.  */
19734       if (expand_vec_perm_interleave3 (&dfinal))
19735 	return false;
19736     }
19737   else
19738     return false;
19739 
19740   /* Examine from whence the elements come.  */
19741   contents = 0;
19742   for (i = 0; i < nelt; ++i)
19743     contents |= HOST_WIDE_INT_1U << d->perm[i];
19744 
19745   memset (remap, 0xff, sizeof (remap));
19746   dremap = *d;
19747 
19748   if (GET_MODE_SIZE (d->vmode) == 4
19749       || GET_MODE_SIZE (d->vmode) == 8)
19750     {
19751       unsigned HOST_WIDE_INT h1, h2, h3, h4;
19752 
19753       /* Split the two input vectors into 4 halves.  */
19754       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19755       h2 = h1 << nelt2;
19756       h3 = h2 << nelt2;
19757       h4 = h3 << nelt2;
19758 
19759       /* If the elements from the low halves use interleave low,
19760 	 and similarly for interleave high.  */
19761       if ((contents & (h1 | h3)) == contents)
19762 	{
19763 	  /* punpckl* */
19764 	  for (i = 0; i < nelt2; ++i)
19765 	    {
19766 	      remap[i] = i * 2;
19767 	      remap[i + nelt] = i * 2 + 1;
19768 	      dremap.perm[i * 2] = i;
19769 	      dremap.perm[i * 2 + 1] = i + nelt;
19770 	    }
19771 	}
19772       else if ((contents & (h2 | h4)) == contents)
19773 	{
19774 	  /* punpckh* */
19775 	  for (i = 0; i < nelt2; ++i)
19776 	    {
19777 	      remap[i + nelt2] = i * 2;
19778 	      remap[i + nelt + nelt2] = i * 2 + 1;
19779 	      dremap.perm[i * 2] = i + nelt2;
19780 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19781 	    }
19782 	}
19783       else
19784 	return false;
19785     }
19786   else if (GET_MODE_SIZE (d->vmode) == 16)
19787     {
19788       unsigned HOST_WIDE_INT h1, h2, h3, h4;
19789 
19790       /* Split the two input vectors into 4 halves.  */
19791       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19792       h2 = h1 << nelt2;
19793       h3 = h2 << nelt2;
19794       h4 = h3 << nelt2;
19795 
19796       /* If the elements from the low halves use interleave low, and similarly
19797 	 for interleave high.  If the elements are from mis-matched halves, we
19798 	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
19799       if ((contents & (h1 | h3)) == contents)
19800 	{
19801 	  /* punpckl* */
19802 	  for (i = 0; i < nelt2; ++i)
19803 	    {
19804 	      remap[i] = i * 2;
19805 	      remap[i + nelt] = i * 2 + 1;
19806 	      dremap.perm[i * 2] = i;
19807 	      dremap.perm[i * 2 + 1] = i + nelt;
19808 	    }
19809 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
19810 	    dremap.vmode = V4SFmode;
19811 	}
19812       else if ((contents & (h2 | h4)) == contents)
19813 	{
19814 	  /* punpckh* */
19815 	  for (i = 0; i < nelt2; ++i)
19816 	    {
19817 	      remap[i + nelt2] = i * 2;
19818 	      remap[i + nelt + nelt2] = i * 2 + 1;
19819 	      dremap.perm[i * 2] = i + nelt2;
19820 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19821 	    }
19822 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
19823 	    dremap.vmode = V4SFmode;
19824 	}
19825       else if ((contents & (h1 | h4)) == contents)
19826 	{
19827 	  /* shufps */
19828 	  for (i = 0; i < nelt2; ++i)
19829 	    {
19830 	      remap[i] = i;
19831 	      remap[i + nelt + nelt2] = i + nelt2;
19832 	      dremap.perm[i] = i;
19833 	      dremap.perm[i + nelt2] = i + nelt + nelt2;
19834 	    }
19835 	  if (nelt != 4)
19836 	    {
19837 	      /* shufpd */
19838 	      dremap.vmode = V2DImode;
19839 	      dremap.nelt = 2;
19840 	      dremap.perm[0] = 0;
19841 	      dremap.perm[1] = 3;
19842 	    }
19843 	}
19844       else if ((contents & (h2 | h3)) == contents)
19845 	{
19846 	  /* shufps */
19847 	  for (i = 0; i < nelt2; ++i)
19848 	    {
19849 	      remap[i + nelt2] = i;
19850 	      remap[i + nelt] = i + nelt2;
19851 	      dremap.perm[i] = i + nelt2;
19852 	      dremap.perm[i + nelt2] = i + nelt;
19853 	    }
19854 	  if (nelt != 4)
19855 	    {
19856 	      /* shufpd */
19857 	      dremap.vmode = V2DImode;
19858 	      dremap.nelt = 2;
19859 	      dremap.perm[0] = 1;
19860 	      dremap.perm[1] = 2;
19861 	    }
19862 	}
19863       else
19864 	return false;
19865     }
19866   else
19867     {
19868       unsigned int nelt4 = nelt / 4, nzcnt = 0;
19869       unsigned HOST_WIDE_INT q[8];
19870       unsigned int nonzero_halves[4];
19871 
19872       /* Split the two input vectors into 8 quarters.  */
19873       q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
19874       for (i = 1; i < 8; ++i)
19875 	q[i] = q[0] << (nelt4 * i);
19876       for (i = 0; i < 4; ++i)
19877 	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
19878 	  {
19879 	    nonzero_halves[nzcnt] = i;
19880 	    ++nzcnt;
19881 	  }
19882 
19883       if (nzcnt == 1)
19884 	{
19885 	  gcc_assert (d->one_operand_p);
19886 	  nonzero_halves[1] = nonzero_halves[0];
19887 	  same_halves = true;
19888 	}
19889       else if (d->one_operand_p)
19890 	{
19891 	  gcc_assert (nonzero_halves[0] == 0);
19892 	  gcc_assert (nonzero_halves[1] == 1);
19893 	}
19894 
19895       if (nzcnt <= 2)
19896 	{
19897 	  if (d->perm[0] / nelt2 == nonzero_halves[1])
19898 	    {
19899 	      /* Attempt to increase the likelihood that dfinal
19900 		 shuffle will be intra-lane.  */
19901 	      std::swap (nonzero_halves[0], nonzero_halves[1]);
19902 	    }
19903 
19904 	  /* vperm2f128 or vperm2i128.  */
19905 	  for (i = 0; i < nelt2; ++i)
19906 	    {
19907 	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
19908 	      remap[i + nonzero_halves[0] * nelt2] = i;
19909 	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
19910 	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
19911 	    }
19912 
19913 	  if (d->vmode != V8SFmode
19914 	      && d->vmode != V4DFmode
19915 	      && d->vmode != V8SImode)
19916 	    {
19917 	      dremap.vmode = V8SImode;
19918 	      dremap.nelt = 8;
19919 	      for (i = 0; i < 4; ++i)
19920 		{
19921 		  dremap.perm[i] = i + nonzero_halves[0] * 4;
19922 		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
19923 		}
19924 	    }
19925 	}
19926       else if (d->one_operand_p)
19927 	return false;
19928       else if (TARGET_AVX2
19929 	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
19930 	{
19931 	  /* vpunpckl* */
19932 	  for (i = 0; i < nelt4; ++i)
19933 	    {
19934 	      remap[i] = i * 2;
19935 	      remap[i + nelt] = i * 2 + 1;
19936 	      remap[i + nelt2] = i * 2 + nelt2;
19937 	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
19938 	      dremap.perm[i * 2] = i;
19939 	      dremap.perm[i * 2 + 1] = i + nelt;
19940 	      dremap.perm[i * 2 + nelt2] = i + nelt2;
19941 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
19942 	    }
19943 	}
19944       else if (TARGET_AVX2
19945 	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
19946 	{
19947 	  /* vpunpckh* */
19948 	  for (i = 0; i < nelt4; ++i)
19949 	    {
19950 	      remap[i + nelt4] = i * 2;
19951 	      remap[i + nelt + nelt4] = i * 2 + 1;
19952 	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
19953 	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
19954 	      dremap.perm[i * 2] = i + nelt4;
19955 	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
19956 	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
19957 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
19958 	    }
19959 	}
19960       else
19961 	return false;
19962     }
19963 
19964   /* Use the remapping array set up above to move the elements from their
19965      swizzled locations into their final destinations.  */
19966   dfinal = *d;
19967   for (i = 0; i < nelt; ++i)
19968     {
19969       unsigned e = remap[d->perm[i]];
19970       gcc_assert (e < nelt);
19971       /* If same_halves is true, both halves of the remapped vector are the
19972 	 same.  Avoid cross-lane accesses if possible.  */
19973       if (same_halves && i >= nelt2)
19974 	{
19975 	  gcc_assert (e < nelt2);
19976 	  dfinal.perm[i] = e + nelt2;
19977 	}
19978       else
19979 	dfinal.perm[i] = e;
19980     }
19981   if (!d->testing_p)
19982     {
19983       dremap.target = gen_reg_rtx (dremap.vmode);
19984       dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
19985     }
19986   dfinal.op1 = dfinal.op0;
19987   dfinal.one_operand_p = true;
19988 
19989   /* Test if the final remap can be done with a single insn.  For V4SFmode or
19990      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
19991   start_sequence ();
19992   ok = expand_vec_perm_1 (&dfinal);
19993   seq = get_insns ();
19994   end_sequence ();
19995 
19996   if (!ok)
19997     return false;
19998 
19999   if (d->testing_p)
20000     return true;
20001 
20002   if (dremap.vmode != dfinal.vmode)
20003     {
20004       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20005       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20006     }
20007 
20008   ok = expand_vec_perm_1 (&dremap);
20009   gcc_assert (ok);
20010 
20011   emit_insn (seq);
20012   return true;
20013 }
20014 
20015 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
20016    a single vector cross-lane permutation into vpermq followed
20017    by any of the single insn permutations.  */
20018 
20019 static bool
expand_vec_perm_vpermq_perm_1(struct expand_vec_perm_d * d)20020 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20021 {
20022   struct expand_vec_perm_d dremap, dfinal;
20023   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20024   unsigned contents[2];
20025   bool ok;
20026 
20027   if (!(TARGET_AVX2
20028 	&& (d->vmode == V32QImode || d->vmode == V16HImode)
20029 	&& d->one_operand_p))
20030     return false;
20031 
20032   contents[0] = 0;
20033   contents[1] = 0;
20034   for (i = 0; i < nelt2; ++i)
20035     {
20036       contents[0] |= 1u << (d->perm[i] / nelt4);
20037       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20038     }
20039 
20040   for (i = 0; i < 2; ++i)
20041     {
20042       unsigned int cnt = 0;
20043       for (j = 0; j < 4; ++j)
20044 	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20045 	  return false;
20046     }
20047 
20048   if (d->testing_p)
20049     return true;
20050 
20051   dremap = *d;
20052   dremap.vmode = V4DImode;
20053   dremap.nelt = 4;
20054   dremap.target = gen_reg_rtx (V4DImode);
20055   dremap.op0 = gen_lowpart (V4DImode, d->op0);
20056   dremap.op1 = dremap.op0;
20057   dremap.one_operand_p = true;
20058   for (i = 0; i < 2; ++i)
20059     {
20060       unsigned int cnt = 0;
20061       for (j = 0; j < 4; ++j)
20062 	if ((contents[i] & (1u << j)) != 0)
20063 	  dremap.perm[2 * i + cnt++] = j;
20064       for (; cnt < 2; ++cnt)
20065 	dremap.perm[2 * i + cnt] = 0;
20066     }
20067 
20068   dfinal = *d;
20069   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20070   dfinal.op1 = dfinal.op0;
20071   dfinal.one_operand_p = true;
20072   for (i = 0, j = 0; i < nelt; ++i)
20073     {
20074       if (i == nelt2)
20075 	j = 2;
20076       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20077       if ((d->perm[i] / nelt4) == dremap.perm[j])
20078 	;
20079       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20080 	dfinal.perm[i] |= nelt4;
20081       else
20082 	gcc_unreachable ();
20083     }
20084 
20085   ok = expand_vec_perm_1 (&dremap);
20086   gcc_assert (ok);
20087 
20088   ok = expand_vec_perm_1 (&dfinal);
20089   gcc_assert (ok);
20090 
20091   return true;
20092 }
20093 
20094 static bool canonicalize_perm (struct expand_vec_perm_d *d);
20095 
20096 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
20097    a vector permutation using two instructions, vperm2f128 resp.
20098    vperm2i128 followed by any single in-lane permutation.  */
20099 
20100 static bool
expand_vec_perm_vperm2f128(struct expand_vec_perm_d * d)20101 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20102 {
20103   struct expand_vec_perm_d dfirst, dsecond;
20104   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20105   bool ok;
20106 
20107   if (!TARGET_AVX
20108       || GET_MODE_SIZE (d->vmode) != 32
20109       || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20110     return false;
20111 
20112   dsecond = *d;
20113   dsecond.one_operand_p = false;
20114   dsecond.testing_p = true;
20115 
20116   /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20117      immediate.  For perm < 16 the second permutation uses
20118      d->op0 as first operand, for perm >= 16 it uses d->op1
20119      as first operand.  The second operand is the result of
20120      vperm2[fi]128.  */
20121   for (perm = 0; perm < 32; perm++)
20122     {
20123       /* Ignore permutations which do not move anything cross-lane.  */
20124       if (perm < 16)
20125 	{
20126 	  /* The second shuffle for e.g. V4DFmode has
20127 	     0123 and ABCD operands.
20128 	     Ignore AB23, as 23 is already in the second lane
20129 	     of the first operand.  */
20130 	  if ((perm & 0xc) == (1 << 2)) continue;
20131 	  /* And 01CD, as 01 is in the first lane of the first
20132 	     operand.  */
20133 	  if ((perm & 3) == 0) continue;
20134 	  /* And 4567, as then the vperm2[fi]128 doesn't change
20135 	     anything on the original 4567 second operand.  */
20136 	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
20137 	}
20138       else
20139 	{
20140 	  /* The second shuffle for e.g. V4DFmode has
20141 	     4567 and ABCD operands.
20142 	     Ignore AB67, as 67 is already in the second lane
20143 	     of the first operand.  */
20144 	  if ((perm & 0xc) == (3 << 2)) continue;
20145 	  /* And 45CD, as 45 is in the first lane of the first
20146 	     operand.  */
20147 	  if ((perm & 3) == 2) continue;
20148 	  /* And 0123, as then the vperm2[fi]128 doesn't change
20149 	     anything on the original 0123 first operand.  */
20150 	  if ((perm & 0xf) == (1 << 2)) continue;
20151 	}
20152 
20153       for (i = 0; i < nelt; i++)
20154 	{
20155 	  j = d->perm[i] / nelt2;
20156 	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20157 	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20158 	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20159 	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
20160 	  else
20161 	    break;
20162 	}
20163 
20164       if (i == nelt)
20165 	{
20166 	  start_sequence ();
20167 	  ok = expand_vec_perm_1 (&dsecond);
20168 	  end_sequence ();
20169 	}
20170       else
20171 	ok = false;
20172 
20173       if (ok)
20174 	{
20175 	  if (d->testing_p)
20176 	    return true;
20177 
20178 	  /* Found a usable second shuffle.  dfirst will be
20179 	     vperm2f128 on d->op0 and d->op1.  */
20180 	  dsecond.testing_p = false;
20181 	  dfirst = *d;
20182 	  dfirst.target = gen_reg_rtx (d->vmode);
20183 	  for (i = 0; i < nelt; i++)
20184 	    dfirst.perm[i] = (i & (nelt2 - 1))
20185 			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20186 
20187 	  canonicalize_perm (&dfirst);
20188 	  ok = expand_vec_perm_1 (&dfirst);
20189 	  gcc_assert (ok);
20190 
20191 	  /* And dsecond is some single insn shuffle, taking
20192 	     d->op0 and result of vperm2f128 (if perm < 16) or
20193 	     d->op1 and result of vperm2f128 (otherwise).  */
20194 	  if (perm >= 16)
20195 	    dsecond.op0 = dsecond.op1;
20196 	  dsecond.op1 = dfirst.target;
20197 
20198 	  ok = expand_vec_perm_1 (&dsecond);
20199 	  gcc_assert (ok);
20200 
20201 	  return true;
20202 	}
20203 
20204       /* For one operand, the only useful vperm2f128 permutation is 0x01
20205 	 aka lanes swap.  */
20206       if (d->one_operand_p)
20207 	return false;
20208     }
20209 
20210   return false;
20211 }
20212 
20213 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
20214    a two vector permutation using 2 intra-lane interleave insns
20215    and cross-lane shuffle for 32-byte vectors.  */
20216 
20217 static bool
expand_vec_perm_interleave3(struct expand_vec_perm_d * d)20218 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20219 {
20220   unsigned i, nelt;
20221   rtx (*gen) (rtx, rtx, rtx);
20222 
20223   if (d->one_operand_p)
20224     return false;
20225   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20226     ;
20227   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20228     ;
20229   else
20230     return false;
20231 
20232   nelt = d->nelt;
20233   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20234     return false;
20235   for (i = 0; i < nelt; i += 2)
20236     if (d->perm[i] != d->perm[0] + i / 2
20237 	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
20238       return false;
20239 
20240   if (d->testing_p)
20241     return true;
20242 
20243   switch (d->vmode)
20244     {
20245     case E_V32QImode:
20246       if (d->perm[0])
20247 	gen = gen_vec_interleave_highv32qi;
20248       else
20249 	gen = gen_vec_interleave_lowv32qi;
20250       break;
20251     case E_V16HImode:
20252       if (d->perm[0])
20253 	gen = gen_vec_interleave_highv16hi;
20254       else
20255 	gen = gen_vec_interleave_lowv16hi;
20256       break;
20257     case E_V8SImode:
20258       if (d->perm[0])
20259 	gen = gen_vec_interleave_highv8si;
20260       else
20261 	gen = gen_vec_interleave_lowv8si;
20262       break;
20263     case E_V4DImode:
20264       if (d->perm[0])
20265 	gen = gen_vec_interleave_highv4di;
20266       else
20267 	gen = gen_vec_interleave_lowv4di;
20268       break;
20269     case E_V8SFmode:
20270       if (d->perm[0])
20271 	gen = gen_vec_interleave_highv8sf;
20272       else
20273 	gen = gen_vec_interleave_lowv8sf;
20274       break;
20275     case E_V4DFmode:
20276       if (d->perm[0])
20277 	gen = gen_vec_interleave_highv4df;
20278       else
20279 	gen = gen_vec_interleave_lowv4df;
20280       break;
20281     default:
20282       gcc_unreachable ();
20283     }
20284 
20285   emit_insn (gen (d->target, d->op0, d->op1));
20286   return true;
20287 }
20288 
20289 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
20290    a single vector permutation using a single intra-lane vector
20291    permutation, vperm2f128 swapping the lanes and vblend* insn blending
20292    the non-swapped and swapped vectors together.  */
20293 
20294 static bool
expand_vec_perm_vperm2f128_vblend(struct expand_vec_perm_d * d)20295 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
20296 {
20297   struct expand_vec_perm_d dfirst, dsecond;
20298   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
20299   rtx_insn *seq;
20300   bool ok;
20301   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20302 
20303   if (!TARGET_AVX
20304       || TARGET_AVX2
20305       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20306       || !d->one_operand_p)
20307     return false;
20308 
20309   dfirst = *d;
20310   for (i = 0; i < nelt; i++)
20311     dfirst.perm[i] = 0xff;
20312   for (i = 0, msk = 0; i < nelt; i++)
20313     {
20314       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20315       if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
20316 	return false;
20317       dfirst.perm[j] = d->perm[i];
20318       if (j != i)
20319 	msk |= (1 << i);
20320     }
20321   for (i = 0; i < nelt; i++)
20322     if (dfirst.perm[i] == 0xff)
20323       dfirst.perm[i] = i;
20324 
20325   if (!d->testing_p)
20326     dfirst.target = gen_reg_rtx (dfirst.vmode);
20327 
20328   start_sequence ();
20329   ok = expand_vec_perm_1 (&dfirst);
20330   seq = get_insns ();
20331   end_sequence ();
20332 
20333   if (!ok)
20334     return false;
20335 
20336   if (d->testing_p)
20337     return true;
20338 
20339   emit_insn (seq);
20340 
20341   dsecond = *d;
20342   dsecond.op0 = dfirst.target;
20343   dsecond.op1 = dfirst.target;
20344   dsecond.one_operand_p = true;
20345   dsecond.target = gen_reg_rtx (dsecond.vmode);
20346   for (i = 0; i < nelt; i++)
20347     dsecond.perm[i] = i ^ nelt2;
20348 
20349   ok = expand_vec_perm_1 (&dsecond);
20350   gcc_assert (ok);
20351 
20352   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20353   emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
20354   return true;
20355 }
20356 
20357 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
20358    a two vector permutation using two single vector permutations and
20359    {,v}{,p}unpckl{ps,pd,bw,wd,dq}.  If two_insn, succeed only if one
20360    of dfirst or dsecond is identity permutation.  */
20361 
20362 static bool
expand_vec_perm_2perm_interleave(struct expand_vec_perm_d * d,bool two_insn)20363 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
20364 {
20365   unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
20366   struct expand_vec_perm_d dfirst, dsecond, dfinal;
20367   bool ident1 = true, ident2 = true;
20368 
20369   if (d->one_operand_p)
20370     return false;
20371 
20372   if (GET_MODE_SIZE (d->vmode) == 16)
20373     {
20374       if (!TARGET_SSE)
20375 	return false;
20376       if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
20377 	return false;
20378     }
20379   else if (GET_MODE_SIZE (d->vmode) == 32)
20380     {
20381       if (!TARGET_AVX)
20382 	return false;
20383       if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
20384 	return false;
20385       lane = nelt2;
20386     }
20387   else
20388     return false;
20389 
20390   for (i = 1; i < nelt; i++)
20391     if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
20392       return false;
20393 
20394   dfirst = *d;
20395   dsecond = *d;
20396   dfinal = *d;
20397   dfirst.op1 = dfirst.op0;
20398   dfirst.one_operand_p = true;
20399   dsecond.op0 = dsecond.op1;
20400   dsecond.one_operand_p = true;
20401 
20402   for (i = 0; i < nelt; i++)
20403     if (d->perm[i] >= nelt)
20404       {
20405 	dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
20406 	if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
20407 	  ident2 = false;
20408 	dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
20409 	  = d->perm[i] - nelt;
20410       }
20411     else
20412       {
20413 	dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
20414 	if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
20415 	  ident1 = false;
20416 	dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
20417       }
20418 
20419   if (two_insn && !ident1 && !ident2)
20420     return false;
20421 
20422   if (!d->testing_p)
20423     {
20424       if (!ident1)
20425 	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20426       if (!ident2)
20427 	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20428       if (d->perm[0] >= nelt)
20429 	std::swap (dfinal.op0, dfinal.op1);
20430     }
20431 
20432   bool ok;
20433   rtx_insn *seq1 = NULL, *seq2 = NULL;
20434 
20435   if (!ident1)
20436     {
20437       start_sequence ();
20438       ok = expand_vec_perm_1 (&dfirst);
20439       seq1 = get_insns ();
20440       end_sequence ();
20441 
20442       if (!ok)
20443 	return false;
20444     }
20445 
20446   if (!ident2)
20447     {
20448       start_sequence ();
20449       ok = expand_vec_perm_1 (&dsecond);
20450       seq2 = get_insns ();
20451       end_sequence ();
20452 
20453       if (!ok)
20454 	return false;
20455     }
20456 
20457   if (d->testing_p)
20458     return true;
20459 
20460   for (i = 0; i < nelt; i++)
20461     {
20462       dfinal.perm[i] = i / 2;
20463       if (i >= lane)
20464 	dfinal.perm[i] += lane / 2;
20465       if ((i & 1) != 0)
20466 	dfinal.perm[i] += nelt;
20467     }
20468   emit_insn (seq1);
20469   emit_insn (seq2);
20470   ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
20471 			       dfinal.perm, dfinal.nelt, false);
20472   gcc_assert (ok);
20473   return true;
20474 }
20475 
20476 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
20477    the permutation using two single vector permutations and the SSE4_1 pblendv
20478    instruction.  If two_insn, succeed only if one of dfirst or dsecond is
20479    identity permutation.  */
20480 
20481 static bool
expand_vec_perm_2perm_pblendv(struct expand_vec_perm_d * d,bool two_insn)20482 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
20483 {
20484   unsigned i, nelt = d->nelt;
20485   struct expand_vec_perm_d dfirst, dsecond, dfinal;
20486   machine_mode vmode = d->vmode;
20487   bool ident1 = true, ident2 = true;
20488 
20489   /* Use the same checks as in expand_vec_perm_blend.  */
20490   if (d->one_operand_p)
20491     return false;
20492   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20493     ;
20494   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20495     ;
20496   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
20497 			     || GET_MODE_SIZE (vmode) == 8
20498 			     || GET_MODE_SIZE (vmode) == 4))
20499     ;
20500   else
20501     return false;
20502 
20503   dfirst = *d;
20504   dsecond = *d;
20505   dfinal = *d;
20506   dfirst.op1 = dfirst.op0;
20507   dfirst.one_operand_p = true;
20508   dsecond.op0 = dsecond.op1;
20509   dsecond.one_operand_p = true;
20510 
20511   for (i = 0; i < nelt; ++i)
20512     if (d->perm[i] >= nelt)
20513       {
20514 	dfirst.perm[i] = 0xff;
20515 	dsecond.perm[i] = d->perm[i] - nelt;
20516 	if (d->perm[i] != i + nelt)
20517 	  ident2 = false;
20518       }
20519     else
20520       {
20521 	dsecond.perm[i] = 0xff;
20522 	dfirst.perm[i] = d->perm[i];
20523 	if (d->perm[i] != i)
20524 	  ident1 = false;
20525       }
20526 
20527   if (two_insn && !ident1 && !ident2)
20528     return false;
20529 
20530   /* For now.  Ideally treat 0xff as a wildcard.  */
20531   for (i = 0; i < nelt; ++i)
20532     if (dfirst.perm[i] == 0xff)
20533       {
20534 	if (GET_MODE_SIZE (vmode) == 32
20535 	    && dfirst.perm[i ^ (nelt / 2)] != 0xff)
20536 	  dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20537 	else
20538 	  dfirst.perm[i] = i;
20539       }
20540     else
20541       {
20542 	if (GET_MODE_SIZE (vmode) == 32
20543 	    && dsecond.perm[i ^ (nelt / 2)] != 0xff)
20544 	  dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20545 	else
20546 	  dsecond.perm[i] = i;
20547       }
20548 
20549   if (!d->testing_p)
20550     {
20551       if (!ident1)
20552 	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20553       if (!ident2)
20554 	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20555     }
20556 
20557   bool ok;
20558   rtx_insn *seq1 = NULL, *seq2 = NULL;
20559 
20560   if (!ident1)
20561     {
20562       start_sequence ();
20563       ok = expand_vec_perm_1 (&dfirst);
20564       seq1 = get_insns ();
20565       end_sequence ();
20566 
20567       if (!ok)
20568 	return false;
20569     }
20570 
20571   if (!ident2)
20572     {
20573       start_sequence ();
20574       ok = expand_vec_perm_1 (&dsecond);
20575       seq2 = get_insns ();
20576       end_sequence ();
20577 
20578       if (!ok)
20579 	return false;
20580     }
20581 
20582   if (d->testing_p)
20583     return true;
20584 
20585   for (i = 0; i < nelt; ++i)
20586     dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
20587 
20588   emit_insn (seq1);
20589   emit_insn (seq2);
20590   ok = expand_vec_perm_blend (&dfinal);
20591   gcc_assert (ok);
20592   return true;
20593 }
20594 
20595 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
20596    permutation using two vperm2f128, followed by a vshufpd insn blending
20597    the two vectors together.  */
20598 
20599 static bool
expand_vec_perm_2vperm2f128_vshuf(struct expand_vec_perm_d * d)20600 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
20601 {
20602   struct expand_vec_perm_d dfirst, dsecond, dthird;
20603   bool ok;
20604 
20605   if (!TARGET_AVX || (d->vmode != V4DFmode))
20606     return false;
20607 
20608   if (d->testing_p)
20609     return true;
20610 
20611   dfirst = *d;
20612   dsecond = *d;
20613   dthird = *d;
20614 
20615   dfirst.perm[0] = (d->perm[0] & ~1);
20616   dfirst.perm[1] = (d->perm[0] & ~1) + 1;
20617   dfirst.perm[2] = (d->perm[2] & ~1);
20618   dfirst.perm[3] = (d->perm[2] & ~1) + 1;
20619   dsecond.perm[0] = (d->perm[1] & ~1);
20620   dsecond.perm[1] = (d->perm[1] & ~1) + 1;
20621   dsecond.perm[2] = (d->perm[3] & ~1);
20622   dsecond.perm[3] = (d->perm[3] & ~1) + 1;
20623   dthird.perm[0] = (d->perm[0] % 2);
20624   dthird.perm[1] = (d->perm[1] % 2) + 4;
20625   dthird.perm[2] = (d->perm[2] % 2) + 2;
20626   dthird.perm[3] = (d->perm[3] % 2) + 6;
20627 
20628   dfirst.target = gen_reg_rtx (dfirst.vmode);
20629   dsecond.target = gen_reg_rtx (dsecond.vmode);
20630   dthird.op0 = dfirst.target;
20631   dthird.op1 = dsecond.target;
20632   dthird.one_operand_p = false;
20633 
20634   canonicalize_perm (&dfirst);
20635   canonicalize_perm (&dsecond);
20636 
20637   ok = expand_vec_perm_1 (&dfirst)
20638        && expand_vec_perm_1 (&dsecond)
20639        && expand_vec_perm_1 (&dthird);
20640 
20641   gcc_assert (ok);
20642 
20643   return true;
20644 }
20645 
20646 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
20647 
20648 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
20649    a two vector permutation using two intra-lane vector
20650    permutations, vperm2f128 swapping the lanes and vblend* insn blending
20651    the non-swapped and swapped vectors together.  */
20652 
20653 static bool
expand_vec_perm2_vperm2f128_vblend(struct expand_vec_perm_d * d)20654 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
20655 {
20656   struct expand_vec_perm_d dfirst, dsecond, dthird;
20657   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
20658   rtx_insn *seq1, *seq2;
20659   bool ok;
20660   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20661 
20662   if (!TARGET_AVX
20663       || TARGET_AVX2
20664       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20665       || d->one_operand_p)
20666     return false;
20667 
20668   dfirst = *d;
20669   dsecond = *d;
20670   for (i = 0; i < nelt; i++)
20671     {
20672       dfirst.perm[i] = 0xff;
20673       dsecond.perm[i] = 0xff;
20674     }
20675   for (i = 0, msk = 0; i < nelt; i++)
20676     {
20677       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20678       if (j == i)
20679 	{
20680 	  dfirst.perm[j] = d->perm[i];
20681 	  which1 |= (d->perm[i] < nelt ? 1 : 2);
20682 	}
20683       else
20684 	{
20685 	  dsecond.perm[j] = d->perm[i];
20686 	  which2 |= (d->perm[i] < nelt ? 1 : 2);
20687 	  msk |= (1U << i);
20688 	}
20689     }
20690   if (msk == 0 || msk == (1U << nelt) - 1)
20691     return false;
20692 
20693   if (!d->testing_p)
20694     {
20695       dfirst.target = gen_reg_rtx (dfirst.vmode);
20696       dsecond.target = gen_reg_rtx (dsecond.vmode);
20697     }
20698 
20699   for (i = 0; i < nelt; i++)
20700     {
20701       if (dfirst.perm[i] == 0xff)
20702 	dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
20703       if (dsecond.perm[i] == 0xff)
20704 	dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
20705     }
20706   canonicalize_perm (&dfirst);
20707   start_sequence ();
20708   ok = ix86_expand_vec_perm_const_1 (&dfirst);
20709   seq1 = get_insns ();
20710   end_sequence ();
20711 
20712   if (!ok)
20713     return false;
20714 
20715   canonicalize_perm (&dsecond);
20716   start_sequence ();
20717   ok = ix86_expand_vec_perm_const_1 (&dsecond);
20718   seq2 = get_insns ();
20719   end_sequence ();
20720 
20721   if (!ok)
20722     return false;
20723 
20724   if (d->testing_p)
20725     return true;
20726 
20727   emit_insn (seq1);
20728   emit_insn (seq2);
20729 
20730   dthird = *d;
20731   dthird.op0 = dsecond.target;
20732   dthird.op1 = dsecond.target;
20733   dthird.one_operand_p = true;
20734   dthird.target = gen_reg_rtx (dthird.vmode);
20735   for (i = 0; i < nelt; i++)
20736     dthird.perm[i] = i ^ nelt2;
20737 
20738   ok = expand_vec_perm_1 (&dthird);
20739   gcc_assert (ok);
20740 
20741   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20742   emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
20743   return true;
20744 }
20745 
20746 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
20747    permutation with two pshufb insns and an ior.  We should have already
20748    failed all two instruction sequences.  */
20749 
20750 static bool
expand_vec_perm_pshufb2(struct expand_vec_perm_d * d)20751 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
20752 {
20753   rtx rperm[2][16], vperm, l, h, op, m128;
20754   unsigned int i, nelt, eltsz;
20755   machine_mode mode;
20756   rtx (*gen) (rtx, rtx, rtx);
20757 
20758   if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
20759 			&& GET_MODE_SIZE (d->vmode) != 8
20760 			&& GET_MODE_SIZE (d->vmode) != 4))
20761     return false;
20762   gcc_assert (!d->one_operand_p);
20763 
20764   if (d->testing_p)
20765     return true;
20766 
20767   switch (GET_MODE_SIZE (d->vmode))
20768     {
20769     case 4:
20770       mode = V4QImode;
20771       gen = gen_mmx_pshufbv4qi3;
20772       break;
20773     case 8:
20774       mode = V8QImode;
20775       gen = gen_mmx_pshufbv8qi3;
20776       break;
20777     case 16:
20778       mode = V16QImode;
20779       gen = gen_ssse3_pshufbv16qi3;
20780       break;
20781     default:
20782       gcc_unreachable ();
20783     }
20784 
20785   nelt = d->nelt;
20786   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20787 
20788   /* Generate two permutation masks.  If the required element is within
20789      the given vector it is shuffled into the proper lane.  If the required
20790      element is in the other vector, force a zero into the lane by setting
20791      bit 7 in the permutation mask.  */
20792   m128 = GEN_INT (-128);
20793   for (i = 0; i < nelt; ++i)
20794     {
20795       unsigned j, k, e = d->perm[i];
20796       unsigned which = (e >= nelt);
20797       if (e >= nelt)
20798 	e -= nelt;
20799 
20800       for (j = 0; j < eltsz; ++j)
20801 	{
20802 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
20803 	  rperm[1-which][i*eltsz + j] = m128;
20804 	}
20805 
20806       for (k = i*eltsz + j; k < 16; ++k)
20807 	rperm[0][k] = rperm[1][k] = m128;
20808     }
20809 
20810   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
20811   vperm = force_reg (V16QImode, vperm);
20812 
20813   l = gen_reg_rtx (mode);
20814   op = gen_lowpart (mode, d->op0);
20815   emit_insn (gen (l, op, vperm));
20816 
20817   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
20818   vperm = force_reg (V16QImode, vperm);
20819 
20820   h = gen_reg_rtx (mode);
20821   op = gen_lowpart (mode, d->op1);
20822   emit_insn (gen (h, op, vperm));
20823 
20824   op = d->target;
20825   if (d->vmode != mode)
20826     op = gen_reg_rtx (mode);
20827   ix86_emit_vec_binop (IOR, mode, op, l, h);
20828   if (op != d->target)
20829     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20830 
20831   return true;
20832 }
20833 
20834 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
20835    with two vpshufb insns, vpermq and vpor.  We should have already failed
20836    all two or three instruction sequences.  */
20837 
20838 static bool
expand_vec_perm_vpshufb2_vpermq(struct expand_vec_perm_d * d)20839 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
20840 {
20841   rtx rperm[2][32], vperm, l, h, hp, op, m128;
20842   unsigned int i, nelt, eltsz;
20843 
20844   if (!TARGET_AVX2
20845       || !d->one_operand_p
20846       || (d->vmode != V32QImode && d->vmode != V16HImode))
20847     return false;
20848 
20849   if (d->testing_p)
20850     return true;
20851 
20852   nelt = d->nelt;
20853   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20854 
20855   /* Generate two permutation masks.  If the required element is within
20856      the same lane, it is shuffled in.  If the required element from the
20857      other lane, force a zero by setting bit 7 in the permutation mask.
20858      In the other mask the mask has non-negative elements if element
20859      is requested from the other lane, but also moved to the other lane,
20860      so that the result of vpshufb can have the two V2TImode halves
20861      swapped.  */
20862   m128 = GEN_INT (-128);
20863   for (i = 0; i < nelt; ++i)
20864     {
20865       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20866       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
20867 
20868       for (j = 0; j < eltsz; ++j)
20869 	{
20870 	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
20871 	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
20872 	}
20873     }
20874 
20875   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
20876   vperm = force_reg (V32QImode, vperm);
20877 
20878   h = gen_reg_rtx (V32QImode);
20879   op = gen_lowpart (V32QImode, d->op0);
20880   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
20881 
20882   /* Swap the 128-byte lanes of h into hp.  */
20883   hp = gen_reg_rtx (V4DImode);
20884   op = gen_lowpart (V4DImode, h);
20885   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
20886 				  const1_rtx));
20887 
20888   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
20889   vperm = force_reg (V32QImode, vperm);
20890 
20891   l = gen_reg_rtx (V32QImode);
20892   op = gen_lowpart (V32QImode, d->op0);
20893   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
20894 
20895   op = d->target;
20896   if (d->vmode != V32QImode)
20897     op = gen_reg_rtx (V32QImode);
20898   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
20899   if (op != d->target)
20900     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20901 
20902   return true;
20903 }
20904 
20905 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
20906    and extract-odd permutations of two V32QImode and V16QImode operand
20907    with two vpshufb insns, vpor and vpermq.  We should have already
20908    failed all two or three instruction sequences.  */
20909 
20910 static bool
expand_vec_perm_vpshufb2_vpermq_even_odd(struct expand_vec_perm_d * d)20911 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
20912 {
20913   rtx rperm[2][32], vperm, l, h, ior, op, m128;
20914   unsigned int i, nelt, eltsz;
20915 
20916   if (!TARGET_AVX2
20917       || d->one_operand_p
20918       || (d->vmode != V32QImode && d->vmode != V16HImode))
20919     return false;
20920 
20921   for (i = 0; i < d->nelt; ++i)
20922     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
20923       return false;
20924 
20925   if (d->testing_p)
20926     return true;
20927 
20928   nelt = d->nelt;
20929   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20930 
20931   /* Generate two permutation masks.  In the first permutation mask
20932      the first quarter will contain indexes for the first half
20933      of the op0, the second quarter will contain bit 7 set, third quarter
20934      will contain indexes for the second half of the op0 and the
20935      last quarter bit 7 set.  In the second permutation mask
20936      the first quarter will contain bit 7 set, the second quarter
20937      indexes for the first half of the op1, the third quarter bit 7 set
20938      and last quarter indexes for the second half of the op1.
20939      I.e. the first mask e.g. for V32QImode extract even will be:
20940      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
20941      (all values masked with 0xf except for -128) and second mask
20942      for extract even will be
20943      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
20944   m128 = GEN_INT (-128);
20945   for (i = 0; i < nelt; ++i)
20946     {
20947       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20948       unsigned which = d->perm[i] >= nelt;
20949       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
20950 
20951       for (j = 0; j < eltsz; ++j)
20952 	{
20953 	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
20954 	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
20955 	}
20956     }
20957 
20958   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
20959   vperm = force_reg (V32QImode, vperm);
20960 
20961   l = gen_reg_rtx (V32QImode);
20962   op = gen_lowpart (V32QImode, d->op0);
20963   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
20964 
20965   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
20966   vperm = force_reg (V32QImode, vperm);
20967 
20968   h = gen_reg_rtx (V32QImode);
20969   op = gen_lowpart (V32QImode, d->op1);
20970   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
20971 
20972   ior = gen_reg_rtx (V32QImode);
20973   emit_insn (gen_iorv32qi3 (ior, l, h));
20974 
20975   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
20976   op = gen_reg_rtx (V4DImode);
20977   ior = gen_lowpart (V4DImode, ior);
20978   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
20979 				  const1_rtx, GEN_INT (3)));
20980   emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20981 
20982   return true;
20983 }
20984 
20985 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
20986    and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
20987    operands with two "and" and "pack" or two "shift" and "pack" insns.
20988    We should have already failed all two instruction sequences.  */
20989 
20990 static bool
expand_vec_perm_even_odd_pack(struct expand_vec_perm_d * d)20991 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
20992 {
20993   rtx op, dop0, dop1, t;
20994   unsigned i, odd, c, s, nelt = d->nelt;
20995   bool end_perm = false;
20996   machine_mode half_mode;
20997   rtx (*gen_and) (rtx, rtx, rtx);
20998   rtx (*gen_pack) (rtx, rtx, rtx);
20999   rtx (*gen_shift) (rtx, rtx, rtx);
21000 
21001   if (d->one_operand_p)
21002     return false;
21003 
21004   switch (d->vmode)
21005     {
21006     case E_V4HImode:
21007       /* Required for "pack".  */
21008       if (!TARGET_SSE4_1)
21009 	return false;
21010       c = 0xffff;
21011       s = 16;
21012       half_mode = V2SImode;
21013       gen_and = gen_andv2si3;
21014       gen_pack = gen_mmx_packusdw;
21015       gen_shift = gen_lshrv2si3;
21016       break;
21017     case E_V8HImode:
21018       /* Required for "pack".  */
21019       if (!TARGET_SSE4_1)
21020         return false;
21021       c = 0xffff;
21022       s = 16;
21023       half_mode = V4SImode;
21024       gen_and = gen_andv4si3;
21025       gen_pack = gen_sse4_1_packusdw;
21026       gen_shift = gen_lshrv4si3;
21027       break;
21028     case E_V8QImode:
21029       /* No check as all instructions are SSE2.  */
21030       c = 0xff;
21031       s = 8;
21032       half_mode = V4HImode;
21033       gen_and = gen_andv4hi3;
21034       gen_pack = gen_mmx_packuswb;
21035       gen_shift = gen_lshrv4hi3;
21036       break;
21037     case E_V16QImode:
21038       /* No check as all instructions are SSE2.  */
21039       c = 0xff;
21040       s = 8;
21041       half_mode = V8HImode;
21042       gen_and = gen_andv8hi3;
21043       gen_pack = gen_sse2_packuswb;
21044       gen_shift = gen_lshrv8hi3;
21045       break;
21046     case E_V16HImode:
21047       if (!TARGET_AVX2)
21048         return false;
21049       c = 0xffff;
21050       s = 16;
21051       half_mode = V8SImode;
21052       gen_and = gen_andv8si3;
21053       gen_pack = gen_avx2_packusdw;
21054       gen_shift = gen_lshrv8si3;
21055       end_perm = true;
21056       break;
21057     case E_V32QImode:
21058       if (!TARGET_AVX2)
21059         return false;
21060       c = 0xff;
21061       s = 8;
21062       half_mode = V16HImode;
21063       gen_and = gen_andv16hi3;
21064       gen_pack = gen_avx2_packuswb;
21065       gen_shift = gen_lshrv16hi3;
21066       end_perm = true;
21067       break;
21068     default:
21069       /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21070 	 are more profitable than general shuffles.  */
21071       return false;
21072     }
21073 
21074   /* Check that permutation is even or odd.  */
21075   odd = d->perm[0];
21076   if (odd > 1)
21077     return false;
21078 
21079   for (i = 1; i < nelt; ++i)
21080     if (d->perm[i] != 2 * i + odd)
21081       return false;
21082 
21083   if (d->testing_p)
21084     return true;
21085 
21086   dop0 = gen_reg_rtx (half_mode);
21087   dop1 = gen_reg_rtx (half_mode);
21088   if (odd == 0)
21089     {
21090       t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
21091       t = force_reg (half_mode, t);
21092       emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
21093       emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
21094     }
21095   else
21096     {
21097       emit_insn (gen_shift (dop0,
21098 			    gen_lowpart (half_mode, d->op0),
21099 			    GEN_INT (s)));
21100       emit_insn (gen_shift (dop1,
21101 			    gen_lowpart (half_mode, d->op1),
21102 			    GEN_INT (s)));
21103     }
21104   /* In AVX2 for 256 bit case we need to permute pack result.  */
21105   if (TARGET_AVX2 && end_perm)
21106     {
21107       op = gen_reg_rtx (d->vmode);
21108       t = gen_reg_rtx (V4DImode);
21109       emit_insn (gen_pack (op, dop0, dop1));
21110       emit_insn (gen_avx2_permv4di_1 (t,
21111 				      gen_lowpart (V4DImode, op),
21112 				      const0_rtx,
21113 				      const2_rtx,
21114 				      const1_rtx,
21115 				      GEN_INT (3)));
21116       emit_move_insn (d->target, gen_lowpart (d->vmode, t));
21117     }
21118   else
21119     emit_insn (gen_pack (d->target, dop0, dop1));
21120 
21121   return true;
21122 }
21123 
21124 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
21125    and extract-odd permutations of two V64QI operands
21126    with two "shifts", two "truncs" and one "concat" insns for "odd"
21127    and two "truncs" and one concat insn for "even."
21128    Have already failed all two instruction sequences.  */
21129 
21130 static bool
expand_vec_perm_even_odd_trunc(struct expand_vec_perm_d * d)21131 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
21132 {
21133   rtx t1, t2, t3, t4;
21134   unsigned i, odd, nelt = d->nelt;
21135 
21136   if (!TARGET_AVX512BW
21137       || d->one_operand_p
21138       || d->vmode != V64QImode)
21139     return false;
21140 
21141   /* Check that permutation is even or odd.  */
21142   odd = d->perm[0];
21143   if (odd > 1)
21144     return false;
21145 
21146   for (i = 1; i < nelt; ++i)
21147     if (d->perm[i] != 2 * i + odd)
21148       return false;
21149 
21150   if (d->testing_p)
21151     return true;
21152 
21153 
21154   if (odd)
21155     {
21156       t1 = gen_reg_rtx (V32HImode);
21157       t2 = gen_reg_rtx (V32HImode);
21158       emit_insn (gen_lshrv32hi3 (t1,
21159 				 gen_lowpart (V32HImode, d->op0),
21160 				 GEN_INT (8)));
21161       emit_insn (gen_lshrv32hi3 (t2,
21162 				 gen_lowpart (V32HImode, d->op1),
21163 				 GEN_INT (8)));
21164     }
21165   else
21166     {
21167       t1 = gen_lowpart (V32HImode, d->op0);
21168       t2 = gen_lowpart (V32HImode, d->op1);
21169     }
21170 
21171   t3 = gen_reg_rtx (V32QImode);
21172   t4 = gen_reg_rtx (V32QImode);
21173   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
21174   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
21175   emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
21176 
21177   return true;
21178 }
21179 
21180 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
21181    and extract-odd permutations.  */
21182 
21183 static bool
expand_vec_perm_even_odd_1(struct expand_vec_perm_d * d,unsigned odd)21184 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
21185 {
21186   rtx t1, t2, t3, t4, t5;
21187 
21188   switch (d->vmode)
21189     {
21190     case E_V4DFmode:
21191       if (d->testing_p)
21192 	break;
21193       t1 = gen_reg_rtx (V4DFmode);
21194       t2 = gen_reg_rtx (V4DFmode);
21195 
21196       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
21197       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
21198       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
21199 
21200       /* Now an unpck[lh]pd will produce the result required.  */
21201       if (odd)
21202 	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
21203       else
21204 	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
21205       emit_insn (t3);
21206       break;
21207 
21208     case E_V8SFmode:
21209       {
21210 	int mask = odd ? 0xdd : 0x88;
21211 
21212 	if (d->testing_p)
21213 	  break;
21214 	t1 = gen_reg_rtx (V8SFmode);
21215 	t2 = gen_reg_rtx (V8SFmode);
21216 	t3 = gen_reg_rtx (V8SFmode);
21217 
21218 	/* Shuffle within the 128-bit lanes to produce:
21219 	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
21220 	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
21221 				      GEN_INT (mask)));
21222 
21223 	/* Shuffle the lanes around to produce:
21224 	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
21225 	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
21226 					    GEN_INT (0x3)));
21227 
21228 	/* Shuffle within the 128-bit lanes to produce:
21229 	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
21230 	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
21231 
21232 	/* Shuffle within the 128-bit lanes to produce:
21233 	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
21234 	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
21235 
21236 	/* Shuffle the lanes around to produce:
21237 	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
21238 	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
21239 					    GEN_INT (0x20)));
21240       }
21241       break;
21242 
21243     case E_V2DFmode:
21244     case E_V4SFmode:
21245     case E_V2DImode:
21246     case E_V2SImode:
21247     case E_V4SImode:
21248     case E_V2HImode:
21249       /* These are always directly implementable by expand_vec_perm_1.  */
21250       gcc_unreachable ();
21251 
21252     case E_V2SFmode:
21253       gcc_assert (TARGET_MMX_WITH_SSE);
21254       /* We have no suitable instructions.  */
21255       if (d->testing_p)
21256 	return false;
21257       break;
21258 
21259     case E_V4QImode:
21260       if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21261 	return expand_vec_perm_pshufb2 (d);
21262       else
21263 	{
21264 	  if (d->testing_p)
21265 	    break;
21266 	  /* We need 2*log2(N)-1 operations to achieve odd/even
21267 	     with interleave. */
21268 	  t1 = gen_reg_rtx (V4QImode);
21269 	  emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
21270 	  emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
21271 	  if (odd)
21272 	    t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
21273 	  else
21274 	    t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
21275 	  emit_insn (t2);
21276 	}
21277       break;
21278 
21279     case E_V4HImode:
21280       if (TARGET_SSE4_1)
21281 	return expand_vec_perm_even_odd_pack (d);
21282       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21283 	return expand_vec_perm_pshufb2 (d);
21284       else
21285 	{
21286 	  if (d->testing_p)
21287 	    break;
21288 	  /* We need 2*log2(N)-1 operations to achieve odd/even
21289 	     with interleave. */
21290 	  t1 = gen_reg_rtx (V4HImode);
21291 	  emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
21292 	  emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
21293 	  if (odd)
21294 	    t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
21295 	  else
21296 	    t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
21297 	  emit_insn (t2);
21298 	}
21299       break;
21300 
21301     case E_V8HImode:
21302       if (TARGET_SSE4_1)
21303 	return expand_vec_perm_even_odd_pack (d);
21304       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21305 	return expand_vec_perm_pshufb2 (d);
21306       else
21307 	{
21308 	  if (d->testing_p)
21309 	    break;
21310 	  /* We need 2*log2(N)-1 operations to achieve odd/even
21311 	     with interleave. */
21312 	  t1 = gen_reg_rtx (V8HImode);
21313 	  t2 = gen_reg_rtx (V8HImode);
21314 	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
21315 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
21316 	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
21317 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
21318 	  if (odd)
21319 	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
21320 	  else
21321 	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
21322 	  emit_insn (t3);
21323 	}
21324       break;
21325 
21326     case E_V8QImode:
21327     case E_V16QImode:
21328       return expand_vec_perm_even_odd_pack (d);
21329 
21330     case E_V16HImode:
21331     case E_V32QImode:
21332       return expand_vec_perm_even_odd_pack (d);
21333 
21334     case E_V64QImode:
21335       return expand_vec_perm_even_odd_trunc (d);
21336 
21337     case E_V4DImode:
21338       if (!TARGET_AVX2)
21339 	{
21340 	  struct expand_vec_perm_d d_copy = *d;
21341 	  d_copy.vmode = V4DFmode;
21342 	  if (d->testing_p)
21343 	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
21344 	  else
21345 	    d_copy.target = gen_reg_rtx (V4DFmode);
21346 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
21347 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
21348 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21349 	    {
21350 	      if (!d->testing_p)
21351 		emit_move_insn (d->target,
21352 				gen_lowpart (V4DImode, d_copy.target));
21353 	      return true;
21354 	    }
21355 	  return false;
21356 	}
21357 
21358       if (d->testing_p)
21359 	break;
21360 
21361       t1 = gen_reg_rtx (V4DImode);
21362       t2 = gen_reg_rtx (V4DImode);
21363 
21364       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
21365       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
21366       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
21367 
21368       /* Now an vpunpck[lh]qdq will produce the result required.  */
21369       if (odd)
21370 	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
21371       else
21372 	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
21373       emit_insn (t3);
21374       break;
21375 
21376     case E_V8SImode:
21377       if (!TARGET_AVX2)
21378 	{
21379 	  struct expand_vec_perm_d d_copy = *d;
21380 	  d_copy.vmode = V8SFmode;
21381 	  if (d->testing_p)
21382 	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
21383 	  else
21384 	    d_copy.target = gen_reg_rtx (V8SFmode);
21385 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
21386 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
21387 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21388 	    {
21389 	      if (!d->testing_p)
21390 		emit_move_insn (d->target,
21391 				gen_lowpart (V8SImode, d_copy.target));
21392 	      return true;
21393 	    }
21394 	  return false;
21395 	}
21396 
21397       if (d->testing_p)
21398 	break;
21399 
21400       t1 = gen_reg_rtx (V8SImode);
21401       t2 = gen_reg_rtx (V8SImode);
21402       t3 = gen_reg_rtx (V4DImode);
21403       t4 = gen_reg_rtx (V4DImode);
21404       t5 = gen_reg_rtx (V4DImode);
21405 
21406       /* Shuffle the lanes around into
21407 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
21408       emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
21409 				    gen_lowpart (V4DImode, d->op1),
21410 				    GEN_INT (0x20)));
21411       emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
21412 				    gen_lowpart (V4DImode, d->op1),
21413 				    GEN_INT (0x31)));
21414 
21415       /* Swap the 2nd and 3rd position in each lane into
21416 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
21417       emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
21418 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21419       emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
21420 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21421 
21422       /* Now an vpunpck[lh]qdq will produce
21423 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
21424       if (odd)
21425 	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
21426 					   gen_lowpart (V4DImode, t2));
21427       else
21428 	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
21429 					  gen_lowpart (V4DImode, t2));
21430       emit_insn (t3);
21431       emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
21432       break;
21433 
21434     default:
21435       gcc_unreachable ();
21436     }
21437 
21438   return true;
21439 }
21440 
21441 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
21442    extract-even and extract-odd permutations.  */
21443 
21444 static bool
expand_vec_perm_even_odd(struct expand_vec_perm_d * d)21445 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
21446 {
21447   unsigned i, odd, nelt = d->nelt;
21448 
21449   odd = d->perm[0];
21450   if (odd != 0 && odd != 1)
21451     return false;
21452 
21453   for (i = 1; i < nelt; ++i)
21454     if (d->perm[i] != 2 * i + odd)
21455       return false;
21456 
21457   if (d->vmode == E_V32HImode
21458       && d->testing_p
21459       && !TARGET_AVX512BW)
21460     return false;
21461 
21462   return expand_vec_perm_even_odd_1 (d, odd);
21463 }
21464 
21465 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
21466    permutations.  We assume that expand_vec_perm_1 has already failed.  */
21467 
21468 static bool
expand_vec_perm_broadcast_1(struct expand_vec_perm_d * d)21469 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
21470 {
21471   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
21472   machine_mode vmode = d->vmode;
21473   rtx (*gen) (rtx, rtx, rtx);
21474   unsigned char perm2[4];
21475   rtx op0 = d->op0, dest;
21476   bool ok;
21477 
21478   switch (vmode)
21479     {
21480     case E_V4DFmode:
21481     case E_V8SFmode:
21482       /* These are special-cased in sse.md so that we can optionally
21483 	 use the vbroadcast instruction.  They expand to two insns
21484 	 if the input happens to be in a register.  */
21485       gcc_unreachable ();
21486 
21487     case E_V2DFmode:
21488     case E_V2SFmode:
21489     case E_V4SFmode:
21490     case E_V2DImode:
21491     case E_V2SImode:
21492     case E_V4SImode:
21493     case E_V2HImode:
21494     case E_V4HImode:
21495       /* These are always implementable using standard shuffle patterns.  */
21496       gcc_unreachable ();
21497 
21498     case E_V4QImode:
21499       /* This can be implemented via interleave and pshuflw.  */
21500       if (d->testing_p)
21501 	return true;
21502 
21503       if (elt >= nelt2)
21504 	{
21505 	  gen = gen_mmx_punpckhbw_low;
21506 	  elt -= nelt2;
21507 	}
21508       else
21509 	gen = gen_mmx_punpcklbw_low;
21510 
21511       dest = gen_reg_rtx (vmode);
21512       emit_insn (gen (dest, op0, op0));
21513       vmode = get_mode_wider_vector (vmode);
21514       op0 = gen_lowpart (vmode, dest);
21515 
21516       memset (perm2, elt, 2);
21517       dest = gen_reg_rtx (vmode);
21518       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21519       gcc_assert (ok);
21520 
21521       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21522       return true;
21523 
21524     case E_V8QImode:
21525       /* This can be implemented via interleave.  We save one insn by
21526 	 stopping once we have promoted to V2SImode and then use pshufd.  */
21527       if (d->testing_p)
21528 	return true;
21529       do
21530 	{
21531 	  if (elt >= nelt2)
21532 	    {
21533 	      gen = vmode == V8QImode ? gen_mmx_punpckhbw
21534 				      : gen_mmx_punpckhwd;
21535 	      elt -= nelt2;
21536 	    }
21537 	  else
21538 	    gen = vmode == V8QImode ? gen_mmx_punpcklbw
21539 				    : gen_mmx_punpcklwd;
21540 	  nelt2 /= 2;
21541 
21542 	  dest = gen_reg_rtx (vmode);
21543 	  emit_insn (gen (dest, op0, op0));
21544 	  vmode = get_mode_wider_vector (vmode);
21545 	  op0 = gen_lowpart (vmode, dest);
21546 	}
21547       while (vmode != V2SImode);
21548 
21549       memset (perm2, elt, 2);
21550       dest = gen_reg_rtx (vmode);
21551       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21552       gcc_assert (ok);
21553 
21554       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21555       return true;
21556 
21557     case E_V8HImode:
21558     case E_V16QImode:
21559       /* These can be implemented via interleave.  We save one insn by
21560 	 stopping once we have promoted to V4SImode and then use pshufd.  */
21561       if (d->testing_p)
21562 	return true;
21563       do
21564 	{
21565 	  if (elt >= nelt2)
21566 	    {
21567 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
21568 				       : gen_vec_interleave_highv8hi;
21569 	      elt -= nelt2;
21570 	    }
21571 	  else
21572 	    gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
21573 				     : gen_vec_interleave_lowv8hi;
21574 	  nelt2 /= 2;
21575 
21576 	  dest = gen_reg_rtx (vmode);
21577 	  emit_insn (gen (dest, op0, op0));
21578 	  vmode = get_mode_wider_vector (vmode);
21579 	  op0 = gen_lowpart (vmode, dest);
21580 	}
21581       while (vmode != V4SImode);
21582 
21583       memset (perm2, elt, 4);
21584       dest = gen_reg_rtx (vmode);
21585       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
21586       gcc_assert (ok);
21587 
21588       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21589       return true;
21590 
21591     case E_V8HFmode:
21592       /* This can be implemented via interleave and pshufd.  */
21593       if (d->testing_p)
21594 	return true;
21595 
21596       if (elt >= nelt2)
21597 	{
21598 	  gen = gen_vec_interleave_highv8hf;
21599 	  elt -= nelt2;
21600 	}
21601       else
21602 	gen = gen_vec_interleave_lowv8hf;
21603       nelt2 /= 2;
21604 
21605       dest = gen_reg_rtx (vmode);
21606       emit_insn (gen (dest, op0, op0));
21607 
21608       vmode = V4SImode;
21609       op0 = gen_lowpart (vmode, dest);
21610 
21611       memset (perm2, elt, 4);
21612       dest = gen_reg_rtx (vmode);
21613       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
21614       gcc_assert (ok);
21615 
21616       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21617       return true;
21618 
21619     case E_V32QImode:
21620     case E_V16HImode:
21621     case E_V8SImode:
21622     case E_V4DImode:
21623       /* For AVX2 broadcasts of the first element vpbroadcast* or
21624 	 vpermq should be used by expand_vec_perm_1.  */
21625       gcc_assert (!TARGET_AVX2 || d->perm[0]);
21626       return false;
21627 
21628     case E_V64QImode:
21629       gcc_assert (!TARGET_AVX512BW || d->perm[0]);
21630       return false;
21631 
21632     case E_V32HImode:
21633       gcc_assert (!TARGET_AVX512BW);
21634       return false;
21635 
21636     default:
21637       gcc_unreachable ();
21638     }
21639 }
21640 
21641 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
21642    broadcast permutations.  */
21643 
21644 static bool
expand_vec_perm_broadcast(struct expand_vec_perm_d * d)21645 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
21646 {
21647   unsigned i, elt, nelt = d->nelt;
21648 
21649   if (!d->one_operand_p)
21650     return false;
21651 
21652   elt = d->perm[0];
21653   for (i = 1; i < nelt; ++i)
21654     if (d->perm[i] != elt)
21655       return false;
21656 
21657   return expand_vec_perm_broadcast_1 (d);
21658 }
21659 
21660 /* Implement arbitrary permutations of two V64QImode operands
21661    with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
21662 static bool
expand_vec_perm_vpermt2_vpshub2(struct expand_vec_perm_d * d)21663 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
21664 {
21665   if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
21666     return false;
21667 
21668   if (d->testing_p)
21669     return true;
21670 
21671   struct expand_vec_perm_d ds[2];
21672   rtx rperm[128], vperm, target0, target1;
21673   unsigned int i, nelt;
21674   machine_mode vmode;
21675 
21676   nelt = d->nelt;
21677   vmode = V64QImode;
21678 
21679   for (i = 0; i < 2; i++)
21680     {
21681       ds[i] = *d;
21682       ds[i].vmode = V32HImode;
21683       ds[i].nelt = 32;
21684       ds[i].target = gen_reg_rtx (V32HImode);
21685       ds[i].op0 = gen_lowpart (V32HImode, d->op0);
21686       ds[i].op1 = gen_lowpart (V32HImode, d->op1);
21687     }
21688 
21689   /* Prepare permutations such that the first one takes care of
21690      putting the even bytes into the right positions or one higher
21691      positions (ds[0]) and the second one takes care of
21692      putting the odd bytes into the right positions or one below
21693      (ds[1]).  */
21694 
21695   for (i = 0; i < nelt; i++)
21696     {
21697       ds[i & 1].perm[i / 2] = d->perm[i] / 2;
21698       if (i & 1)
21699 	{
21700 	  rperm[i] = constm1_rtx;
21701 	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21702 	}
21703       else
21704 	{
21705 	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21706 	  rperm[i + 64] = constm1_rtx;
21707 	}
21708     }
21709 
21710   bool ok = expand_vec_perm_1 (&ds[0]);
21711   gcc_assert (ok);
21712   ds[0].target = gen_lowpart (V64QImode, ds[0].target);
21713 
21714   ok = expand_vec_perm_1 (&ds[1]);
21715   gcc_assert (ok);
21716   ds[1].target = gen_lowpart (V64QImode, ds[1].target);
21717 
21718   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
21719   vperm = force_reg (vmode, vperm);
21720   target0 = gen_reg_rtx (V64QImode);
21721   emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
21722 
21723   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
21724   vperm = force_reg (vmode, vperm);
21725   target1 = gen_reg_rtx (V64QImode);
21726   emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
21727 
21728   emit_insn (gen_iorv64qi3 (d->target, target0, target1));
21729   return true;
21730 }
21731 
21732 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
21733    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
21734    all the shorter instruction sequences.  */
21735 
21736 static bool
expand_vec_perm_vpshufb4_vpermq2(struct expand_vec_perm_d * d)21737 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
21738 {
21739   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
21740   unsigned int i, nelt, eltsz;
21741   bool used[4];
21742 
21743   if (!TARGET_AVX2
21744       || d->one_operand_p
21745       || (d->vmode != V32QImode && d->vmode != V16HImode))
21746     return false;
21747 
21748   if (d->testing_p)
21749     return true;
21750 
21751   nelt = d->nelt;
21752   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21753 
21754   /* Generate 4 permutation masks.  If the required element is within
21755      the same lane, it is shuffled in.  If the required element from the
21756      other lane, force a zero by setting bit 7 in the permutation mask.
21757      In the other mask the mask has non-negative elements if element
21758      is requested from the other lane, but also moved to the other lane,
21759      so that the result of vpshufb can have the two V2TImode halves
21760      swapped.  */
21761   m128 = GEN_INT (-128);
21762   for (i = 0; i < 32; ++i)
21763     {
21764       rperm[0][i] = m128;
21765       rperm[1][i] = m128;
21766       rperm[2][i] = m128;
21767       rperm[3][i] = m128;
21768     }
21769   used[0] = false;
21770   used[1] = false;
21771   used[2] = false;
21772   used[3] = false;
21773   for (i = 0; i < nelt; ++i)
21774     {
21775       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21776       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21777       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
21778 
21779       for (j = 0; j < eltsz; ++j)
21780 	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
21781       used[which] = true;
21782     }
21783 
21784   for (i = 0; i < 2; ++i)
21785     {
21786       if (!used[2 * i + 1])
21787 	{
21788 	  h[i] = NULL_RTX;
21789 	  continue;
21790 	}
21791       vperm = gen_rtx_CONST_VECTOR (V32QImode,
21792 				    gen_rtvec_v (32, rperm[2 * i + 1]));
21793       vperm = force_reg (V32QImode, vperm);
21794       h[i] = gen_reg_rtx (V32QImode);
21795       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
21796       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
21797     }
21798 
21799   /* Swap the 128-byte lanes of h[X].  */
21800   for (i = 0; i < 2; ++i)
21801    {
21802      if (h[i] == NULL_RTX)
21803        continue;
21804      op = gen_reg_rtx (V4DImode);
21805      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
21806 				     const2_rtx, GEN_INT (3), const0_rtx,
21807 				     const1_rtx));
21808      h[i] = gen_lowpart (V32QImode, op);
21809    }
21810 
21811   for (i = 0; i < 2; ++i)
21812     {
21813       if (!used[2 * i])
21814 	{
21815 	  l[i] = NULL_RTX;
21816 	  continue;
21817 	}
21818       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
21819       vperm = force_reg (V32QImode, vperm);
21820       l[i] = gen_reg_rtx (V32QImode);
21821       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
21822       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
21823     }
21824 
21825   for (i = 0; i < 2; ++i)
21826     {
21827       if (h[i] && l[i])
21828 	{
21829 	  op = gen_reg_rtx (V32QImode);
21830 	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
21831 	  l[i] = op;
21832 	}
21833       else if (h[i])
21834 	l[i] = h[i];
21835     }
21836 
21837   gcc_assert (l[0] && l[1]);
21838   op = d->target;
21839   if (d->vmode != V32QImode)
21840     op = gen_reg_rtx (V32QImode);
21841   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
21842   if (op != d->target)
21843     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21844   return true;
21845 }
21846 
21847 /* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
21848    taken care of, perform the expansion in D and return true on success.  */
21849 
21850 static bool
ix86_expand_vec_perm_const_1(struct expand_vec_perm_d * d)21851 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
21852 {
21853   /* Try a single instruction expansion.  */
21854   if (expand_vec_perm_1 (d))
21855     return true;
21856 
21857   /* Try sequences of two instructions.  */
21858 
21859   if (expand_vec_perm_pshuflw_pshufhw (d))
21860     return true;
21861 
21862   if (expand_vec_perm_palignr (d, false))
21863     return true;
21864 
21865   if (expand_vec_perm_interleave2 (d))
21866     return true;
21867 
21868   if (expand_vec_perm_broadcast (d))
21869     return true;
21870 
21871   if (expand_vec_perm_vpermq_perm_1 (d))
21872     return true;
21873 
21874   if (expand_vec_perm_vperm2f128 (d))
21875     return true;
21876 
21877   if (expand_vec_perm_pblendv (d))
21878     return true;
21879 
21880   if (expand_vec_perm_2perm_interleave (d, true))
21881     return true;
21882 
21883   if (expand_vec_perm_2perm_pblendv (d, true))
21884     return true;
21885 
21886   /* Try sequences of three instructions.  */
21887 
21888   if (expand_vec_perm_even_odd_pack (d))
21889     return true;
21890 
21891   if (expand_vec_perm_2vperm2f128_vshuf (d))
21892     return true;
21893 
21894   if (expand_vec_perm_pshufb2 (d))
21895     return true;
21896 
21897   if (expand_vec_perm_interleave3 (d))
21898     return true;
21899 
21900   if (expand_vec_perm_vperm2f128_vblend (d))
21901     return true;
21902 
21903   if (expand_vec_perm_2perm_interleave (d, false))
21904     return true;
21905 
21906   if (expand_vec_perm_2perm_pblendv (d, false))
21907     return true;
21908 
21909   /* Try sequences of four instructions.  */
21910 
21911   if (expand_vec_perm_even_odd_trunc (d))
21912     return true;
21913   if (expand_vec_perm_vpshufb2_vpermq (d))
21914     return true;
21915 
21916   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
21917     return true;
21918 
21919   if (expand_vec_perm_vpermt2_vpshub2 (d))
21920     return true;
21921 
21922   /* ??? Look for narrow permutations whose element orderings would
21923      allow the promotion to a wider mode.  */
21924 
21925   /* ??? Look for sequences of interleave or a wider permute that place
21926      the data into the correct lanes for a half-vector shuffle like
21927      pshuf[lh]w or vpermilps.  */
21928 
21929   /* ??? Look for sequences of interleave that produce the desired results.
21930      The combinatorics of punpck[lh] get pretty ugly... */
21931 
21932   if (expand_vec_perm_even_odd (d))
21933     return true;
21934 
21935   /* Even longer sequences.  */
21936   if (expand_vec_perm_vpshufb4_vpermq2 (d))
21937     return true;
21938 
21939   /* See if we can get the same permutation in different vector integer
21940      mode.  */
21941   struct expand_vec_perm_d nd;
21942   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
21943     {
21944       if (!d->testing_p)
21945 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
21946       return true;
21947     }
21948 
21949   /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
21950   if (expand_vec_perm2_vperm2f128_vblend (d))
21951     return true;
21952 
21953   return false;
21954 }
21955 
21956 /* If a permutation only uses one operand, make it clear. Returns true
21957    if the permutation references both operands.  */
21958 
21959 static bool
canonicalize_perm(struct expand_vec_perm_d * d)21960 canonicalize_perm (struct expand_vec_perm_d *d)
21961 {
21962   int i, which, nelt = d->nelt;
21963 
21964   for (i = which = 0; i < nelt; ++i)
21965     which |= (d->perm[i] < nelt ? 1 : 2);
21966 
21967   d->one_operand_p = true;
21968   switch (which)
21969     {
21970     default:
21971       gcc_unreachable();
21972 
21973     case 3:
21974       if (!rtx_equal_p (d->op0, d->op1))
21975         {
21976 	  d->one_operand_p = false;
21977 	  break;
21978         }
21979       /* The elements of PERM do not suggest that only the first operand
21980 	 is used, but both operands are identical.  Allow easier matching
21981 	 of the permutation by folding the permutation into the single
21982 	 input vector.  */
21983       /* FALLTHRU */
21984 
21985     case 2:
21986       for (i = 0; i < nelt; ++i)
21987         d->perm[i] &= nelt - 1;
21988       d->op0 = d->op1;
21989       break;
21990 
21991     case 1:
21992       d->op1 = d->op0;
21993       break;
21994     }
21995 
21996   return (which == 3);
21997 }
21998 
21999 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
22000 
22001 bool
ix86_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)22002 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
22003 			       rtx op1, const vec_perm_indices &sel)
22004 {
22005   struct expand_vec_perm_d d;
22006   unsigned char perm[MAX_VECT_LEN];
22007   unsigned int i, nelt, which;
22008   bool two_args;
22009 
22010   /* For HF mode vector, convert it to HI using subreg.  */
22011   if (GET_MODE_INNER (vmode) == HFmode)
22012     {
22013       machine_mode orig_mode = vmode;
22014       vmode = mode_for_vector (HImode,
22015 			       GET_MODE_NUNITS (vmode)).require ();
22016       if (target)
22017 	target = lowpart_subreg (vmode, target, orig_mode);
22018       if (op0)
22019 	op0 = lowpart_subreg (vmode, op0, orig_mode);
22020       if (op1)
22021 	op1 = lowpart_subreg (vmode, op1, orig_mode);
22022     }
22023 
22024   d.target = target;
22025   d.op0 = op0;
22026   d.op1 = op1;
22027 
22028   d.vmode = vmode;
22029   gcc_assert (VECTOR_MODE_P (d.vmode));
22030   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22031   d.testing_p = !target;
22032 
22033   gcc_assert (sel.length () == nelt);
22034   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
22035 
22036   /* Given sufficient ISA support we can just return true here
22037      for selected vector modes.  */
22038   switch (d.vmode)
22039     {
22040     case E_V16SFmode:
22041     case E_V16SImode:
22042     case E_V8DImode:
22043     case E_V8DFmode:
22044       if (!TARGET_AVX512F)
22045 	return false;
22046       /* All implementable with a single vperm[it]2 insn.  */
22047       if (d.testing_p)
22048 	return true;
22049       break;
22050     case E_V32HImode:
22051       if (!TARGET_AVX512F)
22052 	return false;
22053       if (d.testing_p && TARGET_AVX512BW)
22054 	/* All implementable with a single vperm[it]2 insn.  */
22055 	return true;
22056       break;
22057     case E_V64QImode:
22058       if (!TARGET_AVX512F)
22059 	return false;
22060       if (d.testing_p && TARGET_AVX512BW)
22061 	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
22062 	return true;
22063       break;
22064     case E_V8SImode:
22065     case E_V8SFmode:
22066     case E_V4DFmode:
22067     case E_V4DImode:
22068       if (!TARGET_AVX)
22069 	return false;
22070       if (d.testing_p && TARGET_AVX512VL)
22071 	/* All implementable with a single vperm[it]2 insn.  */
22072 	return true;
22073       break;
22074     case E_V16HImode:
22075       if (!TARGET_SSE2)
22076 	return false;
22077       if (d.testing_p && TARGET_AVX2)
22078 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
22079 	return true;
22080       break;
22081     case E_V32QImode:
22082       if (!TARGET_SSE2)
22083 	return false;
22084       if (d.testing_p && TARGET_AVX2)
22085 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
22086 	return true;
22087       break;
22088     case E_V8HImode:
22089     case E_V16QImode:
22090       if (!TARGET_SSE2)
22091 	return false;
22092       /* Fall through.  */
22093     case E_V4SImode:
22094     case E_V4SFmode:
22095       if (!TARGET_SSE)
22096 	return false;
22097       /* All implementable with a single vpperm insn.  */
22098       if (d.testing_p && TARGET_XOP)
22099 	return true;
22100       /* All implementable with 2 pshufb + 1 ior.  */
22101       if (d.testing_p && TARGET_SSSE3)
22102 	return true;
22103       break;
22104     case E_V2SFmode:
22105     case E_V2SImode:
22106     case E_V4HImode:
22107     case E_V8QImode:
22108       if (!TARGET_MMX_WITH_SSE)
22109 	return false;
22110       break;
22111     case E_V2HImode:
22112       if (!TARGET_SSE2)
22113 	return false;
22114       /* All implementable with *punpckwd.  */
22115       if (d.testing_p)
22116 	return true;
22117       break;
22118     case E_V4QImode:
22119       if (!TARGET_SSE2)
22120 	return false;
22121       break;
22122     case E_V2DImode:
22123     case E_V2DFmode:
22124       if (!TARGET_SSE)
22125 	return false;
22126       /* All implementable with shufpd or unpck[lh]pd.  */
22127       if (d.testing_p)
22128 	return true;
22129       break;
22130     default:
22131       return false;
22132     }
22133 
22134   for (i = which = 0; i < nelt; ++i)
22135     {
22136       unsigned char e = sel[i];
22137       gcc_assert (e < 2 * nelt);
22138       d.perm[i] = e;
22139       perm[i] = e;
22140       which |= (e < nelt ? 1 : 2);
22141     }
22142 
22143   if (d.testing_p)
22144     {
22145       /* For all elements from second vector, fold the elements to first.  */
22146       if (which == 2)
22147 	for (i = 0; i < nelt; ++i)
22148 	  d.perm[i] -= nelt;
22149 
22150       /* Check whether the mask can be applied to the vector type.  */
22151       d.one_operand_p = (which != 3);
22152 
22153       /* Implementable with shufps, pshufd or pshuflw.  */
22154       if (d.one_operand_p
22155 	  && (d.vmode == V4SFmode || d.vmode == V2SFmode
22156 	      || d.vmode == V4SImode || d.vmode == V2SImode
22157 	      || d.vmode == V4HImode || d.vmode == V2HImode))
22158 	return true;
22159 
22160       /* Otherwise we have to go through the motions and see if we can
22161 	 figure out how to generate the requested permutation.  */
22162       d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
22163       d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
22164       if (!d.one_operand_p)
22165 	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
22166 
22167       start_sequence ();
22168       bool ret = ix86_expand_vec_perm_const_1 (&d);
22169       end_sequence ();
22170 
22171       return ret;
22172     }
22173 
22174   two_args = canonicalize_perm (&d);
22175 
22176   /* If one of the operands is a zero vector, try to match pmovzx.  */
22177   if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
22178     {
22179       struct expand_vec_perm_d dzero = d;
22180       if (d.op0 == CONST0_RTX (vmode))
22181 	{
22182 	  d.op1 = dzero.op1 = force_reg (vmode, d.op1);
22183 	  std::swap (dzero.op0, dzero.op1);
22184 	  for (i = 0; i < nelt; ++i)
22185 	    dzero.perm[i] ^= nelt;
22186 	}
22187       else
22188 	d.op0 = dzero.op0 = force_reg (vmode, d.op0);
22189 
22190       if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
22191 				  dzero.perm, nelt, dzero.testing_p))
22192 	return true;
22193     }
22194 
22195   /* Force operands into registers.  */
22196   rtx nop0 = force_reg (vmode, d.op0);
22197   if (d.op0 == d.op1)
22198     d.op1 = nop0;
22199   d.op0 = nop0;
22200   d.op1 = force_reg (vmode, d.op1);
22201 
22202   if (ix86_expand_vec_perm_const_1 (&d))
22203     return true;
22204 
22205   /* If the selector says both arguments are needed, but the operands are the
22206      same, the above tried to expand with one_operand_p and flattened selector.
22207      If that didn't work, retry without one_operand_p; we succeeded with that
22208      during testing.  */
22209   if (two_args && d.one_operand_p)
22210     {
22211       d.one_operand_p = false;
22212       memcpy (d.perm, perm, sizeof (perm));
22213       return ix86_expand_vec_perm_const_1 (&d);
22214     }
22215 
22216   return false;
22217 }
22218 
22219 void
ix86_expand_vec_extract_even_odd(rtx targ,rtx op0,rtx op1,unsigned odd)22220 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
22221 {
22222   struct expand_vec_perm_d d;
22223   unsigned i, nelt;
22224 
22225   d.target = targ;
22226   d.op0 = op0;
22227   d.op1 = op1;
22228   d.vmode = GET_MODE (targ);
22229   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22230   d.one_operand_p = false;
22231   d.testing_p = false;
22232 
22233   for (i = 0; i < nelt; ++i)
22234     d.perm[i] = i * 2 + odd;
22235 
22236   /* We'll either be able to implement the permutation directly...  */
22237   if (expand_vec_perm_1 (&d))
22238     return;
22239 
22240   /* ... or we use the special-case patterns.  */
22241   expand_vec_perm_even_odd_1 (&d, odd);
22242 }
22243 
22244 static void
ix86_expand_vec_interleave(rtx targ,rtx op0,rtx op1,bool high_p)22245 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
22246 {
22247   struct expand_vec_perm_d d;
22248   unsigned i, nelt, base;
22249   bool ok;
22250 
22251   d.target = targ;
22252   d.op0 = op0;
22253   d.op1 = op1;
22254   d.vmode = GET_MODE (targ);
22255   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22256   d.one_operand_p = false;
22257   d.testing_p = false;
22258 
22259   base = high_p ? nelt / 2 : 0;
22260   for (i = 0; i < nelt / 2; ++i)
22261     {
22262       d.perm[i * 2] = i + base;
22263       d.perm[i * 2 + 1] = i + base + nelt;
22264     }
22265 
22266   /* Note that for AVX this isn't one instruction.  */
22267   ok = ix86_expand_vec_perm_const_1 (&d);
22268   gcc_assert (ok);
22269 }
22270 
22271 /* This function is similar as ix86_expand_vecop_qihi,
22272    but optimized under AVX512BW by using vpmovwb.
22273    For example, optimize vector MUL generation like
22274 
22275    vpmovzxbw ymm2, xmm0
22276    vpmovzxbw ymm3, xmm1
22277    vpmullw   ymm4, ymm2, ymm3
22278    vpmovwb   xmm0, ymm4
22279 
22280    it would take less instructions than ix86_expand_vecop_qihi.
22281    Return true if success.  */
22282 
22283 static bool
ix86_expand_vecop_qihi2(enum rtx_code code,rtx dest,rtx op1,rtx op2)22284 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22285 {
22286   machine_mode himode, qimode = GET_MODE (dest);
22287   rtx hop1, hop2, hdest;
22288   rtx (*gen_extend)(rtx, rtx);
22289   rtx (*gen_truncate)(rtx, rtx);
22290   bool uns_p = (code == ASHIFTRT) ? false : true;
22291 
22292   /* There's no V64HImode multiplication instruction.  */
22293   if (qimode == E_V64QImode)
22294     return false;
22295 
22296   /* vpmovwb only available under AVX512BW.  */
22297   if (!TARGET_AVX512BW)
22298     return false;
22299   if ((qimode == V8QImode || qimode == V16QImode)
22300       && !TARGET_AVX512VL)
22301     return false;
22302   /* Not generate zmm instruction when prefer 128/256 bit vector width.  */
22303   if (qimode == V32QImode
22304       && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
22305     return false;
22306 
22307   switch (qimode)
22308     {
22309     case E_V8QImode:
22310       himode = V8HImode;
22311       gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
22312       gen_truncate = gen_truncv8hiv8qi2;
22313       break;
22314     case E_V16QImode:
22315       himode = V16HImode;
22316       gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
22317       gen_truncate = gen_truncv16hiv16qi2;
22318       break;
22319     case E_V32QImode:
22320       himode = V32HImode;
22321       gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
22322       gen_truncate = gen_truncv32hiv32qi2;
22323       break;
22324     default:
22325       gcc_unreachable ();
22326     }
22327 
22328   hop1 = gen_reg_rtx (himode);
22329   hop2 = gen_reg_rtx (himode);
22330   hdest = gen_reg_rtx (himode);
22331   emit_insn (gen_extend (hop1, op1));
22332   emit_insn (gen_extend (hop2, op2));
22333   emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
22334 						      hop1, hop2)));
22335   emit_insn (gen_truncate (dest, hdest));
22336   return true;
22337 }
22338 
22339 /* Expand a vector operation shift by constant for a V*QImode in terms of the
22340    same operation on V*HImode. Return true if success. */
22341 static bool
ix86_expand_vec_shift_qihi_constant(enum rtx_code code,rtx dest,rtx op1,rtx op2)22342 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
22343 				     rtx dest, rtx op1, rtx op2)
22344 {
22345   machine_mode qimode, himode;
22346   HOST_WIDE_INT and_constant, xor_constant;
22347   HOST_WIDE_INT shift_amount;
22348   rtx vec_const_and, vec_const_xor;
22349   rtx tmp, op1_subreg;
22350   rtx (*gen_shift) (rtx, rtx, rtx);
22351   rtx (*gen_and) (rtx, rtx, rtx);
22352   rtx (*gen_xor) (rtx, rtx, rtx);
22353   rtx (*gen_sub) (rtx, rtx, rtx);
22354 
22355   /* Only optimize shift by constant.  */
22356   if (!CONST_INT_P (op2))
22357     return false;
22358 
22359   qimode = GET_MODE (dest);
22360   shift_amount = INTVAL (op2);
22361   /* Do nothing when shift amount greater equal 8.  */
22362   if (shift_amount > 7)
22363     return false;
22364 
22365   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
22366   /* Record sign bit.  */
22367   xor_constant = 1 << (8 - shift_amount - 1);
22368 
22369   /* Zero upper/lower bits shift from left/right element.  */
22370   and_constant
22371     = (code == ASHIFT ? 256 - (1 << shift_amount)
22372        : (1 << (8 - shift_amount)) - 1);
22373 
22374   switch (qimode)
22375     {
22376     case V16QImode:
22377       himode = V8HImode;
22378       gen_shift =
22379 	((code == ASHIFT)
22380 	 ? gen_ashlv8hi3
22381 	 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
22382       gen_and = gen_andv16qi3;
22383       gen_xor = gen_xorv16qi3;
22384       gen_sub = gen_subv16qi3;
22385       break;
22386     case V32QImode:
22387       himode = V16HImode;
22388       gen_shift =
22389 	((code == ASHIFT)
22390 	 ? gen_ashlv16hi3
22391 	 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
22392       gen_and = gen_andv32qi3;
22393       gen_xor = gen_xorv32qi3;
22394       gen_sub = gen_subv32qi3;
22395       break;
22396     case V64QImode:
22397       himode = V32HImode;
22398       gen_shift =
22399 	((code == ASHIFT)
22400 	 ? gen_ashlv32hi3
22401 	 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
22402       gen_and = gen_andv64qi3;
22403       gen_xor = gen_xorv64qi3;
22404       gen_sub = gen_subv64qi3;
22405       break;
22406     default:
22407       gcc_unreachable ();
22408     }
22409 
22410   tmp = gen_reg_rtx (himode);
22411   vec_const_and = gen_reg_rtx (qimode);
22412   op1_subreg = lowpart_subreg (himode, op1, qimode);
22413 
22414   /* For ASHIFT and LSHIFTRT, perform operation like
22415      vpsllw/vpsrlw $shift_amount, %op1, %dest.
22416      vpand %vec_const_and, %dest.  */
22417   emit_insn (gen_shift (tmp, op1_subreg, op2));
22418   emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
22419   emit_move_insn (vec_const_and,
22420 		  ix86_build_const_vector (qimode, true,
22421 					   gen_int_mode (and_constant, QImode)));
22422   emit_insn (gen_and (dest, dest, vec_const_and));
22423 
22424   /* For ASHIFTRT, perform extra operation like
22425      vpxor %vec_const_xor, %dest, %dest
22426      vpsubb %vec_const_xor, %dest, %dest  */
22427   if (code == ASHIFTRT)
22428     {
22429       vec_const_xor = gen_reg_rtx (qimode);
22430       emit_move_insn (vec_const_xor,
22431 		      ix86_build_const_vector (qimode, true,
22432 					       gen_int_mode (xor_constant, QImode)));
22433       emit_insn (gen_xor (dest, dest, vec_const_xor));
22434       emit_insn (gen_sub (dest, dest, vec_const_xor));
22435     }
22436   return true;
22437 }
22438 
22439 /* Expand a vector operation CODE for a V*QImode in terms of the
22440    same operation on V*HImode.  */
22441 
22442 void
ix86_expand_vecop_qihi(enum rtx_code code,rtx dest,rtx op1,rtx op2)22443 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22444 {
22445   machine_mode qimode = GET_MODE (dest);
22446   machine_mode himode;
22447   rtx (*gen_il) (rtx, rtx, rtx);
22448   rtx (*gen_ih) (rtx, rtx, rtx);
22449   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
22450   struct expand_vec_perm_d d;
22451   bool ok, full_interleave;
22452   bool uns_p = false;
22453   int i;
22454 
22455   if (CONST_INT_P (op2)
22456       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
22457       && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
22458     return;
22459 
22460   if (TARGET_AVX512BW
22461       && VECTOR_MODE_P (GET_MODE (op2))
22462       && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
22463     return;
22464 
22465   switch (qimode)
22466     {
22467     case E_V16QImode:
22468       himode = V8HImode;
22469       gen_il = gen_vec_interleave_lowv16qi;
22470       gen_ih = gen_vec_interleave_highv16qi;
22471       break;
22472     case E_V32QImode:
22473       himode = V16HImode;
22474       gen_il = gen_avx2_interleave_lowv32qi;
22475       gen_ih = gen_avx2_interleave_highv32qi;
22476       break;
22477     case E_V64QImode:
22478       himode = V32HImode;
22479       gen_il = gen_avx512bw_interleave_lowv64qi;
22480       gen_ih = gen_avx512bw_interleave_highv64qi;
22481       break;
22482     default:
22483       gcc_unreachable ();
22484     }
22485 
22486   switch (code)
22487     {
22488     case MULT:
22489       /* Unpack data such that we've got a source byte in each low byte of
22490 	 each word.  We don't care what goes into the high byte of each word.
22491 	 Rather than trying to get zero in there, most convenient is to let
22492 	 it be a copy of the low byte.  */
22493       op2_l = gen_reg_rtx (qimode);
22494       op2_h = gen_reg_rtx (qimode);
22495       emit_insn (gen_il (op2_l, op2, op2));
22496       emit_insn (gen_ih (op2_h, op2, op2));
22497 
22498       op1_l = gen_reg_rtx (qimode);
22499       op1_h = gen_reg_rtx (qimode);
22500       emit_insn (gen_il (op1_l, op1, op1));
22501       emit_insn (gen_ih (op1_h, op1, op1));
22502       full_interleave = qimode == V16QImode;
22503       break;
22504 
22505     case ASHIFT:
22506     case LSHIFTRT:
22507       uns_p = true;
22508       /* FALLTHRU */
22509     case ASHIFTRT:
22510       op1_l = gen_reg_rtx (himode);
22511       op1_h = gen_reg_rtx (himode);
22512       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
22513       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
22514       /* vashr/vlshr/vashl  */
22515       if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22516 	{
22517 	  rtx tmp = force_reg (qimode, op2);
22518 	  op2_l = gen_reg_rtx (himode);
22519 	  op2_h = gen_reg_rtx (himode);
22520 	  ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
22521 	  ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
22522 	}
22523       else
22524 	op2_l = op2_h = op2;
22525 
22526       full_interleave = true;
22527       break;
22528     default:
22529       gcc_unreachable ();
22530     }
22531 
22532   /* Perform vashr/vlshr/vashl.  */
22533   if (code != MULT
22534       && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22535     {
22536       res_l = gen_reg_rtx (himode);
22537       res_h = gen_reg_rtx (himode);
22538       emit_insn (gen_rtx_SET (res_l,
22539 			      simplify_gen_binary (code, himode,
22540 						   op1_l, op2_l)));
22541       emit_insn (gen_rtx_SET (res_h,
22542 			      simplify_gen_binary (code, himode,
22543 						   op1_h, op2_h)));
22544     }
22545   /* Performance mult/ashr/lshr/ashl.  */
22546   else
22547     {
22548       res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
22549 				   1, OPTAB_DIRECT);
22550       res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
22551 				   1, OPTAB_DIRECT);
22552     }
22553 
22554   gcc_assert (res_l && res_h);
22555 
22556   /* Merge the data back into the right place.  */
22557   d.target = dest;
22558   d.op0 = gen_lowpart (qimode, res_l);
22559   d.op1 = gen_lowpart (qimode, res_h);
22560   d.vmode = qimode;
22561   d.nelt = GET_MODE_NUNITS (qimode);
22562   d.one_operand_p = false;
22563   d.testing_p = false;
22564 
22565   if (full_interleave)
22566     {
22567       /* For SSE2, we used an full interleave, so the desired
22568 	 results are in the even elements.  */
22569       for (i = 0; i < d.nelt; ++i)
22570 	d.perm[i] = i * 2;
22571     }
22572   else
22573     {
22574       /* For AVX, the interleave used above was not cross-lane.  So the
22575 	 extraction is evens but with the second and third quarter swapped.
22576 	 Happily, that is even one insn shorter than even extraction.
22577 	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
22578 	 always first from the first and then from the second source operand,
22579 	 the index bits above the low 4 bits remains the same.
22580 	 Thus, for d.nelt == 32 we want permutation
22581 	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
22582 	 and for d.nelt == 64 we want permutation
22583 	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
22584 	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
22585       for (i = 0; i < d.nelt; ++i)
22586 	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
22587     }
22588 
22589   ok = ix86_expand_vec_perm_const_1 (&d);
22590   gcc_assert (ok);
22591 
22592   set_unique_reg_note (get_last_insn (), REG_EQUAL,
22593 		       gen_rtx_fmt_ee (code, qimode, op1, op2));
22594 }
22595 
22596 /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
22597    if op is CONST_VECTOR with all odd elements equal to their
22598    preceding element.  */
22599 
22600 static bool
const_vector_equal_evenodd_p(rtx op)22601 const_vector_equal_evenodd_p (rtx op)
22602 {
22603   machine_mode mode = GET_MODE (op);
22604   int i, nunits = GET_MODE_NUNITS (mode);
22605   if (GET_CODE (op) != CONST_VECTOR
22606       || nunits != CONST_VECTOR_NUNITS (op))
22607     return false;
22608   for (i = 0; i < nunits; i += 2)
22609     if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
22610       return false;
22611   return true;
22612 }
22613 
22614 void
ix86_expand_mul_widen_evenodd(rtx dest,rtx op1,rtx op2,bool uns_p,bool odd_p)22615 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
22616 			       bool uns_p, bool odd_p)
22617 {
22618   machine_mode mode = GET_MODE (op1);
22619   machine_mode wmode = GET_MODE (dest);
22620   rtx x;
22621   rtx orig_op1 = op1, orig_op2 = op2;
22622 
22623   if (!nonimmediate_operand (op1, mode))
22624     op1 = force_reg (mode, op1);
22625   if (!nonimmediate_operand (op2, mode))
22626     op2 = force_reg (mode, op2);
22627 
22628   /* We only play even/odd games with vectors of SImode.  */
22629   gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
22630 
22631   /* If we're looking for the odd results, shift those members down to
22632      the even slots.  For some cpus this is faster than a PSHUFD.  */
22633   if (odd_p)
22634     {
22635       /* For XOP use vpmacsdqh, but only for smult, as it is only
22636 	 signed.  */
22637       if (TARGET_XOP && mode == V4SImode && !uns_p)
22638 	{
22639 	  x = force_reg (wmode, CONST0_RTX (wmode));
22640 	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
22641 	  return;
22642 	}
22643 
22644       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
22645       if (!const_vector_equal_evenodd_p (orig_op1))
22646 	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
22647 			    x, NULL, 1, OPTAB_DIRECT);
22648       if (!const_vector_equal_evenodd_p (orig_op2))
22649 	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
22650 			    x, NULL, 1, OPTAB_DIRECT);
22651       op1 = gen_lowpart (mode, op1);
22652       op2 = gen_lowpart (mode, op2);
22653     }
22654 
22655   if (mode == V16SImode)
22656     {
22657       if (uns_p)
22658 	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
22659       else
22660 	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
22661     }
22662   else if (mode == V8SImode)
22663     {
22664       if (uns_p)
22665 	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
22666       else
22667 	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
22668     }
22669   else if (uns_p)
22670     x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
22671   else if (TARGET_SSE4_1)
22672     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
22673   else
22674     {
22675       rtx s1, s2, t0, t1, t2;
22676 
22677       /* The easiest way to implement this without PMULDQ is to go through
22678 	 the motions as if we are performing a full 64-bit multiply.  With
22679 	 the exception that we need to do less shuffling of the elements.  */
22680 
22681       /* Compute the sign-extension, aka highparts, of the two operands.  */
22682       s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22683 				op1, pc_rtx, pc_rtx);
22684       s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22685 				op2, pc_rtx, pc_rtx);
22686 
22687       /* Multiply LO(A) * HI(B), and vice-versa.  */
22688       t1 = gen_reg_rtx (wmode);
22689       t2 = gen_reg_rtx (wmode);
22690       emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
22691       emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
22692 
22693       /* Multiply LO(A) * LO(B).  */
22694       t0 = gen_reg_rtx (wmode);
22695       emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
22696 
22697       /* Combine and shift the highparts into place.  */
22698       t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
22699       t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
22700 			 1, OPTAB_DIRECT);
22701 
22702       /* Combine high and low parts.  */
22703       force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
22704       return;
22705     }
22706   emit_insn (x);
22707 }
22708 
22709 void
ix86_expand_mul_widen_hilo(rtx dest,rtx op1,rtx op2,bool uns_p,bool high_p)22710 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
22711 			    bool uns_p, bool high_p)
22712 {
22713   machine_mode wmode = GET_MODE (dest);
22714   machine_mode mode = GET_MODE (op1);
22715   rtx t1, t2, t3, t4, mask;
22716 
22717   switch (mode)
22718     {
22719     case E_V4SImode:
22720       t1 = gen_reg_rtx (mode);
22721       t2 = gen_reg_rtx (mode);
22722       if (TARGET_XOP && !uns_p)
22723 	{
22724 	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
22725 	     shuffle the elements once so that all elements are in the right
22726 	     place for immediate use: { A C B D }.  */
22727 	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
22728 					const1_rtx, GEN_INT (3)));
22729 	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
22730 					const1_rtx, GEN_INT (3)));
22731 	}
22732       else
22733 	{
22734 	  /* Put the elements into place for the multiply.  */
22735 	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
22736 	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
22737 	  high_p = false;
22738 	}
22739       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
22740       break;
22741 
22742     case E_V8SImode:
22743       /* Shuffle the elements between the lanes.  After this we
22744 	 have { A B E F | C D G H } for each operand.  */
22745       t1 = gen_reg_rtx (V4DImode);
22746       t2 = gen_reg_rtx (V4DImode);
22747       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
22748 				      const0_rtx, const2_rtx,
22749 				      const1_rtx, GEN_INT (3)));
22750       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
22751 				      const0_rtx, const2_rtx,
22752 				      const1_rtx, GEN_INT (3)));
22753 
22754       /* Shuffle the elements within the lanes.  After this we
22755 	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
22756       t3 = gen_reg_rtx (V8SImode);
22757       t4 = gen_reg_rtx (V8SImode);
22758       mask = GEN_INT (high_p
22759 		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
22760 		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
22761       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
22762       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
22763 
22764       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
22765       break;
22766 
22767     case E_V8HImode:
22768     case E_V16HImode:
22769       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
22770 			 uns_p, OPTAB_DIRECT);
22771       t2 = expand_binop (mode,
22772 			 uns_p ? umul_highpart_optab : smul_highpart_optab,
22773 			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
22774       gcc_assert (t1 && t2);
22775 
22776       t3 = gen_reg_rtx (mode);
22777       ix86_expand_vec_interleave (t3, t1, t2, high_p);
22778       emit_move_insn (dest, gen_lowpart (wmode, t3));
22779       break;
22780 
22781     case E_V16QImode:
22782     case E_V32QImode:
22783     case E_V32HImode:
22784     case E_V16SImode:
22785     case E_V64QImode:
22786       t1 = gen_reg_rtx (wmode);
22787       t2 = gen_reg_rtx (wmode);
22788       ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
22789       ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
22790 
22791       emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
22792       break;
22793 
22794     default:
22795       gcc_unreachable ();
22796     }
22797 }
22798 
22799 void
ix86_expand_sse2_mulv4si3(rtx op0,rtx op1,rtx op2)22800 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
22801 {
22802   rtx res_1, res_2, res_3, res_4;
22803 
22804   res_1 = gen_reg_rtx (V4SImode);
22805   res_2 = gen_reg_rtx (V4SImode);
22806   res_3 = gen_reg_rtx (V2DImode);
22807   res_4 = gen_reg_rtx (V2DImode);
22808   ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
22809   ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
22810 
22811   /* Move the results in element 2 down to element 1; we don't care
22812      what goes in elements 2 and 3.  Then we can merge the parts
22813      back together with an interleave.
22814 
22815      Note that two other sequences were tried:
22816      (1) Use interleaves at the start instead of psrldq, which allows
22817      us to use a single shufps to merge things back at the end.
22818      (2) Use shufps here to combine the two vectors, then pshufd to
22819      put the elements in the correct order.
22820      In both cases the cost of the reformatting stall was too high
22821      and the overall sequence slower.  */
22822 
22823   emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
22824 				const0_rtx, const2_rtx,
22825 				const0_rtx, const0_rtx));
22826   emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
22827 				const0_rtx, const2_rtx,
22828 				const0_rtx, const0_rtx));
22829   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
22830 
22831   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
22832 }
22833 
22834 void
ix86_expand_sse2_mulvxdi3(rtx op0,rtx op1,rtx op2)22835 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
22836 {
22837   machine_mode mode = GET_MODE (op0);
22838   rtx t1, t2, t3, t4, t5, t6;
22839 
22840   if (TARGET_AVX512DQ && mode == V8DImode)
22841     emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
22842   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
22843     emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
22844   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
22845     emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
22846   else if (TARGET_XOP && mode == V2DImode)
22847     {
22848       /* op1: A,B,C,D, op2: E,F,G,H */
22849       op1 = gen_lowpart (V4SImode, op1);
22850       op2 = gen_lowpart (V4SImode, op2);
22851 
22852       t1 = gen_reg_rtx (V4SImode);
22853       t2 = gen_reg_rtx (V4SImode);
22854       t3 = gen_reg_rtx (V2DImode);
22855       t4 = gen_reg_rtx (V2DImode);
22856 
22857       /* t1: B,A,D,C */
22858       emit_insn (gen_sse2_pshufd_1 (t1, op1,
22859 				    GEN_INT (1),
22860 				    GEN_INT (0),
22861 				    GEN_INT (3),
22862 				    GEN_INT (2)));
22863 
22864       /* t2: (B*E),(A*F),(D*G),(C*H) */
22865       emit_insn (gen_mulv4si3 (t2, t1, op2));
22866 
22867       /* t3: (B*E)+(A*F), (D*G)+(C*H) */
22868       emit_insn (gen_xop_phadddq (t3, t2));
22869 
22870       /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
22871       emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
22872 
22873       /* Multiply lower parts and add all */
22874       t5 = gen_reg_rtx (V2DImode);
22875       emit_insn (gen_vec_widen_umult_even_v4si (t5,
22876 					gen_lowpart (V4SImode, op1),
22877 					gen_lowpart (V4SImode, op2)));
22878       force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
22879     }
22880   else
22881     {
22882       machine_mode nmode;
22883       rtx (*umul) (rtx, rtx, rtx);
22884 
22885       if (mode == V2DImode)
22886 	{
22887 	  umul = gen_vec_widen_umult_even_v4si;
22888 	  nmode = V4SImode;
22889 	}
22890       else if (mode == V4DImode)
22891 	{
22892 	  umul = gen_vec_widen_umult_even_v8si;
22893 	  nmode = V8SImode;
22894 	}
22895       else if (mode == V8DImode)
22896 	{
22897 	  umul = gen_vec_widen_umult_even_v16si;
22898 	  nmode = V16SImode;
22899 	}
22900       else
22901 	gcc_unreachable ();
22902 
22903 
22904       /* Multiply low parts.  */
22905       t1 = gen_reg_rtx (mode);
22906       emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
22907 
22908       /* Shift input vectors right 32 bits so we can multiply high parts.  */
22909       t6 = GEN_INT (32);
22910       t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
22911       t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
22912 
22913       /* Multiply high parts by low parts.  */
22914       t4 = gen_reg_rtx (mode);
22915       t5 = gen_reg_rtx (mode);
22916       emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
22917       emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
22918 
22919       /* Combine and shift the highparts back.  */
22920       t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
22921       t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
22922 
22923       /* Combine high and low parts.  */
22924       force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
22925     }
22926 
22927   set_unique_reg_note (get_last_insn (), REG_EQUAL,
22928 		       gen_rtx_MULT (mode, op1, op2));
22929 }
22930 
22931 /* Return 1 if control tansfer instruction INSN
22932    should be encoded with notrack prefix.  */
22933 
22934 bool
ix86_notrack_prefixed_insn_p(rtx_insn * insn)22935 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
22936 {
22937   if (!insn || !((flag_cf_protection & CF_BRANCH)))
22938     return false;
22939 
22940   if (CALL_P (insn))
22941     {
22942       rtx call = get_call_rtx_from (insn);
22943       gcc_assert (call != NULL_RTX);
22944       rtx addr = XEXP (call, 0);
22945 
22946       /* Do not emit 'notrack' if it's not an indirect call.  */
22947       if (MEM_P (addr)
22948 	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
22949 	return false;
22950       else
22951 	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
22952     }
22953 
22954   if (JUMP_P (insn) && !flag_cet_switch)
22955     {
22956       rtx target = JUMP_LABEL (insn);
22957       if (target == NULL_RTX || ANY_RETURN_P (target))
22958 	return false;
22959 
22960       /* Check the jump is a switch table.  */
22961       rtx_insn *label = as_a<rtx_insn *> (target);
22962       rtx_insn *table = next_insn (label);
22963       if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
22964 	return false;
22965       else
22966 	return true;
22967     }
22968   return false;
22969 }
22970 
22971 /* Calculate integer abs() using only SSE2 instructions.  */
22972 
22973 void
ix86_expand_sse2_abs(rtx target,rtx input)22974 ix86_expand_sse2_abs (rtx target, rtx input)
22975 {
22976   machine_mode mode = GET_MODE (target);
22977   rtx tmp0, tmp1, x;
22978 
22979   switch (mode)
22980     {
22981     case E_V2DImode:
22982     case E_V4DImode:
22983       /* For 64-bit signed integer X, with SSE4.2 use
22984 	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
22985 	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
22986 	 32 and use logical instead of arithmetic right shift (which is
22987 	 unimplemented) and subtract.  */
22988       if (TARGET_SSE4_2)
22989 	{
22990 	  tmp0 = gen_reg_rtx (mode);
22991 	  tmp1 = gen_reg_rtx (mode);
22992 	  emit_move_insn (tmp1, CONST0_RTX (mode));
22993 	  if (mode == E_V2DImode)
22994 	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
22995 	  else
22996 	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
22997 	}
22998       else
22999 	{
23000 	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
23001 				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
23002 					       - 1), NULL, 0, OPTAB_DIRECT);
23003 	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
23004 	}
23005 
23006       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23007 				  NULL, 0, OPTAB_DIRECT);
23008       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23009 			       target, 0, OPTAB_DIRECT);
23010       break;
23011 
23012     case E_V4SImode:
23013       /* For 32-bit signed integer X, the best way to calculate the absolute
23014 	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
23015       tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
23016 				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
23017 				  NULL, 0, OPTAB_DIRECT);
23018       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23019 				  NULL, 0, OPTAB_DIRECT);
23020       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23021 			       target, 0, OPTAB_DIRECT);
23022       break;
23023 
23024     case E_V8HImode:
23025       /* For 16-bit signed integer X, the best way to calculate the absolute
23026 	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
23027       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23028 
23029       x = expand_simple_binop (mode, SMAX, tmp0, input,
23030 			       target, 0, OPTAB_DIRECT);
23031       break;
23032 
23033     case E_V16QImode:
23034       /* For 8-bit signed integer X, the best way to calculate the absolute
23035 	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23036 	 as SSE2 provides the PMINUB insn.  */
23037       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23038 
23039       x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
23040 			       target, 0, OPTAB_DIRECT);
23041       break;
23042 
23043     default:
23044       gcc_unreachable ();
23045     }
23046 
23047   if (x != target)
23048     emit_move_insn (target, x);
23049 }
23050 
23051 /* Expand an extract from a vector register through pextr insn.
23052    Return true if successful.  */
23053 
23054 bool
ix86_expand_pextr(rtx * operands)23055 ix86_expand_pextr (rtx *operands)
23056 {
23057   rtx dst = operands[0];
23058   rtx src = operands[1];
23059 
23060   unsigned int size = INTVAL (operands[2]);
23061   unsigned int pos = INTVAL (operands[3]);
23062 
23063   if (SUBREG_P (dst))
23064     {
23065       /* Reject non-lowpart subregs.  */
23066       if (SUBREG_BYTE (dst) > 0)
23067 	return false;
23068       dst = SUBREG_REG (dst);
23069     }
23070 
23071   if (SUBREG_P (src))
23072     {
23073       pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
23074       src = SUBREG_REG (src);
23075     }
23076 
23077   switch (GET_MODE (src))
23078     {
23079     case E_V16QImode:
23080     case E_V8HImode:
23081     case E_V4SImode:
23082     case E_V2DImode:
23083     case E_V1TImode:
23084       {
23085 	machine_mode srcmode, dstmode;
23086 	rtx d, pat;
23087 
23088 	if (!int_mode_for_size (size, 0).exists (&dstmode))
23089 	  return false;
23090 
23091 	switch (dstmode)
23092 	  {
23093 	  case E_QImode:
23094 	    if (!TARGET_SSE4_1)
23095 	      return false;
23096 	    srcmode = V16QImode;
23097 	    break;
23098 
23099 	  case E_HImode:
23100 	    if (!TARGET_SSE2)
23101 	      return false;
23102 	    srcmode = V8HImode;
23103 	    break;
23104 
23105 	  case E_SImode:
23106 	    if (!TARGET_SSE4_1)
23107 	      return false;
23108 	    srcmode = V4SImode;
23109 	    break;
23110 
23111 	  case E_DImode:
23112 	    gcc_assert (TARGET_64BIT);
23113 	    if (!TARGET_SSE4_1)
23114 	      return false;
23115 	    srcmode = V2DImode;
23116 	    break;
23117 
23118 	  default:
23119 	    return false;
23120 	  }
23121 
23122 	/* Reject extractions from misaligned positions.  */
23123 	if (pos & (size-1))
23124 	  return false;
23125 
23126 	if (GET_MODE (dst) == dstmode)
23127 	  d = dst;
23128 	else
23129 	  d = gen_reg_rtx (dstmode);
23130 
23131 	/* Construct insn pattern.  */
23132 	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
23133 	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
23134 
23135 	/* Let the rtl optimizers know about the zero extension performed.  */
23136 	if (dstmode == QImode || dstmode == HImode)
23137 	  {
23138 	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
23139 	    d = gen_lowpart (SImode, d);
23140 	  }
23141 
23142 	emit_insn (gen_rtx_SET (d, pat));
23143 
23144 	if (d != dst)
23145 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23146 	return true;
23147       }
23148 
23149     default:
23150       return false;
23151     }
23152 }
23153 
23154 /* Expand an insert into a vector register through pinsr insn.
23155    Return true if successful.  */
23156 
23157 bool
ix86_expand_pinsr(rtx * operands)23158 ix86_expand_pinsr (rtx *operands)
23159 {
23160   rtx dst = operands[0];
23161   rtx src = operands[3];
23162 
23163   unsigned int size = INTVAL (operands[1]);
23164   unsigned int pos = INTVAL (operands[2]);
23165 
23166   if (SUBREG_P (dst))
23167     {
23168       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
23169       dst = SUBREG_REG (dst);
23170     }
23171 
23172   switch (GET_MODE (dst))
23173     {
23174     case E_V16QImode:
23175     case E_V8HImode:
23176     case E_V4SImode:
23177     case E_V2DImode:
23178     case E_V1TImode:
23179       {
23180 	machine_mode srcmode, dstmode;
23181 	rtx (*pinsr)(rtx, rtx, rtx, rtx);
23182 	rtx d;
23183 
23184 	if (!int_mode_for_size (size, 0).exists (&srcmode))
23185 	  return false;
23186 
23187 	switch (srcmode)
23188 	  {
23189 	  case E_QImode:
23190 	    if (!TARGET_SSE4_1)
23191 	      return false;
23192 	    dstmode = V16QImode;
23193 	    pinsr = gen_sse4_1_pinsrb;
23194 	    break;
23195 
23196 	  case E_HImode:
23197 	    if (!TARGET_SSE2)
23198 	      return false;
23199 	    dstmode = V8HImode;
23200 	    pinsr = gen_sse2_pinsrw;
23201 	    break;
23202 
23203 	  case E_SImode:
23204 	    if (!TARGET_SSE4_1)
23205 	      return false;
23206 	    dstmode = V4SImode;
23207 	    pinsr = gen_sse4_1_pinsrd;
23208 	    break;
23209 
23210 	  case E_DImode:
23211 	    gcc_assert (TARGET_64BIT);
23212 	    if (!TARGET_SSE4_1)
23213 	      return false;
23214 	    dstmode = V2DImode;
23215 	    pinsr = gen_sse4_1_pinsrq;
23216 	    break;
23217 
23218 	  default:
23219 	    return false;
23220 	  }
23221 
23222 	/* Reject insertions to misaligned positions.  */
23223 	if (pos & (size-1))
23224 	  return false;
23225 
23226 	if (SUBREG_P (src))
23227 	  {
23228 	    unsigned int srcpos = SUBREG_BYTE (src);
23229 
23230 	    if (srcpos > 0)
23231 	      {
23232 		rtx extr_ops[4];
23233 
23234 		extr_ops[0] = gen_reg_rtx (srcmode);
23235 		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
23236 		extr_ops[2] = GEN_INT (size);
23237 		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
23238 
23239 		if (!ix86_expand_pextr (extr_ops))
23240 		  return false;
23241 
23242 		src = extr_ops[0];
23243 	      }
23244 	    else
23245 	      src = gen_lowpart (srcmode, SUBREG_REG (src));
23246 	  }
23247 
23248 	if (GET_MODE (dst) == dstmode)
23249 	  d = dst;
23250 	else
23251 	  d = gen_reg_rtx (dstmode);
23252 
23253 	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
23254 			  gen_lowpart (srcmode, src),
23255 			  GEN_INT (1 << (pos / size))));
23256 	if (d != dst)
23257 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23258 	return true;
23259       }
23260 
23261     default:
23262       return false;
23263     }
23264 }
23265 
23266 /* All CPUs prefer to avoid cross-lane operations so perform reductions
23267    upper against lower halves up to SSE reg size.  */
23268 
23269 machine_mode
ix86_split_reduction(machine_mode mode)23270 ix86_split_reduction (machine_mode mode)
23271 {
23272   /* Reduce lowpart against highpart until we reach SSE reg width to
23273      avoid cross-lane operations.  */
23274   switch (mode)
23275     {
23276     case E_V8DImode:
23277     case E_V4DImode:
23278       return V2DImode;
23279     case E_V16SImode:
23280     case E_V8SImode:
23281       return V4SImode;
23282     case E_V32HImode:
23283     case E_V16HImode:
23284       return V8HImode;
23285     case E_V64QImode:
23286     case E_V32QImode:
23287       return V16QImode;
23288     case E_V16SFmode:
23289     case E_V8SFmode:
23290       return V4SFmode;
23291     case E_V8DFmode:
23292     case E_V4DFmode:
23293       return V2DFmode;
23294     default:
23295       return mode;
23296     }
23297 }
23298 
23299 /* Generate call to __divmoddi4.  */
23300 
23301 void
ix86_expand_divmod_libfunc(rtx libfunc,machine_mode mode,rtx op0,rtx op1,rtx * quot_p,rtx * rem_p)23302 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
23303 			    rtx op0, rtx op1,
23304 			    rtx *quot_p, rtx *rem_p)
23305 {
23306   rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
23307 
23308   rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
23309 				      mode, op0, mode, op1, mode,
23310 				      XEXP (rem, 0), Pmode);
23311   *quot_p = quot;
23312   *rem_p = rem;
23313 }
23314 
23315 void
ix86_expand_atomic_fetch_op_loop(rtx target,rtx mem,rtx val,enum rtx_code code,bool after,bool doubleword)23316 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
23317 				  enum rtx_code code, bool after,
23318 				  bool doubleword)
23319 {
23320   rtx old_reg, new_reg, old_mem, success;
23321   machine_mode mode = GET_MODE (target);
23322   rtx_code_label *loop_label = NULL;
23323 
23324   old_reg = gen_reg_rtx (mode);
23325   new_reg = old_reg;
23326   old_mem = copy_to_reg (mem);
23327   loop_label = gen_label_rtx ();
23328   emit_label (loop_label);
23329   emit_move_insn (old_reg, old_mem);
23330 
23331   /* return value for atomic_fetch_op.  */
23332   if (!after)
23333     emit_move_insn (target, old_reg);
23334 
23335   if (code == NOT)
23336     {
23337       new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
23338 				     true, OPTAB_LIB_WIDEN);
23339       new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
23340     }
23341   else
23342     new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
23343 				   true, OPTAB_LIB_WIDEN);
23344 
23345   /* return value for atomic_op_fetch.  */
23346   if (after)
23347     emit_move_insn (target, new_reg);
23348 
23349   success = NULL_RTX;
23350 
23351   ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
23352 			    gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
23353 					  SImode),
23354 			    doubleword, loop_label);
23355 }
23356 
23357 /* Relax cmpxchg instruction, param loop_label indicates whether
23358    the instruction should be relaxed with a pause loop.  If not,
23359    it will be relaxed to an atomic load + compare, and skip
23360    cmpxchg instruction if mem != exp_input.  */
23361 
23362 void
ix86_expand_cmpxchg_loop(rtx * ptarget_bool,rtx target_val,rtx mem,rtx exp_input,rtx new_input,rtx mem_model,bool doubleword,rtx_code_label * loop_label)23363 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
23364 			  rtx mem, rtx exp_input, rtx new_input,
23365 			  rtx mem_model, bool doubleword,
23366 			  rtx_code_label *loop_label)
23367 {
23368   rtx_code_label *cmp_label = NULL;
23369   rtx_code_label *done_label = NULL;
23370   rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
23371   rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
23372   rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
23373   machine_mode mode = GET_MODE (target_val), hmode = mode;
23374 
23375   if (*ptarget_bool == NULL)
23376     target_bool = gen_reg_rtx (QImode);
23377   else
23378     target_bool = *ptarget_bool;
23379 
23380   cmp_label = gen_label_rtx ();
23381   done_label = gen_label_rtx ();
23382 
23383   new_mem = gen_reg_rtx (mode);
23384   /* Load memory first.  */
23385   expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
23386 
23387   switch (mode)
23388     {
23389     case E_TImode:
23390       gendw = gen_atomic_compare_and_swapti_doubleword;
23391       hmode = DImode;
23392       break;
23393     case E_DImode:
23394       if (doubleword)
23395 	{
23396 	  gendw = gen_atomic_compare_and_swapdi_doubleword;
23397 	  hmode = SImode;
23398 	}
23399       else
23400 	gen = gen_atomic_compare_and_swapdi_1;
23401       break;
23402     case E_SImode:
23403       gen = gen_atomic_compare_and_swapsi_1;
23404       break;
23405     case E_HImode:
23406       gen = gen_atomic_compare_and_swaphi_1;
23407       break;
23408     case E_QImode:
23409       gen = gen_atomic_compare_and_swapqi_1;
23410       break;
23411     default:
23412       gcc_unreachable ();
23413     }
23414 
23415   /* Compare mem value with expected value.  */
23416   if (doubleword)
23417     {
23418       rtx low_new_mem = gen_lowpart (hmode, new_mem);
23419       rtx low_exp_input = gen_lowpart (hmode, exp_input);
23420       rtx high_new_mem = gen_highpart (hmode, new_mem);
23421       rtx high_exp_input = gen_highpart (hmode, exp_input);
23422       emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
23423 			       hmode, 1, cmp_label,
23424 			       profile_probability::guessed_never ());
23425       emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
23426 			       hmode, 1, cmp_label,
23427 			       profile_probability::guessed_never ());
23428     }
23429   else
23430     emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
23431 			     GET_MODE (exp_input), 1, cmp_label,
23432 			     profile_probability::guessed_never ());
23433 
23434   /* Directly emits cmpxchg here.  */
23435   if (doubleword)
23436     emit_insn (gendw (target_val, mem, exp_input,
23437 		      gen_lowpart (hmode, new_input),
23438 		      gen_highpart (hmode, new_input),
23439 		      mem_model));
23440   else
23441     emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
23442 
23443   if (!loop_label)
23444   {
23445     emit_jump_insn (gen_jump (done_label));
23446     emit_barrier ();
23447     emit_label (cmp_label);
23448     emit_move_insn (target_val, new_mem);
23449     emit_label (done_label);
23450     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23451 		       const0_rtx);
23452   }
23453   else
23454   {
23455     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23456 		       const0_rtx);
23457     emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
23458 			     GET_MODE (target_bool), 1, loop_label,
23459 			     profile_probability::guessed_never ());
23460     emit_jump_insn (gen_jump (done_label));
23461     emit_barrier ();
23462 
23463     /* If mem is not expected, pause and loop back.  */
23464     emit_label (cmp_label);
23465     emit_move_insn (target_val, new_mem);
23466     emit_insn (gen_pause ());
23467     emit_jump_insn (gen_jump (loop_label));
23468     emit_barrier ();
23469     emit_label (done_label);
23470   }
23471 
23472   *ptarget_bool = target_bool;
23473 }
23474 
23475 #include "gt-i386-expand.h"
23476