1 /* Copyright (C) 1988-2020 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
split_double_mode(machine_mode mode,rtx operands[],int num,rtx lo_half[],rtx hi_half[])103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157 hi_half[num] = simplify_gen_subreg (half_mode, op,
158 GET_MODE (op) == VOIDmode
159 ? mode : GET_MODE (op), byte);
160 }
161 }
162 }
163
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
165 for the target. */
166
167 void
ix86_expand_clear(rtx dest)168 ix86_expand_clear (rtx dest)
169 {
170 rtx tmp;
171
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed);
174
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
177 dest = gen_rtx_REG (SImode, REGNO (dest));
178 tmp = gen_rtx_SET (dest, const0_rtx);
179
180 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
181 {
182 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
183 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
184 }
185
186 emit_insn (tmp);
187 }
188
189 void
ix86_expand_move(machine_mode mode,rtx operands[])190 ix86_expand_move (machine_mode mode, rtx operands[])
191 {
192 rtx op0, op1;
193 rtx tmp, addend = NULL_RTX;
194 enum tls_model model;
195
196 op0 = operands[0];
197 op1 = operands[1];
198
199 switch (GET_CODE (op1))
200 {
201 case CONST:
202 tmp = XEXP (op1, 0);
203
204 if (GET_CODE (tmp) != PLUS
205 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
206 break;
207
208 op1 = XEXP (tmp, 0);
209 addend = XEXP (tmp, 1);
210 /* FALLTHRU */
211
212 case SYMBOL_REF:
213 model = SYMBOL_REF_TLS_MODEL (op1);
214
215 if (model)
216 op1 = legitimize_tls_address (op1, model, true);
217 else if (ix86_force_load_from_GOT_p (op1))
218 {
219 /* Load the external function address via GOT slot to avoid PLT. */
220 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
221 (TARGET_64BIT
222 ? UNSPEC_GOTPCREL
223 : UNSPEC_GOT));
224 op1 = gen_rtx_CONST (Pmode, op1);
225 op1 = gen_const_mem (Pmode, op1);
226 set_mem_alias_set (op1, ix86_GOT_alias_set ());
227 }
228 else
229 {
230 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
231 if (tmp)
232 {
233 op1 = tmp;
234 if (!addend)
235 break;
236 }
237 else
238 {
239 op1 = operands[1];
240 break;
241 }
242 }
243
244 if (addend)
245 {
246 op1 = force_operand (op1, NULL_RTX);
247 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
248 op0, 1, OPTAB_DIRECT);
249 }
250 else
251 op1 = force_operand (op1, op0);
252
253 if (op1 == op0)
254 return;
255
256 op1 = convert_to_mode (mode, op1, 1);
257
258 default:
259 break;
260 }
261
262 if ((flag_pic || MACHOPIC_INDIRECT)
263 && symbolic_operand (op1, mode))
264 {
265 if (TARGET_MACHO && !TARGET_64BIT)
266 {
267 #if TARGET_MACHO
268 /* dynamic-no-pic */
269 if (MACHOPIC_INDIRECT)
270 {
271 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
272 ? op0 : gen_reg_rtx (Pmode);
273 op1 = machopic_indirect_data_reference (op1, temp);
274 if (MACHOPIC_PURE)
275 op1 = machopic_legitimize_pic_address (op1, mode,
276 temp == op1 ? 0 : temp);
277 }
278 if (op0 != op1 && GET_CODE (op0) != MEM)
279 {
280 rtx insn = gen_rtx_SET (op0, op1);
281 emit_insn (insn);
282 return;
283 }
284 if (GET_CODE (op0) == MEM)
285 op1 = force_reg (Pmode, op1);
286 else
287 {
288 rtx temp = op0;
289 if (GET_CODE (temp) != REG)
290 temp = gen_reg_rtx (Pmode);
291 temp = legitimize_pic_address (op1, temp);
292 if (temp == op0)
293 return;
294 op1 = temp;
295 }
296 /* dynamic-no-pic */
297 #endif
298 }
299 else
300 {
301 if (MEM_P (op0))
302 op1 = force_reg (mode, op1);
303 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
304 {
305 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
306 op1 = legitimize_pic_address (op1, reg);
307 if (op0 == op1)
308 return;
309 op1 = convert_to_mode (mode, op1, 1);
310 }
311 }
312 }
313 else
314 {
315 if (MEM_P (op0)
316 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
317 || !push_operand (op0, mode))
318 && MEM_P (op1))
319 op1 = force_reg (mode, op1);
320
321 if (push_operand (op0, mode)
322 && ! general_no_elim_operand (op1, mode))
323 op1 = copy_to_mode_reg (mode, op1);
324
325 /* Force large constants in 64bit compilation into register
326 to get them CSEed. */
327 if (can_create_pseudo_p ()
328 && (mode == DImode) && TARGET_64BIT
329 && immediate_operand (op1, mode)
330 && !x86_64_zext_immediate_operand (op1, VOIDmode)
331 && !register_operand (op0, mode)
332 && optimize)
333 op1 = copy_to_mode_reg (mode, op1);
334
335 if (can_create_pseudo_p ()
336 && CONST_DOUBLE_P (op1))
337 {
338 /* If we are loading a floating point constant to a register,
339 force the value to memory now, since we'll get better code
340 out the back end. */
341
342 op1 = validize_mem (force_const_mem (mode, op1));
343 if (!register_operand (op0, mode))
344 {
345 rtx temp = gen_reg_rtx (mode);
346 emit_insn (gen_rtx_SET (temp, op1));
347 emit_move_insn (op0, temp);
348 return;
349 }
350 }
351 }
352
353 emit_insn (gen_rtx_SET (op0, op1));
354 }
355
356 void
ix86_expand_vector_move(machine_mode mode,rtx operands[])357 ix86_expand_vector_move (machine_mode mode, rtx operands[])
358 {
359 rtx op0 = operands[0], op1 = operands[1];
360 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
361 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
362 unsigned int align = (TARGET_IAMCU
363 ? GET_MODE_BITSIZE (mode)
364 : GET_MODE_ALIGNMENT (mode));
365
366 if (push_operand (op0, VOIDmode))
367 op0 = emit_move_resolve_push (mode, op0);
368
369 /* Force constants other than zero into memory. We do not know how
370 the instructions used to build constants modify the upper 64 bits
371 of the register, once we have that information we may be able
372 to handle some of them more efficiently. */
373 if (can_create_pseudo_p ()
374 && (CONSTANT_P (op1)
375 || (SUBREG_P (op1)
376 && CONSTANT_P (SUBREG_REG (op1))))
377 && ((register_operand (op0, mode)
378 && !standard_sse_constant_p (op1, mode))
379 /* ix86_expand_vector_move_misalign() does not like constants. */
380 || (SSE_REG_MODE_P (mode)
381 && MEM_P (op0)
382 && MEM_ALIGN (op0) < align)))
383 {
384 if (SUBREG_P (op1))
385 {
386 machine_mode imode = GET_MODE (SUBREG_REG (op1));
387 rtx r = force_const_mem (imode, SUBREG_REG (op1));
388 if (r)
389 r = validize_mem (r);
390 else
391 r = force_reg (imode, SUBREG_REG (op1));
392 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
393 }
394 else
395 op1 = validize_mem (force_const_mem (mode, op1));
396 }
397
398 /* We need to check memory alignment for SSE mode since attribute
399 can make operands unaligned. */
400 if (can_create_pseudo_p ()
401 && SSE_REG_MODE_P (mode)
402 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
403 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
404 {
405 rtx tmp[2];
406
407 /* ix86_expand_vector_move_misalign() does not like both
408 arguments in memory. */
409 if (!register_operand (op0, mode)
410 && !register_operand (op1, mode))
411 op1 = force_reg (mode, op1);
412
413 tmp[0] = op0; tmp[1] = op1;
414 ix86_expand_vector_move_misalign (mode, tmp);
415 return;
416 }
417
418 /* Make operand1 a register if it isn't already. */
419 if (can_create_pseudo_p ()
420 && !register_operand (op0, mode)
421 && !register_operand (op1, mode))
422 {
423 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
424 return;
425 }
426
427 emit_insn (gen_rtx_SET (op0, op1));
428 }
429
430 /* Split 32-byte AVX unaligned load and store if needed. */
431
432 static void
ix86_avx256_split_vector_move_misalign(rtx op0,rtx op1)433 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
434 {
435 rtx m;
436 rtx (*extract) (rtx, rtx, rtx);
437 machine_mode mode;
438
439 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
440 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
441 {
442 emit_insn (gen_rtx_SET (op0, op1));
443 return;
444 }
445
446 rtx orig_op0 = NULL_RTX;
447 mode = GET_MODE (op0);
448 switch (GET_MODE_CLASS (mode))
449 {
450 case MODE_VECTOR_INT:
451 case MODE_INT:
452 if (mode != V32QImode)
453 {
454 if (!MEM_P (op0))
455 {
456 orig_op0 = op0;
457 op0 = gen_reg_rtx (V32QImode);
458 }
459 else
460 op0 = gen_lowpart (V32QImode, op0);
461 op1 = gen_lowpart (V32QImode, op1);
462 mode = V32QImode;
463 }
464 break;
465 case MODE_VECTOR_FLOAT:
466 break;
467 default:
468 gcc_unreachable ();
469 }
470
471 switch (mode)
472 {
473 default:
474 gcc_unreachable ();
475 case E_V32QImode:
476 extract = gen_avx_vextractf128v32qi;
477 mode = V16QImode;
478 break;
479 case E_V8SFmode:
480 extract = gen_avx_vextractf128v8sf;
481 mode = V4SFmode;
482 break;
483 case E_V4DFmode:
484 extract = gen_avx_vextractf128v4df;
485 mode = V2DFmode;
486 break;
487 }
488
489 if (MEM_P (op1))
490 {
491 rtx r = gen_reg_rtx (mode);
492 m = adjust_address (op1, mode, 0);
493 emit_move_insn (r, m);
494 m = adjust_address (op1, mode, 16);
495 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
496 emit_move_insn (op0, r);
497 }
498 else if (MEM_P (op0))
499 {
500 m = adjust_address (op0, mode, 0);
501 emit_insn (extract (m, op1, const0_rtx));
502 m = adjust_address (op0, mode, 16);
503 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
504 }
505 else
506 gcc_unreachable ();
507
508 if (orig_op0)
509 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
510 }
511
512 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
513 straight to ix86_expand_vector_move. */
514 /* Code generation for scalar reg-reg moves of single and double precision data:
515 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
516 movaps reg, reg
517 else
518 movss reg, reg
519 if (x86_sse_partial_reg_dependency == true)
520 movapd reg, reg
521 else
522 movsd reg, reg
523
524 Code generation for scalar loads of double precision data:
525 if (x86_sse_split_regs == true)
526 movlpd mem, reg (gas syntax)
527 else
528 movsd mem, reg
529
530 Code generation for unaligned packed loads of single precision data
531 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
532 if (x86_sse_unaligned_move_optimal)
533 movups mem, reg
534
535 if (x86_sse_partial_reg_dependency == true)
536 {
537 xorps reg, reg
538 movlps mem, reg
539 movhps mem+8, reg
540 }
541 else
542 {
543 movlps mem, reg
544 movhps mem+8, reg
545 }
546
547 Code generation for unaligned packed loads of double precision data
548 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
549 if (x86_sse_unaligned_move_optimal)
550 movupd mem, reg
551
552 if (x86_sse_split_regs == true)
553 {
554 movlpd mem, reg
555 movhpd mem+8, reg
556 }
557 else
558 {
559 movsd mem, reg
560 movhpd mem+8, reg
561 }
562 */
563
564 void
ix86_expand_vector_move_misalign(machine_mode mode,rtx operands[])565 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
566 {
567 rtx op0, op1, m;
568
569 op0 = operands[0];
570 op1 = operands[1];
571
572 /* Use unaligned load/store for AVX512 or when optimizing for size. */
573 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
574 {
575 emit_insn (gen_rtx_SET (op0, op1));
576 return;
577 }
578
579 if (TARGET_AVX)
580 {
581 if (GET_MODE_SIZE (mode) == 32)
582 ix86_avx256_split_vector_move_misalign (op0, op1);
583 else
584 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
585 emit_insn (gen_rtx_SET (op0, op1));
586 return;
587 }
588
589 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
590 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
591 {
592 emit_insn (gen_rtx_SET (op0, op1));
593 return;
594 }
595
596 /* ??? If we have typed data, then it would appear that using
597 movdqu is the only way to get unaligned data loaded with
598 integer type. */
599 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
600 {
601 emit_insn (gen_rtx_SET (op0, op1));
602 return;
603 }
604
605 if (MEM_P (op1))
606 {
607 if (TARGET_SSE2 && mode == V2DFmode)
608 {
609 rtx zero;
610
611 /* When SSE registers are split into halves, we can avoid
612 writing to the top half twice. */
613 if (TARGET_SSE_SPLIT_REGS)
614 {
615 emit_clobber (op0);
616 zero = op0;
617 }
618 else
619 {
620 /* ??? Not sure about the best option for the Intel chips.
621 The following would seem to satisfy; the register is
622 entirely cleared, breaking the dependency chain. We
623 then store to the upper half, with a dependency depth
624 of one. A rumor has it that Intel recommends two movsd
625 followed by an unpacklpd, but this is unconfirmed. And
626 given that the dependency depth of the unpacklpd would
627 still be one, I'm not sure why this would be better. */
628 zero = CONST0_RTX (V2DFmode);
629 }
630
631 m = adjust_address (op1, DFmode, 0);
632 emit_insn (gen_sse2_loadlpd (op0, zero, m));
633 m = adjust_address (op1, DFmode, 8);
634 emit_insn (gen_sse2_loadhpd (op0, op0, m));
635 }
636 else
637 {
638 rtx t;
639
640 if (mode != V4SFmode)
641 t = gen_reg_rtx (V4SFmode);
642 else
643 t = op0;
644
645 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
646 emit_move_insn (t, CONST0_RTX (V4SFmode));
647 else
648 emit_clobber (t);
649
650 m = adjust_address (op1, V2SFmode, 0);
651 emit_insn (gen_sse_loadlps (t, t, m));
652 m = adjust_address (op1, V2SFmode, 8);
653 emit_insn (gen_sse_loadhps (t, t, m));
654 if (mode != V4SFmode)
655 emit_move_insn (op0, gen_lowpart (mode, t));
656 }
657 }
658 else if (MEM_P (op0))
659 {
660 if (TARGET_SSE2 && mode == V2DFmode)
661 {
662 m = adjust_address (op0, DFmode, 0);
663 emit_insn (gen_sse2_storelpd (m, op1));
664 m = adjust_address (op0, DFmode, 8);
665 emit_insn (gen_sse2_storehpd (m, op1));
666 }
667 else
668 {
669 if (mode != V4SFmode)
670 op1 = gen_lowpart (V4SFmode, op1);
671
672 m = adjust_address (op0, V2SFmode, 0);
673 emit_insn (gen_sse_storelps (m, op1));
674 m = adjust_address (op0, V2SFmode, 8);
675 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
676 }
677 }
678 else
679 gcc_unreachable ();
680 }
681
682 /* Move bits 64:95 to bits 32:63. */
683
684 void
ix86_move_vector_high_sse_to_mmx(rtx op)685 ix86_move_vector_high_sse_to_mmx (rtx op)
686 {
687 rtx mask = gen_rtx_PARALLEL (VOIDmode,
688 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
689 GEN_INT (0), GEN_INT (0)));
690 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
691 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
692 rtx insn = gen_rtx_SET (dest, op);
693 emit_insn (insn);
694 }
695
696 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
697
698 void
ix86_split_mmx_pack(rtx operands[],enum rtx_code code)699 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
700 {
701 rtx op0 = operands[0];
702 rtx op1 = operands[1];
703 rtx op2 = operands[2];
704
705 machine_mode dmode = GET_MODE (op0);
706 machine_mode smode = GET_MODE (op1);
707 machine_mode inner_dmode = GET_MODE_INNER (dmode);
708 machine_mode inner_smode = GET_MODE_INNER (smode);
709
710 /* Get the corresponding SSE mode for destination. */
711 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
712 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
713 nunits).require ();
714 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
715 nunits / 2).require ();
716
717 /* Get the corresponding SSE mode for source. */
718 nunits = 16 / GET_MODE_SIZE (inner_smode);
719 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
720 nunits).require ();
721
722 /* Generate SSE pack with signed/unsigned saturation. */
723 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
724 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
725 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
726
727 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
728 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
729 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
730 op1, op2));
731 emit_insn (insn);
732
733 ix86_move_vector_high_sse_to_mmx (op0);
734 }
735
736 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
737
738 void
ix86_split_mmx_punpck(rtx operands[],bool high_p)739 ix86_split_mmx_punpck (rtx operands[], bool high_p)
740 {
741 rtx op0 = operands[0];
742 rtx op1 = operands[1];
743 rtx op2 = operands[2];
744 machine_mode mode = GET_MODE (op0);
745 rtx mask;
746 /* The corresponding SSE mode. */
747 machine_mode sse_mode, double_sse_mode;
748
749 switch (mode)
750 {
751 case E_V8QImode:
752 sse_mode = V16QImode;
753 double_sse_mode = V32QImode;
754 mask = gen_rtx_PARALLEL (VOIDmode,
755 gen_rtvec (16,
756 GEN_INT (0), GEN_INT (16),
757 GEN_INT (1), GEN_INT (17),
758 GEN_INT (2), GEN_INT (18),
759 GEN_INT (3), GEN_INT (19),
760 GEN_INT (4), GEN_INT (20),
761 GEN_INT (5), GEN_INT (21),
762 GEN_INT (6), GEN_INT (22),
763 GEN_INT (7), GEN_INT (23)));
764 break;
765
766 case E_V4HImode:
767 sse_mode = V8HImode;
768 double_sse_mode = V16HImode;
769 mask = gen_rtx_PARALLEL (VOIDmode,
770 gen_rtvec (8,
771 GEN_INT (0), GEN_INT (8),
772 GEN_INT (1), GEN_INT (9),
773 GEN_INT (2), GEN_INT (10),
774 GEN_INT (3), GEN_INT (11)));
775 break;
776
777 case E_V2SImode:
778 sse_mode = V4SImode;
779 double_sse_mode = V8SImode;
780 mask = gen_rtx_PARALLEL (VOIDmode,
781 gen_rtvec (4,
782 GEN_INT (0), GEN_INT (4),
783 GEN_INT (1), GEN_INT (5)));
784 break;
785
786 default:
787 gcc_unreachable ();
788 }
789
790 /* Generate SSE punpcklXX. */
791 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
792 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
793 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
794
795 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
796 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
797 rtx insn = gen_rtx_SET (dest, op2);
798 emit_insn (insn);
799
800 if (high_p)
801 {
802 /* Move bits 64:127 to bits 0:63. */
803 mask = gen_rtx_PARALLEL (VOIDmode,
804 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
805 GEN_INT (0), GEN_INT (0)));
806 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
807 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
808 insn = gen_rtx_SET (dest, op1);
809 emit_insn (insn);
810 }
811 }
812
813 /* Helper function of ix86_fixup_binary_operands to canonicalize
814 operand order. Returns true if the operands should be swapped. */
815
816 static bool
ix86_swap_binary_operands_p(enum rtx_code code,machine_mode mode,rtx operands[])817 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
818 rtx operands[])
819 {
820 rtx dst = operands[0];
821 rtx src1 = operands[1];
822 rtx src2 = operands[2];
823
824 /* If the operation is not commutative, we can't do anything. */
825 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
826 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
827 return false;
828
829 /* Highest priority is that src1 should match dst. */
830 if (rtx_equal_p (dst, src1))
831 return false;
832 if (rtx_equal_p (dst, src2))
833 return true;
834
835 /* Next highest priority is that immediate constants come second. */
836 if (immediate_operand (src2, mode))
837 return false;
838 if (immediate_operand (src1, mode))
839 return true;
840
841 /* Lowest priority is that memory references should come second. */
842 if (MEM_P (src2))
843 return false;
844 if (MEM_P (src1))
845 return true;
846
847 return false;
848 }
849
850
851 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
852 destination to use for the operation. If different from the true
853 destination in operands[0], a copy operation will be required. */
854
855 rtx
ix86_fixup_binary_operands(enum rtx_code code,machine_mode mode,rtx operands[])856 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
857 rtx operands[])
858 {
859 rtx dst = operands[0];
860 rtx src1 = operands[1];
861 rtx src2 = operands[2];
862
863 /* Canonicalize operand order. */
864 if (ix86_swap_binary_operands_p (code, mode, operands))
865 {
866 /* It is invalid to swap operands of different modes. */
867 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
868
869 std::swap (src1, src2);
870 }
871
872 /* Both source operands cannot be in memory. */
873 if (MEM_P (src1) && MEM_P (src2))
874 {
875 /* Optimization: Only read from memory once. */
876 if (rtx_equal_p (src1, src2))
877 {
878 src2 = force_reg (mode, src2);
879 src1 = src2;
880 }
881 else if (rtx_equal_p (dst, src1))
882 src2 = force_reg (mode, src2);
883 else
884 src1 = force_reg (mode, src1);
885 }
886
887 /* If the destination is memory, and we do not have matching source
888 operands, do things in registers. */
889 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
890 dst = gen_reg_rtx (mode);
891
892 /* Source 1 cannot be a constant. */
893 if (CONSTANT_P (src1))
894 src1 = force_reg (mode, src1);
895
896 /* Source 1 cannot be a non-matching memory. */
897 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
898 src1 = force_reg (mode, src1);
899
900 /* Improve address combine. */
901 if (code == PLUS
902 && GET_MODE_CLASS (mode) == MODE_INT
903 && MEM_P (src2))
904 src2 = force_reg (mode, src2);
905
906 operands[1] = src1;
907 operands[2] = src2;
908 return dst;
909 }
910
911 /* Similarly, but assume that the destination has already been
912 set up properly. */
913
914 void
ix86_fixup_binary_operands_no_copy(enum rtx_code code,machine_mode mode,rtx operands[])915 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
916 machine_mode mode, rtx operands[])
917 {
918 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
919 gcc_assert (dst == operands[0]);
920 }
921
922 /* Attempt to expand a binary operator. Make the expansion closer to the
923 actual machine, then just general_operand, which will allow 3 separate
924 memory references (one output, two input) in a single insn. */
925
926 void
ix86_expand_binary_operator(enum rtx_code code,machine_mode mode,rtx operands[])927 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
928 rtx operands[])
929 {
930 rtx src1, src2, dst, op, clob;
931
932 dst = ix86_fixup_binary_operands (code, mode, operands);
933 src1 = operands[1];
934 src2 = operands[2];
935
936 /* Emit the instruction. */
937
938 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
939
940 if (reload_completed
941 && code == PLUS
942 && !rtx_equal_p (dst, src1))
943 {
944 /* This is going to be an LEA; avoid splitting it later. */
945 emit_insn (op);
946 }
947 else
948 {
949 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
950 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
951 }
952
953 /* Fix up the destination if needed. */
954 if (dst != operands[0])
955 emit_move_insn (operands[0], dst);
956 }
957
958 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
959 the given OPERANDS. */
960
961 void
ix86_expand_vector_logical_operator(enum rtx_code code,machine_mode mode,rtx operands[])962 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
963 rtx operands[])
964 {
965 rtx op1 = NULL_RTX, op2 = NULL_RTX;
966 if (SUBREG_P (operands[1]))
967 {
968 op1 = operands[1];
969 op2 = operands[2];
970 }
971 else if (SUBREG_P (operands[2]))
972 {
973 op1 = operands[2];
974 op2 = operands[1];
975 }
976 /* Optimize (__m128i) d | (__m128i) e and similar code
977 when d and e are float vectors into float vector logical
978 insn. In C/C++ without using intrinsics there is no other way
979 to express vector logical operation on float vectors than
980 to cast them temporarily to integer vectors. */
981 if (op1
982 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
983 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
984 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
985 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
986 && SUBREG_BYTE (op1) == 0
987 && (GET_CODE (op2) == CONST_VECTOR
988 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
989 && SUBREG_BYTE (op2) == 0))
990 && can_create_pseudo_p ())
991 {
992 rtx dst;
993 switch (GET_MODE (SUBREG_REG (op1)))
994 {
995 case E_V4SFmode:
996 case E_V8SFmode:
997 case E_V16SFmode:
998 case E_V2DFmode:
999 case E_V4DFmode:
1000 case E_V8DFmode:
1001 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1002 if (GET_CODE (op2) == CONST_VECTOR)
1003 {
1004 op2 = gen_lowpart (GET_MODE (dst), op2);
1005 op2 = force_reg (GET_MODE (dst), op2);
1006 }
1007 else
1008 {
1009 op1 = operands[1];
1010 op2 = SUBREG_REG (operands[2]);
1011 if (!vector_operand (op2, GET_MODE (dst)))
1012 op2 = force_reg (GET_MODE (dst), op2);
1013 }
1014 op1 = SUBREG_REG (op1);
1015 if (!vector_operand (op1, GET_MODE (dst)))
1016 op1 = force_reg (GET_MODE (dst), op1);
1017 emit_insn (gen_rtx_SET (dst,
1018 gen_rtx_fmt_ee (code, GET_MODE (dst),
1019 op1, op2)));
1020 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1021 return;
1022 default:
1023 break;
1024 }
1025 }
1026 if (!vector_operand (operands[1], mode))
1027 operands[1] = force_reg (mode, operands[1]);
1028 if (!vector_operand (operands[2], mode))
1029 operands[2] = force_reg (mode, operands[2]);
1030 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1031 emit_insn (gen_rtx_SET (operands[0],
1032 gen_rtx_fmt_ee (code, mode, operands[1],
1033 operands[2])));
1034 }
1035
1036 /* Return TRUE or FALSE depending on whether the binary operator meets the
1037 appropriate constraints. */
1038
1039 bool
ix86_binary_operator_ok(enum rtx_code code,machine_mode mode,rtx operands[3])1040 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1041 rtx operands[3])
1042 {
1043 rtx dst = operands[0];
1044 rtx src1 = operands[1];
1045 rtx src2 = operands[2];
1046
1047 /* Both source operands cannot be in memory. */
1048 if (MEM_P (src1) && MEM_P (src2))
1049 return false;
1050
1051 /* Canonicalize operand order for commutative operators. */
1052 if (ix86_swap_binary_operands_p (code, mode, operands))
1053 std::swap (src1, src2);
1054
1055 /* If the destination is memory, we must have a matching source operand. */
1056 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1057 return false;
1058
1059 /* Source 1 cannot be a constant. */
1060 if (CONSTANT_P (src1))
1061 return false;
1062
1063 /* Source 1 cannot be a non-matching memory. */
1064 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1065 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1066 return (code == AND
1067 && (mode == HImode
1068 || mode == SImode
1069 || (TARGET_64BIT && mode == DImode))
1070 && satisfies_constraint_L (src2));
1071
1072 return true;
1073 }
1074
1075 /* Attempt to expand a unary operator. Make the expansion closer to the
1076 actual machine, then just general_operand, which will allow 2 separate
1077 memory references (one output, one input) in a single insn. */
1078
1079 void
ix86_expand_unary_operator(enum rtx_code code,machine_mode mode,rtx operands[])1080 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1081 rtx operands[])
1082 {
1083 bool matching_memory = false;
1084 rtx src, dst, op, clob;
1085
1086 dst = operands[0];
1087 src = operands[1];
1088
1089 /* If the destination is memory, and we do not have matching source
1090 operands, do things in registers. */
1091 if (MEM_P (dst))
1092 {
1093 if (rtx_equal_p (dst, src))
1094 matching_memory = true;
1095 else
1096 dst = gen_reg_rtx (mode);
1097 }
1098
1099 /* When source operand is memory, destination must match. */
1100 if (MEM_P (src) && !matching_memory)
1101 src = force_reg (mode, src);
1102
1103 /* Emit the instruction. */
1104
1105 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1106
1107 if (code == NOT)
1108 emit_insn (op);
1109 else
1110 {
1111 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1112 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1113 }
1114
1115 /* Fix up the destination if needed. */
1116 if (dst != operands[0])
1117 emit_move_insn (operands[0], dst);
1118 }
1119
1120 /* Predict just emitted jump instruction to be taken with probability PROB. */
1121
1122 static void
predict_jump(int prob)1123 predict_jump (int prob)
1124 {
1125 rtx_insn *insn = get_last_insn ();
1126 gcc_assert (JUMP_P (insn));
1127 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1128 }
1129
1130 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1131 divisor are within the range [0-255]. */
1132
1133 void
ix86_split_idivmod(machine_mode mode,rtx operands[],bool unsigned_p)1134 ix86_split_idivmod (machine_mode mode, rtx operands[],
1135 bool unsigned_p)
1136 {
1137 rtx_code_label *end_label, *qimode_label;
1138 rtx div, mod;
1139 rtx_insn *insn;
1140 rtx scratch, tmp0, tmp1, tmp2;
1141 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1142
1143 operands[2] = force_reg (mode, operands[2]);
1144 operands[3] = force_reg (mode, operands[3]);
1145
1146 switch (mode)
1147 {
1148 case E_SImode:
1149 if (GET_MODE (operands[0]) == SImode)
1150 {
1151 if (GET_MODE (operands[1]) == SImode)
1152 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1153 else
1154 gen_divmod4_1
1155 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1156 }
1157 else
1158 gen_divmod4_1
1159 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1160 break;
1161
1162 case E_DImode:
1163 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1164 break;
1165
1166 default:
1167 gcc_unreachable ();
1168 }
1169
1170 end_label = gen_label_rtx ();
1171 qimode_label = gen_label_rtx ();
1172
1173 scratch = gen_reg_rtx (mode);
1174
1175 /* Use 8bit unsigned divimod if dividend and divisor are within
1176 the range [0-255]. */
1177 emit_move_insn (scratch, operands[2]);
1178 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1179 scratch, 1, OPTAB_DIRECT);
1180 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1181 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1182 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1183 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1184 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1185 pc_rtx);
1186 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1187 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1188 JUMP_LABEL (insn) = qimode_label;
1189
1190 /* Generate original signed/unsigned divimod. */
1191 div = gen_divmod4_1 (operands[0], operands[1],
1192 operands[2], operands[3]);
1193 emit_insn (div);
1194
1195 /* Branch to the end. */
1196 emit_jump_insn (gen_jump (end_label));
1197 emit_barrier ();
1198
1199 /* Generate 8bit unsigned divide. */
1200 emit_label (qimode_label);
1201 /* Don't use operands[0] for result of 8bit divide since not all
1202 registers support QImode ZERO_EXTRACT. */
1203 tmp0 = lowpart_subreg (HImode, scratch, mode);
1204 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1205 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1206 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1207
1208 if (unsigned_p)
1209 {
1210 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1211 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1212 }
1213 else
1214 {
1215 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1216 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1217 }
1218 if (mode == SImode)
1219 {
1220 if (GET_MODE (operands[0]) != SImode)
1221 div = gen_rtx_ZERO_EXTEND (DImode, div);
1222 if (GET_MODE (operands[1]) != SImode)
1223 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1224 }
1225
1226 /* Extract remainder from AH. */
1227 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
1228 tmp0, GEN_INT (8), GEN_INT (8));
1229 if (REG_P (operands[1]))
1230 insn = emit_move_insn (operands[1], tmp1);
1231 else
1232 {
1233 /* Need a new scratch register since the old one has result
1234 of 8bit divide. */
1235 scratch = gen_reg_rtx (GET_MODE (operands[1]));
1236 emit_move_insn (scratch, tmp1);
1237 insn = emit_move_insn (operands[1], scratch);
1238 }
1239 set_unique_reg_note (insn, REG_EQUAL, mod);
1240
1241 /* Zero extend quotient from AL. */
1242 tmp1 = gen_lowpart (QImode, tmp0);
1243 insn = emit_insn (gen_extend_insn
1244 (operands[0], tmp1,
1245 GET_MODE (operands[0]), QImode, 1));
1246 set_unique_reg_note (insn, REG_EQUAL, div);
1247
1248 emit_label (end_label);
1249 }
1250
1251 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1252 matches destination. RTX includes clobber of FLAGS_REG. */
1253
1254 void
ix86_emit_binop(enum rtx_code code,machine_mode mode,rtx dst,rtx src)1255 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1256 rtx dst, rtx src)
1257 {
1258 rtx op, clob;
1259
1260 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1261 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1262
1263 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1264 }
1265
1266 /* Return true if regno1 def is nearest to the insn. */
1267
1268 static bool
find_nearest_reg_def(rtx_insn * insn,int regno1,int regno2)1269 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1270 {
1271 rtx_insn *prev = insn;
1272 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1273
1274 if (insn == start)
1275 return false;
1276 while (prev && prev != start)
1277 {
1278 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1279 {
1280 prev = PREV_INSN (prev);
1281 continue;
1282 }
1283 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1284 return true;
1285 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1286 return false;
1287 prev = PREV_INSN (prev);
1288 }
1289
1290 /* None of the regs is defined in the bb. */
1291 return false;
1292 }
1293
1294 /* Split lea instructions into a sequence of instructions
1295 which are executed on ALU to avoid AGU stalls.
1296 It is assumed that it is allowed to clobber flags register
1297 at lea position. */
1298
1299 void
ix86_split_lea_for_addr(rtx_insn * insn,rtx operands[],machine_mode mode)1300 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1301 {
1302 unsigned int regno0, regno1, regno2;
1303 struct ix86_address parts;
1304 rtx target, tmp;
1305 int ok, adds;
1306
1307 ok = ix86_decompose_address (operands[1], &parts);
1308 gcc_assert (ok);
1309
1310 target = gen_lowpart (mode, operands[0]);
1311
1312 regno0 = true_regnum (target);
1313 regno1 = INVALID_REGNUM;
1314 regno2 = INVALID_REGNUM;
1315
1316 if (parts.base)
1317 {
1318 parts.base = gen_lowpart (mode, parts.base);
1319 regno1 = true_regnum (parts.base);
1320 }
1321
1322 if (parts.index)
1323 {
1324 parts.index = gen_lowpart (mode, parts.index);
1325 regno2 = true_regnum (parts.index);
1326 }
1327
1328 if (parts.disp)
1329 parts.disp = gen_lowpart (mode, parts.disp);
1330
1331 if (parts.scale > 1)
1332 {
1333 /* Case r1 = r1 + ... */
1334 if (regno1 == regno0)
1335 {
1336 /* If we have a case r1 = r1 + C * r2 then we
1337 should use multiplication which is very
1338 expensive. Assume cost model is wrong if we
1339 have such case here. */
1340 gcc_assert (regno2 != regno0);
1341
1342 for (adds = parts.scale; adds > 0; adds--)
1343 ix86_emit_binop (PLUS, mode, target, parts.index);
1344 }
1345 else
1346 {
1347 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1348 if (regno0 != regno2)
1349 emit_insn (gen_rtx_SET (target, parts.index));
1350
1351 /* Use shift for scaling. */
1352 ix86_emit_binop (ASHIFT, mode, target,
1353 GEN_INT (exact_log2 (parts.scale)));
1354
1355 if (parts.base)
1356 ix86_emit_binop (PLUS, mode, target, parts.base);
1357
1358 if (parts.disp && parts.disp != const0_rtx)
1359 ix86_emit_binop (PLUS, mode, target, parts.disp);
1360 }
1361 }
1362 else if (!parts.base && !parts.index)
1363 {
1364 gcc_assert(parts.disp);
1365 emit_insn (gen_rtx_SET (target, parts.disp));
1366 }
1367 else
1368 {
1369 if (!parts.base)
1370 {
1371 if (regno0 != regno2)
1372 emit_insn (gen_rtx_SET (target, parts.index));
1373 }
1374 else if (!parts.index)
1375 {
1376 if (regno0 != regno1)
1377 emit_insn (gen_rtx_SET (target, parts.base));
1378 }
1379 else
1380 {
1381 if (regno0 == regno1)
1382 tmp = parts.index;
1383 else if (regno0 == regno2)
1384 tmp = parts.base;
1385 else
1386 {
1387 rtx tmp1;
1388
1389 /* Find better operand for SET instruction, depending
1390 on which definition is farther from the insn. */
1391 if (find_nearest_reg_def (insn, regno1, regno2))
1392 tmp = parts.index, tmp1 = parts.base;
1393 else
1394 tmp = parts.base, tmp1 = parts.index;
1395
1396 emit_insn (gen_rtx_SET (target, tmp));
1397
1398 if (parts.disp && parts.disp != const0_rtx)
1399 ix86_emit_binop (PLUS, mode, target, parts.disp);
1400
1401 ix86_emit_binop (PLUS, mode, target, tmp1);
1402 return;
1403 }
1404
1405 ix86_emit_binop (PLUS, mode, target, tmp);
1406 }
1407
1408 if (parts.disp && parts.disp != const0_rtx)
1409 ix86_emit_binop (PLUS, mode, target, parts.disp);
1410 }
1411 }
1412
1413 /* Post-reload splitter for converting an SF or DFmode value in an
1414 SSE register into an unsigned SImode. */
1415
1416 void
ix86_split_convert_uns_si_sse(rtx operands[])1417 ix86_split_convert_uns_si_sse (rtx operands[])
1418 {
1419 machine_mode vecmode;
1420 rtx value, large, zero_or_two31, input, two31, x;
1421
1422 large = operands[1];
1423 zero_or_two31 = operands[2];
1424 input = operands[3];
1425 two31 = operands[4];
1426 vecmode = GET_MODE (large);
1427 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1428
1429 /* Load up the value into the low element. We must ensure that the other
1430 elements are valid floats -- zero is the easiest such value. */
1431 if (MEM_P (input))
1432 {
1433 if (vecmode == V4SFmode)
1434 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1435 else
1436 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1437 }
1438 else
1439 {
1440 input = gen_rtx_REG (vecmode, REGNO (input));
1441 emit_move_insn (value, CONST0_RTX (vecmode));
1442 if (vecmode == V4SFmode)
1443 emit_insn (gen_sse_movss (value, value, input));
1444 else
1445 emit_insn (gen_sse2_movsd (value, value, input));
1446 }
1447
1448 emit_move_insn (large, two31);
1449 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1450
1451 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1452 emit_insn (gen_rtx_SET (large, x));
1453
1454 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1455 emit_insn (gen_rtx_SET (zero_or_two31, x));
1456
1457 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1458 emit_insn (gen_rtx_SET (value, x));
1459
1460 large = gen_rtx_REG (V4SImode, REGNO (large));
1461 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1462
1463 x = gen_rtx_REG (V4SImode, REGNO (value));
1464 if (vecmode == V4SFmode)
1465 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1466 else
1467 emit_insn (gen_sse2_cvttpd2dq (x, value));
1468 value = x;
1469
1470 emit_insn (gen_xorv4si3 (value, value, large));
1471 }
1472
1473 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1474 machine_mode mode, rtx target,
1475 rtx var, int one_var);
1476
1477 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1478 Expects the 64-bit DImode to be supplied in a pair of integral
1479 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1480 -mfpmath=sse, !optimize_size only. */
1481
1482 void
ix86_expand_convert_uns_didf_sse(rtx target,rtx input)1483 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1484 {
1485 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1486 rtx int_xmm, fp_xmm;
1487 rtx biases, exponents;
1488 rtx x;
1489
1490 int_xmm = gen_reg_rtx (V4SImode);
1491 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1492 emit_insn (gen_movdi_to_sse (int_xmm, input));
1493 else if (TARGET_SSE_SPLIT_REGS)
1494 {
1495 emit_clobber (int_xmm);
1496 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1497 }
1498 else
1499 {
1500 x = gen_reg_rtx (V2DImode);
1501 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1502 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1503 }
1504
1505 x = gen_rtx_CONST_VECTOR (V4SImode,
1506 gen_rtvec (4, GEN_INT (0x43300000UL),
1507 GEN_INT (0x45300000UL),
1508 const0_rtx, const0_rtx));
1509 exponents = validize_mem (force_const_mem (V4SImode, x));
1510
1511 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1512 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1513
1514 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1515 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1516 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1517 (0x1.0p84 + double(fp_value_hi_xmm)).
1518 Note these exponents differ by 32. */
1519
1520 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1521
1522 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1523 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1524 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1525 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1526 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1527 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1528 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1529 biases = validize_mem (force_const_mem (V2DFmode, biases));
1530 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1531
1532 /* Add the upper and lower DFmode values together. */
1533 if (TARGET_SSE3)
1534 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1535 else
1536 {
1537 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1538 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1539 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1540 }
1541
1542 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1543 }
1544
1545 /* Not used, but eases macroization of patterns. */
1546 void
ix86_expand_convert_uns_sixf_sse(rtx,rtx)1547 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1548 {
1549 gcc_unreachable ();
1550 }
1551
1552 /* Convert an unsigned SImode value into a DFmode. Only currently used
1553 for SSE, but applicable anywhere. */
1554
1555 void
ix86_expand_convert_uns_sidf_sse(rtx target,rtx input)1556 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1557 {
1558 REAL_VALUE_TYPE TWO31r;
1559 rtx x, fp;
1560
1561 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1562 NULL, 1, OPTAB_DIRECT);
1563
1564 fp = gen_reg_rtx (DFmode);
1565 emit_insn (gen_floatsidf2 (fp, x));
1566
1567 real_ldexp (&TWO31r, &dconst1, 31);
1568 x = const_double_from_real_value (TWO31r, DFmode);
1569
1570 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1571 if (x != target)
1572 emit_move_insn (target, x);
1573 }
1574
1575 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1576 32-bit mode; otherwise we have a direct convert instruction. */
1577
1578 void
ix86_expand_convert_sign_didf_sse(rtx target,rtx input)1579 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1580 {
1581 REAL_VALUE_TYPE TWO32r;
1582 rtx fp_lo, fp_hi, x;
1583
1584 fp_lo = gen_reg_rtx (DFmode);
1585 fp_hi = gen_reg_rtx (DFmode);
1586
1587 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1588
1589 real_ldexp (&TWO32r, &dconst1, 32);
1590 x = const_double_from_real_value (TWO32r, DFmode);
1591 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1592
1593 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1594
1595 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1596 0, OPTAB_DIRECT);
1597 if (x != target)
1598 emit_move_insn (target, x);
1599 }
1600
1601 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1602 For x86_32, -mfpmath=sse, !optimize_size only. */
1603 void
ix86_expand_convert_uns_sisf_sse(rtx target,rtx input)1604 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1605 {
1606 REAL_VALUE_TYPE ONE16r;
1607 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1608
1609 real_ldexp (&ONE16r, &dconst1, 16);
1610 x = const_double_from_real_value (ONE16r, SFmode);
1611 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1612 NULL, 0, OPTAB_DIRECT);
1613 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1614 NULL, 0, OPTAB_DIRECT);
1615 fp_hi = gen_reg_rtx (SFmode);
1616 fp_lo = gen_reg_rtx (SFmode);
1617 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1618 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1619 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1620 0, OPTAB_DIRECT);
1621 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1622 0, OPTAB_DIRECT);
1623 if (!rtx_equal_p (target, fp_hi))
1624 emit_move_insn (target, fp_hi);
1625 }
1626
1627 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1628 a vector of unsigned ints VAL to vector of floats TARGET. */
1629
1630 void
ix86_expand_vector_convert_uns_vsivsf(rtx target,rtx val)1631 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1632 {
1633 rtx tmp[8];
1634 REAL_VALUE_TYPE TWO16r;
1635 machine_mode intmode = GET_MODE (val);
1636 machine_mode fltmode = GET_MODE (target);
1637 rtx (*cvt) (rtx, rtx);
1638
1639 if (intmode == V4SImode)
1640 cvt = gen_floatv4siv4sf2;
1641 else
1642 cvt = gen_floatv8siv8sf2;
1643 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1644 tmp[0] = force_reg (intmode, tmp[0]);
1645 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1646 OPTAB_DIRECT);
1647 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1648 NULL_RTX, 1, OPTAB_DIRECT);
1649 tmp[3] = gen_reg_rtx (fltmode);
1650 emit_insn (cvt (tmp[3], tmp[1]));
1651 tmp[4] = gen_reg_rtx (fltmode);
1652 emit_insn (cvt (tmp[4], tmp[2]));
1653 real_ldexp (&TWO16r, &dconst1, 16);
1654 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1655 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1656 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1657 OPTAB_DIRECT);
1658 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1659 OPTAB_DIRECT);
1660 if (tmp[7] != target)
1661 emit_move_insn (target, tmp[7]);
1662 }
1663
1664 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1665 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1666 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1667 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1668
1669 rtx
ix86_expand_adjust_ufix_to_sfix_si(rtx val,rtx * xorp)1670 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1671 {
1672 REAL_VALUE_TYPE TWO31r;
1673 rtx two31r, tmp[4];
1674 machine_mode mode = GET_MODE (val);
1675 machine_mode scalarmode = GET_MODE_INNER (mode);
1676 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1677 rtx (*cmp) (rtx, rtx, rtx, rtx);
1678 int i;
1679
1680 for (i = 0; i < 3; i++)
1681 tmp[i] = gen_reg_rtx (mode);
1682 real_ldexp (&TWO31r, &dconst1, 31);
1683 two31r = const_double_from_real_value (TWO31r, scalarmode);
1684 two31r = ix86_build_const_vector (mode, 1, two31r);
1685 two31r = force_reg (mode, two31r);
1686 switch (mode)
1687 {
1688 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1689 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1690 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1691 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1692 default: gcc_unreachable ();
1693 }
1694 tmp[3] = gen_rtx_LE (mode, two31r, val);
1695 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1696 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1697 0, OPTAB_DIRECT);
1698 if (intmode == V4SImode || TARGET_AVX2)
1699 *xorp = expand_simple_binop (intmode, ASHIFT,
1700 gen_lowpart (intmode, tmp[0]),
1701 GEN_INT (31), NULL_RTX, 0,
1702 OPTAB_DIRECT);
1703 else
1704 {
1705 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1706 two31 = ix86_build_const_vector (intmode, 1, two31);
1707 *xorp = expand_simple_binop (intmode, AND,
1708 gen_lowpart (intmode, tmp[0]),
1709 two31, NULL_RTX, 0,
1710 OPTAB_DIRECT);
1711 }
1712 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1713 0, OPTAB_DIRECT);
1714 }
1715
1716 /* Generate code for floating point ABS or NEG. */
1717
1718 void
ix86_expand_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])1719 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1720 rtx operands[])
1721 {
1722 rtx set, dst, src;
1723 bool use_sse = false;
1724 bool vector_mode = VECTOR_MODE_P (mode);
1725 machine_mode vmode = mode;
1726 rtvec par;
1727
1728 if (vector_mode)
1729 use_sse = true;
1730 else if (mode == TFmode)
1731 use_sse = true;
1732 else if (TARGET_SSE_MATH)
1733 {
1734 use_sse = SSE_FLOAT_MODE_P (mode);
1735 if (mode == SFmode)
1736 vmode = V4SFmode;
1737 else if (mode == DFmode)
1738 vmode = V2DFmode;
1739 }
1740
1741 dst = operands[0];
1742 src = operands[1];
1743
1744 set = gen_rtx_fmt_e (code, mode, src);
1745 set = gen_rtx_SET (dst, set);
1746
1747 if (use_sse)
1748 {
1749 rtx mask, use, clob;
1750
1751 /* NEG and ABS performed with SSE use bitwise mask operations.
1752 Create the appropriate mask now. */
1753 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1754 use = gen_rtx_USE (VOIDmode, mask);
1755 if (vector_mode)
1756 par = gen_rtvec (2, set, use);
1757 else
1758 {
1759 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1760 par = gen_rtvec (3, set, use, clob);
1761 }
1762 }
1763 else
1764 {
1765 rtx clob;
1766
1767 /* Changing of sign for FP values is doable using integer unit too. */
1768 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1769 par = gen_rtvec (2, set, clob);
1770 }
1771
1772 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1773 }
1774
1775 /* Deconstruct a floating point ABS or NEG operation
1776 with integer registers into integer operations. */
1777
1778 void
ix86_split_fp_absneg_operator(enum rtx_code code,machine_mode mode,rtx operands[])1779 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1780 rtx operands[])
1781 {
1782 enum rtx_code absneg_op;
1783 rtx dst, set;
1784
1785 gcc_assert (operands_match_p (operands[0], operands[1]));
1786
1787 switch (mode)
1788 {
1789 case E_SFmode:
1790 dst = gen_lowpart (SImode, operands[0]);
1791
1792 if (code == ABS)
1793 {
1794 set = gen_int_mode (0x7fffffff, SImode);
1795 absneg_op = AND;
1796 }
1797 else
1798 {
1799 set = gen_int_mode (0x80000000, SImode);
1800 absneg_op = XOR;
1801 }
1802 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1803 break;
1804
1805 case E_DFmode:
1806 if (TARGET_64BIT)
1807 {
1808 dst = gen_lowpart (DImode, operands[0]);
1809 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1810
1811 if (code == ABS)
1812 set = const0_rtx;
1813 else
1814 set = gen_rtx_NOT (DImode, dst);
1815 }
1816 else
1817 {
1818 dst = gen_highpart (SImode, operands[0]);
1819
1820 if (code == ABS)
1821 {
1822 set = gen_int_mode (0x7fffffff, SImode);
1823 absneg_op = AND;
1824 }
1825 else
1826 {
1827 set = gen_int_mode (0x80000000, SImode);
1828 absneg_op = XOR;
1829 }
1830 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1831 }
1832 break;
1833
1834 case E_XFmode:
1835 dst = gen_rtx_REG (SImode,
1836 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1837 if (code == ABS)
1838 {
1839 set = GEN_INT (0x7fff);
1840 absneg_op = AND;
1841 }
1842 else
1843 {
1844 set = GEN_INT (0x8000);
1845 absneg_op = XOR;
1846 }
1847 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1848 break;
1849
1850 default:
1851 gcc_unreachable ();
1852 }
1853
1854 set = gen_rtx_SET (dst, set);
1855
1856 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1857 rtvec par = gen_rtvec (2, set, clob);
1858
1859 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1860 }
1861
1862 /* Expand a copysign operation. Special case operand 0 being a constant. */
1863
1864 void
ix86_expand_copysign(rtx operands[])1865 ix86_expand_copysign (rtx operands[])
1866 {
1867 machine_mode mode, vmode;
1868 rtx dest, op0, op1, mask;
1869
1870 dest = operands[0];
1871 op0 = operands[1];
1872 op1 = operands[2];
1873
1874 mode = GET_MODE (dest);
1875
1876 if (mode == SFmode)
1877 vmode = V4SFmode;
1878 else if (mode == DFmode)
1879 vmode = V2DFmode;
1880 else if (mode == TFmode)
1881 vmode = mode;
1882 else
1883 gcc_unreachable ();
1884
1885 mask = ix86_build_signbit_mask (vmode, 0, 0);
1886
1887 if (CONST_DOUBLE_P (op0))
1888 {
1889 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1890 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1891
1892 if (mode == SFmode || mode == DFmode)
1893 {
1894 if (op0 == CONST0_RTX (mode))
1895 op0 = CONST0_RTX (vmode);
1896 else
1897 {
1898 rtx v = ix86_build_const_vector (vmode, false, op0);
1899
1900 op0 = force_reg (vmode, v);
1901 }
1902 }
1903 else if (op0 != CONST0_RTX (mode))
1904 op0 = force_reg (mode, op0);
1905
1906 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1907 }
1908 else
1909 {
1910 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1911
1912 emit_insn (gen_copysign3_var
1913 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1914 }
1915 }
1916
1917 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1918 be a constant, and so has already been expanded into a vector constant. */
1919
1920 void
ix86_split_copysign_const(rtx operands[])1921 ix86_split_copysign_const (rtx operands[])
1922 {
1923 machine_mode mode, vmode;
1924 rtx dest, op0, mask, x;
1925
1926 dest = operands[0];
1927 op0 = operands[1];
1928 mask = operands[3];
1929
1930 mode = GET_MODE (dest);
1931 vmode = GET_MODE (mask);
1932
1933 dest = lowpart_subreg (vmode, dest, mode);
1934 x = gen_rtx_AND (vmode, dest, mask);
1935 emit_insn (gen_rtx_SET (dest, x));
1936
1937 if (op0 != CONST0_RTX (vmode))
1938 {
1939 x = gen_rtx_IOR (vmode, dest, op0);
1940 emit_insn (gen_rtx_SET (dest, x));
1941 }
1942 }
1943
1944 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1945 so we have to do two masks. */
1946
1947 void
ix86_split_copysign_var(rtx operands[])1948 ix86_split_copysign_var (rtx operands[])
1949 {
1950 machine_mode mode, vmode;
1951 rtx dest, scratch, op0, op1, mask, nmask, x;
1952
1953 dest = operands[0];
1954 scratch = operands[1];
1955 op0 = operands[2];
1956 op1 = operands[3];
1957 nmask = operands[4];
1958 mask = operands[5];
1959
1960 mode = GET_MODE (dest);
1961 vmode = GET_MODE (mask);
1962
1963 if (rtx_equal_p (op0, op1))
1964 {
1965 /* Shouldn't happen often (it's useless, obviously), but when it does
1966 we'd generate incorrect code if we continue below. */
1967 emit_move_insn (dest, op0);
1968 return;
1969 }
1970
1971 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1972 {
1973 gcc_assert (REGNO (op1) == REGNO (scratch));
1974
1975 x = gen_rtx_AND (vmode, scratch, mask);
1976 emit_insn (gen_rtx_SET (scratch, x));
1977
1978 dest = mask;
1979 op0 = lowpart_subreg (vmode, op0, mode);
1980 x = gen_rtx_NOT (vmode, dest);
1981 x = gen_rtx_AND (vmode, x, op0);
1982 emit_insn (gen_rtx_SET (dest, x));
1983 }
1984 else
1985 {
1986 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1987 {
1988 x = gen_rtx_AND (vmode, scratch, mask);
1989 }
1990 else /* alternative 2,4 */
1991 {
1992 gcc_assert (REGNO (mask) == REGNO (scratch));
1993 op1 = lowpart_subreg (vmode, op1, mode);
1994 x = gen_rtx_AND (vmode, scratch, op1);
1995 }
1996 emit_insn (gen_rtx_SET (scratch, x));
1997
1998 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1999 {
2000 dest = lowpart_subreg (vmode, op0, mode);
2001 x = gen_rtx_AND (vmode, dest, nmask);
2002 }
2003 else /* alternative 3,4 */
2004 {
2005 gcc_assert (REGNO (nmask) == REGNO (dest));
2006 dest = nmask;
2007 op0 = lowpart_subreg (vmode, op0, mode);
2008 x = gen_rtx_AND (vmode, dest, op0);
2009 }
2010 emit_insn (gen_rtx_SET (dest, x));
2011 }
2012
2013 x = gen_rtx_IOR (vmode, dest, scratch);
2014 emit_insn (gen_rtx_SET (dest, x));
2015 }
2016
2017 /* Expand an xorsign operation. */
2018
2019 void
ix86_expand_xorsign(rtx operands[])2020 ix86_expand_xorsign (rtx operands[])
2021 {
2022 machine_mode mode, vmode;
2023 rtx dest, op0, op1, mask;
2024
2025 dest = operands[0];
2026 op0 = operands[1];
2027 op1 = operands[2];
2028
2029 mode = GET_MODE (dest);
2030
2031 if (mode == SFmode)
2032 vmode = V4SFmode;
2033 else if (mode == DFmode)
2034 vmode = V2DFmode;
2035 else
2036 gcc_unreachable ();
2037
2038 mask = ix86_build_signbit_mask (vmode, 0, 0);
2039
2040 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2041 }
2042
2043 /* Deconstruct an xorsign operation into bit masks. */
2044
2045 void
ix86_split_xorsign(rtx operands[])2046 ix86_split_xorsign (rtx operands[])
2047 {
2048 machine_mode mode, vmode;
2049 rtx dest, op0, mask, x;
2050
2051 dest = operands[0];
2052 op0 = operands[1];
2053 mask = operands[3];
2054
2055 mode = GET_MODE (dest);
2056 vmode = GET_MODE (mask);
2057
2058 dest = lowpart_subreg (vmode, dest, mode);
2059 x = gen_rtx_AND (vmode, dest, mask);
2060 emit_insn (gen_rtx_SET (dest, x));
2061
2062 op0 = lowpart_subreg (vmode, op0, mode);
2063 x = gen_rtx_XOR (vmode, dest, op0);
2064 emit_insn (gen_rtx_SET (dest, x));
2065 }
2066
2067 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2068
2069 void
ix86_expand_branch(enum rtx_code code,rtx op0,rtx op1,rtx label)2070 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2071 {
2072 machine_mode mode = GET_MODE (op0);
2073 rtx tmp;
2074
2075 /* Handle special case - vector comparsion with boolean result, transform
2076 it using ptest instruction. */
2077 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2078 {
2079 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2080 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2081
2082 gcc_assert (code == EQ || code == NE);
2083 /* Generate XOR since we can't check that one operand is zero vector. */
2084 tmp = gen_reg_rtx (mode);
2085 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2086 tmp = gen_lowpart (p_mode, tmp);
2087 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2088 gen_rtx_UNSPEC (CCmode,
2089 gen_rtvec (2, tmp, tmp),
2090 UNSPEC_PTEST)));
2091 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2092 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2093 gen_rtx_LABEL_REF (VOIDmode, label),
2094 pc_rtx);
2095 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2096 return;
2097 }
2098
2099 switch (mode)
2100 {
2101 case E_SFmode:
2102 case E_DFmode:
2103 case E_XFmode:
2104 case E_QImode:
2105 case E_HImode:
2106 case E_SImode:
2107 simple:
2108 tmp = ix86_expand_compare (code, op0, op1);
2109 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2110 gen_rtx_LABEL_REF (VOIDmode, label),
2111 pc_rtx);
2112 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2113 return;
2114
2115 case E_DImode:
2116 if (TARGET_64BIT)
2117 goto simple;
2118 /* For 32-bit target DI comparison may be performed on
2119 SSE registers. To allow this we should avoid split
2120 to SI mode which is achieved by doing xor in DI mode
2121 and then comparing with zero (which is recognized by
2122 STV pass). We don't compare using xor when optimizing
2123 for size. */
2124 if (!optimize_insn_for_size_p ()
2125 && TARGET_STV
2126 && (code == EQ || code == NE))
2127 {
2128 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2129 op1 = const0_rtx;
2130 }
2131 /* FALLTHRU */
2132 case E_TImode:
2133 /* Expand DImode branch into multiple compare+branch. */
2134 {
2135 rtx lo[2], hi[2];
2136 rtx_code_label *label2;
2137 enum rtx_code code1, code2, code3;
2138 machine_mode submode;
2139
2140 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2141 {
2142 std::swap (op0, op1);
2143 code = swap_condition (code);
2144 }
2145
2146 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2147 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2148
2149 submode = mode == DImode ? SImode : DImode;
2150
2151 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2152 avoid two branches. This costs one extra insn, so disable when
2153 optimizing for size. */
2154
2155 if ((code == EQ || code == NE)
2156 && (!optimize_insn_for_size_p ()
2157 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2158 {
2159 rtx xor0, xor1;
2160
2161 xor1 = hi[0];
2162 if (hi[1] != const0_rtx)
2163 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2164 NULL_RTX, 0, OPTAB_WIDEN);
2165
2166 xor0 = lo[0];
2167 if (lo[1] != const0_rtx)
2168 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2169 NULL_RTX, 0, OPTAB_WIDEN);
2170
2171 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2172 NULL_RTX, 0, OPTAB_WIDEN);
2173
2174 ix86_expand_branch (code, tmp, const0_rtx, label);
2175 return;
2176 }
2177
2178 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2179 op1 is a constant and the low word is zero, then we can just
2180 examine the high word. Similarly for low word -1 and
2181 less-or-equal-than or greater-than. */
2182
2183 if (CONST_INT_P (hi[1]))
2184 switch (code)
2185 {
2186 case LT: case LTU: case GE: case GEU:
2187 if (lo[1] == const0_rtx)
2188 {
2189 ix86_expand_branch (code, hi[0], hi[1], label);
2190 return;
2191 }
2192 break;
2193 case LE: case LEU: case GT: case GTU:
2194 if (lo[1] == constm1_rtx)
2195 {
2196 ix86_expand_branch (code, hi[0], hi[1], label);
2197 return;
2198 }
2199 break;
2200 default:
2201 break;
2202 }
2203
2204 /* Emulate comparisons that do not depend on Zero flag with
2205 double-word subtraction. Note that only Overflow, Sign
2206 and Carry flags are valid, so swap arguments and condition
2207 of comparisons that would otherwise test Zero flag. */
2208
2209 switch (code)
2210 {
2211 case LE: case LEU: case GT: case GTU:
2212 std::swap (lo[0], lo[1]);
2213 std::swap (hi[0], hi[1]);
2214 code = swap_condition (code);
2215 /* FALLTHRU */
2216
2217 case LT: case LTU: case GE: case GEU:
2218 {
2219 bool uns = (code == LTU || code == GEU);
2220 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2221 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2222
2223 if (!nonimmediate_operand (lo[0], submode))
2224 lo[0] = force_reg (submode, lo[0]);
2225 if (!x86_64_general_operand (lo[1], submode))
2226 lo[1] = force_reg (submode, lo[1]);
2227
2228 if (!register_operand (hi[0], submode))
2229 hi[0] = force_reg (submode, hi[0]);
2230 if ((uns && !nonimmediate_operand (hi[1], submode))
2231 || (!uns && !x86_64_general_operand (hi[1], submode)))
2232 hi[1] = force_reg (submode, hi[1]);
2233
2234 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2235
2236 tmp = gen_rtx_SCRATCH (submode);
2237 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2238
2239 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2240 ix86_expand_branch (code, tmp, const0_rtx, label);
2241 return;
2242 }
2243
2244 default:
2245 break;
2246 }
2247
2248 /* Otherwise, we need two or three jumps. */
2249
2250 label2 = gen_label_rtx ();
2251
2252 code1 = code;
2253 code2 = swap_condition (code);
2254 code3 = unsigned_condition (code);
2255
2256 switch (code)
2257 {
2258 case LT: case GT: case LTU: case GTU:
2259 break;
2260
2261 case LE: code1 = LT; code2 = GT; break;
2262 case GE: code1 = GT; code2 = LT; break;
2263 case LEU: code1 = LTU; code2 = GTU; break;
2264 case GEU: code1 = GTU; code2 = LTU; break;
2265
2266 case EQ: code1 = UNKNOWN; code2 = NE; break;
2267 case NE: code2 = UNKNOWN; break;
2268
2269 default:
2270 gcc_unreachable ();
2271 }
2272
2273 /*
2274 * a < b =>
2275 * if (hi(a) < hi(b)) goto true;
2276 * if (hi(a) > hi(b)) goto false;
2277 * if (lo(a) < lo(b)) goto true;
2278 * false:
2279 */
2280
2281 if (code1 != UNKNOWN)
2282 ix86_expand_branch (code1, hi[0], hi[1], label);
2283 if (code2 != UNKNOWN)
2284 ix86_expand_branch (code2, hi[0], hi[1], label2);
2285
2286 ix86_expand_branch (code3, lo[0], lo[1], label);
2287
2288 if (code2 != UNKNOWN)
2289 emit_label (label2);
2290 return;
2291 }
2292
2293 default:
2294 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2295 goto simple;
2296 }
2297 }
2298
2299 /* Figure out whether to use unordered fp comparisons. */
2300
2301 static bool
ix86_unordered_fp_compare(enum rtx_code code)2302 ix86_unordered_fp_compare (enum rtx_code code)
2303 {
2304 if (!TARGET_IEEE_FP)
2305 return false;
2306
2307 switch (code)
2308 {
2309 case LT:
2310 case LE:
2311 case GT:
2312 case GE:
2313 case LTGT:
2314 return false;
2315
2316 case EQ:
2317 case NE:
2318
2319 case UNORDERED:
2320 case ORDERED:
2321 case UNLT:
2322 case UNLE:
2323 case UNGT:
2324 case UNGE:
2325 case UNEQ:
2326 return true;
2327
2328 default:
2329 gcc_unreachable ();
2330 }
2331 }
2332
2333 /* Return a comparison we can do and that it is equivalent to
2334 swap_condition (code) apart possibly from orderedness.
2335 But, never change orderedness if TARGET_IEEE_FP, returning
2336 UNKNOWN in that case if necessary. */
2337
2338 static enum rtx_code
ix86_fp_swap_condition(enum rtx_code code)2339 ix86_fp_swap_condition (enum rtx_code code)
2340 {
2341 switch (code)
2342 {
2343 case GT: /* GTU - CF=0 & ZF=0 */
2344 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2345 case GE: /* GEU - CF=0 */
2346 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2347 case UNLT: /* LTU - CF=1 */
2348 return TARGET_IEEE_FP ? UNKNOWN : GT;
2349 case UNLE: /* LEU - CF=1 | ZF=1 */
2350 return TARGET_IEEE_FP ? UNKNOWN : GE;
2351 default:
2352 return swap_condition (code);
2353 }
2354 }
2355
2356 /* Return cost of comparison CODE using the best strategy for performance.
2357 All following functions do use number of instructions as a cost metrics.
2358 In future this should be tweaked to compute bytes for optimize_size and
2359 take into account performance of various instructions on various CPUs. */
2360
2361 static int
ix86_fp_comparison_cost(enum rtx_code code)2362 ix86_fp_comparison_cost (enum rtx_code code)
2363 {
2364 int arith_cost;
2365
2366 /* The cost of code using bit-twiddling on %ah. */
2367 switch (code)
2368 {
2369 case UNLE:
2370 case UNLT:
2371 case LTGT:
2372 case GT:
2373 case GE:
2374 case UNORDERED:
2375 case ORDERED:
2376 case UNEQ:
2377 arith_cost = 4;
2378 break;
2379 case LT:
2380 case NE:
2381 case EQ:
2382 case UNGE:
2383 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2384 break;
2385 case LE:
2386 case UNGT:
2387 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2388 break;
2389 default:
2390 gcc_unreachable ();
2391 }
2392
2393 switch (ix86_fp_comparison_strategy (code))
2394 {
2395 case IX86_FPCMP_COMI:
2396 return arith_cost > 4 ? 3 : 2;
2397 case IX86_FPCMP_SAHF:
2398 return arith_cost > 4 ? 4 : 3;
2399 default:
2400 return arith_cost;
2401 }
2402 }
2403
2404 /* Swap, force into registers, or otherwise massage the two operands
2405 to a fp comparison. The operands are updated in place; the new
2406 comparison code is returned. */
2407
2408 static enum rtx_code
ix86_prepare_fp_compare_args(enum rtx_code code,rtx * pop0,rtx * pop1)2409 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2410 {
2411 bool unordered_compare = ix86_unordered_fp_compare (code);
2412 rtx op0 = *pop0, op1 = *pop1;
2413 machine_mode op_mode = GET_MODE (op0);
2414 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2415
2416 /* All of the unordered compare instructions only work on registers.
2417 The same is true of the fcomi compare instructions. The XFmode
2418 compare instructions require registers except when comparing
2419 against zero or when converting operand 1 from fixed point to
2420 floating point. */
2421
2422 if (!is_sse
2423 && (unordered_compare
2424 || (op_mode == XFmode
2425 && ! (standard_80387_constant_p (op0) == 1
2426 || standard_80387_constant_p (op1) == 1)
2427 && GET_CODE (op1) != FLOAT)
2428 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2429 {
2430 op0 = force_reg (op_mode, op0);
2431 op1 = force_reg (op_mode, op1);
2432 }
2433 else
2434 {
2435 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2436 things around if they appear profitable, otherwise force op0
2437 into a register. */
2438
2439 if (standard_80387_constant_p (op0) == 0
2440 || (MEM_P (op0)
2441 && ! (standard_80387_constant_p (op1) == 0
2442 || MEM_P (op1))))
2443 {
2444 enum rtx_code new_code = ix86_fp_swap_condition (code);
2445 if (new_code != UNKNOWN)
2446 {
2447 std::swap (op0, op1);
2448 code = new_code;
2449 }
2450 }
2451
2452 if (!REG_P (op0))
2453 op0 = force_reg (op_mode, op0);
2454
2455 if (CONSTANT_P (op1))
2456 {
2457 int tmp = standard_80387_constant_p (op1);
2458 if (tmp == 0)
2459 op1 = validize_mem (force_const_mem (op_mode, op1));
2460 else if (tmp == 1)
2461 {
2462 if (TARGET_CMOVE)
2463 op1 = force_reg (op_mode, op1);
2464 }
2465 else
2466 op1 = force_reg (op_mode, op1);
2467 }
2468 }
2469
2470 /* Try to rearrange the comparison to make it cheaper. */
2471 if (ix86_fp_comparison_cost (code)
2472 > ix86_fp_comparison_cost (swap_condition (code))
2473 && (REG_P (op1) || can_create_pseudo_p ()))
2474 {
2475 std::swap (op0, op1);
2476 code = swap_condition (code);
2477 if (!REG_P (op0))
2478 op0 = force_reg (op_mode, op0);
2479 }
2480
2481 *pop0 = op0;
2482 *pop1 = op1;
2483 return code;
2484 }
2485
2486 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2487
2488 static rtx
ix86_expand_fp_compare(enum rtx_code code,rtx op0,rtx op1)2489 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2490 {
2491 bool unordered_compare = ix86_unordered_fp_compare (code);
2492 machine_mode cmp_mode;
2493 rtx tmp, scratch;
2494
2495 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2496
2497 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2498 if (unordered_compare)
2499 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2500
2501 /* Do fcomi/sahf based test when profitable. */
2502 switch (ix86_fp_comparison_strategy (code))
2503 {
2504 case IX86_FPCMP_COMI:
2505 cmp_mode = CCFPmode;
2506 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2507 break;
2508
2509 case IX86_FPCMP_SAHF:
2510 cmp_mode = CCFPmode;
2511 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2512 scratch = gen_reg_rtx (HImode);
2513 emit_insn (gen_rtx_SET (scratch, tmp));
2514 emit_insn (gen_x86_sahf_1 (scratch));
2515 break;
2516
2517 case IX86_FPCMP_ARITH:
2518 cmp_mode = CCNOmode;
2519 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2520 scratch = gen_reg_rtx (HImode);
2521 emit_insn (gen_rtx_SET (scratch, tmp));
2522
2523 /* In the unordered case, we have to check C2 for NaN's, which
2524 doesn't happen to work out to anything nice combination-wise.
2525 So do some bit twiddling on the value we've got in AH to come
2526 up with an appropriate set of condition codes. */
2527
2528 switch (code)
2529 {
2530 case GT:
2531 case UNGT:
2532 if (code == GT || !TARGET_IEEE_FP)
2533 {
2534 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2535 code = EQ;
2536 }
2537 else
2538 {
2539 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2540 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2541 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2542 cmp_mode = CCmode;
2543 code = GEU;
2544 }
2545 break;
2546 case LT:
2547 case UNLT:
2548 if (code == LT && TARGET_IEEE_FP)
2549 {
2550 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2551 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2552 cmp_mode = CCmode;
2553 code = EQ;
2554 }
2555 else
2556 {
2557 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2558 code = NE;
2559 }
2560 break;
2561 case GE:
2562 case UNGE:
2563 if (code == GE || !TARGET_IEEE_FP)
2564 {
2565 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2566 code = EQ;
2567 }
2568 else
2569 {
2570 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2571 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2572 code = NE;
2573 }
2574 break;
2575 case LE:
2576 case UNLE:
2577 if (code == LE && TARGET_IEEE_FP)
2578 {
2579 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2580 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2581 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2582 cmp_mode = CCmode;
2583 code = LTU;
2584 }
2585 else
2586 {
2587 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2588 code = NE;
2589 }
2590 break;
2591 case EQ:
2592 case UNEQ:
2593 if (code == EQ && TARGET_IEEE_FP)
2594 {
2595 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2596 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2597 cmp_mode = CCmode;
2598 code = EQ;
2599 }
2600 else
2601 {
2602 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2603 code = NE;
2604 }
2605 break;
2606 case NE:
2607 case LTGT:
2608 if (code == NE && TARGET_IEEE_FP)
2609 {
2610 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2611 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2612 GEN_INT (0x40)));
2613 code = NE;
2614 }
2615 else
2616 {
2617 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2618 code = EQ;
2619 }
2620 break;
2621
2622 case UNORDERED:
2623 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2624 code = NE;
2625 break;
2626 case ORDERED:
2627 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2628 code = EQ;
2629 break;
2630
2631 default:
2632 gcc_unreachable ();
2633 }
2634 break;
2635
2636 default:
2637 gcc_unreachable();
2638 }
2639
2640 /* Return the test that should be put into the flags user, i.e.
2641 the bcc, scc, or cmov instruction. */
2642 return gen_rtx_fmt_ee (code, VOIDmode,
2643 gen_rtx_REG (cmp_mode, FLAGS_REG),
2644 const0_rtx);
2645 }
2646
2647 /* Generate insn patterns to do an integer compare of OPERANDS. */
2648
2649 static rtx
ix86_expand_int_compare(enum rtx_code code,rtx op0,rtx op1)2650 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2651 {
2652 machine_mode cmpmode;
2653 rtx tmp, flags;
2654
2655 cmpmode = SELECT_CC_MODE (code, op0, op1);
2656 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2657
2658 /* This is very simple, but making the interface the same as in the
2659 FP case makes the rest of the code easier. */
2660 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2661 emit_insn (gen_rtx_SET (flags, tmp));
2662
2663 /* Return the test that should be put into the flags user, i.e.
2664 the bcc, scc, or cmov instruction. */
2665 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2666 }
2667
2668 static rtx
ix86_expand_compare(enum rtx_code code,rtx op0,rtx op1)2669 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2670 {
2671 rtx ret;
2672
2673 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2674 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2675
2676 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2677 {
2678 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2679 ret = ix86_expand_fp_compare (code, op0, op1);
2680 }
2681 else
2682 ret = ix86_expand_int_compare (code, op0, op1);
2683
2684 return ret;
2685 }
2686
2687 void
ix86_expand_setcc(rtx dest,enum rtx_code code,rtx op0,rtx op1)2688 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2689 {
2690 rtx ret;
2691
2692 gcc_assert (GET_MODE (dest) == QImode);
2693
2694 ret = ix86_expand_compare (code, op0, op1);
2695 PUT_MODE (ret, QImode);
2696 emit_insn (gen_rtx_SET (dest, ret));
2697 }
2698
2699 /* Expand comparison setting or clearing carry flag. Return true when
2700 successful and set pop for the operation. */
2701 static bool
ix86_expand_carry_flag_compare(enum rtx_code code,rtx op0,rtx op1,rtx * pop)2702 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2703 {
2704 machine_mode mode
2705 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2706
2707 /* Do not handle double-mode compares that go through special path. */
2708 if (mode == (TARGET_64BIT ? TImode : DImode))
2709 return false;
2710
2711 if (SCALAR_FLOAT_MODE_P (mode))
2712 {
2713 rtx compare_op;
2714 rtx_insn *compare_seq;
2715
2716 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2717
2718 /* Shortcut: following common codes never translate
2719 into carry flag compares. */
2720 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2721 || code == ORDERED || code == UNORDERED)
2722 return false;
2723
2724 /* These comparisons require zero flag; swap operands so they won't. */
2725 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2726 && !TARGET_IEEE_FP)
2727 {
2728 std::swap (op0, op1);
2729 code = swap_condition (code);
2730 }
2731
2732 /* Try to expand the comparison and verify that we end up with
2733 carry flag based comparison. This fails to be true only when
2734 we decide to expand comparison using arithmetic that is not
2735 too common scenario. */
2736 start_sequence ();
2737 compare_op = ix86_expand_fp_compare (code, op0, op1);
2738 compare_seq = get_insns ();
2739 end_sequence ();
2740
2741 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2742 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2743 else
2744 code = GET_CODE (compare_op);
2745
2746 if (code != LTU && code != GEU)
2747 return false;
2748
2749 emit_insn (compare_seq);
2750 *pop = compare_op;
2751 return true;
2752 }
2753
2754 if (!INTEGRAL_MODE_P (mode))
2755 return false;
2756
2757 switch (code)
2758 {
2759 case LTU:
2760 case GEU:
2761 break;
2762
2763 /* Convert a==0 into (unsigned)a<1. */
2764 case EQ:
2765 case NE:
2766 if (op1 != const0_rtx)
2767 return false;
2768 op1 = const1_rtx;
2769 code = (code == EQ ? LTU : GEU);
2770 break;
2771
2772 /* Convert a>b into b<a or a>=b-1. */
2773 case GTU:
2774 case LEU:
2775 if (CONST_INT_P (op1))
2776 {
2777 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2778 /* Bail out on overflow. We still can swap operands but that
2779 would force loading of the constant into register. */
2780 if (op1 == const0_rtx
2781 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2782 return false;
2783 code = (code == GTU ? GEU : LTU);
2784 }
2785 else
2786 {
2787 std::swap (op0, op1);
2788 code = (code == GTU ? LTU : GEU);
2789 }
2790 break;
2791
2792 /* Convert a>=0 into (unsigned)a<0x80000000. */
2793 case LT:
2794 case GE:
2795 if (mode == DImode || op1 != const0_rtx)
2796 return false;
2797 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2798 code = (code == LT ? GEU : LTU);
2799 break;
2800 case LE:
2801 case GT:
2802 if (mode == DImode || op1 != constm1_rtx)
2803 return false;
2804 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2805 code = (code == LE ? GEU : LTU);
2806 break;
2807
2808 default:
2809 return false;
2810 }
2811 /* Swapping operands may cause constant to appear as first operand. */
2812 if (!nonimmediate_operand (op0, VOIDmode))
2813 {
2814 if (!can_create_pseudo_p ())
2815 return false;
2816 op0 = force_reg (mode, op0);
2817 }
2818 *pop = ix86_expand_compare (code, op0, op1);
2819 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2820 return true;
2821 }
2822
2823 /* Expand conditional increment or decrement using adb/sbb instructions.
2824 The default case using setcc followed by the conditional move can be
2825 done by generic code. */
2826 bool
ix86_expand_int_addcc(rtx operands[])2827 ix86_expand_int_addcc (rtx operands[])
2828 {
2829 enum rtx_code code = GET_CODE (operands[1]);
2830 rtx flags;
2831 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2832 rtx compare_op;
2833 rtx val = const0_rtx;
2834 bool fpcmp = false;
2835 machine_mode mode;
2836 rtx op0 = XEXP (operands[1], 0);
2837 rtx op1 = XEXP (operands[1], 1);
2838
2839 if (operands[3] != const1_rtx
2840 && operands[3] != constm1_rtx)
2841 return false;
2842 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2843 return false;
2844 code = GET_CODE (compare_op);
2845
2846 flags = XEXP (compare_op, 0);
2847
2848 if (GET_MODE (flags) == CCFPmode)
2849 {
2850 fpcmp = true;
2851 code = ix86_fp_compare_code_to_integer (code);
2852 }
2853
2854 if (code != LTU)
2855 {
2856 val = constm1_rtx;
2857 if (fpcmp)
2858 PUT_CODE (compare_op,
2859 reverse_condition_maybe_unordered
2860 (GET_CODE (compare_op)));
2861 else
2862 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2863 }
2864
2865 mode = GET_MODE (operands[0]);
2866
2867 /* Construct either adc or sbb insn. */
2868 if ((code == LTU) == (operands[3] == constm1_rtx))
2869 insn = gen_sub3_carry;
2870 else
2871 insn = gen_add3_carry;
2872
2873 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2874
2875 return true;
2876 }
2877
2878 bool
ix86_expand_int_movcc(rtx operands[])2879 ix86_expand_int_movcc (rtx operands[])
2880 {
2881 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2882 rtx_insn *compare_seq;
2883 rtx compare_op;
2884 machine_mode mode = GET_MODE (operands[0]);
2885 bool sign_bit_compare_p = false;
2886 rtx op0 = XEXP (operands[1], 0);
2887 rtx op1 = XEXP (operands[1], 1);
2888
2889 if (GET_MODE (op0) == TImode
2890 || (GET_MODE (op0) == DImode
2891 && !TARGET_64BIT))
2892 return false;
2893
2894 start_sequence ();
2895 compare_op = ix86_expand_compare (code, op0, op1);
2896 compare_seq = get_insns ();
2897 end_sequence ();
2898
2899 compare_code = GET_CODE (compare_op);
2900
2901 if ((op1 == const0_rtx && (code == GE || code == LT))
2902 || (op1 == constm1_rtx && (code == GT || code == LE)))
2903 sign_bit_compare_p = true;
2904
2905 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2906 HImode insns, we'd be swallowed in word prefix ops. */
2907
2908 if ((mode != HImode || TARGET_FAST_PREFIX)
2909 && (mode != (TARGET_64BIT ? TImode : DImode))
2910 && CONST_INT_P (operands[2])
2911 && CONST_INT_P (operands[3]))
2912 {
2913 rtx out = operands[0];
2914 HOST_WIDE_INT ct = INTVAL (operands[2]);
2915 HOST_WIDE_INT cf = INTVAL (operands[3]);
2916 HOST_WIDE_INT diff;
2917
2918 diff = ct - cf;
2919 /* Sign bit compares are better done using shifts than we do by using
2920 sbb. */
2921 if (sign_bit_compare_p
2922 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2923 {
2924 /* Detect overlap between destination and compare sources. */
2925 rtx tmp = out;
2926
2927 if (!sign_bit_compare_p)
2928 {
2929 rtx flags;
2930 bool fpcmp = false;
2931
2932 compare_code = GET_CODE (compare_op);
2933
2934 flags = XEXP (compare_op, 0);
2935
2936 if (GET_MODE (flags) == CCFPmode)
2937 {
2938 fpcmp = true;
2939 compare_code
2940 = ix86_fp_compare_code_to_integer (compare_code);
2941 }
2942
2943 /* To simplify rest of code, restrict to the GEU case. */
2944 if (compare_code == LTU)
2945 {
2946 std::swap (ct, cf);
2947 compare_code = reverse_condition (compare_code);
2948 code = reverse_condition (code);
2949 }
2950 else
2951 {
2952 if (fpcmp)
2953 PUT_CODE (compare_op,
2954 reverse_condition_maybe_unordered
2955 (GET_CODE (compare_op)));
2956 else
2957 PUT_CODE (compare_op,
2958 reverse_condition (GET_CODE (compare_op)));
2959 }
2960 diff = ct - cf;
2961
2962 if (reg_overlap_mentioned_p (out, op0)
2963 || reg_overlap_mentioned_p (out, op1))
2964 tmp = gen_reg_rtx (mode);
2965
2966 if (mode == DImode)
2967 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2968 else
2969 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2970 flags, compare_op));
2971 }
2972 else
2973 {
2974 if (code == GT || code == GE)
2975 code = reverse_condition (code);
2976 else
2977 {
2978 std::swap (ct, cf);
2979 diff = ct - cf;
2980 }
2981 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2982 }
2983
2984 if (diff == 1)
2985 {
2986 /*
2987 * cmpl op0,op1
2988 * sbbl dest,dest
2989 * [addl dest, ct]
2990 *
2991 * Size 5 - 8.
2992 */
2993 if (ct)
2994 tmp = expand_simple_binop (mode, PLUS,
2995 tmp, GEN_INT (ct),
2996 copy_rtx (tmp), 1, OPTAB_DIRECT);
2997 }
2998 else if (cf == -1)
2999 {
3000 /*
3001 * cmpl op0,op1
3002 * sbbl dest,dest
3003 * orl $ct, dest
3004 *
3005 * Size 8.
3006 */
3007 tmp = expand_simple_binop (mode, IOR,
3008 tmp, GEN_INT (ct),
3009 copy_rtx (tmp), 1, OPTAB_DIRECT);
3010 }
3011 else if (diff == -1 && ct)
3012 {
3013 /*
3014 * cmpl op0,op1
3015 * sbbl dest,dest
3016 * notl dest
3017 * [addl dest, cf]
3018 *
3019 * Size 8 - 11.
3020 */
3021 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3022 if (cf)
3023 tmp = expand_simple_binop (mode, PLUS,
3024 copy_rtx (tmp), GEN_INT (cf),
3025 copy_rtx (tmp), 1, OPTAB_DIRECT);
3026 }
3027 else
3028 {
3029 /*
3030 * cmpl op0,op1
3031 * sbbl dest,dest
3032 * [notl dest]
3033 * andl cf - ct, dest
3034 * [addl dest, ct]
3035 *
3036 * Size 8 - 11.
3037 */
3038
3039 if (cf == 0)
3040 {
3041 cf = ct;
3042 ct = 0;
3043 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3044 }
3045
3046 tmp = expand_simple_binop (mode, AND,
3047 copy_rtx (tmp),
3048 gen_int_mode (cf - ct, mode),
3049 copy_rtx (tmp), 1, OPTAB_DIRECT);
3050 if (ct)
3051 tmp = expand_simple_binop (mode, PLUS,
3052 copy_rtx (tmp), GEN_INT (ct),
3053 copy_rtx (tmp), 1, OPTAB_DIRECT);
3054 }
3055
3056 if (!rtx_equal_p (tmp, out))
3057 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3058
3059 return true;
3060 }
3061
3062 if (diff < 0)
3063 {
3064 machine_mode cmp_mode = GET_MODE (op0);
3065 enum rtx_code new_code;
3066
3067 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3068 {
3069 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3070
3071 /* We may be reversing a non-trapping
3072 comparison to a trapping comparison. */
3073 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3074 && code != EQ && code != NE
3075 && code != ORDERED && code != UNORDERED)
3076 new_code = UNKNOWN;
3077 else
3078 new_code = reverse_condition_maybe_unordered (code);
3079 }
3080 else
3081 new_code = ix86_reverse_condition (code, cmp_mode);
3082 if (new_code != UNKNOWN)
3083 {
3084 std::swap (ct, cf);
3085 diff = -diff;
3086 code = new_code;
3087 }
3088 }
3089
3090 compare_code = UNKNOWN;
3091 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3092 && CONST_INT_P (op1))
3093 {
3094 if (op1 == const0_rtx
3095 && (code == LT || code == GE))
3096 compare_code = code;
3097 else if (op1 == constm1_rtx)
3098 {
3099 if (code == LE)
3100 compare_code = LT;
3101 else if (code == GT)
3102 compare_code = GE;
3103 }
3104 }
3105
3106 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3107 if (compare_code != UNKNOWN
3108 && GET_MODE (op0) == GET_MODE (out)
3109 && (cf == -1 || ct == -1))
3110 {
3111 /* If lea code below could be used, only optimize
3112 if it results in a 2 insn sequence. */
3113
3114 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3115 || diff == 3 || diff == 5 || diff == 9)
3116 || (compare_code == LT && ct == -1)
3117 || (compare_code == GE && cf == -1))
3118 {
3119 /*
3120 * notl op1 (if necessary)
3121 * sarl $31, op1
3122 * orl cf, op1
3123 */
3124 if (ct != -1)
3125 {
3126 cf = ct;
3127 ct = -1;
3128 code = reverse_condition (code);
3129 }
3130
3131 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3132
3133 out = expand_simple_binop (mode, IOR,
3134 out, GEN_INT (cf),
3135 out, 1, OPTAB_DIRECT);
3136 if (out != operands[0])
3137 emit_move_insn (operands[0], out);
3138
3139 return true;
3140 }
3141 }
3142
3143
3144 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3145 || diff == 3 || diff == 5 || diff == 9)
3146 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3147 && (mode != DImode
3148 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3149 {
3150 /*
3151 * xorl dest,dest
3152 * cmpl op1,op2
3153 * setcc dest
3154 * lea cf(dest*(ct-cf)),dest
3155 *
3156 * Size 14.
3157 *
3158 * This also catches the degenerate setcc-only case.
3159 */
3160
3161 rtx tmp;
3162 int nops;
3163
3164 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3165
3166 nops = 0;
3167 /* On x86_64 the lea instruction operates on Pmode, so we need
3168 to get arithmetics done in proper mode to match. */
3169 if (diff == 1)
3170 tmp = copy_rtx (out);
3171 else
3172 {
3173 rtx out1;
3174 out1 = copy_rtx (out);
3175 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3176 nops++;
3177 if (diff & 1)
3178 {
3179 tmp = gen_rtx_PLUS (mode, tmp, out1);
3180 nops++;
3181 }
3182 }
3183 if (cf != 0)
3184 {
3185 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
3186 nops++;
3187 }
3188 if (!rtx_equal_p (tmp, out))
3189 {
3190 if (nops == 1)
3191 out = force_operand (tmp, copy_rtx (out));
3192 else
3193 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3194 }
3195 if (!rtx_equal_p (out, operands[0]))
3196 emit_move_insn (operands[0], copy_rtx (out));
3197
3198 return true;
3199 }
3200
3201 /*
3202 * General case: Jumpful:
3203 * xorl dest,dest cmpl op1, op2
3204 * cmpl op1, op2 movl ct, dest
3205 * setcc dest jcc 1f
3206 * decl dest movl cf, dest
3207 * andl (cf-ct),dest 1:
3208 * addl ct,dest
3209 *
3210 * Size 20. Size 14.
3211 *
3212 * This is reasonably steep, but branch mispredict costs are
3213 * high on modern cpus, so consider failing only if optimizing
3214 * for space.
3215 */
3216
3217 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3218 && BRANCH_COST (optimize_insn_for_speed_p (),
3219 false) >= 2)
3220 {
3221 if (cf == 0)
3222 {
3223 machine_mode cmp_mode = GET_MODE (op0);
3224 enum rtx_code new_code;
3225
3226 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3227 {
3228 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3229
3230 /* We may be reversing a non-trapping
3231 comparison to a trapping comparison. */
3232 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3233 && code != EQ && code != NE
3234 && code != ORDERED && code != UNORDERED)
3235 new_code = UNKNOWN;
3236 else
3237 new_code = reverse_condition_maybe_unordered (code);
3238
3239 }
3240 else
3241 {
3242 new_code = ix86_reverse_condition (code, cmp_mode);
3243 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3244 compare_code = reverse_condition (compare_code);
3245 }
3246
3247 if (new_code != UNKNOWN)
3248 {
3249 cf = ct;
3250 ct = 0;
3251 code = new_code;
3252 }
3253 }
3254
3255 if (compare_code != UNKNOWN)
3256 {
3257 /* notl op1 (if needed)
3258 sarl $31, op1
3259 andl (cf-ct), op1
3260 addl ct, op1
3261
3262 For x < 0 (resp. x <= -1) there will be no notl,
3263 so if possible swap the constants to get rid of the
3264 complement.
3265 True/false will be -1/0 while code below (store flag
3266 followed by decrement) is 0/-1, so the constants need
3267 to be exchanged once more. */
3268
3269 if (compare_code == GE || !cf)
3270 {
3271 code = reverse_condition (code);
3272 compare_code = LT;
3273 }
3274 else
3275 std::swap (ct, cf);
3276
3277 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3278 }
3279 else
3280 {
3281 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3282
3283 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3284 constm1_rtx,
3285 copy_rtx (out), 1, OPTAB_DIRECT);
3286 }
3287
3288 out = expand_simple_binop (mode, AND, copy_rtx (out),
3289 gen_int_mode (cf - ct, mode),
3290 copy_rtx (out), 1, OPTAB_DIRECT);
3291 if (ct)
3292 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3293 copy_rtx (out), 1, OPTAB_DIRECT);
3294 if (!rtx_equal_p (out, operands[0]))
3295 emit_move_insn (operands[0], copy_rtx (out));
3296
3297 return true;
3298 }
3299 }
3300
3301 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3302 {
3303 /* Try a few things more with specific constants and a variable. */
3304
3305 optab op;
3306 rtx var, orig_out, out, tmp;
3307
3308 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3309 return false;
3310
3311 /* If one of the two operands is an interesting constant, load a
3312 constant with the above and mask it in with a logical operation. */
3313
3314 if (CONST_INT_P (operands[2]))
3315 {
3316 var = operands[3];
3317 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3318 operands[3] = constm1_rtx, op = and_optab;
3319 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3320 operands[3] = const0_rtx, op = ior_optab;
3321 else
3322 return false;
3323 }
3324 else if (CONST_INT_P (operands[3]))
3325 {
3326 var = operands[2];
3327 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3328 operands[2] = constm1_rtx, op = and_optab;
3329 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3330 operands[2] = const0_rtx, op = ior_optab;
3331 else
3332 return false;
3333 }
3334 else
3335 return false;
3336
3337 orig_out = operands[0];
3338 tmp = gen_reg_rtx (mode);
3339 operands[0] = tmp;
3340
3341 /* Recurse to get the constant loaded. */
3342 if (!ix86_expand_int_movcc (operands))
3343 return false;
3344
3345 /* Mask in the interesting variable. */
3346 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3347 OPTAB_WIDEN);
3348 if (!rtx_equal_p (out, orig_out))
3349 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3350
3351 return true;
3352 }
3353
3354 /*
3355 * For comparison with above,
3356 *
3357 * movl cf,dest
3358 * movl ct,tmp
3359 * cmpl op1,op2
3360 * cmovcc tmp,dest
3361 *
3362 * Size 15.
3363 */
3364
3365 if (! nonimmediate_operand (operands[2], mode))
3366 operands[2] = force_reg (mode, operands[2]);
3367 if (! nonimmediate_operand (operands[3], mode))
3368 operands[3] = force_reg (mode, operands[3]);
3369
3370 if (! register_operand (operands[2], VOIDmode)
3371 && (mode == QImode
3372 || ! register_operand (operands[3], VOIDmode)))
3373 operands[2] = force_reg (mode, operands[2]);
3374
3375 if (mode == QImode
3376 && ! register_operand (operands[3], VOIDmode))
3377 operands[3] = force_reg (mode, operands[3]);
3378
3379 emit_insn (compare_seq);
3380 emit_insn (gen_rtx_SET (operands[0],
3381 gen_rtx_IF_THEN_ELSE (mode,
3382 compare_op, operands[2],
3383 operands[3])));
3384 return true;
3385 }
3386
3387 /* Detect conditional moves that exactly match min/max operational
3388 semantics. Note that this is IEEE safe, as long as we don't
3389 interchange the operands.
3390
3391 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3392 and TRUE if the operation is successful and instructions are emitted. */
3393
3394 static bool
ix86_expand_sse_fp_minmax(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx if_true,rtx if_false)3395 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3396 rtx cmp_op1, rtx if_true, rtx if_false)
3397 {
3398 machine_mode mode;
3399 bool is_min;
3400 rtx tmp;
3401
3402 if (code == LT)
3403 ;
3404 else if (code == UNGE)
3405 std::swap (if_true, if_false);
3406 else
3407 return false;
3408
3409 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3410 is_min = true;
3411 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3412 is_min = false;
3413 else
3414 return false;
3415
3416 mode = GET_MODE (dest);
3417
3418 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3419 but MODE may be a vector mode and thus not appropriate. */
3420 if (!flag_finite_math_only || flag_signed_zeros)
3421 {
3422 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3423 rtvec v;
3424
3425 if_true = force_reg (mode, if_true);
3426 v = gen_rtvec (2, if_true, if_false);
3427 tmp = gen_rtx_UNSPEC (mode, v, u);
3428 }
3429 else
3430 {
3431 code = is_min ? SMIN : SMAX;
3432 if (MEM_P (if_true) && MEM_P (if_false))
3433 if_true = force_reg (mode, if_true);
3434 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3435 }
3436
3437 emit_insn (gen_rtx_SET (dest, tmp));
3438 return true;
3439 }
3440
3441 /* Return true if MODE is valid for vector compare to mask register,
3442 Same result for conditionl vector move with mask register. */
3443 static bool
ix86_valid_mask_cmp_mode(machine_mode mode)3444 ix86_valid_mask_cmp_mode (machine_mode mode)
3445 {
3446 /* XOP has its own vector conditional movement. */
3447 if (TARGET_XOP && !TARGET_AVX512F)
3448 return false;
3449
3450 /* AVX512F is needed for mask operation. */
3451 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3452 return false;
3453
3454 /* AVX512BW is needed for vector QI/HImode,
3455 AVX512VL is needed for 128/256-bit vector. */
3456 machine_mode inner_mode = GET_MODE_INNER (mode);
3457 int vector_size = GET_MODE_SIZE (mode);
3458 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3459 return false;
3460
3461 return vector_size == 64 || TARGET_AVX512VL;
3462 }
3463
3464 /* Expand an SSE comparison. Return the register with the result. */
3465
3466 static rtx
ix86_expand_sse_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1,rtx op_true,rtx op_false)3467 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3468 rtx op_true, rtx op_false)
3469 {
3470 machine_mode mode = GET_MODE (dest);
3471 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3472
3473 /* In general case result of comparison can differ from operands' type. */
3474 machine_mode cmp_mode;
3475
3476 /* In AVX512F the result of comparison is an integer mask. */
3477 bool maskcmp = false;
3478 rtx x;
3479
3480 if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
3481 {
3482 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3483 maskcmp = true;
3484 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3485 }
3486 else
3487 cmp_mode = cmp_ops_mode;
3488
3489 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3490
3491 int (*op1_predicate)(rtx, machine_mode)
3492 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3493
3494 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3495 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3496
3497 if (optimize
3498 || (maskcmp && cmp_mode != mode)
3499 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3500 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3501 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3502
3503 if (maskcmp)
3504 {
3505 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3506 gcc_assert (ok);
3507 return dest;
3508 }
3509
3510 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3511
3512 if (cmp_mode != mode && !maskcmp)
3513 {
3514 x = force_reg (cmp_ops_mode, x);
3515 convert_move (dest, x, false);
3516 }
3517 else
3518 emit_insn (gen_rtx_SET (dest, x));
3519
3520 return dest;
3521 }
3522
3523 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3524 operations. This is used for both scalar and vector conditional moves. */
3525
3526 void
ix86_expand_sse_movcc(rtx dest,rtx cmp,rtx op_true,rtx op_false)3527 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3528 {
3529 machine_mode mode = GET_MODE (dest);
3530 machine_mode cmpmode = GET_MODE (cmp);
3531
3532 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3533 if (rtx_equal_p (op_true, op_false))
3534 {
3535 emit_move_insn (dest, op_true);
3536 return;
3537 }
3538
3539 /* In AVX512F the result of comparison is an integer mask. */
3540 bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
3541
3542 rtx t2, t3, x;
3543
3544 /* If we have an integer mask and FP value then we need
3545 to cast mask to FP mode. */
3546 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3547 {
3548 cmp = force_reg (cmpmode, cmp);
3549 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3550 }
3551
3552 if (maskcmp)
3553 {
3554 /* Using vector move with mask register. */
3555 cmp = force_reg (cmpmode, cmp);
3556 /* Optimize for mask zero. */
3557 op_true = (op_true != CONST0_RTX (mode)
3558 ? force_reg (mode, op_true) : op_true);
3559 op_false = (op_false != CONST0_RTX (mode)
3560 ? force_reg (mode, op_false) : op_false);
3561 if (op_true == CONST0_RTX (mode))
3562 {
3563 rtx (*gen_not) (rtx, rtx);
3564 switch (cmpmode)
3565 {
3566 case E_QImode: gen_not = gen_knotqi; break;
3567 case E_HImode: gen_not = gen_knothi; break;
3568 case E_SImode: gen_not = gen_knotsi; break;
3569 case E_DImode: gen_not = gen_knotdi; break;
3570 default: gcc_unreachable ();
3571 }
3572 rtx n = gen_reg_rtx (cmpmode);
3573 emit_insn (gen_not (n, cmp));
3574 cmp = n;
3575 /* Reverse op_true op_false. */
3576 std::swap (op_true, op_false);
3577 }
3578
3579 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3580 emit_insn (gen_rtx_SET (dest, vec_merge));
3581 return;
3582 }
3583 else if (vector_all_ones_operand (op_true, mode)
3584 && op_false == CONST0_RTX (mode))
3585 {
3586 emit_insn (gen_rtx_SET (dest, cmp));
3587 return;
3588 }
3589 else if (op_false == CONST0_RTX (mode))
3590 {
3591 op_true = force_reg (mode, op_true);
3592 x = gen_rtx_AND (mode, cmp, op_true);
3593 emit_insn (gen_rtx_SET (dest, x));
3594 return;
3595 }
3596 else if (op_true == CONST0_RTX (mode))
3597 {
3598 op_false = force_reg (mode, op_false);
3599 x = gen_rtx_NOT (mode, cmp);
3600 x = gen_rtx_AND (mode, x, op_false);
3601 emit_insn (gen_rtx_SET (dest, x));
3602 return;
3603 }
3604 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3605 {
3606 op_false = force_reg (mode, op_false);
3607 x = gen_rtx_IOR (mode, cmp, op_false);
3608 emit_insn (gen_rtx_SET (dest, x));
3609 return;
3610 }
3611 else if (TARGET_XOP)
3612 {
3613 op_true = force_reg (mode, op_true);
3614
3615 if (!nonimmediate_operand (op_false, mode))
3616 op_false = force_reg (mode, op_false);
3617
3618 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3619 op_true,
3620 op_false)));
3621 return;
3622 }
3623
3624 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3625 rtx d = dest;
3626
3627 if (!vector_operand (op_true, mode))
3628 op_true = force_reg (mode, op_true);
3629
3630 op_false = force_reg (mode, op_false);
3631
3632 switch (mode)
3633 {
3634 case E_V4SFmode:
3635 if (TARGET_SSE4_1)
3636 gen = gen_sse4_1_blendvps;
3637 break;
3638 case E_V2DFmode:
3639 if (TARGET_SSE4_1)
3640 gen = gen_sse4_1_blendvpd;
3641 break;
3642 case E_SFmode:
3643 if (TARGET_SSE4_1)
3644 {
3645 gen = gen_sse4_1_blendvss;
3646 op_true = force_reg (mode, op_true);
3647 }
3648 break;
3649 case E_DFmode:
3650 if (TARGET_SSE4_1)
3651 {
3652 gen = gen_sse4_1_blendvsd;
3653 op_true = force_reg (mode, op_true);
3654 }
3655 break;
3656 case E_V16QImode:
3657 case E_V8HImode:
3658 case E_V4SImode:
3659 case E_V2DImode:
3660 if (TARGET_SSE4_1)
3661 {
3662 gen = gen_sse4_1_pblendvb;
3663 if (mode != V16QImode)
3664 d = gen_reg_rtx (V16QImode);
3665 op_false = gen_lowpart (V16QImode, op_false);
3666 op_true = gen_lowpart (V16QImode, op_true);
3667 cmp = gen_lowpart (V16QImode, cmp);
3668 }
3669 break;
3670 case E_V8SFmode:
3671 if (TARGET_AVX)
3672 gen = gen_avx_blendvps256;
3673 break;
3674 case E_V4DFmode:
3675 if (TARGET_AVX)
3676 gen = gen_avx_blendvpd256;
3677 break;
3678 case E_V32QImode:
3679 case E_V16HImode:
3680 case E_V8SImode:
3681 case E_V4DImode:
3682 if (TARGET_AVX2)
3683 {
3684 gen = gen_avx2_pblendvb;
3685 if (mode != V32QImode)
3686 d = gen_reg_rtx (V32QImode);
3687 op_false = gen_lowpart (V32QImode, op_false);
3688 op_true = gen_lowpart (V32QImode, op_true);
3689 cmp = gen_lowpart (V32QImode, cmp);
3690 }
3691 break;
3692
3693 case E_V64QImode:
3694 gen = gen_avx512bw_blendmv64qi;
3695 break;
3696 case E_V32HImode:
3697 gen = gen_avx512bw_blendmv32hi;
3698 break;
3699 case E_V16SImode:
3700 gen = gen_avx512f_blendmv16si;
3701 break;
3702 case E_V8DImode:
3703 gen = gen_avx512f_blendmv8di;
3704 break;
3705 case E_V8DFmode:
3706 gen = gen_avx512f_blendmv8df;
3707 break;
3708 case E_V16SFmode:
3709 gen = gen_avx512f_blendmv16sf;
3710 break;
3711
3712 default:
3713 break;
3714 }
3715
3716 if (gen != NULL)
3717 {
3718 emit_insn (gen (d, op_false, op_true, cmp));
3719 if (d != dest)
3720 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3721 }
3722 else
3723 {
3724 op_true = force_reg (mode, op_true);
3725
3726 t2 = gen_reg_rtx (mode);
3727 if (optimize)
3728 t3 = gen_reg_rtx (mode);
3729 else
3730 t3 = dest;
3731
3732 x = gen_rtx_AND (mode, op_true, cmp);
3733 emit_insn (gen_rtx_SET (t2, x));
3734
3735 x = gen_rtx_NOT (mode, cmp);
3736 x = gen_rtx_AND (mode, x, op_false);
3737 emit_insn (gen_rtx_SET (t3, x));
3738
3739 x = gen_rtx_IOR (mode, t3, t2);
3740 emit_insn (gen_rtx_SET (dest, x));
3741 }
3742 }
3743
3744 /* Swap, force into registers, or otherwise massage the two operands
3745 to an sse comparison with a mask result. Thus we differ a bit from
3746 ix86_prepare_fp_compare_args which expects to produce a flags result.
3747
3748 The DEST operand exists to help determine whether to commute commutative
3749 operators. The POP0/POP1 operands are updated in place. The new
3750 comparison code is returned, or UNKNOWN if not implementable. */
3751
3752 static enum rtx_code
ix86_prepare_sse_fp_compare_args(rtx dest,enum rtx_code code,rtx * pop0,rtx * pop1)3753 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3754 rtx *pop0, rtx *pop1)
3755 {
3756 switch (code)
3757 {
3758 case LTGT:
3759 case UNEQ:
3760 /* AVX supports all the needed comparisons. */
3761 if (TARGET_AVX)
3762 break;
3763 /* We have no LTGT as an operator. We could implement it with
3764 NE & ORDERED, but this requires an extra temporary. It's
3765 not clear that it's worth it. */
3766 return UNKNOWN;
3767
3768 case LT:
3769 case LE:
3770 case UNGT:
3771 case UNGE:
3772 /* These are supported directly. */
3773 break;
3774
3775 case EQ:
3776 case NE:
3777 case UNORDERED:
3778 case ORDERED:
3779 /* AVX has 3 operand comparisons, no need to swap anything. */
3780 if (TARGET_AVX)
3781 break;
3782 /* For commutative operators, try to canonicalize the destination
3783 operand to be first in the comparison - this helps reload to
3784 avoid extra moves. */
3785 if (!dest || !rtx_equal_p (dest, *pop1))
3786 break;
3787 /* FALLTHRU */
3788
3789 case GE:
3790 case GT:
3791 case UNLE:
3792 case UNLT:
3793 /* These are not supported directly before AVX, and furthermore
3794 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3795 comparison operands to transform into something that is
3796 supported. */
3797 std::swap (*pop0, *pop1);
3798 code = swap_condition (code);
3799 break;
3800
3801 default:
3802 gcc_unreachable ();
3803 }
3804
3805 return code;
3806 }
3807
3808 /* Expand a floating-point conditional move. Return true if successful. */
3809
3810 bool
ix86_expand_fp_movcc(rtx operands[])3811 ix86_expand_fp_movcc (rtx operands[])
3812 {
3813 machine_mode mode = GET_MODE (operands[0]);
3814 enum rtx_code code = GET_CODE (operands[1]);
3815 rtx tmp, compare_op;
3816 rtx op0 = XEXP (operands[1], 0);
3817 rtx op1 = XEXP (operands[1], 1);
3818
3819 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3820 {
3821 machine_mode cmode;
3822
3823 /* Since we've no cmove for sse registers, don't force bad register
3824 allocation just to gain access to it. Deny movcc when the
3825 comparison mode doesn't match the move mode. */
3826 cmode = GET_MODE (op0);
3827 if (cmode == VOIDmode)
3828 cmode = GET_MODE (op1);
3829 if (cmode != mode)
3830 return false;
3831
3832 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3833 if (code == UNKNOWN)
3834 return false;
3835
3836 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3837 operands[2], operands[3]))
3838 return true;
3839
3840 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3841 operands[2], operands[3]);
3842 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3843 return true;
3844 }
3845
3846 if (GET_MODE (op0) == TImode
3847 || (GET_MODE (op0) == DImode
3848 && !TARGET_64BIT))
3849 return false;
3850
3851 /* The floating point conditional move instructions don't directly
3852 support conditions resulting from a signed integer comparison. */
3853
3854 compare_op = ix86_expand_compare (code, op0, op1);
3855 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3856 {
3857 tmp = gen_reg_rtx (QImode);
3858 ix86_expand_setcc (tmp, code, op0, op1);
3859
3860 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3861 }
3862
3863 emit_insn (gen_rtx_SET (operands[0],
3864 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3865 operands[2], operands[3])));
3866
3867 return true;
3868 }
3869
3870 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3871
3872 static int
ix86_int_cmp_code_to_pcmp_immediate(enum rtx_code code)3873 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3874 {
3875 switch (code)
3876 {
3877 case EQ:
3878 return 0;
3879 case LT:
3880 case LTU:
3881 return 1;
3882 case LE:
3883 case LEU:
3884 return 2;
3885 case NE:
3886 return 4;
3887 case GE:
3888 case GEU:
3889 return 5;
3890 case GT:
3891 case GTU:
3892 return 6;
3893 default:
3894 gcc_unreachable ();
3895 }
3896 }
3897
3898 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3899
3900 static int
ix86_fp_cmp_code_to_pcmp_immediate(enum rtx_code code)3901 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3902 {
3903 switch (code)
3904 {
3905 case EQ:
3906 return 0x00;
3907 case NE:
3908 return 0x04;
3909 case GT:
3910 return 0x0e;
3911 case LE:
3912 return 0x02;
3913 case GE:
3914 return 0x0d;
3915 case LT:
3916 return 0x01;
3917 case UNLE:
3918 return 0x0a;
3919 case UNLT:
3920 return 0x09;
3921 case UNGE:
3922 return 0x05;
3923 case UNGT:
3924 return 0x06;
3925 case UNEQ:
3926 return 0x18;
3927 case LTGT:
3928 return 0x0c;
3929 case ORDERED:
3930 return 0x07;
3931 case UNORDERED:
3932 return 0x03;
3933 default:
3934 gcc_unreachable ();
3935 }
3936 }
3937
3938 /* Return immediate value to be used in UNSPEC_PCMP
3939 for comparison CODE in MODE. */
3940
3941 static int
ix86_cmp_code_to_pcmp_immediate(enum rtx_code code,machine_mode mode)3942 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3943 {
3944 if (FLOAT_MODE_P (mode))
3945 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3946 return ix86_int_cmp_code_to_pcmp_immediate (code);
3947 }
3948
3949 /* Expand AVX-512 vector comparison. */
3950
3951 bool
ix86_expand_mask_vec_cmp(rtx dest,enum rtx_code code,rtx cmp_op0,rtx cmp_op1)3952 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
3953 {
3954 machine_mode mask_mode = GET_MODE (dest);
3955 machine_mode cmp_mode = GET_MODE (cmp_op0);
3956 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3957 int unspec_code;
3958 rtx unspec;
3959
3960 switch (code)
3961 {
3962 case LEU:
3963 case GTU:
3964 case GEU:
3965 case LTU:
3966 unspec_code = UNSPEC_UNSIGNED_PCMP;
3967 break;
3968
3969 default:
3970 unspec_code = UNSPEC_PCMP;
3971 }
3972
3973 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
3974 unspec_code);
3975 emit_insn (gen_rtx_SET (dest, unspec));
3976
3977 return true;
3978 }
3979
3980 /* Expand fp vector comparison. */
3981
3982 bool
ix86_expand_fp_vec_cmp(rtx operands[])3983 ix86_expand_fp_vec_cmp (rtx operands[])
3984 {
3985 enum rtx_code code = GET_CODE (operands[1]);
3986 rtx cmp;
3987
3988 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
3989 &operands[2], &operands[3]);
3990 if (code == UNKNOWN)
3991 {
3992 rtx temp;
3993 switch (GET_CODE (operands[1]))
3994 {
3995 case LTGT:
3996 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
3997 operands[3], NULL, NULL);
3998 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
3999 operands[3], NULL, NULL);
4000 code = AND;
4001 break;
4002 case UNEQ:
4003 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4004 operands[3], NULL, NULL);
4005 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4006 operands[3], NULL, NULL);
4007 code = IOR;
4008 break;
4009 default:
4010 gcc_unreachable ();
4011 }
4012 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4013 OPTAB_DIRECT);
4014 }
4015 else
4016 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4017 operands[1], operands[2]);
4018
4019 if (operands[0] != cmp)
4020 emit_move_insn (operands[0], cmp);
4021
4022 return true;
4023 }
4024
4025 static rtx
ix86_expand_int_sse_cmp(rtx dest,enum rtx_code code,rtx cop0,rtx cop1,rtx op_true,rtx op_false,bool * negate)4026 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4027 rtx op_true, rtx op_false, bool *negate)
4028 {
4029 machine_mode data_mode = GET_MODE (dest);
4030 machine_mode mode = GET_MODE (cop0);
4031 rtx x;
4032
4033 *negate = false;
4034
4035 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4036 if (TARGET_XOP
4037 && (mode == V16QImode || mode == V8HImode
4038 || mode == V4SImode || mode == V2DImode))
4039 ;
4040 /* AVX512F supports all of the comparsions
4041 on all 128/256/512-bit vector int types. */
4042 else if (ix86_valid_mask_cmp_mode (mode))
4043 ;
4044 else
4045 {
4046 /* Canonicalize the comparison to EQ, GT, GTU. */
4047 switch (code)
4048 {
4049 case EQ:
4050 case GT:
4051 case GTU:
4052 break;
4053
4054 case NE:
4055 case LE:
4056 case LEU:
4057 code = reverse_condition (code);
4058 *negate = true;
4059 break;
4060
4061 case GE:
4062 case GEU:
4063 code = reverse_condition (code);
4064 *negate = true;
4065 /* FALLTHRU */
4066
4067 case LT:
4068 case LTU:
4069 std::swap (cop0, cop1);
4070 code = swap_condition (code);
4071 break;
4072
4073 default:
4074 gcc_unreachable ();
4075 }
4076
4077 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4078 if (mode == V2DImode)
4079 {
4080 switch (code)
4081 {
4082 case EQ:
4083 /* SSE4.1 supports EQ. */
4084 if (!TARGET_SSE4_1)
4085 return NULL;
4086 break;
4087
4088 case GT:
4089 case GTU:
4090 /* SSE4.2 supports GT/GTU. */
4091 if (!TARGET_SSE4_2)
4092 return NULL;
4093 break;
4094
4095 default:
4096 gcc_unreachable ();
4097 }
4098 }
4099
4100 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4101 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4102 if (*negate)
4103 std::swap (optrue, opfalse);
4104
4105 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4106 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4107 min (x, y) == x). While we add one instruction (the minimum),
4108 we remove the need for two instructions in the negation, as the
4109 result is done this way.
4110 When using masks, do it for SI/DImode element types, as it is shorter
4111 than the two subtractions. */
4112 if ((code != EQ
4113 && GET_MODE_SIZE (mode) != 64
4114 && vector_all_ones_operand (opfalse, data_mode)
4115 && optrue == CONST0_RTX (data_mode))
4116 || (code == GTU
4117 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4118 /* Don't do it if not using integer masks and we'd end up with
4119 the right values in the registers though. */
4120 && (GET_MODE_SIZE (mode) == 64
4121 || !vector_all_ones_operand (optrue, data_mode)
4122 || opfalse != CONST0_RTX (data_mode))))
4123 {
4124 rtx (*gen) (rtx, rtx, rtx) = NULL;
4125
4126 switch (mode)
4127 {
4128 case E_V16SImode:
4129 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4130 break;
4131 case E_V8DImode:
4132 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4133 cop0 = force_reg (mode, cop0);
4134 cop1 = force_reg (mode, cop1);
4135 break;
4136 case E_V32QImode:
4137 if (TARGET_AVX2)
4138 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4139 break;
4140 case E_V16HImode:
4141 if (TARGET_AVX2)
4142 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4143 break;
4144 case E_V8SImode:
4145 if (TARGET_AVX2)
4146 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4147 break;
4148 case E_V4DImode:
4149 if (TARGET_AVX512VL)
4150 {
4151 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4152 cop0 = force_reg (mode, cop0);
4153 cop1 = force_reg (mode, cop1);
4154 }
4155 break;
4156 case E_V16QImode:
4157 if (code == GTU && TARGET_SSE2)
4158 gen = gen_uminv16qi3;
4159 else if (code == GT && TARGET_SSE4_1)
4160 gen = gen_sminv16qi3;
4161 break;
4162 case E_V8HImode:
4163 if (code == GTU && TARGET_SSE4_1)
4164 gen = gen_uminv8hi3;
4165 else if (code == GT && TARGET_SSE2)
4166 gen = gen_sminv8hi3;
4167 break;
4168 case E_V4SImode:
4169 if (TARGET_SSE4_1)
4170 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4171 break;
4172 case E_V2DImode:
4173 if (TARGET_AVX512VL)
4174 {
4175 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4176 cop0 = force_reg (mode, cop0);
4177 cop1 = force_reg (mode, cop1);
4178 }
4179 break;
4180 default:
4181 break;
4182 }
4183
4184 if (gen)
4185 {
4186 rtx tem = gen_reg_rtx (mode);
4187 if (!vector_operand (cop0, mode))
4188 cop0 = force_reg (mode, cop0);
4189 if (!vector_operand (cop1, mode))
4190 cop1 = force_reg (mode, cop1);
4191 *negate = !*negate;
4192 emit_insn (gen (tem, cop0, cop1));
4193 cop1 = tem;
4194 code = EQ;
4195 }
4196 }
4197
4198 /* Unsigned parallel compare is not supported by the hardware.
4199 Play some tricks to turn this into a signed comparison
4200 against 0. */
4201 if (code == GTU)
4202 {
4203 cop0 = force_reg (mode, cop0);
4204
4205 switch (mode)
4206 {
4207 case E_V16SImode:
4208 case E_V8DImode:
4209 case E_V8SImode:
4210 case E_V4DImode:
4211 case E_V4SImode:
4212 case E_V2DImode:
4213 {
4214 rtx t1, t2, mask;
4215
4216 /* Subtract (-(INT MAX) - 1) from both operands to make
4217 them signed. */
4218 mask = ix86_build_signbit_mask (mode, true, false);
4219 t1 = gen_reg_rtx (mode);
4220 emit_insn (gen_sub3_insn (t1, cop0, mask));
4221
4222 t2 = gen_reg_rtx (mode);
4223 emit_insn (gen_sub3_insn (t2, cop1, mask));
4224
4225 cop0 = t1;
4226 cop1 = t2;
4227 code = GT;
4228 }
4229 break;
4230
4231 case E_V64QImode:
4232 case E_V32HImode:
4233 case E_V32QImode:
4234 case E_V16HImode:
4235 case E_V16QImode:
4236 case E_V8HImode:
4237 /* Perform a parallel unsigned saturating subtraction. */
4238 x = gen_reg_rtx (mode);
4239 emit_insn (gen_rtx_SET
4240 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4241 cop0 = x;
4242 cop1 = CONST0_RTX (mode);
4243 code = EQ;
4244 *negate = !*negate;
4245 break;
4246
4247 default:
4248 gcc_unreachable ();
4249 }
4250 }
4251 }
4252
4253 if (*negate)
4254 std::swap (op_true, op_false);
4255
4256 /* Allow the comparison to be done in one mode, but the movcc to
4257 happen in another mode. */
4258 if (data_mode == mode)
4259 {
4260 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4261 op_true, op_false);
4262 }
4263 else
4264 {
4265 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4266 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4267 op_true, op_false);
4268 if (GET_MODE (x) == mode)
4269 x = gen_lowpart (data_mode, x);
4270 }
4271
4272 return x;
4273 }
4274
4275 /* Expand integer vector comparison. */
4276
4277 bool
ix86_expand_int_vec_cmp(rtx operands[])4278 ix86_expand_int_vec_cmp (rtx operands[])
4279 {
4280 rtx_code code = GET_CODE (operands[1]);
4281 bool negate = false;
4282 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4283 operands[3], NULL, NULL, &negate);
4284
4285 if (!cmp)
4286 return false;
4287
4288 if (negate)
4289 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4290 CONST0_RTX (GET_MODE (cmp)),
4291 NULL, NULL, &negate);
4292
4293 gcc_assert (!negate);
4294
4295 if (operands[0] != cmp)
4296 emit_move_insn (operands[0], cmp);
4297
4298 return true;
4299 }
4300
4301 /* Expand a floating-point vector conditional move; a vcond operation
4302 rather than a movcc operation. */
4303
4304 bool
ix86_expand_fp_vcond(rtx operands[])4305 ix86_expand_fp_vcond (rtx operands[])
4306 {
4307 enum rtx_code code = GET_CODE (operands[3]);
4308 rtx cmp;
4309
4310 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4311 &operands[4], &operands[5]);
4312 if (code == UNKNOWN)
4313 {
4314 rtx temp;
4315 switch (GET_CODE (operands[3]))
4316 {
4317 case LTGT:
4318 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4319 operands[5], operands[0], operands[0]);
4320 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4321 operands[5], operands[1], operands[2]);
4322 code = AND;
4323 break;
4324 case UNEQ:
4325 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4326 operands[5], operands[0], operands[0]);
4327 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4328 operands[5], operands[1], operands[2]);
4329 code = IOR;
4330 break;
4331 default:
4332 gcc_unreachable ();
4333 }
4334 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4335 OPTAB_DIRECT);
4336 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4337 return true;
4338 }
4339
4340 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4341 operands[5], operands[1], operands[2]))
4342 return true;
4343
4344 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4345 operands[1], operands[2]);
4346 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4347 return true;
4348 }
4349
4350 /* Expand a signed/unsigned integral vector conditional move. */
4351
4352 bool
ix86_expand_int_vcond(rtx operands[])4353 ix86_expand_int_vcond (rtx operands[])
4354 {
4355 machine_mode data_mode = GET_MODE (operands[0]);
4356 machine_mode mode = GET_MODE (operands[4]);
4357 enum rtx_code code = GET_CODE (operands[3]);
4358 bool negate = false;
4359 rtx x, cop0, cop1;
4360
4361 cop0 = operands[4];
4362 cop1 = operands[5];
4363
4364 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4365 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4366 if ((code == LT || code == GE)
4367 && data_mode == mode
4368 && cop1 == CONST0_RTX (mode)
4369 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4370 && GET_MODE_UNIT_SIZE (data_mode) > 1
4371 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4372 && (GET_MODE_SIZE (data_mode) == 16
4373 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4374 {
4375 rtx negop = operands[2 - (code == LT)];
4376 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4377 if (negop == CONST1_RTX (data_mode))
4378 {
4379 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4380 operands[0], 1, OPTAB_DIRECT);
4381 if (res != operands[0])
4382 emit_move_insn (operands[0], res);
4383 return true;
4384 }
4385 else if (GET_MODE_INNER (data_mode) != DImode
4386 && vector_all_ones_operand (negop, data_mode))
4387 {
4388 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4389 operands[0], 0, OPTAB_DIRECT);
4390 if (res != operands[0])
4391 emit_move_insn (operands[0], res);
4392 return true;
4393 }
4394 }
4395
4396 if (!nonimmediate_operand (cop1, mode))
4397 cop1 = force_reg (mode, cop1);
4398 if (!general_operand (operands[1], data_mode))
4399 operands[1] = force_reg (data_mode, operands[1]);
4400 if (!general_operand (operands[2], data_mode))
4401 operands[2] = force_reg (data_mode, operands[2]);
4402
4403 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4404 operands[1], operands[2], &negate);
4405
4406 if (!x)
4407 return false;
4408
4409 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4410 operands[2-negate]);
4411 return true;
4412 }
4413
4414 static bool
ix86_expand_vec_perm_vpermt2(rtx target,rtx mask,rtx op0,rtx op1,struct expand_vec_perm_d * d)4415 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4416 struct expand_vec_perm_d *d)
4417 {
4418 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4419 expander, so args are either in d, or in op0, op1 etc. */
4420 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4421 machine_mode maskmode = mode;
4422 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4423
4424 switch (mode)
4425 {
4426 case E_V8HImode:
4427 if (TARGET_AVX512VL && TARGET_AVX512BW)
4428 gen = gen_avx512vl_vpermt2varv8hi3;
4429 break;
4430 case E_V16HImode:
4431 if (TARGET_AVX512VL && TARGET_AVX512BW)
4432 gen = gen_avx512vl_vpermt2varv16hi3;
4433 break;
4434 case E_V64QImode:
4435 if (TARGET_AVX512VBMI)
4436 gen = gen_avx512bw_vpermt2varv64qi3;
4437 break;
4438 case E_V32HImode:
4439 if (TARGET_AVX512BW)
4440 gen = gen_avx512bw_vpermt2varv32hi3;
4441 break;
4442 case E_V4SImode:
4443 if (TARGET_AVX512VL)
4444 gen = gen_avx512vl_vpermt2varv4si3;
4445 break;
4446 case E_V8SImode:
4447 if (TARGET_AVX512VL)
4448 gen = gen_avx512vl_vpermt2varv8si3;
4449 break;
4450 case E_V16SImode:
4451 if (TARGET_AVX512F)
4452 gen = gen_avx512f_vpermt2varv16si3;
4453 break;
4454 case E_V4SFmode:
4455 if (TARGET_AVX512VL)
4456 {
4457 gen = gen_avx512vl_vpermt2varv4sf3;
4458 maskmode = V4SImode;
4459 }
4460 break;
4461 case E_V8SFmode:
4462 if (TARGET_AVX512VL)
4463 {
4464 gen = gen_avx512vl_vpermt2varv8sf3;
4465 maskmode = V8SImode;
4466 }
4467 break;
4468 case E_V16SFmode:
4469 if (TARGET_AVX512F)
4470 {
4471 gen = gen_avx512f_vpermt2varv16sf3;
4472 maskmode = V16SImode;
4473 }
4474 break;
4475 case E_V2DImode:
4476 if (TARGET_AVX512VL)
4477 gen = gen_avx512vl_vpermt2varv2di3;
4478 break;
4479 case E_V4DImode:
4480 if (TARGET_AVX512VL)
4481 gen = gen_avx512vl_vpermt2varv4di3;
4482 break;
4483 case E_V8DImode:
4484 if (TARGET_AVX512F)
4485 gen = gen_avx512f_vpermt2varv8di3;
4486 break;
4487 case E_V2DFmode:
4488 if (TARGET_AVX512VL)
4489 {
4490 gen = gen_avx512vl_vpermt2varv2df3;
4491 maskmode = V2DImode;
4492 }
4493 break;
4494 case E_V4DFmode:
4495 if (TARGET_AVX512VL)
4496 {
4497 gen = gen_avx512vl_vpermt2varv4df3;
4498 maskmode = V4DImode;
4499 }
4500 break;
4501 case E_V8DFmode:
4502 if (TARGET_AVX512F)
4503 {
4504 gen = gen_avx512f_vpermt2varv8df3;
4505 maskmode = V8DImode;
4506 }
4507 break;
4508 default:
4509 break;
4510 }
4511
4512 if (gen == NULL)
4513 return false;
4514
4515 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4516 expander, so args are either in d, or in op0, op1 etc. */
4517 if (d)
4518 {
4519 rtx vec[64];
4520 target = d->target;
4521 op0 = d->op0;
4522 op1 = d->op1;
4523 for (int i = 0; i < d->nelt; ++i)
4524 vec[i] = GEN_INT (d->perm[i]);
4525 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4526 }
4527
4528 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4529 return true;
4530 }
4531
4532 /* Expand a variable vector permutation. */
4533
4534 void
ix86_expand_vec_perm(rtx operands[])4535 ix86_expand_vec_perm (rtx operands[])
4536 {
4537 rtx target = operands[0];
4538 rtx op0 = operands[1];
4539 rtx op1 = operands[2];
4540 rtx mask = operands[3];
4541 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4542 machine_mode mode = GET_MODE (op0);
4543 machine_mode maskmode = GET_MODE (mask);
4544 int w, e, i;
4545 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4546
4547 /* Number of elements in the vector. */
4548 w = GET_MODE_NUNITS (mode);
4549 e = GET_MODE_UNIT_SIZE (mode);
4550 gcc_assert (w <= 64);
4551
4552 if (TARGET_AVX512F && one_operand_shuffle)
4553 {
4554 rtx (*gen) (rtx, rtx, rtx) = NULL;
4555 switch (mode)
4556 {
4557 case E_V16SImode:
4558 gen =gen_avx512f_permvarv16si;
4559 break;
4560 case E_V16SFmode:
4561 gen = gen_avx512f_permvarv16sf;
4562 break;
4563 case E_V8DImode:
4564 gen = gen_avx512f_permvarv8di;
4565 break;
4566 case E_V8DFmode:
4567 gen = gen_avx512f_permvarv8df;
4568 break;
4569 default:
4570 break;
4571 }
4572 if (gen != NULL)
4573 {
4574 emit_insn (gen (target, op0, mask));
4575 return;
4576 }
4577 }
4578
4579 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4580 return;
4581
4582 if (TARGET_AVX2)
4583 {
4584 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4585 {
4586 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4587 an constant shuffle operand. With a tiny bit of effort we can
4588 use VPERMD instead. A re-interpretation stall for V4DFmode is
4589 unfortunate but there's no avoiding it.
4590 Similarly for V16HImode we don't have instructions for variable
4591 shuffling, while for V32QImode we can use after preparing suitable
4592 masks vpshufb; vpshufb; vpermq; vpor. */
4593
4594 if (mode == V16HImode)
4595 {
4596 maskmode = mode = V32QImode;
4597 w = 32;
4598 e = 1;
4599 }
4600 else
4601 {
4602 maskmode = mode = V8SImode;
4603 w = 8;
4604 e = 4;
4605 }
4606 t1 = gen_reg_rtx (maskmode);
4607
4608 /* Replicate the low bits of the V4DImode mask into V8SImode:
4609 mask = { A B C D }
4610 t1 = { A A B B C C D D }. */
4611 for (i = 0; i < w / 2; ++i)
4612 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4613 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4614 vt = force_reg (maskmode, vt);
4615 mask = gen_lowpart (maskmode, mask);
4616 if (maskmode == V8SImode)
4617 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4618 else
4619 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4620
4621 /* Multiply the shuffle indicies by two. */
4622 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4623 OPTAB_DIRECT);
4624
4625 /* Add one to the odd shuffle indicies:
4626 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4627 for (i = 0; i < w / 2; ++i)
4628 {
4629 vec[i * 2] = const0_rtx;
4630 vec[i * 2 + 1] = const1_rtx;
4631 }
4632 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4633 vt = validize_mem (force_const_mem (maskmode, vt));
4634 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4635 OPTAB_DIRECT);
4636
4637 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4638 operands[3] = mask = t1;
4639 target = gen_reg_rtx (mode);
4640 op0 = gen_lowpart (mode, op0);
4641 op1 = gen_lowpart (mode, op1);
4642 }
4643
4644 switch (mode)
4645 {
4646 case E_V8SImode:
4647 /* The VPERMD and VPERMPS instructions already properly ignore
4648 the high bits of the shuffle elements. No need for us to
4649 perform an AND ourselves. */
4650 if (one_operand_shuffle)
4651 {
4652 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4653 if (target != operands[0])
4654 emit_move_insn (operands[0],
4655 gen_lowpart (GET_MODE (operands[0]), target));
4656 }
4657 else
4658 {
4659 t1 = gen_reg_rtx (V8SImode);
4660 t2 = gen_reg_rtx (V8SImode);
4661 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4662 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4663 goto merge_two;
4664 }
4665 return;
4666
4667 case E_V8SFmode:
4668 mask = gen_lowpart (V8SImode, mask);
4669 if (one_operand_shuffle)
4670 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4671 else
4672 {
4673 t1 = gen_reg_rtx (V8SFmode);
4674 t2 = gen_reg_rtx (V8SFmode);
4675 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4676 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4677 goto merge_two;
4678 }
4679 return;
4680
4681 case E_V4SImode:
4682 /* By combining the two 128-bit input vectors into one 256-bit
4683 input vector, we can use VPERMD and VPERMPS for the full
4684 two-operand shuffle. */
4685 t1 = gen_reg_rtx (V8SImode);
4686 t2 = gen_reg_rtx (V8SImode);
4687 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4688 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4689 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4690 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4691 return;
4692
4693 case E_V4SFmode:
4694 t1 = gen_reg_rtx (V8SFmode);
4695 t2 = gen_reg_rtx (V8SImode);
4696 mask = gen_lowpart (V4SImode, mask);
4697 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4698 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4699 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4700 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4701 return;
4702
4703 case E_V32QImode:
4704 t1 = gen_reg_rtx (V32QImode);
4705 t2 = gen_reg_rtx (V32QImode);
4706 t3 = gen_reg_rtx (V32QImode);
4707 vt2 = GEN_INT (-128);
4708 vt = gen_const_vec_duplicate (V32QImode, vt2);
4709 vt = force_reg (V32QImode, vt);
4710 for (i = 0; i < 32; i++)
4711 vec[i] = i < 16 ? vt2 : const0_rtx;
4712 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4713 vt2 = force_reg (V32QImode, vt2);
4714 /* From mask create two adjusted masks, which contain the same
4715 bits as mask in the low 7 bits of each vector element.
4716 The first mask will have the most significant bit clear
4717 if it requests element from the same 128-bit lane
4718 and MSB set if it requests element from the other 128-bit lane.
4719 The second mask will have the opposite values of the MSB,
4720 and additionally will have its 128-bit lanes swapped.
4721 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4722 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4723 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4724 stands for other 12 bytes. */
4725 /* The bit whether element is from the same lane or the other
4726 lane is bit 4, so shift it up by 3 to the MSB position. */
4727 t5 = gen_reg_rtx (V4DImode);
4728 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4729 GEN_INT (3)));
4730 /* Clear MSB bits from the mask just in case it had them set. */
4731 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4732 /* After this t1 will have MSB set for elements from other lane. */
4733 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4734 /* Clear bits other than MSB. */
4735 emit_insn (gen_andv32qi3 (t1, t1, vt));
4736 /* Or in the lower bits from mask into t3. */
4737 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4738 /* And invert MSB bits in t1, so MSB is set for elements from the same
4739 lane. */
4740 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4741 /* Swap 128-bit lanes in t3. */
4742 t6 = gen_reg_rtx (V4DImode);
4743 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4744 const2_rtx, GEN_INT (3),
4745 const0_rtx, const1_rtx));
4746 /* And or in the lower bits from mask into t1. */
4747 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4748 if (one_operand_shuffle)
4749 {
4750 /* Each of these shuffles will put 0s in places where
4751 element from the other 128-bit lane is needed, otherwise
4752 will shuffle in the requested value. */
4753 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4754 gen_lowpart (V32QImode, t6)));
4755 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4756 /* For t3 the 128-bit lanes are swapped again. */
4757 t7 = gen_reg_rtx (V4DImode);
4758 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4759 const2_rtx, GEN_INT (3),
4760 const0_rtx, const1_rtx));
4761 /* And oring both together leads to the result. */
4762 emit_insn (gen_iorv32qi3 (target, t1,
4763 gen_lowpart (V32QImode, t7)));
4764 if (target != operands[0])
4765 emit_move_insn (operands[0],
4766 gen_lowpart (GET_MODE (operands[0]), target));
4767 return;
4768 }
4769
4770 t4 = gen_reg_rtx (V32QImode);
4771 /* Similarly to the above one_operand_shuffle code,
4772 just for repeated twice for each operand. merge_two:
4773 code will merge the two results together. */
4774 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4775 gen_lowpart (V32QImode, t6)));
4776 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4777 gen_lowpart (V32QImode, t6)));
4778 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4779 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4780 t7 = gen_reg_rtx (V4DImode);
4781 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4782 const2_rtx, GEN_INT (3),
4783 const0_rtx, const1_rtx));
4784 t8 = gen_reg_rtx (V4DImode);
4785 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4786 const2_rtx, GEN_INT (3),
4787 const0_rtx, const1_rtx));
4788 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4789 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4790 t1 = t4;
4791 t2 = t3;
4792 goto merge_two;
4793
4794 default:
4795 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4796 break;
4797 }
4798 }
4799
4800 if (TARGET_XOP)
4801 {
4802 /* The XOP VPPERM insn supports three inputs. By ignoring the
4803 one_operand_shuffle special case, we avoid creating another
4804 set of constant vectors in memory. */
4805 one_operand_shuffle = false;
4806
4807 /* mask = mask & {2*w-1, ...} */
4808 vt = GEN_INT (2*w - 1);
4809 }
4810 else
4811 {
4812 /* mask = mask & {w-1, ...} */
4813 vt = GEN_INT (w - 1);
4814 }
4815
4816 vt = gen_const_vec_duplicate (maskmode, vt);
4817 mask = expand_simple_binop (maskmode, AND, mask, vt,
4818 NULL_RTX, 0, OPTAB_DIRECT);
4819
4820 /* For non-QImode operations, convert the word permutation control
4821 into a byte permutation control. */
4822 if (mode != V16QImode)
4823 {
4824 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4825 GEN_INT (exact_log2 (e)),
4826 NULL_RTX, 0, OPTAB_DIRECT);
4827
4828 /* Convert mask to vector of chars. */
4829 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4830
4831 /* Replicate each of the input bytes into byte positions:
4832 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4833 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4834 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4835 for (i = 0; i < 16; ++i)
4836 vec[i] = GEN_INT (i/e * e);
4837 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4838 vt = validize_mem (force_const_mem (V16QImode, vt));
4839 if (TARGET_XOP)
4840 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4841 else
4842 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4843
4844 /* Convert it into the byte positions by doing
4845 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4846 for (i = 0; i < 16; ++i)
4847 vec[i] = GEN_INT (i % e);
4848 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4849 vt = validize_mem (force_const_mem (V16QImode, vt));
4850 emit_insn (gen_addv16qi3 (mask, mask, vt));
4851 }
4852
4853 /* The actual shuffle operations all operate on V16QImode. */
4854 op0 = gen_lowpart (V16QImode, op0);
4855 op1 = gen_lowpart (V16QImode, op1);
4856
4857 if (TARGET_XOP)
4858 {
4859 if (GET_MODE (target) != V16QImode)
4860 target = gen_reg_rtx (V16QImode);
4861 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4862 if (target != operands[0])
4863 emit_move_insn (operands[0],
4864 gen_lowpart (GET_MODE (operands[0]), target));
4865 }
4866 else if (one_operand_shuffle)
4867 {
4868 if (GET_MODE (target) != V16QImode)
4869 target = gen_reg_rtx (V16QImode);
4870 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4871 if (target != operands[0])
4872 emit_move_insn (operands[0],
4873 gen_lowpart (GET_MODE (operands[0]), target));
4874 }
4875 else
4876 {
4877 rtx xops[6];
4878 bool ok;
4879
4880 /* Shuffle the two input vectors independently. */
4881 t1 = gen_reg_rtx (V16QImode);
4882 t2 = gen_reg_rtx (V16QImode);
4883 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4884 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4885
4886 merge_two:
4887 /* Then merge them together. The key is whether any given control
4888 element contained a bit set that indicates the second word. */
4889 mask = operands[3];
4890 vt = GEN_INT (w);
4891 if (maskmode == V2DImode && !TARGET_SSE4_1)
4892 {
4893 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4894 more shuffle to convert the V2DI input mask into a V4SI
4895 input mask. At which point the masking that expand_int_vcond
4896 will work as desired. */
4897 rtx t3 = gen_reg_rtx (V4SImode);
4898 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4899 const0_rtx, const0_rtx,
4900 const2_rtx, const2_rtx));
4901 mask = t3;
4902 maskmode = V4SImode;
4903 e = w = 4;
4904 }
4905
4906 vt = gen_const_vec_duplicate (maskmode, vt);
4907 vt = force_reg (maskmode, vt);
4908 mask = expand_simple_binop (maskmode, AND, mask, vt,
4909 NULL_RTX, 0, OPTAB_DIRECT);
4910
4911 if (GET_MODE (target) != mode)
4912 target = gen_reg_rtx (mode);
4913 xops[0] = target;
4914 xops[1] = gen_lowpart (mode, t2);
4915 xops[2] = gen_lowpart (mode, t1);
4916 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4917 xops[4] = mask;
4918 xops[5] = vt;
4919 ok = ix86_expand_int_vcond (xops);
4920 gcc_assert (ok);
4921 if (target != operands[0])
4922 emit_move_insn (operands[0],
4923 gen_lowpart (GET_MODE (operands[0]), target));
4924 }
4925 }
4926
4927 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4928 true if we should do zero extension, else sign extension. HIGH_P is
4929 true if we want the N/2 high elements, else the low elements. */
4930
4931 void
ix86_expand_sse_unpack(rtx dest,rtx src,bool unsigned_p,bool high_p)4932 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4933 {
4934 machine_mode imode = GET_MODE (src);
4935 rtx tmp;
4936
4937 if (TARGET_SSE4_1)
4938 {
4939 rtx (*unpack)(rtx, rtx);
4940 rtx (*extract)(rtx, rtx) = NULL;
4941 machine_mode halfmode = BLKmode;
4942
4943 switch (imode)
4944 {
4945 case E_V64QImode:
4946 if (unsigned_p)
4947 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4948 else
4949 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4950 halfmode = V32QImode;
4951 extract
4952 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4953 break;
4954 case E_V32QImode:
4955 if (unsigned_p)
4956 unpack = gen_avx2_zero_extendv16qiv16hi2;
4957 else
4958 unpack = gen_avx2_sign_extendv16qiv16hi2;
4959 halfmode = V16QImode;
4960 extract
4961 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4962 break;
4963 case E_V32HImode:
4964 if (unsigned_p)
4965 unpack = gen_avx512f_zero_extendv16hiv16si2;
4966 else
4967 unpack = gen_avx512f_sign_extendv16hiv16si2;
4968 halfmode = V16HImode;
4969 extract
4970 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4971 break;
4972 case E_V16HImode:
4973 if (unsigned_p)
4974 unpack = gen_avx2_zero_extendv8hiv8si2;
4975 else
4976 unpack = gen_avx2_sign_extendv8hiv8si2;
4977 halfmode = V8HImode;
4978 extract
4979 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4980 break;
4981 case E_V16SImode:
4982 if (unsigned_p)
4983 unpack = gen_avx512f_zero_extendv8siv8di2;
4984 else
4985 unpack = gen_avx512f_sign_extendv8siv8di2;
4986 halfmode = V8SImode;
4987 extract
4988 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
4989 break;
4990 case E_V8SImode:
4991 if (unsigned_p)
4992 unpack = gen_avx2_zero_extendv4siv4di2;
4993 else
4994 unpack = gen_avx2_sign_extendv4siv4di2;
4995 halfmode = V4SImode;
4996 extract
4997 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
4998 break;
4999 case E_V16QImode:
5000 if (unsigned_p)
5001 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5002 else
5003 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5004 break;
5005 case E_V8HImode:
5006 if (unsigned_p)
5007 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5008 else
5009 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5010 break;
5011 case E_V4SImode:
5012 if (unsigned_p)
5013 unpack = gen_sse4_1_zero_extendv2siv2di2;
5014 else
5015 unpack = gen_sse4_1_sign_extendv2siv2di2;
5016 break;
5017 default:
5018 gcc_unreachable ();
5019 }
5020
5021 if (GET_MODE_SIZE (imode) >= 32)
5022 {
5023 tmp = gen_reg_rtx (halfmode);
5024 emit_insn (extract (tmp, src));
5025 }
5026 else if (high_p)
5027 {
5028 /* Shift higher 8 bytes to lower 8 bytes. */
5029 tmp = gen_reg_rtx (V1TImode);
5030 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5031 GEN_INT (64)));
5032 tmp = gen_lowpart (imode, tmp);
5033 }
5034 else
5035 tmp = src;
5036
5037 emit_insn (unpack (dest, tmp));
5038 }
5039 else
5040 {
5041 rtx (*unpack)(rtx, rtx, rtx);
5042
5043 switch (imode)
5044 {
5045 case E_V16QImode:
5046 if (high_p)
5047 unpack = gen_vec_interleave_highv16qi;
5048 else
5049 unpack = gen_vec_interleave_lowv16qi;
5050 break;
5051 case E_V8HImode:
5052 if (high_p)
5053 unpack = gen_vec_interleave_highv8hi;
5054 else
5055 unpack = gen_vec_interleave_lowv8hi;
5056 break;
5057 case E_V4SImode:
5058 if (high_p)
5059 unpack = gen_vec_interleave_highv4si;
5060 else
5061 unpack = gen_vec_interleave_lowv4si;
5062 break;
5063 default:
5064 gcc_unreachable ();
5065 }
5066
5067 if (unsigned_p)
5068 tmp = force_reg (imode, CONST0_RTX (imode));
5069 else
5070 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5071 src, pc_rtx, pc_rtx);
5072
5073 rtx tmp2 = gen_reg_rtx (imode);
5074 emit_insn (unpack (tmp2, src, tmp));
5075 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5076 }
5077 }
5078
5079 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5080 but works for floating pointer parameters and nonoffsetable memories.
5081 For pushes, it returns just stack offsets; the values will be saved
5082 in the right order. Maximally three parts are generated. */
5083
5084 static int
ix86_split_to_parts(rtx operand,rtx * parts,machine_mode mode)5085 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5086 {
5087 int size;
5088
5089 if (!TARGET_64BIT)
5090 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5091 else
5092 size = (GET_MODE_SIZE (mode) + 4) / 8;
5093
5094 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5095 gcc_assert (size >= 2 && size <= 4);
5096
5097 /* Optimize constant pool reference to immediates. This is used by fp
5098 moves, that force all constants to memory to allow combining. */
5099 if (MEM_P (operand) && MEM_READONLY_P (operand))
5100 operand = avoid_constant_pool_reference (operand);
5101
5102 if (MEM_P (operand) && !offsettable_memref_p (operand))
5103 {
5104 /* The only non-offsetable memories we handle are pushes. */
5105 int ok = push_operand (operand, VOIDmode);
5106
5107 gcc_assert (ok);
5108
5109 operand = copy_rtx (operand);
5110 PUT_MODE (operand, word_mode);
5111 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5112 return size;
5113 }
5114
5115 if (GET_CODE (operand) == CONST_VECTOR)
5116 {
5117 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5118 /* Caution: if we looked through a constant pool memory above,
5119 the operand may actually have a different mode now. That's
5120 ok, since we want to pun this all the way back to an integer. */
5121 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5122 gcc_assert (operand != NULL);
5123 mode = imode;
5124 }
5125
5126 if (!TARGET_64BIT)
5127 {
5128 if (mode == DImode)
5129 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5130 else
5131 {
5132 int i;
5133
5134 if (REG_P (operand))
5135 {
5136 gcc_assert (reload_completed);
5137 for (i = 0; i < size; i++)
5138 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5139 }
5140 else if (offsettable_memref_p (operand))
5141 {
5142 operand = adjust_address (operand, SImode, 0);
5143 parts[0] = operand;
5144 for (i = 1; i < size; i++)
5145 parts[i] = adjust_address (operand, SImode, 4 * i);
5146 }
5147 else if (CONST_DOUBLE_P (operand))
5148 {
5149 const REAL_VALUE_TYPE *r;
5150 long l[4];
5151
5152 r = CONST_DOUBLE_REAL_VALUE (operand);
5153 switch (mode)
5154 {
5155 case E_TFmode:
5156 real_to_target (l, r, mode);
5157 parts[3] = gen_int_mode (l[3], SImode);
5158 parts[2] = gen_int_mode (l[2], SImode);
5159 break;
5160 case E_XFmode:
5161 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5162 long double may not be 80-bit. */
5163 real_to_target (l, r, mode);
5164 parts[2] = gen_int_mode (l[2], SImode);
5165 break;
5166 case E_DFmode:
5167 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5168 break;
5169 default:
5170 gcc_unreachable ();
5171 }
5172 parts[1] = gen_int_mode (l[1], SImode);
5173 parts[0] = gen_int_mode (l[0], SImode);
5174 }
5175 else
5176 gcc_unreachable ();
5177 }
5178 }
5179 else
5180 {
5181 if (mode == TImode)
5182 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5183 if (mode == XFmode || mode == TFmode)
5184 {
5185 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5186 if (REG_P (operand))
5187 {
5188 gcc_assert (reload_completed);
5189 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5190 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5191 }
5192 else if (offsettable_memref_p (operand))
5193 {
5194 operand = adjust_address (operand, DImode, 0);
5195 parts[0] = operand;
5196 parts[1] = adjust_address (operand, upper_mode, 8);
5197 }
5198 else if (CONST_DOUBLE_P (operand))
5199 {
5200 long l[4];
5201
5202 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5203
5204 /* real_to_target puts 32-bit pieces in each long. */
5205 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5206 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5207 << 32), DImode);
5208
5209 if (upper_mode == SImode)
5210 parts[1] = gen_int_mode (l[2], SImode);
5211 else
5212 parts[1]
5213 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5214 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5215 << 32), DImode);
5216 }
5217 else
5218 gcc_unreachable ();
5219 }
5220 }
5221
5222 return size;
5223 }
5224
5225 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5226 Return false when normal moves are needed; true when all required
5227 insns have been emitted. Operands 2-4 contain the input values
5228 int the correct order; operands 5-7 contain the output values. */
5229
5230 void
ix86_split_long_move(rtx operands[])5231 ix86_split_long_move (rtx operands[])
5232 {
5233 rtx part[2][4];
5234 int nparts, i, j;
5235 int push = 0;
5236 int collisions = 0;
5237 machine_mode mode = GET_MODE (operands[0]);
5238 bool collisionparts[4];
5239
5240 /* The DFmode expanders may ask us to move double.
5241 For 64bit target this is single move. By hiding the fact
5242 here we simplify i386.md splitters. */
5243 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5244 {
5245 /* Optimize constant pool reference to immediates. This is used by
5246 fp moves, that force all constants to memory to allow combining. */
5247
5248 if (MEM_P (operands[1])
5249 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5250 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5251 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5252 if (push_operand (operands[0], VOIDmode))
5253 {
5254 operands[0] = copy_rtx (operands[0]);
5255 PUT_MODE (operands[0], word_mode);
5256 }
5257 else
5258 operands[0] = gen_lowpart (DImode, operands[0]);
5259 operands[1] = gen_lowpart (DImode, operands[1]);
5260 emit_move_insn (operands[0], operands[1]);
5261 return;
5262 }
5263
5264 /* The only non-offsettable memory we handle is push. */
5265 if (push_operand (operands[0], VOIDmode))
5266 push = 1;
5267 else
5268 gcc_assert (!MEM_P (operands[0])
5269 || offsettable_memref_p (operands[0]));
5270
5271 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5272 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5273
5274 /* When emitting push, take care for source operands on the stack. */
5275 if (push && MEM_P (operands[1])
5276 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5277 {
5278 rtx src_base = XEXP (part[1][nparts - 1], 0);
5279
5280 /* Compensate for the stack decrement by 4. */
5281 if (!TARGET_64BIT && nparts == 3
5282 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5283 src_base = plus_constant (Pmode, src_base, 4);
5284
5285 /* src_base refers to the stack pointer and is
5286 automatically decreased by emitted push. */
5287 for (i = 0; i < nparts; i++)
5288 part[1][i] = change_address (part[1][i],
5289 GET_MODE (part[1][i]), src_base);
5290 }
5291
5292 /* We need to do copy in the right order in case an address register
5293 of the source overlaps the destination. */
5294 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5295 {
5296 rtx tmp;
5297
5298 for (i = 0; i < nparts; i++)
5299 {
5300 collisionparts[i]
5301 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5302 if (collisionparts[i])
5303 collisions++;
5304 }
5305
5306 /* Collision in the middle part can be handled by reordering. */
5307 if (collisions == 1 && nparts == 3 && collisionparts [1])
5308 {
5309 std::swap (part[0][1], part[0][2]);
5310 std::swap (part[1][1], part[1][2]);
5311 }
5312 else if (collisions == 1
5313 && nparts == 4
5314 && (collisionparts [1] || collisionparts [2]))
5315 {
5316 if (collisionparts [1])
5317 {
5318 std::swap (part[0][1], part[0][2]);
5319 std::swap (part[1][1], part[1][2]);
5320 }
5321 else
5322 {
5323 std::swap (part[0][2], part[0][3]);
5324 std::swap (part[1][2], part[1][3]);
5325 }
5326 }
5327
5328 /* If there are more collisions, we can't handle it by reordering.
5329 Do an lea to the last part and use only one colliding move. */
5330 else if (collisions > 1)
5331 {
5332 rtx base, addr;
5333
5334 collisions = 1;
5335
5336 base = part[0][nparts - 1];
5337
5338 /* Handle the case when the last part isn't valid for lea.
5339 Happens in 64-bit mode storing the 12-byte XFmode. */
5340 if (GET_MODE (base) != Pmode)
5341 base = gen_rtx_REG (Pmode, REGNO (base));
5342
5343 addr = XEXP (part[1][0], 0);
5344 if (TARGET_TLS_DIRECT_SEG_REFS)
5345 {
5346 struct ix86_address parts;
5347 int ok = ix86_decompose_address (addr, &parts);
5348 gcc_assert (ok);
5349 /* It is not valid to use %gs: or %fs: in lea. */
5350 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5351 }
5352 emit_insn (gen_rtx_SET (base, addr));
5353 part[1][0] = replace_equiv_address (part[1][0], base);
5354 for (i = 1; i < nparts; i++)
5355 {
5356 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5357 part[1][i] = replace_equiv_address (part[1][i], tmp);
5358 }
5359 }
5360 }
5361
5362 if (push)
5363 {
5364 if (!TARGET_64BIT)
5365 {
5366 if (nparts == 3)
5367 {
5368 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5369 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5370 emit_move_insn (part[0][2], part[1][2]);
5371 }
5372 else if (nparts == 4)
5373 {
5374 emit_move_insn (part[0][3], part[1][3]);
5375 emit_move_insn (part[0][2], part[1][2]);
5376 }
5377 }
5378 else
5379 {
5380 /* In 64bit mode we don't have 32bit push available. In case this is
5381 register, it is OK - we will just use larger counterpart. We also
5382 retype memory - these comes from attempt to avoid REX prefix on
5383 moving of second half of TFmode value. */
5384 if (GET_MODE (part[1][1]) == SImode)
5385 {
5386 switch (GET_CODE (part[1][1]))
5387 {
5388 case MEM:
5389 part[1][1] = adjust_address (part[1][1], DImode, 0);
5390 break;
5391
5392 case REG:
5393 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5394 break;
5395
5396 default:
5397 gcc_unreachable ();
5398 }
5399
5400 if (GET_MODE (part[1][0]) == SImode)
5401 part[1][0] = part[1][1];
5402 }
5403 }
5404 emit_move_insn (part[0][1], part[1][1]);
5405 emit_move_insn (part[0][0], part[1][0]);
5406 return;
5407 }
5408
5409 /* Choose correct order to not overwrite the source before it is copied. */
5410 if ((REG_P (part[0][0])
5411 && REG_P (part[1][1])
5412 && (REGNO (part[0][0]) == REGNO (part[1][1])
5413 || (nparts == 3
5414 && REGNO (part[0][0]) == REGNO (part[1][2]))
5415 || (nparts == 4
5416 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5417 || (collisions > 0
5418 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5419 {
5420 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5421 {
5422 operands[2 + i] = part[0][j];
5423 operands[6 + i] = part[1][j];
5424 }
5425 }
5426 else
5427 {
5428 for (i = 0; i < nparts; i++)
5429 {
5430 operands[2 + i] = part[0][i];
5431 operands[6 + i] = part[1][i];
5432 }
5433 }
5434
5435 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5436 if (optimize_insn_for_size_p ())
5437 {
5438 for (j = 0; j < nparts - 1; j++)
5439 if (CONST_INT_P (operands[6 + j])
5440 && operands[6 + j] != const0_rtx
5441 && REG_P (operands[2 + j]))
5442 for (i = j; i < nparts - 1; i++)
5443 if (CONST_INT_P (operands[7 + i])
5444 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5445 operands[7 + i] = operands[2 + j];
5446 }
5447
5448 for (i = 0; i < nparts; i++)
5449 emit_move_insn (operands[2 + i], operands[6 + i]);
5450
5451 return;
5452 }
5453
5454 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5455 left shift by a constant, either using a single shift or
5456 a sequence of add instructions. */
5457
5458 static void
ix86_expand_ashl_const(rtx operand,int count,machine_mode mode)5459 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5460 {
5461 if (count == 1
5462 || (count * ix86_cost->add <= ix86_cost->shift_const
5463 && !optimize_insn_for_size_p ()))
5464 {
5465 while (count-- > 0)
5466 emit_insn (gen_add2_insn (operand, operand));
5467 }
5468 else
5469 {
5470 rtx (*insn)(rtx, rtx, rtx);
5471
5472 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5473 emit_insn (insn (operand, operand, GEN_INT (count)));
5474 }
5475 }
5476
5477 void
ix86_split_ashl(rtx * operands,rtx scratch,machine_mode mode)5478 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5479 {
5480 rtx (*gen_ashl3)(rtx, rtx, rtx);
5481 rtx (*gen_shld)(rtx, rtx, rtx);
5482 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5483 machine_mode half_mode;
5484
5485 rtx low[2], high[2];
5486 int count;
5487
5488 if (CONST_INT_P (operands[2]))
5489 {
5490 split_double_mode (mode, operands, 2, low, high);
5491 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5492
5493 if (count >= half_width)
5494 {
5495 emit_move_insn (high[0], low[1]);
5496 emit_move_insn (low[0], const0_rtx);
5497
5498 if (count > half_width)
5499 ix86_expand_ashl_const (high[0], count - half_width, mode);
5500 }
5501 else
5502 {
5503 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5504
5505 if (!rtx_equal_p (operands[0], operands[1]))
5506 emit_move_insn (operands[0], operands[1]);
5507
5508 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5509 ix86_expand_ashl_const (low[0], count, mode);
5510 }
5511 return;
5512 }
5513
5514 split_double_mode (mode, operands, 1, low, high);
5515 half_mode = mode == DImode ? SImode : DImode;
5516
5517 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5518
5519 if (operands[1] == const1_rtx)
5520 {
5521 /* Assuming we've chosen a QImode capable registers, then 1 << N
5522 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5523 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5524 {
5525 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5526
5527 ix86_expand_clear (low[0]);
5528 ix86_expand_clear (high[0]);
5529 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5530
5531 d = gen_lowpart (QImode, low[0]);
5532 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5533 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5534 emit_insn (gen_rtx_SET (d, s));
5535
5536 d = gen_lowpart (QImode, high[0]);
5537 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5538 s = gen_rtx_NE (QImode, flags, const0_rtx);
5539 emit_insn (gen_rtx_SET (d, s));
5540 }
5541
5542 /* Otherwise, we can get the same results by manually performing
5543 a bit extract operation on bit 5/6, and then performing the two
5544 shifts. The two methods of getting 0/1 into low/high are exactly
5545 the same size. Avoiding the shift in the bit extract case helps
5546 pentium4 a bit; no one else seems to care much either way. */
5547 else
5548 {
5549 rtx (*gen_lshr3)(rtx, rtx, rtx);
5550 rtx (*gen_and3)(rtx, rtx, rtx);
5551 rtx (*gen_xor3)(rtx, rtx, rtx);
5552 HOST_WIDE_INT bits;
5553 rtx x;
5554
5555 if (mode == DImode)
5556 {
5557 gen_lshr3 = gen_lshrsi3;
5558 gen_and3 = gen_andsi3;
5559 gen_xor3 = gen_xorsi3;
5560 bits = 5;
5561 }
5562 else
5563 {
5564 gen_lshr3 = gen_lshrdi3;
5565 gen_and3 = gen_anddi3;
5566 gen_xor3 = gen_xordi3;
5567 bits = 6;
5568 }
5569
5570 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5571 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5572 else
5573 x = gen_lowpart (half_mode, operands[2]);
5574 emit_insn (gen_rtx_SET (high[0], x));
5575
5576 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5577 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5578 emit_move_insn (low[0], high[0]);
5579 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5580 }
5581
5582 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5583 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5584 return;
5585 }
5586
5587 if (operands[1] == constm1_rtx)
5588 {
5589 /* For -1 << N, we can avoid the shld instruction, because we
5590 know that we're shifting 0...31/63 ones into a -1. */
5591 emit_move_insn (low[0], constm1_rtx);
5592 if (optimize_insn_for_size_p ())
5593 emit_move_insn (high[0], low[0]);
5594 else
5595 emit_move_insn (high[0], constm1_rtx);
5596 }
5597 else
5598 {
5599 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5600
5601 if (!rtx_equal_p (operands[0], operands[1]))
5602 emit_move_insn (operands[0], operands[1]);
5603
5604 split_double_mode (mode, operands, 1, low, high);
5605 emit_insn (gen_shld (high[0], low[0], operands[2]));
5606 }
5607
5608 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5609
5610 if (TARGET_CMOVE && scratch)
5611 {
5612 ix86_expand_clear (scratch);
5613 emit_insn (gen_x86_shift_adj_1
5614 (half_mode, high[0], low[0], operands[2], scratch));
5615 }
5616 else
5617 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5618 }
5619
5620 void
ix86_split_ashr(rtx * operands,rtx scratch,machine_mode mode)5621 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5622 {
5623 rtx (*gen_ashr3)(rtx, rtx, rtx)
5624 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5625 rtx (*gen_shrd)(rtx, rtx, rtx);
5626 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5627
5628 rtx low[2], high[2];
5629 int count;
5630
5631 if (CONST_INT_P (operands[2]))
5632 {
5633 split_double_mode (mode, operands, 2, low, high);
5634 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5635
5636 if (count == GET_MODE_BITSIZE (mode) - 1)
5637 {
5638 emit_move_insn (high[0], high[1]);
5639 emit_insn (gen_ashr3 (high[0], high[0],
5640 GEN_INT (half_width - 1)));
5641 emit_move_insn (low[0], high[0]);
5642
5643 }
5644 else if (count >= half_width)
5645 {
5646 emit_move_insn (low[0], high[1]);
5647 emit_move_insn (high[0], low[0]);
5648 emit_insn (gen_ashr3 (high[0], high[0],
5649 GEN_INT (half_width - 1)));
5650
5651 if (count > half_width)
5652 emit_insn (gen_ashr3 (low[0], low[0],
5653 GEN_INT (count - half_width)));
5654 }
5655 else
5656 {
5657 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5658
5659 if (!rtx_equal_p (operands[0], operands[1]))
5660 emit_move_insn (operands[0], operands[1]);
5661
5662 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5663 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5664 }
5665 }
5666 else
5667 {
5668 machine_mode half_mode;
5669
5670 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5671
5672 if (!rtx_equal_p (operands[0], operands[1]))
5673 emit_move_insn (operands[0], operands[1]);
5674
5675 split_double_mode (mode, operands, 1, low, high);
5676 half_mode = mode == DImode ? SImode : DImode;
5677
5678 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5679 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5680
5681 if (TARGET_CMOVE && scratch)
5682 {
5683 emit_move_insn (scratch, high[0]);
5684 emit_insn (gen_ashr3 (scratch, scratch,
5685 GEN_INT (half_width - 1)));
5686 emit_insn (gen_x86_shift_adj_1
5687 (half_mode, low[0], high[0], operands[2], scratch));
5688 }
5689 else
5690 emit_insn (gen_x86_shift_adj_3
5691 (half_mode, low[0], high[0], operands[2]));
5692 }
5693 }
5694
5695 void
ix86_split_lshr(rtx * operands,rtx scratch,machine_mode mode)5696 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5697 {
5698 rtx (*gen_lshr3)(rtx, rtx, rtx)
5699 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5700 rtx (*gen_shrd)(rtx, rtx, rtx);
5701 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5702
5703 rtx low[2], high[2];
5704 int count;
5705
5706 if (CONST_INT_P (operands[2]))
5707 {
5708 split_double_mode (mode, operands, 2, low, high);
5709 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5710
5711 if (count >= half_width)
5712 {
5713 emit_move_insn (low[0], high[1]);
5714 ix86_expand_clear (high[0]);
5715
5716 if (count > half_width)
5717 emit_insn (gen_lshr3 (low[0], low[0],
5718 GEN_INT (count - half_width)));
5719 }
5720 else
5721 {
5722 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5723
5724 if (!rtx_equal_p (operands[0], operands[1]))
5725 emit_move_insn (operands[0], operands[1]);
5726
5727 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5728 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5729 }
5730 }
5731 else
5732 {
5733 machine_mode half_mode;
5734
5735 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5736
5737 if (!rtx_equal_p (operands[0], operands[1]))
5738 emit_move_insn (operands[0], operands[1]);
5739
5740 split_double_mode (mode, operands, 1, low, high);
5741 half_mode = mode == DImode ? SImode : DImode;
5742
5743 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5744 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5745
5746 if (TARGET_CMOVE && scratch)
5747 {
5748 ix86_expand_clear (scratch);
5749 emit_insn (gen_x86_shift_adj_1
5750 (half_mode, low[0], high[0], operands[2], scratch));
5751 }
5752 else
5753 emit_insn (gen_x86_shift_adj_2
5754 (half_mode, low[0], high[0], operands[2]));
5755 }
5756 }
5757
5758 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5759 DImode for constant loop counts. */
5760
5761 static machine_mode
counter_mode(rtx count_exp)5762 counter_mode (rtx count_exp)
5763 {
5764 if (GET_MODE (count_exp) != VOIDmode)
5765 return GET_MODE (count_exp);
5766 if (!CONST_INT_P (count_exp))
5767 return Pmode;
5768 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5769 return DImode;
5770 return SImode;
5771 }
5772
5773 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5774 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5775 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5776 memory by VALUE (supposed to be in MODE).
5777
5778 The size is rounded down to whole number of chunk size moved at once.
5779 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5780
5781
5782 static void
expand_set_or_cpymem_via_loop(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx count,machine_mode mode,int unroll,int expected_size,bool issetmem)5783 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5784 rtx destptr, rtx srcptr, rtx value,
5785 rtx count, machine_mode mode, int unroll,
5786 int expected_size, bool issetmem)
5787 {
5788 rtx_code_label *out_label, *top_label;
5789 rtx iter, tmp;
5790 machine_mode iter_mode = counter_mode (count);
5791 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5792 rtx piece_size = GEN_INT (piece_size_n);
5793 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5794 rtx size;
5795 int i;
5796
5797 top_label = gen_label_rtx ();
5798 out_label = gen_label_rtx ();
5799 iter = gen_reg_rtx (iter_mode);
5800
5801 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5802 NULL, 1, OPTAB_DIRECT);
5803 /* Those two should combine. */
5804 if (piece_size == const1_rtx)
5805 {
5806 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5807 true, out_label);
5808 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5809 }
5810 emit_move_insn (iter, const0_rtx);
5811
5812 emit_label (top_label);
5813
5814 tmp = convert_modes (Pmode, iter_mode, iter, true);
5815
5816 /* This assert could be relaxed - in this case we'll need to compute
5817 smallest power of two, containing in PIECE_SIZE_N and pass it to
5818 offset_address. */
5819 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5820 destmem = offset_address (destmem, tmp, piece_size_n);
5821 destmem = adjust_address (destmem, mode, 0);
5822
5823 if (!issetmem)
5824 {
5825 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5826 srcmem = adjust_address (srcmem, mode, 0);
5827
5828 /* When unrolling for chips that reorder memory reads and writes,
5829 we can save registers by using single temporary.
5830 Also using 4 temporaries is overkill in 32bit mode. */
5831 if (!TARGET_64BIT && 0)
5832 {
5833 for (i = 0; i < unroll; i++)
5834 {
5835 if (i)
5836 {
5837 destmem = adjust_address (copy_rtx (destmem), mode,
5838 GET_MODE_SIZE (mode));
5839 srcmem = adjust_address (copy_rtx (srcmem), mode,
5840 GET_MODE_SIZE (mode));
5841 }
5842 emit_move_insn (destmem, srcmem);
5843 }
5844 }
5845 else
5846 {
5847 rtx tmpreg[4];
5848 gcc_assert (unroll <= 4);
5849 for (i = 0; i < unroll; i++)
5850 {
5851 tmpreg[i] = gen_reg_rtx (mode);
5852 if (i)
5853 srcmem = adjust_address (copy_rtx (srcmem), mode,
5854 GET_MODE_SIZE (mode));
5855 emit_move_insn (tmpreg[i], srcmem);
5856 }
5857 for (i = 0; i < unroll; i++)
5858 {
5859 if (i)
5860 destmem = adjust_address (copy_rtx (destmem), mode,
5861 GET_MODE_SIZE (mode));
5862 emit_move_insn (destmem, tmpreg[i]);
5863 }
5864 }
5865 }
5866 else
5867 for (i = 0; i < unroll; i++)
5868 {
5869 if (i)
5870 destmem = adjust_address (copy_rtx (destmem), mode,
5871 GET_MODE_SIZE (mode));
5872 emit_move_insn (destmem, value);
5873 }
5874
5875 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5876 true, OPTAB_LIB_WIDEN);
5877 if (tmp != iter)
5878 emit_move_insn (iter, tmp);
5879
5880 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5881 true, top_label);
5882 if (expected_size != -1)
5883 {
5884 expected_size /= GET_MODE_SIZE (mode) * unroll;
5885 if (expected_size == 0)
5886 predict_jump (0);
5887 else if (expected_size > REG_BR_PROB_BASE)
5888 predict_jump (REG_BR_PROB_BASE - 1);
5889 else
5890 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5891 / expected_size);
5892 }
5893 else
5894 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5895 iter = ix86_zero_extend_to_Pmode (iter);
5896 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5897 true, OPTAB_LIB_WIDEN);
5898 if (tmp != destptr)
5899 emit_move_insn (destptr, tmp);
5900 if (!issetmem)
5901 {
5902 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5903 true, OPTAB_LIB_WIDEN);
5904 if (tmp != srcptr)
5905 emit_move_insn (srcptr, tmp);
5906 }
5907 emit_label (out_label);
5908 }
5909
5910 /* Divide COUNTREG by SCALE. */
5911 static rtx
scale_counter(rtx countreg,int scale)5912 scale_counter (rtx countreg, int scale)
5913 {
5914 rtx sc;
5915
5916 if (scale == 1)
5917 return countreg;
5918 if (CONST_INT_P (countreg))
5919 return GEN_INT (INTVAL (countreg) / scale);
5920 gcc_assert (REG_P (countreg));
5921
5922 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5923 GEN_INT (exact_log2 (scale)),
5924 NULL, 1, OPTAB_DIRECT);
5925 return sc;
5926 }
5927
5928 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5929 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5930 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5931 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5932 ORIG_VALUE is the original value passed to memset to fill the memory with.
5933 Other arguments have same meaning as for previous function. */
5934
5935 static void
expand_set_or_cpymem_via_rep(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx orig_value,rtx count,machine_mode mode,bool issetmem)5936 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5937 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5938 rtx count,
5939 machine_mode mode, bool issetmem)
5940 {
5941 rtx destexp;
5942 rtx srcexp;
5943 rtx countreg;
5944 HOST_WIDE_INT rounded_count;
5945
5946 /* If possible, it is shorter to use rep movs.
5947 TODO: Maybe it is better to move this logic to decide_alg. */
5948 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5949 && (!issetmem || orig_value == const0_rtx))
5950 mode = SImode;
5951
5952 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5953 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5954
5955 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5956 GET_MODE_SIZE (mode)));
5957 if (mode != QImode)
5958 {
5959 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5960 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5961 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5962 }
5963 else
5964 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5965 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5966 {
5967 rounded_count
5968 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5969 destmem = shallow_copy_rtx (destmem);
5970 set_mem_size (destmem, rounded_count);
5971 }
5972 else if (MEM_SIZE_KNOWN_P (destmem))
5973 clear_mem_size (destmem);
5974
5975 if (issetmem)
5976 {
5977 value = force_reg (mode, gen_lowpart (mode, value));
5978 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
5979 }
5980 else
5981 {
5982 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
5983 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
5984 if (mode != QImode)
5985 {
5986 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
5987 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5988 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
5989 }
5990 else
5991 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
5992 if (CONST_INT_P (count))
5993 {
5994 rounded_count
5995 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5996 srcmem = shallow_copy_rtx (srcmem);
5997 set_mem_size (srcmem, rounded_count);
5998 }
5999 else
6000 {
6001 if (MEM_SIZE_KNOWN_P (srcmem))
6002 clear_mem_size (srcmem);
6003 }
6004 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6005 destexp, srcexp));
6006 }
6007 }
6008
6009 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6010 DESTMEM.
6011 SRC is passed by pointer to be updated on return.
6012 Return value is updated DST. */
6013 static rtx
emit_memmov(rtx destmem,rtx * srcmem,rtx destptr,rtx srcptr,HOST_WIDE_INT size_to_move)6014 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6015 HOST_WIDE_INT size_to_move)
6016 {
6017 rtx dst = destmem, src = *srcmem, adjust, tempreg;
6018 enum insn_code code;
6019 machine_mode move_mode;
6020 int piece_size, i;
6021
6022 /* Find the widest mode in which we could perform moves.
6023 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6024 it until move of such size is supported. */
6025 piece_size = 1 << floor_log2 (size_to_move);
6026 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6027 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6028 {
6029 gcc_assert (piece_size > 1);
6030 piece_size >>= 1;
6031 }
6032
6033 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6034 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6035 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6036 {
6037 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6038 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6039 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6040 {
6041 move_mode = word_mode;
6042 piece_size = GET_MODE_SIZE (move_mode);
6043 code = optab_handler (mov_optab, move_mode);
6044 }
6045 }
6046 gcc_assert (code != CODE_FOR_nothing);
6047
6048 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6049 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6050
6051 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6052 gcc_assert (size_to_move % piece_size == 0);
6053 adjust = GEN_INT (piece_size);
6054 for (i = 0; i < size_to_move; i += piece_size)
6055 {
6056 /* We move from memory to memory, so we'll need to do it via
6057 a temporary register. */
6058 tempreg = gen_reg_rtx (move_mode);
6059 emit_insn (GEN_FCN (code) (tempreg, src));
6060 emit_insn (GEN_FCN (code) (dst, tempreg));
6061
6062 emit_move_insn (destptr,
6063 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6064 emit_move_insn (srcptr,
6065 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
6066
6067 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6068 piece_size);
6069 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6070 piece_size);
6071 }
6072
6073 /* Update DST and SRC rtx. */
6074 *srcmem = src;
6075 return dst;
6076 }
6077
6078 /* Helper function for the string operations below. Dest VARIABLE whether
6079 it is aligned to VALUE bytes. If true, jump to the label. */
6080
6081 static rtx_code_label *
ix86_expand_aligntest(rtx variable,int value,bool epilogue)6082 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6083 {
6084 rtx_code_label *label = gen_label_rtx ();
6085 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6086 if (GET_MODE (variable) == DImode)
6087 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6088 else
6089 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6090 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6091 1, label);
6092 if (epilogue)
6093 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6094 else
6095 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6096 return label;
6097 }
6098
6099
6100 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6101
6102 static void
expand_cpymem_epilogue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx count,int max_size)6103 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6104 rtx destptr, rtx srcptr, rtx count, int max_size)
6105 {
6106 rtx src, dest;
6107 if (CONST_INT_P (count))
6108 {
6109 HOST_WIDE_INT countval = INTVAL (count);
6110 HOST_WIDE_INT epilogue_size = countval % max_size;
6111 int i;
6112
6113 /* For now MAX_SIZE should be a power of 2. This assert could be
6114 relaxed, but it'll require a bit more complicated epilogue
6115 expanding. */
6116 gcc_assert ((max_size & (max_size - 1)) == 0);
6117 for (i = max_size; i >= 1; i >>= 1)
6118 {
6119 if (epilogue_size & i)
6120 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6121 }
6122 return;
6123 }
6124 if (max_size > 8)
6125 {
6126 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6127 count, 1, OPTAB_DIRECT);
6128 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6129 count, QImode, 1, 4, false);
6130 return;
6131 }
6132
6133 /* When there are stringops, we can cheaply increase dest and src pointers.
6134 Otherwise we save code size by maintaining offset (zero is readily
6135 available from preceding rep operation) and using x86 addressing modes.
6136 */
6137 if (TARGET_SINGLE_STRINGOP)
6138 {
6139 if (max_size > 4)
6140 {
6141 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6142 src = change_address (srcmem, SImode, srcptr);
6143 dest = change_address (destmem, SImode, destptr);
6144 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6145 emit_label (label);
6146 LABEL_NUSES (label) = 1;
6147 }
6148 if (max_size > 2)
6149 {
6150 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6151 src = change_address (srcmem, HImode, srcptr);
6152 dest = change_address (destmem, HImode, destptr);
6153 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6154 emit_label (label);
6155 LABEL_NUSES (label) = 1;
6156 }
6157 if (max_size > 1)
6158 {
6159 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6160 src = change_address (srcmem, QImode, srcptr);
6161 dest = change_address (destmem, QImode, destptr);
6162 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6163 emit_label (label);
6164 LABEL_NUSES (label) = 1;
6165 }
6166 }
6167 else
6168 {
6169 rtx offset = force_reg (Pmode, const0_rtx);
6170 rtx tmp;
6171
6172 if (max_size > 4)
6173 {
6174 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6175 src = change_address (srcmem, SImode, srcptr);
6176 dest = change_address (destmem, SImode, destptr);
6177 emit_move_insn (dest, src);
6178 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6179 true, OPTAB_LIB_WIDEN);
6180 if (tmp != offset)
6181 emit_move_insn (offset, tmp);
6182 emit_label (label);
6183 LABEL_NUSES (label) = 1;
6184 }
6185 if (max_size > 2)
6186 {
6187 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6188 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6189 src = change_address (srcmem, HImode, tmp);
6190 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6191 dest = change_address (destmem, HImode, tmp);
6192 emit_move_insn (dest, src);
6193 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6194 true, OPTAB_LIB_WIDEN);
6195 if (tmp != offset)
6196 emit_move_insn (offset, tmp);
6197 emit_label (label);
6198 LABEL_NUSES (label) = 1;
6199 }
6200 if (max_size > 1)
6201 {
6202 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6203 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6204 src = change_address (srcmem, QImode, tmp);
6205 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6206 dest = change_address (destmem, QImode, tmp);
6207 emit_move_insn (dest, src);
6208 emit_label (label);
6209 LABEL_NUSES (label) = 1;
6210 }
6211 }
6212 }
6213
6214 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6215 with value PROMOTED_VAL.
6216 SRC is passed by pointer to be updated on return.
6217 Return value is updated DST. */
6218 static rtx
emit_memset(rtx destmem,rtx destptr,rtx promoted_val,HOST_WIDE_INT size_to_move)6219 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6220 HOST_WIDE_INT size_to_move)
6221 {
6222 rtx dst = destmem, adjust;
6223 enum insn_code code;
6224 machine_mode move_mode;
6225 int piece_size, i;
6226
6227 /* Find the widest mode in which we could perform moves.
6228 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6229 it until move of such size is supported. */
6230 move_mode = GET_MODE (promoted_val);
6231 if (move_mode == VOIDmode)
6232 move_mode = QImode;
6233 if (size_to_move < GET_MODE_SIZE (move_mode))
6234 {
6235 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6236 move_mode = int_mode_for_size (move_bits, 0).require ();
6237 promoted_val = gen_lowpart (move_mode, promoted_val);
6238 }
6239 piece_size = GET_MODE_SIZE (move_mode);
6240 code = optab_handler (mov_optab, move_mode);
6241 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6242
6243 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6244
6245 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6246 gcc_assert (size_to_move % piece_size == 0);
6247 adjust = GEN_INT (piece_size);
6248 for (i = 0; i < size_to_move; i += piece_size)
6249 {
6250 if (piece_size <= GET_MODE_SIZE (word_mode))
6251 {
6252 emit_insn (gen_strset (destptr, dst, promoted_val));
6253 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6254 piece_size);
6255 continue;
6256 }
6257
6258 emit_insn (GEN_FCN (code) (dst, promoted_val));
6259
6260 emit_move_insn (destptr,
6261 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6262
6263 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6264 piece_size);
6265 }
6266
6267 /* Update DST rtx. */
6268 return dst;
6269 }
6270 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6271 static void
expand_setmem_epilogue_via_loop(rtx destmem,rtx destptr,rtx value,rtx count,int max_size)6272 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6273 rtx count, int max_size)
6274 {
6275 count = expand_simple_binop (counter_mode (count), AND, count,
6276 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6277 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6278 gen_lowpart (QImode, value), count, QImode,
6279 1, max_size / 2, true);
6280 }
6281
6282 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6283 static void
expand_setmem_epilogue(rtx destmem,rtx destptr,rtx value,rtx vec_value,rtx count,int max_size)6284 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6285 rtx count, int max_size)
6286 {
6287 rtx dest;
6288
6289 if (CONST_INT_P (count))
6290 {
6291 HOST_WIDE_INT countval = INTVAL (count);
6292 HOST_WIDE_INT epilogue_size = countval % max_size;
6293 int i;
6294
6295 /* For now MAX_SIZE should be a power of 2. This assert could be
6296 relaxed, but it'll require a bit more complicated epilogue
6297 expanding. */
6298 gcc_assert ((max_size & (max_size - 1)) == 0);
6299 for (i = max_size; i >= 1; i >>= 1)
6300 {
6301 if (epilogue_size & i)
6302 {
6303 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6304 destmem = emit_memset (destmem, destptr, vec_value, i);
6305 else
6306 destmem = emit_memset (destmem, destptr, value, i);
6307 }
6308 }
6309 return;
6310 }
6311 if (max_size > 32)
6312 {
6313 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6314 return;
6315 }
6316 if (max_size > 16)
6317 {
6318 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6319 if (TARGET_64BIT)
6320 {
6321 dest = change_address (destmem, DImode, destptr);
6322 emit_insn (gen_strset (destptr, dest, value));
6323 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6324 emit_insn (gen_strset (destptr, dest, value));
6325 }
6326 else
6327 {
6328 dest = change_address (destmem, SImode, destptr);
6329 emit_insn (gen_strset (destptr, dest, value));
6330 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6331 emit_insn (gen_strset (destptr, dest, value));
6332 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6333 emit_insn (gen_strset (destptr, dest, value));
6334 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6335 emit_insn (gen_strset (destptr, dest, value));
6336 }
6337 emit_label (label);
6338 LABEL_NUSES (label) = 1;
6339 }
6340 if (max_size > 8)
6341 {
6342 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6343 if (TARGET_64BIT)
6344 {
6345 dest = change_address (destmem, DImode, destptr);
6346 emit_insn (gen_strset (destptr, dest, value));
6347 }
6348 else
6349 {
6350 dest = change_address (destmem, SImode, destptr);
6351 emit_insn (gen_strset (destptr, dest, value));
6352 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6353 emit_insn (gen_strset (destptr, dest, value));
6354 }
6355 emit_label (label);
6356 LABEL_NUSES (label) = 1;
6357 }
6358 if (max_size > 4)
6359 {
6360 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6361 dest = change_address (destmem, SImode, destptr);
6362 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6363 emit_label (label);
6364 LABEL_NUSES (label) = 1;
6365 }
6366 if (max_size > 2)
6367 {
6368 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6369 dest = change_address (destmem, HImode, destptr);
6370 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6371 emit_label (label);
6372 LABEL_NUSES (label) = 1;
6373 }
6374 if (max_size > 1)
6375 {
6376 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6377 dest = change_address (destmem, QImode, destptr);
6378 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6379 emit_label (label);
6380 LABEL_NUSES (label) = 1;
6381 }
6382 }
6383
6384 /* Adjust COUNTER by the VALUE. */
6385 static void
ix86_adjust_counter(rtx countreg,HOST_WIDE_INT value)6386 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6387 {
6388 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6389 }
6390
6391 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6392 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6393 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6394 ignored.
6395 Return value is updated DESTMEM. */
6396
6397 static rtx
expand_set_or_cpymem_prologue(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int align,int desired_alignment,bool issetmem)6398 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6399 rtx destptr, rtx srcptr, rtx value,
6400 rtx vec_value, rtx count, int align,
6401 int desired_alignment, bool issetmem)
6402 {
6403 int i;
6404 for (i = 1; i < desired_alignment; i <<= 1)
6405 {
6406 if (align <= i)
6407 {
6408 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6409 if (issetmem)
6410 {
6411 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6412 destmem = emit_memset (destmem, destptr, vec_value, i);
6413 else
6414 destmem = emit_memset (destmem, destptr, value, i);
6415 }
6416 else
6417 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6418 ix86_adjust_counter (count, i);
6419 emit_label (label);
6420 LABEL_NUSES (label) = 1;
6421 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6422 }
6423 }
6424 return destmem;
6425 }
6426
6427 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6428 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6429 and jump to DONE_LABEL. */
6430 static void
expand_small_cpymem_or_setmem(rtx destmem,rtx srcmem,rtx destptr,rtx srcptr,rtx value,rtx vec_value,rtx count,int size,rtx done_label,bool issetmem)6431 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6432 rtx destptr, rtx srcptr,
6433 rtx value, rtx vec_value,
6434 rtx count, int size,
6435 rtx done_label, bool issetmem)
6436 {
6437 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6438 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6439 rtx modesize;
6440 int n;
6441
6442 /* If we do not have vector value to copy, we must reduce size. */
6443 if (issetmem)
6444 {
6445 if (!vec_value)
6446 {
6447 if (GET_MODE (value) == VOIDmode && size > 8)
6448 mode = Pmode;
6449 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6450 mode = GET_MODE (value);
6451 }
6452 else
6453 mode = GET_MODE (vec_value), value = vec_value;
6454 }
6455 else
6456 {
6457 /* Choose appropriate vector mode. */
6458 if (size >= 32)
6459 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6460 else if (size >= 16)
6461 mode = TARGET_SSE ? V16QImode : DImode;
6462 srcmem = change_address (srcmem, mode, srcptr);
6463 }
6464 destmem = change_address (destmem, mode, destptr);
6465 modesize = GEN_INT (GET_MODE_SIZE (mode));
6466 gcc_assert (GET_MODE_SIZE (mode) <= size);
6467 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6468 {
6469 if (issetmem)
6470 emit_move_insn (destmem, gen_lowpart (mode, value));
6471 else
6472 {
6473 emit_move_insn (destmem, srcmem);
6474 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6475 }
6476 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6477 }
6478
6479 destmem = offset_address (destmem, count, 1);
6480 destmem = offset_address (destmem, GEN_INT (-2 * size),
6481 GET_MODE_SIZE (mode));
6482 if (!issetmem)
6483 {
6484 srcmem = offset_address (srcmem, count, 1);
6485 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6486 GET_MODE_SIZE (mode));
6487 }
6488 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6489 {
6490 if (issetmem)
6491 emit_move_insn (destmem, gen_lowpart (mode, value));
6492 else
6493 {
6494 emit_move_insn (destmem, srcmem);
6495 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6496 }
6497 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6498 }
6499 emit_jump_insn (gen_jump (done_label));
6500 emit_barrier ();
6501
6502 emit_label (label);
6503 LABEL_NUSES (label) = 1;
6504 }
6505
6506 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6507 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6508 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6509 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6510 DONE_LABEL is a label after the whole copying sequence. The label is created
6511 on demand if *DONE_LABEL is NULL.
6512 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6513 bounds after the initial copies.
6514
6515 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6516 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6517 we will dispatch to a library call for large blocks.
6518
6519 In pseudocode we do:
6520
6521 if (COUNT < SIZE)
6522 {
6523 Assume that SIZE is 4. Bigger sizes are handled analogously
6524 if (COUNT & 4)
6525 {
6526 copy 4 bytes from SRCPTR to DESTPTR
6527 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6528 goto done_label
6529 }
6530 if (!COUNT)
6531 goto done_label;
6532 copy 1 byte from SRCPTR to DESTPTR
6533 if (COUNT & 2)
6534 {
6535 copy 2 bytes from SRCPTR to DESTPTR
6536 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6537 }
6538 }
6539 else
6540 {
6541 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6542 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6543
6544 OLD_DESPTR = DESTPTR;
6545 Align DESTPTR up to DESIRED_ALIGN
6546 SRCPTR += DESTPTR - OLD_DESTPTR
6547 COUNT -= DEST_PTR - OLD_DESTPTR
6548 if (DYNAMIC_CHECK)
6549 Round COUNT down to multiple of SIZE
6550 << optional caller supplied zero size guard is here >>
6551 << optional caller supplied dynamic check is here >>
6552 << caller supplied main copy loop is here >>
6553 }
6554 done_label:
6555 */
6556 static void
expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves(rtx destmem,rtx srcmem,rtx * destptr,rtx * srcptr,machine_mode mode,rtx value,rtx vec_value,rtx * count,rtx_code_label ** done_label,int size,int desired_align,int align,unsigned HOST_WIDE_INT * min_size,bool dynamic_check,bool issetmem)6557 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6558 rtx *destptr, rtx *srcptr,
6559 machine_mode mode,
6560 rtx value, rtx vec_value,
6561 rtx *count,
6562 rtx_code_label **done_label,
6563 int size,
6564 int desired_align,
6565 int align,
6566 unsigned HOST_WIDE_INT *min_size,
6567 bool dynamic_check,
6568 bool issetmem)
6569 {
6570 rtx_code_label *loop_label = NULL, *label;
6571 int n;
6572 rtx modesize;
6573 int prolog_size = 0;
6574 rtx mode_value;
6575
6576 /* Chose proper value to copy. */
6577 if (issetmem && VECTOR_MODE_P (mode))
6578 mode_value = vec_value;
6579 else
6580 mode_value = value;
6581 gcc_assert (GET_MODE_SIZE (mode) <= size);
6582
6583 /* See if block is big or small, handle small blocks. */
6584 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6585 {
6586 int size2 = size;
6587 loop_label = gen_label_rtx ();
6588
6589 if (!*done_label)
6590 *done_label = gen_label_rtx ();
6591
6592 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6593 1, loop_label);
6594 size2 >>= 1;
6595
6596 /* Handle sizes > 3. */
6597 for (;size2 > 2; size2 >>= 1)
6598 expand_small_cpymem_or_setmem (destmem, srcmem,
6599 *destptr, *srcptr,
6600 value, vec_value,
6601 *count,
6602 size2, *done_label, issetmem);
6603 /* Nothing to copy? Jump to DONE_LABEL if so */
6604 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6605 1, *done_label);
6606
6607 /* Do a byte copy. */
6608 destmem = change_address (destmem, QImode, *destptr);
6609 if (issetmem)
6610 emit_move_insn (destmem, gen_lowpart (QImode, value));
6611 else
6612 {
6613 srcmem = change_address (srcmem, QImode, *srcptr);
6614 emit_move_insn (destmem, srcmem);
6615 }
6616
6617 /* Handle sizes 2 and 3. */
6618 label = ix86_expand_aligntest (*count, 2, false);
6619 destmem = change_address (destmem, HImode, *destptr);
6620 destmem = offset_address (destmem, *count, 1);
6621 destmem = offset_address (destmem, GEN_INT (-2), 2);
6622 if (issetmem)
6623 emit_move_insn (destmem, gen_lowpart (HImode, value));
6624 else
6625 {
6626 srcmem = change_address (srcmem, HImode, *srcptr);
6627 srcmem = offset_address (srcmem, *count, 1);
6628 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6629 emit_move_insn (destmem, srcmem);
6630 }
6631
6632 emit_label (label);
6633 LABEL_NUSES (label) = 1;
6634 emit_jump_insn (gen_jump (*done_label));
6635 emit_barrier ();
6636 }
6637 else
6638 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6639 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6640
6641 /* Start memcpy for COUNT >= SIZE. */
6642 if (loop_label)
6643 {
6644 emit_label (loop_label);
6645 LABEL_NUSES (loop_label) = 1;
6646 }
6647
6648 /* Copy first desired_align bytes. */
6649 if (!issetmem)
6650 srcmem = change_address (srcmem, mode, *srcptr);
6651 destmem = change_address (destmem, mode, *destptr);
6652 modesize = GEN_INT (GET_MODE_SIZE (mode));
6653 for (n = 0; prolog_size < desired_align - align; n++)
6654 {
6655 if (issetmem)
6656 emit_move_insn (destmem, mode_value);
6657 else
6658 {
6659 emit_move_insn (destmem, srcmem);
6660 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6661 }
6662 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6663 prolog_size += GET_MODE_SIZE (mode);
6664 }
6665
6666
6667 /* Copy last SIZE bytes. */
6668 destmem = offset_address (destmem, *count, 1);
6669 destmem = offset_address (destmem,
6670 GEN_INT (-size - prolog_size),
6671 1);
6672 if (issetmem)
6673 emit_move_insn (destmem, mode_value);
6674 else
6675 {
6676 srcmem = offset_address (srcmem, *count, 1);
6677 srcmem = offset_address (srcmem,
6678 GEN_INT (-size - prolog_size),
6679 1);
6680 emit_move_insn (destmem, srcmem);
6681 }
6682 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6683 {
6684 destmem = offset_address (destmem, modesize, 1);
6685 if (issetmem)
6686 emit_move_insn (destmem, mode_value);
6687 else
6688 {
6689 srcmem = offset_address (srcmem, modesize, 1);
6690 emit_move_insn (destmem, srcmem);
6691 }
6692 }
6693
6694 /* Align destination. */
6695 if (desired_align > 1 && desired_align > align)
6696 {
6697 rtx saveddest = *destptr;
6698
6699 gcc_assert (desired_align <= size);
6700 /* Align destptr up, place it to new register. */
6701 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6702 GEN_INT (prolog_size),
6703 NULL_RTX, 1, OPTAB_DIRECT);
6704 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6705 REG_POINTER (*destptr) = 1;
6706 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6707 GEN_INT (-desired_align),
6708 *destptr, 1, OPTAB_DIRECT);
6709 /* See how many bytes we skipped. */
6710 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6711 *destptr,
6712 saveddest, 1, OPTAB_DIRECT);
6713 /* Adjust srcptr and count. */
6714 if (!issetmem)
6715 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6716 saveddest, *srcptr, 1, OPTAB_DIRECT);
6717 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6718 saveddest, *count, 1, OPTAB_DIRECT);
6719 /* We copied at most size + prolog_size. */
6720 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6721 *min_size
6722 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6723 else
6724 *min_size = 0;
6725
6726 /* Our loops always round down the block size, but for dispatch to
6727 library we need precise value. */
6728 if (dynamic_check)
6729 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6730 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6731 }
6732 else
6733 {
6734 gcc_assert (prolog_size == 0);
6735 /* Decrease count, so we won't end up copying last word twice. */
6736 if (!CONST_INT_P (*count))
6737 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6738 constm1_rtx, *count, 1, OPTAB_DIRECT);
6739 else
6740 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6741 (unsigned HOST_WIDE_INT)size));
6742 if (*min_size)
6743 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6744 }
6745 }
6746
6747
6748 /* This function is like the previous one, except here we know how many bytes
6749 need to be copied. That allows us to update alignment not only of DST, which
6750 is returned, but also of SRC, which is passed as a pointer for that
6751 reason. */
6752 static rtx
expand_set_or_cpymem_constant_prologue(rtx dst,rtx * srcp,rtx destreg,rtx srcreg,rtx value,rtx vec_value,int desired_align,int align_bytes,bool issetmem)6753 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6754 rtx srcreg, rtx value, rtx vec_value,
6755 int desired_align, int align_bytes,
6756 bool issetmem)
6757 {
6758 rtx src = NULL;
6759 rtx orig_dst = dst;
6760 rtx orig_src = NULL;
6761 int piece_size = 1;
6762 int copied_bytes = 0;
6763
6764 if (!issetmem)
6765 {
6766 gcc_assert (srcp != NULL);
6767 src = *srcp;
6768 orig_src = src;
6769 }
6770
6771 for (piece_size = 1;
6772 piece_size <= desired_align && copied_bytes < align_bytes;
6773 piece_size <<= 1)
6774 {
6775 if (align_bytes & piece_size)
6776 {
6777 if (issetmem)
6778 {
6779 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6780 dst = emit_memset (dst, destreg, vec_value, piece_size);
6781 else
6782 dst = emit_memset (dst, destreg, value, piece_size);
6783 }
6784 else
6785 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6786 copied_bytes += piece_size;
6787 }
6788 }
6789 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6790 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6791 if (MEM_SIZE_KNOWN_P (orig_dst))
6792 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6793
6794 if (!issetmem)
6795 {
6796 int src_align_bytes = get_mem_align_offset (src, desired_align
6797 * BITS_PER_UNIT);
6798 if (src_align_bytes >= 0)
6799 src_align_bytes = desired_align - src_align_bytes;
6800 if (src_align_bytes >= 0)
6801 {
6802 unsigned int src_align;
6803 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6804 {
6805 if ((src_align_bytes & (src_align - 1))
6806 == (align_bytes & (src_align - 1)))
6807 break;
6808 }
6809 if (src_align > (unsigned int) desired_align)
6810 src_align = desired_align;
6811 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6812 set_mem_align (src, src_align * BITS_PER_UNIT);
6813 }
6814 if (MEM_SIZE_KNOWN_P (orig_src))
6815 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6816 *srcp = src;
6817 }
6818
6819 return dst;
6820 }
6821
6822 /* Return true if ALG can be used in current context.
6823 Assume we expand memset if MEMSET is true. */
6824 static bool
alg_usable_p(enum stringop_alg alg,bool memset,bool have_as)6825 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6826 {
6827 if (alg == no_stringop)
6828 return false;
6829 if (alg == vector_loop)
6830 return TARGET_SSE || TARGET_AVX;
6831 /* Algorithms using the rep prefix want at least edi and ecx;
6832 additionally, memset wants eax and memcpy wants esi. Don't
6833 consider such algorithms if the user has appropriated those
6834 registers for their own purposes, or if we have a non-default
6835 address space, since some string insns cannot override the segment. */
6836 if (alg == rep_prefix_1_byte
6837 || alg == rep_prefix_4_byte
6838 || alg == rep_prefix_8_byte)
6839 {
6840 if (have_as)
6841 return false;
6842 if (fixed_regs[CX_REG]
6843 || fixed_regs[DI_REG]
6844 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6845 return false;
6846 }
6847 return true;
6848 }
6849
6850 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6851 static enum stringop_alg
decide_alg(HOST_WIDE_INT count,HOST_WIDE_INT expected_size,unsigned HOST_WIDE_INT min_size,unsigned HOST_WIDE_INT max_size,bool memset,bool zero_memset,bool have_as,int * dynamic_check,bool * noalign,bool recur)6852 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6853 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6854 bool memset, bool zero_memset, bool have_as,
6855 int *dynamic_check, bool *noalign, bool recur)
6856 {
6857 const struct stringop_algs *algs;
6858 bool optimize_for_speed;
6859 int max = 0;
6860 const struct processor_costs *cost;
6861 int i;
6862 bool any_alg_usable_p = false;
6863
6864 *noalign = false;
6865 *dynamic_check = -1;
6866
6867 /* Even if the string operation call is cold, we still might spend a lot
6868 of time processing large blocks. */
6869 if (optimize_function_for_size_p (cfun)
6870 || (optimize_insn_for_size_p ()
6871 && (max_size < 256
6872 || (expected_size != -1 && expected_size < 256))))
6873 optimize_for_speed = false;
6874 else
6875 optimize_for_speed = true;
6876
6877 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6878 if (memset)
6879 algs = &cost->memset[TARGET_64BIT != 0];
6880 else
6881 algs = &cost->memcpy[TARGET_64BIT != 0];
6882
6883 /* See maximal size for user defined algorithm. */
6884 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6885 {
6886 enum stringop_alg candidate = algs->size[i].alg;
6887 bool usable = alg_usable_p (candidate, memset, have_as);
6888 any_alg_usable_p |= usable;
6889
6890 if (candidate != libcall && candidate && usable)
6891 max = algs->size[i].max;
6892 }
6893
6894 /* If expected size is not known but max size is small enough
6895 so inline version is a win, set expected size into
6896 the range. */
6897 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6898 && expected_size == -1)
6899 expected_size = min_size / 2 + max_size / 2;
6900
6901 /* If user specified the algorithm, honor it if possible. */
6902 if (ix86_stringop_alg != no_stringop
6903 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6904 return ix86_stringop_alg;
6905 /* rep; movq or rep; movl is the smallest variant. */
6906 else if (!optimize_for_speed)
6907 {
6908 *noalign = true;
6909 if (!count || (count & 3) || (memset && !zero_memset))
6910 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6911 ? rep_prefix_1_byte : loop_1_byte;
6912 else
6913 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6914 ? rep_prefix_4_byte : loop;
6915 }
6916 /* Very tiny blocks are best handled via the loop, REP is expensive to
6917 setup. */
6918 else if (expected_size != -1 && expected_size < 4)
6919 return loop_1_byte;
6920 else if (expected_size != -1)
6921 {
6922 enum stringop_alg alg = libcall;
6923 bool alg_noalign = false;
6924 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6925 {
6926 /* We get here if the algorithms that were not libcall-based
6927 were rep-prefix based and we are unable to use rep prefixes
6928 based on global register usage. Break out of the loop and
6929 use the heuristic below. */
6930 if (algs->size[i].max == 0)
6931 break;
6932 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6933 {
6934 enum stringop_alg candidate = algs->size[i].alg;
6935
6936 if (candidate != libcall
6937 && alg_usable_p (candidate, memset, have_as))
6938 {
6939 alg = candidate;
6940 alg_noalign = algs->size[i].noalign;
6941 }
6942 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6943 last non-libcall inline algorithm. */
6944 if (TARGET_INLINE_ALL_STRINGOPS)
6945 {
6946 /* When the current size is best to be copied by a libcall,
6947 but we are still forced to inline, run the heuristic below
6948 that will pick code for medium sized blocks. */
6949 if (alg != libcall)
6950 {
6951 *noalign = alg_noalign;
6952 return alg;
6953 }
6954 else if (!any_alg_usable_p)
6955 break;
6956 }
6957 else if (alg_usable_p (candidate, memset, have_as))
6958 {
6959 *noalign = algs->size[i].noalign;
6960 return candidate;
6961 }
6962 }
6963 }
6964 }
6965 /* When asked to inline the call anyway, try to pick meaningful choice.
6966 We look for maximal size of block that is faster to copy by hand and
6967 take blocks of at most of that size guessing that average size will
6968 be roughly half of the block.
6969
6970 If this turns out to be bad, we might simply specify the preferred
6971 choice in ix86_costs. */
6972 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6973 && (algs->unknown_size == libcall
6974 || !alg_usable_p (algs->unknown_size, memset, have_as)))
6975 {
6976 enum stringop_alg alg;
6977 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6978
6979 /* If there aren't any usable algorithms or if recursing already,
6980 then recursing on smaller sizes or same size isn't going to
6981 find anything. Just return the simple byte-at-a-time copy loop. */
6982 if (!any_alg_usable_p || recur)
6983 {
6984 /* Pick something reasonable. */
6985 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
6986 *dynamic_check = 128;
6987 return loop_1_byte;
6988 }
6989 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
6990 zero_memset, have_as, dynamic_check, noalign, true);
6991 gcc_assert (*dynamic_check == -1);
6992 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6993 *dynamic_check = max;
6994 else
6995 gcc_assert (alg != libcall);
6996 return alg;
6997 }
6998 return (alg_usable_p (algs->unknown_size, memset, have_as)
6999 ? algs->unknown_size : libcall);
7000 }
7001
7002 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7003 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7004 static int
decide_alignment(int align,enum stringop_alg alg,int expected_size,machine_mode move_mode)7005 decide_alignment (int align,
7006 enum stringop_alg alg,
7007 int expected_size,
7008 machine_mode move_mode)
7009 {
7010 int desired_align = 0;
7011
7012 gcc_assert (alg != no_stringop);
7013
7014 if (alg == libcall)
7015 return 0;
7016 if (move_mode == VOIDmode)
7017 return 0;
7018
7019 desired_align = GET_MODE_SIZE (move_mode);
7020 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7021 copying whole cacheline at once. */
7022 if (TARGET_PENTIUMPRO
7023 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7024 desired_align = 8;
7025
7026 if (optimize_size)
7027 desired_align = 1;
7028 if (desired_align < align)
7029 desired_align = align;
7030 if (expected_size != -1 && expected_size < 4)
7031 desired_align = align;
7032
7033 return desired_align;
7034 }
7035
7036
7037 /* Helper function for memcpy. For QImode value 0xXY produce
7038 0xXYXYXYXY of wide specified by MODE. This is essentially
7039 a * 0x10101010, but we can do slightly better than
7040 synth_mult by unwinding the sequence by hand on CPUs with
7041 slow multiply. */
7042 static rtx
promote_duplicated_reg(machine_mode mode,rtx val)7043 promote_duplicated_reg (machine_mode mode, rtx val)
7044 {
7045 machine_mode valmode = GET_MODE (val);
7046 rtx tmp;
7047 int nops = mode == DImode ? 3 : 2;
7048
7049 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7050 if (val == const0_rtx)
7051 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7052 if (CONST_INT_P (val))
7053 {
7054 HOST_WIDE_INT v = INTVAL (val) & 255;
7055
7056 v |= v << 8;
7057 v |= v << 16;
7058 if (mode == DImode)
7059 v |= (v << 16) << 16;
7060 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7061 }
7062
7063 if (valmode == VOIDmode)
7064 valmode = QImode;
7065 if (valmode != QImode)
7066 val = gen_lowpart (QImode, val);
7067 if (mode == QImode)
7068 return val;
7069 if (!TARGET_PARTIAL_REG_STALL)
7070 nops--;
7071 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7072 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7073 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7074 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7075 {
7076 rtx reg = convert_modes (mode, QImode, val, true);
7077 tmp = promote_duplicated_reg (mode, const1_rtx);
7078 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7079 OPTAB_DIRECT);
7080 }
7081 else
7082 {
7083 rtx reg = convert_modes (mode, QImode, val, true);
7084
7085 if (!TARGET_PARTIAL_REG_STALL)
7086 if (mode == SImode)
7087 emit_insn (gen_insvsi_1 (reg, reg));
7088 else
7089 emit_insn (gen_insvdi_1 (reg, reg));
7090 else
7091 {
7092 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7093 NULL, 1, OPTAB_DIRECT);
7094 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7095 OPTAB_DIRECT);
7096 }
7097 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7098 NULL, 1, OPTAB_DIRECT);
7099 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7100 if (mode == SImode)
7101 return reg;
7102 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7103 NULL, 1, OPTAB_DIRECT);
7104 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7105 return reg;
7106 }
7107 }
7108
7109 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7110 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7111 alignment from ALIGN to DESIRED_ALIGN. */
7112 static rtx
promote_duplicated_reg_to_size(rtx val,int size_needed,int desired_align,int align)7113 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7114 int align)
7115 {
7116 rtx promoted_val;
7117
7118 if (TARGET_64BIT
7119 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7120 promoted_val = promote_duplicated_reg (DImode, val);
7121 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7122 promoted_val = promote_duplicated_reg (SImode, val);
7123 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7124 promoted_val = promote_duplicated_reg (HImode, val);
7125 else
7126 promoted_val = val;
7127
7128 return promoted_val;
7129 }
7130
7131 /* Copy the address to a Pmode register. This is used for x32 to
7132 truncate DImode TLS address to a SImode register. */
7133
7134 static rtx
ix86_copy_addr_to_reg(rtx addr)7135 ix86_copy_addr_to_reg (rtx addr)
7136 {
7137 rtx reg;
7138 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7139 {
7140 reg = copy_addr_to_reg (addr);
7141 REG_POINTER (reg) = 1;
7142 return reg;
7143 }
7144 else
7145 {
7146 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7147 reg = copy_to_mode_reg (DImode, addr);
7148 REG_POINTER (reg) = 1;
7149 return gen_rtx_SUBREG (SImode, reg, 0);
7150 }
7151 }
7152
7153 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7154 operations when profitable. The code depends upon architecture, block size
7155 and alignment, but always has one of the following overall structures:
7156
7157 Aligned move sequence:
7158
7159 1) Prologue guard: Conditional that jumps up to epilogues for small
7160 blocks that can be handled by epilogue alone. This is faster
7161 but also needed for correctness, since prologue assume the block
7162 is larger than the desired alignment.
7163
7164 Optional dynamic check for size and libcall for large
7165 blocks is emitted here too, with -minline-stringops-dynamically.
7166
7167 2) Prologue: copy first few bytes in order to get destination
7168 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7169 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7170 copied. We emit either a jump tree on power of two sized
7171 blocks, or a byte loop.
7172
7173 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7174 with specified algorithm.
7175
7176 4) Epilogue: code copying tail of the block that is too small to be
7177 handled by main body (or up to size guarded by prologue guard).
7178
7179 Misaligned move sequence
7180
7181 1) missaligned move prologue/epilogue containing:
7182 a) Prologue handling small memory blocks and jumping to done_label
7183 (skipped if blocks are known to be large enough)
7184 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7185 needed by single possibly misaligned move
7186 (skipped if alignment is not needed)
7187 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7188
7189 2) Zero size guard dispatching to done_label, if needed
7190
7191 3) dispatch to library call, if needed,
7192
7193 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7194 with specified algorithm. */
7195 bool
ix86_expand_set_or_cpymem(rtx dst,rtx src,rtx count_exp,rtx val_exp,rtx align_exp,rtx expected_align_exp,rtx expected_size_exp,rtx min_size_exp,rtx max_size_exp,rtx probable_max_size_exp,bool issetmem)7196 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7197 rtx align_exp, rtx expected_align_exp,
7198 rtx expected_size_exp, rtx min_size_exp,
7199 rtx max_size_exp, rtx probable_max_size_exp,
7200 bool issetmem)
7201 {
7202 rtx destreg;
7203 rtx srcreg = NULL;
7204 rtx_code_label *label = NULL;
7205 rtx tmp;
7206 rtx_code_label *jump_around_label = NULL;
7207 HOST_WIDE_INT align = 1;
7208 unsigned HOST_WIDE_INT count = 0;
7209 HOST_WIDE_INT expected_size = -1;
7210 int size_needed = 0, epilogue_size_needed;
7211 int desired_align = 0, align_bytes = 0;
7212 enum stringop_alg alg;
7213 rtx promoted_val = NULL;
7214 rtx vec_promoted_val = NULL;
7215 bool force_loopy_epilogue = false;
7216 int dynamic_check;
7217 bool need_zero_guard = false;
7218 bool noalign;
7219 machine_mode move_mode = VOIDmode;
7220 machine_mode wider_mode;
7221 int unroll_factor = 1;
7222 /* TODO: Once value ranges are available, fill in proper data. */
7223 unsigned HOST_WIDE_INT min_size = 0;
7224 unsigned HOST_WIDE_INT max_size = -1;
7225 unsigned HOST_WIDE_INT probable_max_size = -1;
7226 bool misaligned_prologue_used = false;
7227 bool have_as;
7228
7229 if (CONST_INT_P (align_exp))
7230 align = INTVAL (align_exp);
7231 /* i386 can do misaligned access on reasonably increased cost. */
7232 if (CONST_INT_P (expected_align_exp)
7233 && INTVAL (expected_align_exp) > align)
7234 align = INTVAL (expected_align_exp);
7235 /* ALIGN is the minimum of destination and source alignment, but we care here
7236 just about destination alignment. */
7237 else if (!issetmem
7238 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7239 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7240
7241 if (CONST_INT_P (count_exp))
7242 {
7243 min_size = max_size = probable_max_size = count = expected_size
7244 = INTVAL (count_exp);
7245 /* When COUNT is 0, there is nothing to do. */
7246 if (!count)
7247 return true;
7248 }
7249 else
7250 {
7251 if (min_size_exp)
7252 min_size = INTVAL (min_size_exp);
7253 if (max_size_exp)
7254 max_size = INTVAL (max_size_exp);
7255 if (probable_max_size_exp)
7256 probable_max_size = INTVAL (probable_max_size_exp);
7257 if (CONST_INT_P (expected_size_exp))
7258 expected_size = INTVAL (expected_size_exp);
7259 }
7260
7261 /* Make sure we don't need to care about overflow later on. */
7262 if (count > (HOST_WIDE_INT_1U << 30))
7263 return false;
7264
7265 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7266 if (!issetmem)
7267 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7268
7269 /* Step 0: Decide on preferred algorithm, desired alignment and
7270 size of chunks to be copied by main loop. */
7271 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7272 issetmem,
7273 issetmem && val_exp == const0_rtx, have_as,
7274 &dynamic_check, &noalign, false);
7275
7276 if (dump_file)
7277 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7278 stringop_alg_names[alg]);
7279
7280 if (alg == libcall)
7281 return false;
7282 gcc_assert (alg != no_stringop);
7283
7284 /* For now vector-version of memset is generated only for memory zeroing, as
7285 creating of promoted vector value is very cheap in this case. */
7286 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7287 alg = unrolled_loop;
7288
7289 if (!count)
7290 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7291 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7292 if (!issetmem)
7293 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7294
7295 unroll_factor = 1;
7296 move_mode = word_mode;
7297 switch (alg)
7298 {
7299 case libcall:
7300 case no_stringop:
7301 case last_alg:
7302 gcc_unreachable ();
7303 case loop_1_byte:
7304 need_zero_guard = true;
7305 move_mode = QImode;
7306 break;
7307 case loop:
7308 need_zero_guard = true;
7309 break;
7310 case unrolled_loop:
7311 need_zero_guard = true;
7312 unroll_factor = (TARGET_64BIT ? 4 : 2);
7313 break;
7314 case vector_loop:
7315 need_zero_guard = true;
7316 unroll_factor = 4;
7317 /* Find the widest supported mode. */
7318 move_mode = word_mode;
7319 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7320 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7321 move_mode = wider_mode;
7322
7323 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
7324 move_mode = TImode;
7325
7326 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7327 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7328 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7329 {
7330 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7331 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7332 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7333 move_mode = word_mode;
7334 }
7335 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7336 break;
7337 case rep_prefix_8_byte:
7338 move_mode = DImode;
7339 break;
7340 case rep_prefix_4_byte:
7341 move_mode = SImode;
7342 break;
7343 case rep_prefix_1_byte:
7344 move_mode = QImode;
7345 break;
7346 }
7347 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7348 epilogue_size_needed = size_needed;
7349
7350 /* If we are going to call any library calls conditionally, make sure any
7351 pending stack adjustment happen before the first conditional branch,
7352 otherwise they will be emitted before the library call only and won't
7353 happen from the other branches. */
7354 if (dynamic_check != -1)
7355 do_pending_stack_adjust ();
7356
7357 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7358 if (!TARGET_ALIGN_STRINGOPS || noalign)
7359 align = desired_align;
7360
7361 /* Step 1: Prologue guard. */
7362
7363 /* Alignment code needs count to be in register. */
7364 if (CONST_INT_P (count_exp) && desired_align > align)
7365 {
7366 if (INTVAL (count_exp) > desired_align
7367 && INTVAL (count_exp) > size_needed)
7368 {
7369 align_bytes
7370 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7371 if (align_bytes <= 0)
7372 align_bytes = 0;
7373 else
7374 align_bytes = desired_align - align_bytes;
7375 }
7376 if (align_bytes == 0)
7377 count_exp = force_reg (counter_mode (count_exp), count_exp);
7378 }
7379 gcc_assert (desired_align >= 1 && align >= 1);
7380
7381 /* Misaligned move sequences handle both prologue and epilogue at once.
7382 Default code generation results in a smaller code for large alignments
7383 and also avoids redundant job when sizes are known precisely. */
7384 misaligned_prologue_used
7385 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7386 && MAX (desired_align, epilogue_size_needed) <= 32
7387 && desired_align <= epilogue_size_needed
7388 && ((desired_align > align && !align_bytes)
7389 || (!count && epilogue_size_needed > 1)));
7390
7391 /* Do the cheap promotion to allow better CSE across the
7392 main loop and epilogue (ie one load of the big constant in the
7393 front of all code.
7394 For now the misaligned move sequences do not have fast path
7395 without broadcasting. */
7396 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7397 {
7398 if (alg == vector_loop)
7399 {
7400 gcc_assert (val_exp == const0_rtx);
7401 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7402 promoted_val = promote_duplicated_reg_to_size (val_exp,
7403 GET_MODE_SIZE (word_mode),
7404 desired_align, align);
7405 }
7406 else
7407 {
7408 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7409 desired_align, align);
7410 }
7411 }
7412 /* Misaligned move sequences handles both prologues and epilogues at once.
7413 Default code generation results in smaller code for large alignments and
7414 also avoids redundant job when sizes are known precisely. */
7415 if (misaligned_prologue_used)
7416 {
7417 /* Misaligned move prologue handled small blocks by itself. */
7418 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7419 (dst, src, &destreg, &srcreg,
7420 move_mode, promoted_val, vec_promoted_val,
7421 &count_exp,
7422 &jump_around_label,
7423 desired_align < align
7424 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7425 desired_align, align, &min_size, dynamic_check, issetmem);
7426 if (!issetmem)
7427 src = change_address (src, BLKmode, srcreg);
7428 dst = change_address (dst, BLKmode, destreg);
7429 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7430 epilogue_size_needed = 0;
7431 if (need_zero_guard
7432 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7433 {
7434 /* It is possible that we copied enough so the main loop will not
7435 execute. */
7436 gcc_assert (size_needed > 1);
7437 if (jump_around_label == NULL_RTX)
7438 jump_around_label = gen_label_rtx ();
7439 emit_cmp_and_jump_insns (count_exp,
7440 GEN_INT (size_needed),
7441 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7442 if (expected_size == -1
7443 || expected_size < (desired_align - align) / 2 + size_needed)
7444 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7445 else
7446 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7447 }
7448 }
7449 /* Ensure that alignment prologue won't copy past end of block. */
7450 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7451 {
7452 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7453 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7454 Make sure it is power of 2. */
7455 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7456
7457 /* To improve performance of small blocks, we jump around the VAL
7458 promoting mode. This mean that if the promoted VAL is not constant,
7459 we might not use it in the epilogue and have to use byte
7460 loop variant. */
7461 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7462 force_loopy_epilogue = true;
7463 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7464 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7465 {
7466 /* If main algorithm works on QImode, no epilogue is needed.
7467 For small sizes just don't align anything. */
7468 if (size_needed == 1)
7469 desired_align = align;
7470 else
7471 goto epilogue;
7472 }
7473 else if (!count
7474 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7475 {
7476 label = gen_label_rtx ();
7477 emit_cmp_and_jump_insns (count_exp,
7478 GEN_INT (epilogue_size_needed),
7479 LTU, 0, counter_mode (count_exp), 1, label);
7480 if (expected_size == -1 || expected_size < epilogue_size_needed)
7481 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7482 else
7483 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7484 }
7485 }
7486
7487 /* Emit code to decide on runtime whether library call or inline should be
7488 used. */
7489 if (dynamic_check != -1)
7490 {
7491 if (!issetmem && CONST_INT_P (count_exp))
7492 {
7493 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7494 {
7495 emit_block_copy_via_libcall (dst, src, count_exp);
7496 count_exp = const0_rtx;
7497 goto epilogue;
7498 }
7499 }
7500 else
7501 {
7502 rtx_code_label *hot_label = gen_label_rtx ();
7503 if (jump_around_label == NULL_RTX)
7504 jump_around_label = gen_label_rtx ();
7505 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7506 LEU, 0, counter_mode (count_exp),
7507 1, hot_label);
7508 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7509 if (issetmem)
7510 set_storage_via_libcall (dst, count_exp, val_exp);
7511 else
7512 emit_block_copy_via_libcall (dst, src, count_exp);
7513 emit_jump (jump_around_label);
7514 emit_label (hot_label);
7515 }
7516 }
7517
7518 /* Step 2: Alignment prologue. */
7519 /* Do the expensive promotion once we branched off the small blocks. */
7520 if (issetmem && !promoted_val)
7521 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7522 desired_align, align);
7523
7524 if (desired_align > align && !misaligned_prologue_used)
7525 {
7526 if (align_bytes == 0)
7527 {
7528 /* Except for the first move in prologue, we no longer know
7529 constant offset in aliasing info. It don't seems to worth
7530 the pain to maintain it for the first move, so throw away
7531 the info early. */
7532 dst = change_address (dst, BLKmode, destreg);
7533 if (!issetmem)
7534 src = change_address (src, BLKmode, srcreg);
7535 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7536 promoted_val, vec_promoted_val,
7537 count_exp, align, desired_align,
7538 issetmem);
7539 /* At most desired_align - align bytes are copied. */
7540 if (min_size < (unsigned)(desired_align - align))
7541 min_size = 0;
7542 else
7543 min_size -= desired_align - align;
7544 }
7545 else
7546 {
7547 /* If we know how many bytes need to be stored before dst is
7548 sufficiently aligned, maintain aliasing info accurately. */
7549 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7550 srcreg,
7551 promoted_val,
7552 vec_promoted_val,
7553 desired_align,
7554 align_bytes,
7555 issetmem);
7556
7557 count_exp = plus_constant (counter_mode (count_exp),
7558 count_exp, -align_bytes);
7559 count -= align_bytes;
7560 min_size -= align_bytes;
7561 max_size -= align_bytes;
7562 }
7563 if (need_zero_guard
7564 && min_size < (unsigned HOST_WIDE_INT) size_needed
7565 && (count < (unsigned HOST_WIDE_INT) size_needed
7566 || (align_bytes == 0
7567 && count < ((unsigned HOST_WIDE_INT) size_needed
7568 + desired_align - align))))
7569 {
7570 /* It is possible that we copied enough so the main loop will not
7571 execute. */
7572 gcc_assert (size_needed > 1);
7573 if (label == NULL_RTX)
7574 label = gen_label_rtx ();
7575 emit_cmp_and_jump_insns (count_exp,
7576 GEN_INT (size_needed),
7577 LTU, 0, counter_mode (count_exp), 1, label);
7578 if (expected_size == -1
7579 || expected_size < (desired_align - align) / 2 + size_needed)
7580 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7581 else
7582 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7583 }
7584 }
7585 if (label && size_needed == 1)
7586 {
7587 emit_label (label);
7588 LABEL_NUSES (label) = 1;
7589 label = NULL;
7590 epilogue_size_needed = 1;
7591 if (issetmem)
7592 promoted_val = val_exp;
7593 }
7594 else if (label == NULL_RTX && !misaligned_prologue_used)
7595 epilogue_size_needed = size_needed;
7596
7597 /* Step 3: Main loop. */
7598
7599 switch (alg)
7600 {
7601 case libcall:
7602 case no_stringop:
7603 case last_alg:
7604 gcc_unreachable ();
7605 case loop_1_byte:
7606 case loop:
7607 case unrolled_loop:
7608 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7609 count_exp, move_mode, unroll_factor,
7610 expected_size, issetmem);
7611 break;
7612 case vector_loop:
7613 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7614 vec_promoted_val, count_exp, move_mode,
7615 unroll_factor, expected_size, issetmem);
7616 break;
7617 case rep_prefix_8_byte:
7618 case rep_prefix_4_byte:
7619 case rep_prefix_1_byte:
7620 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7621 val_exp, count_exp, move_mode, issetmem);
7622 break;
7623 }
7624 /* Adjust properly the offset of src and dest memory for aliasing. */
7625 if (CONST_INT_P (count_exp))
7626 {
7627 if (!issetmem)
7628 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7629 (count / size_needed) * size_needed);
7630 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7631 (count / size_needed) * size_needed);
7632 }
7633 else
7634 {
7635 if (!issetmem)
7636 src = change_address (src, BLKmode, srcreg);
7637 dst = change_address (dst, BLKmode, destreg);
7638 }
7639
7640 /* Step 4: Epilogue to copy the remaining bytes. */
7641 epilogue:
7642 if (label)
7643 {
7644 /* When the main loop is done, COUNT_EXP might hold original count,
7645 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7646 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7647 bytes. Compensate if needed. */
7648
7649 if (size_needed < epilogue_size_needed)
7650 {
7651 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7652 GEN_INT (size_needed - 1), count_exp, 1,
7653 OPTAB_DIRECT);
7654 if (tmp != count_exp)
7655 emit_move_insn (count_exp, tmp);
7656 }
7657 emit_label (label);
7658 LABEL_NUSES (label) = 1;
7659 }
7660
7661 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7662 {
7663 if (force_loopy_epilogue)
7664 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7665 epilogue_size_needed);
7666 else
7667 {
7668 if (issetmem)
7669 expand_setmem_epilogue (dst, destreg, promoted_val,
7670 vec_promoted_val, count_exp,
7671 epilogue_size_needed);
7672 else
7673 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7674 epilogue_size_needed);
7675 }
7676 }
7677 if (jump_around_label)
7678 emit_label (jump_around_label);
7679 return true;
7680 }
7681
7682
7683 /* Expand the appropriate insns for doing strlen if not just doing
7684 repnz; scasb
7685
7686 out = result, initialized with the start address
7687 align_rtx = alignment of the address.
7688 scratch = scratch register, initialized with the startaddress when
7689 not aligned, otherwise undefined
7690
7691 This is just the body. It needs the initializations mentioned above and
7692 some address computing at the end. These things are done in i386.md. */
7693
7694 static void
ix86_expand_strlensi_unroll_1(rtx out,rtx src,rtx align_rtx)7695 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7696 {
7697 int align;
7698 rtx tmp;
7699 rtx_code_label *align_2_label = NULL;
7700 rtx_code_label *align_3_label = NULL;
7701 rtx_code_label *align_4_label = gen_label_rtx ();
7702 rtx_code_label *end_0_label = gen_label_rtx ();
7703 rtx mem;
7704 rtx tmpreg = gen_reg_rtx (SImode);
7705 rtx scratch = gen_reg_rtx (SImode);
7706 rtx cmp;
7707
7708 align = 0;
7709 if (CONST_INT_P (align_rtx))
7710 align = INTVAL (align_rtx);
7711
7712 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7713
7714 /* Is there a known alignment and is it less than 4? */
7715 if (align < 4)
7716 {
7717 rtx scratch1 = gen_reg_rtx (Pmode);
7718 emit_move_insn (scratch1, out);
7719 /* Is there a known alignment and is it not 2? */
7720 if (align != 2)
7721 {
7722 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7723 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7724
7725 /* Leave just the 3 lower bits. */
7726 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7727 NULL_RTX, 0, OPTAB_WIDEN);
7728
7729 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7730 Pmode, 1, align_4_label);
7731 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7732 Pmode, 1, align_2_label);
7733 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7734 Pmode, 1, align_3_label);
7735 }
7736 else
7737 {
7738 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7739 check if is aligned to 4 - byte. */
7740
7741 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7742 NULL_RTX, 0, OPTAB_WIDEN);
7743
7744 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7745 Pmode, 1, align_4_label);
7746 }
7747
7748 mem = change_address (src, QImode, out);
7749
7750 /* Now compare the bytes. */
7751
7752 /* Compare the first n unaligned byte on a byte per byte basis. */
7753 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7754 QImode, 1, end_0_label);
7755
7756 /* Increment the address. */
7757 emit_insn (gen_add2_insn (out, const1_rtx));
7758
7759 /* Not needed with an alignment of 2 */
7760 if (align != 2)
7761 {
7762 emit_label (align_2_label);
7763
7764 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7765 end_0_label);
7766
7767 emit_insn (gen_add2_insn (out, const1_rtx));
7768
7769 emit_label (align_3_label);
7770 }
7771
7772 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7773 end_0_label);
7774
7775 emit_insn (gen_add2_insn (out, const1_rtx));
7776 }
7777
7778 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7779 align this loop. It gives only huge programs, but does not help to
7780 speed up. */
7781 emit_label (align_4_label);
7782
7783 mem = change_address (src, SImode, out);
7784 emit_move_insn (scratch, mem);
7785 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7786
7787 /* This formula yields a nonzero result iff one of the bytes is zero.
7788 This saves three branches inside loop and many cycles. */
7789
7790 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7791 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7792 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7793 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7794 gen_int_mode (0x80808080, SImode)));
7795 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7796 align_4_label);
7797
7798 if (TARGET_CMOVE)
7799 {
7800 rtx reg = gen_reg_rtx (SImode);
7801 rtx reg2 = gen_reg_rtx (Pmode);
7802 emit_move_insn (reg, tmpreg);
7803 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7804
7805 /* If zero is not in the first two bytes, move two bytes forward. */
7806 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7807 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7808 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7809 emit_insn (gen_rtx_SET (tmpreg,
7810 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7811 reg,
7812 tmpreg)));
7813 /* Emit lea manually to avoid clobbering of flags. */
7814 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
7815
7816 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7817 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7818 emit_insn (gen_rtx_SET (out,
7819 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7820 reg2,
7821 out)));
7822 }
7823 else
7824 {
7825 rtx_code_label *end_2_label = gen_label_rtx ();
7826 /* Is zero in the first two bytes? */
7827
7828 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7829 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7830 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7831 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7832 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7833 pc_rtx);
7834 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7835 JUMP_LABEL (tmp) = end_2_label;
7836
7837 /* Not in the first two. Move two bytes forward. */
7838 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7839 emit_insn (gen_add2_insn (out, const2_rtx));
7840
7841 emit_label (end_2_label);
7842
7843 }
7844
7845 /* Avoid branch in fixing the byte. */
7846 tmpreg = gen_lowpart (QImode, tmpreg);
7847 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7848 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7849 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7850 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7851
7852 emit_label (end_0_label);
7853 }
7854
7855 /* Expand strlen. */
7856
7857 bool
ix86_expand_strlen(rtx out,rtx src,rtx eoschar,rtx align)7858 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7859 {
7860 if (TARGET_UNROLL_STRLEN
7861 && TARGET_INLINE_ALL_STRINGOPS
7862 && eoschar == const0_rtx
7863 && optimize > 1)
7864 {
7865 /* The generic case of strlen expander is long. Avoid it's
7866 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7867 rtx addr = force_reg (Pmode, XEXP (src, 0));
7868 /* Well it seems that some optimizer does not combine a call like
7869 foo(strlen(bar), strlen(bar));
7870 when the move and the subtraction is done here. It does calculate
7871 the length just once when these instructions are done inside of
7872 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7873 often used and I use one fewer register for the lifetime of
7874 output_strlen_unroll() this is better. */
7875
7876 emit_move_insn (out, addr);
7877
7878 ix86_expand_strlensi_unroll_1 (out, src, align);
7879
7880 /* strlensi_unroll_1 returns the address of the zero at the end of
7881 the string, like memchr(), so compute the length by subtracting
7882 the start address. */
7883 emit_insn (gen_sub2_insn (out, addr));
7884 return true;
7885 }
7886 else
7887 return false;
7888 }
7889
7890 /* For given symbol (function) construct code to compute address of it's PLT
7891 entry in large x86-64 PIC model. */
7892
7893 static rtx
construct_plt_address(rtx symbol)7894 construct_plt_address (rtx symbol)
7895 {
7896 rtx tmp, unspec;
7897
7898 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7899 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7900 gcc_assert (Pmode == DImode);
7901
7902 tmp = gen_reg_rtx (Pmode);
7903 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7904
7905 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7906 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7907 return tmp;
7908 }
7909
7910 /* Additional registers that are clobbered by SYSV calls. */
7911
7912 static int const x86_64_ms_sysv_extra_clobbered_registers
7913 [NUM_X86_64_MS_CLOBBERED_REGS] =
7914 {
7915 SI_REG, DI_REG,
7916 XMM6_REG, XMM7_REG,
7917 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7918 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7919 };
7920
7921 rtx_insn *
ix86_expand_call(rtx retval,rtx fnaddr,rtx callarg1,rtx callarg2,rtx pop,bool sibcall)7922 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7923 rtx callarg2,
7924 rtx pop, bool sibcall)
7925 {
7926 rtx vec[3];
7927 rtx use = NULL, call;
7928 unsigned int vec_len = 0;
7929 tree fndecl;
7930
7931 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7932 {
7933 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7934 if (fndecl
7935 && (lookup_attribute ("interrupt",
7936 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
7937 error ("interrupt service routine cannot be called directly");
7938 }
7939 else
7940 fndecl = NULL_TREE;
7941
7942 if (pop == const0_rtx)
7943 pop = NULL;
7944 gcc_assert (!TARGET_64BIT || !pop);
7945
7946 rtx addr = XEXP (fnaddr, 0);
7947 if (TARGET_MACHO && !TARGET_64BIT)
7948 {
7949 #if TARGET_MACHO
7950 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7951 fnaddr = machopic_indirect_call_target (fnaddr);
7952 #endif
7953 }
7954 else
7955 {
7956 /* Static functions and indirect calls don't need the pic register. Also,
7957 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7958 it an indirect call. */
7959 if (flag_pic
7960 && GET_CODE (addr) == SYMBOL_REF
7961 && !SYMBOL_REF_LOCAL_P (addr))
7962 {
7963 if (flag_plt
7964 && (SYMBOL_REF_DECL (addr) == NULL_TREE
7965 || !lookup_attribute ("noplt",
7966 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
7967 {
7968 if (!TARGET_64BIT
7969 || (ix86_cmodel == CM_LARGE_PIC
7970 && DEFAULT_ABI != MS_ABI))
7971 {
7972 use_reg (&use, gen_rtx_REG (Pmode,
7973 REAL_PIC_OFFSET_TABLE_REGNUM));
7974 if (ix86_use_pseudo_pic_reg ())
7975 emit_move_insn (gen_rtx_REG (Pmode,
7976 REAL_PIC_OFFSET_TABLE_REGNUM),
7977 pic_offset_table_rtx);
7978 }
7979 }
7980 else if (!TARGET_PECOFF && !TARGET_MACHO)
7981 {
7982 if (TARGET_64BIT
7983 && ix86_cmodel == CM_LARGE_PIC
7984 && DEFAULT_ABI != MS_ABI)
7985 {
7986 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
7987 UNSPEC_GOT);
7988 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7989 fnaddr = force_reg (Pmode, fnaddr);
7990 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
7991 }
7992 else if (TARGET_64BIT)
7993 {
7994 fnaddr = gen_rtx_UNSPEC (Pmode,
7995 gen_rtvec (1, addr),
7996 UNSPEC_GOTPCREL);
7997 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7998 }
7999 else
8000 {
8001 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8002 UNSPEC_GOT);
8003 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8004 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8005 fnaddr);
8006 }
8007 fnaddr = gen_const_mem (Pmode, fnaddr);
8008 /* Pmode may not be the same as word_mode for x32, which
8009 doesn't support indirect branch via 32-bit memory slot.
8010 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8011 indirect branch via x32 GOT slot is OK. */
8012 if (GET_MODE (fnaddr) != word_mode)
8013 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8014 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8015 }
8016 }
8017 }
8018
8019 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8020 parameters passed in vector registers. */
8021 if (TARGET_64BIT
8022 && (INTVAL (callarg2) > 0
8023 || (INTVAL (callarg2) == 0
8024 && (TARGET_SSE || !flag_skip_rax_setup))))
8025 {
8026 rtx al = gen_rtx_REG (QImode, AX_REG);
8027 emit_move_insn (al, callarg2);
8028 use_reg (&use, al);
8029 }
8030
8031 if (ix86_cmodel == CM_LARGE_PIC
8032 && !TARGET_PECOFF
8033 && MEM_P (fnaddr)
8034 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8035 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8036 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8037 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8038 branch via x32 GOT slot is OK. */
8039 else if (!(TARGET_X32
8040 && MEM_P (fnaddr)
8041 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8042 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8043 && (sibcall
8044 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8045 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8046 {
8047 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8048 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8049 }
8050
8051 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8052
8053 if (retval)
8054 call = gen_rtx_SET (retval, call);
8055 vec[vec_len++] = call;
8056
8057 if (pop)
8058 {
8059 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8060 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8061 vec[vec_len++] = pop;
8062 }
8063
8064 if (cfun->machine->no_caller_saved_registers
8065 && (!fndecl
8066 || (!TREE_THIS_VOLATILE (fndecl)
8067 && !lookup_attribute ("no_caller_saved_registers",
8068 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8069 {
8070 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8071 bool is_64bit_ms_abi = (TARGET_64BIT
8072 && ix86_function_abi (fndecl) == MS_ABI);
8073 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8074
8075 /* If there are no caller-saved registers, add all registers
8076 that are clobbered by the call which returns. */
8077 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8078 if (!fixed_regs[i]
8079 && (ix86_call_used_regs[i] == 1
8080 || (ix86_call_used_regs[i] & c_mask))
8081 && !STACK_REGNO_P (i)
8082 && !MMX_REGNO_P (i))
8083 clobber_reg (&use,
8084 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8085 }
8086 else if (TARGET_64BIT_MS_ABI
8087 && (!callarg2 || INTVAL (callarg2) != -2))
8088 {
8089 unsigned i;
8090
8091 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8092 {
8093 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8094 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8095
8096 clobber_reg (&use, gen_rtx_REG (mode, regno));
8097 }
8098
8099 /* Set here, but it may get cleared later. */
8100 if (TARGET_CALL_MS2SYSV_XLOGUES)
8101 {
8102 if (!TARGET_SSE)
8103 ;
8104
8105 /* Don't break hot-patched functions. */
8106 else if (ix86_function_ms_hook_prologue (current_function_decl))
8107 ;
8108
8109 /* TODO: Cases not yet examined. */
8110 else if (flag_split_stack)
8111 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8112
8113 else
8114 {
8115 gcc_assert (!reload_completed);
8116 cfun->machine->call_ms2sysv = true;
8117 }
8118 }
8119 }
8120
8121 if (TARGET_MACHO && TARGET_64BIT && !sibcall
8122 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
8123 || !fndecl || TREE_PUBLIC (fndecl)))
8124 {
8125 /* We allow public functions defined in a TU to bind locally for PIC
8126 code (the default) on 64bit Mach-O.
8127 If such functions are not inlined, we cannot tell at compile-time if
8128 they will be called via the lazy symbol resolver (this can depend on
8129 options given at link-time). Therefore, we must assume that the lazy
8130 resolver could be used which clobbers R11 and R10. */
8131 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
8132 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
8133 }
8134
8135 if (vec_len > 1)
8136 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8137 rtx_insn *call_insn = emit_call_insn (call);
8138 if (use)
8139 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8140
8141 return call_insn;
8142 }
8143
8144 /* Split simple return with popping POPC bytes from stack to indirect
8145 branch with stack adjustment . */
8146
8147 void
ix86_split_simple_return_pop_internal(rtx popc)8148 ix86_split_simple_return_pop_internal (rtx popc)
8149 {
8150 struct machine_function *m = cfun->machine;
8151 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8152 rtx_insn *insn;
8153
8154 /* There is no "pascal" calling convention in any 64bit ABI. */
8155 gcc_assert (!TARGET_64BIT);
8156
8157 insn = emit_insn (gen_pop (ecx));
8158 m->fs.cfa_offset -= UNITS_PER_WORD;
8159 m->fs.sp_offset -= UNITS_PER_WORD;
8160
8161 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8162 x = gen_rtx_SET (stack_pointer_rtx, x);
8163 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8164 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8165 RTX_FRAME_RELATED_P (insn) = 1;
8166
8167 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8168 x = gen_rtx_SET (stack_pointer_rtx, x);
8169 insn = emit_insn (x);
8170 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8171 RTX_FRAME_RELATED_P (insn) = 1;
8172
8173 /* Now return address is in ECX. */
8174 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8175 }
8176
8177 /* Errors in the source file can cause expand_expr to return const0_rtx
8178 where we expect a vector. To avoid crashing, use one of the vector
8179 clear instructions. */
8180
8181 static rtx
safe_vector_operand(rtx x,machine_mode mode)8182 safe_vector_operand (rtx x, machine_mode mode)
8183 {
8184 if (x == const0_rtx)
8185 x = CONST0_RTX (mode);
8186 return x;
8187 }
8188
8189 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8190
8191 static rtx
ix86_expand_binop_builtin(enum insn_code icode,tree exp,rtx target)8192 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8193 {
8194 rtx pat;
8195 tree arg0 = CALL_EXPR_ARG (exp, 0);
8196 tree arg1 = CALL_EXPR_ARG (exp, 1);
8197 rtx op0 = expand_normal (arg0);
8198 rtx op1 = expand_normal (arg1);
8199 machine_mode tmode = insn_data[icode].operand[0].mode;
8200 machine_mode mode0 = insn_data[icode].operand[1].mode;
8201 machine_mode mode1 = insn_data[icode].operand[2].mode;
8202
8203 if (VECTOR_MODE_P (mode0))
8204 op0 = safe_vector_operand (op0, mode0);
8205 if (VECTOR_MODE_P (mode1))
8206 op1 = safe_vector_operand (op1, mode1);
8207
8208 if (optimize || !target
8209 || GET_MODE (target) != tmode
8210 || !insn_data[icode].operand[0].predicate (target, tmode))
8211 target = gen_reg_rtx (tmode);
8212
8213 if (GET_MODE (op1) == SImode && mode1 == TImode)
8214 {
8215 rtx x = gen_reg_rtx (V4SImode);
8216 emit_insn (gen_sse2_loadd (x, op1));
8217 op1 = gen_lowpart (TImode, x);
8218 }
8219
8220 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8221 op0 = copy_to_mode_reg (mode0, op0);
8222 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8223 op1 = copy_to_mode_reg (mode1, op1);
8224
8225 pat = GEN_FCN (icode) (target, op0, op1);
8226 if (! pat)
8227 return 0;
8228
8229 emit_insn (pat);
8230
8231 return target;
8232 }
8233
8234 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8235
8236 static rtx
ix86_expand_multi_arg_builtin(enum insn_code icode,tree exp,rtx target,enum ix86_builtin_func_type m_type,enum rtx_code sub_code)8237 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8238 enum ix86_builtin_func_type m_type,
8239 enum rtx_code sub_code)
8240 {
8241 rtx pat;
8242 int i;
8243 int nargs;
8244 bool comparison_p = false;
8245 bool tf_p = false;
8246 bool last_arg_constant = false;
8247 int num_memory = 0;
8248 struct {
8249 rtx op;
8250 machine_mode mode;
8251 } args[4];
8252
8253 machine_mode tmode = insn_data[icode].operand[0].mode;
8254
8255 switch (m_type)
8256 {
8257 case MULTI_ARG_4_DF2_DI_I:
8258 case MULTI_ARG_4_DF2_DI_I1:
8259 case MULTI_ARG_4_SF2_SI_I:
8260 case MULTI_ARG_4_SF2_SI_I1:
8261 nargs = 4;
8262 last_arg_constant = true;
8263 break;
8264
8265 case MULTI_ARG_3_SF:
8266 case MULTI_ARG_3_DF:
8267 case MULTI_ARG_3_SF2:
8268 case MULTI_ARG_3_DF2:
8269 case MULTI_ARG_3_DI:
8270 case MULTI_ARG_3_SI:
8271 case MULTI_ARG_3_SI_DI:
8272 case MULTI_ARG_3_HI:
8273 case MULTI_ARG_3_HI_SI:
8274 case MULTI_ARG_3_QI:
8275 case MULTI_ARG_3_DI2:
8276 case MULTI_ARG_3_SI2:
8277 case MULTI_ARG_3_HI2:
8278 case MULTI_ARG_3_QI2:
8279 nargs = 3;
8280 break;
8281
8282 case MULTI_ARG_2_SF:
8283 case MULTI_ARG_2_DF:
8284 case MULTI_ARG_2_DI:
8285 case MULTI_ARG_2_SI:
8286 case MULTI_ARG_2_HI:
8287 case MULTI_ARG_2_QI:
8288 nargs = 2;
8289 break;
8290
8291 case MULTI_ARG_2_DI_IMM:
8292 case MULTI_ARG_2_SI_IMM:
8293 case MULTI_ARG_2_HI_IMM:
8294 case MULTI_ARG_2_QI_IMM:
8295 nargs = 2;
8296 last_arg_constant = true;
8297 break;
8298
8299 case MULTI_ARG_1_SF:
8300 case MULTI_ARG_1_DF:
8301 case MULTI_ARG_1_SF2:
8302 case MULTI_ARG_1_DF2:
8303 case MULTI_ARG_1_DI:
8304 case MULTI_ARG_1_SI:
8305 case MULTI_ARG_1_HI:
8306 case MULTI_ARG_1_QI:
8307 case MULTI_ARG_1_SI_DI:
8308 case MULTI_ARG_1_HI_DI:
8309 case MULTI_ARG_1_HI_SI:
8310 case MULTI_ARG_1_QI_DI:
8311 case MULTI_ARG_1_QI_SI:
8312 case MULTI_ARG_1_QI_HI:
8313 nargs = 1;
8314 break;
8315
8316 case MULTI_ARG_2_DI_CMP:
8317 case MULTI_ARG_2_SI_CMP:
8318 case MULTI_ARG_2_HI_CMP:
8319 case MULTI_ARG_2_QI_CMP:
8320 nargs = 2;
8321 comparison_p = true;
8322 break;
8323
8324 case MULTI_ARG_2_SF_TF:
8325 case MULTI_ARG_2_DF_TF:
8326 case MULTI_ARG_2_DI_TF:
8327 case MULTI_ARG_2_SI_TF:
8328 case MULTI_ARG_2_HI_TF:
8329 case MULTI_ARG_2_QI_TF:
8330 nargs = 2;
8331 tf_p = true;
8332 break;
8333
8334 default:
8335 gcc_unreachable ();
8336 }
8337
8338 if (optimize || !target
8339 || GET_MODE (target) != tmode
8340 || !insn_data[icode].operand[0].predicate (target, tmode))
8341 target = gen_reg_rtx (tmode);
8342 else if (memory_operand (target, tmode))
8343 num_memory++;
8344
8345 gcc_assert (nargs <= 4);
8346
8347 for (i = 0; i < nargs; i++)
8348 {
8349 tree arg = CALL_EXPR_ARG (exp, i);
8350 rtx op = expand_normal (arg);
8351 int adjust = (comparison_p) ? 1 : 0;
8352 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8353
8354 if (last_arg_constant && i == nargs - 1)
8355 {
8356 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8357 {
8358 enum insn_code new_icode = icode;
8359 switch (icode)
8360 {
8361 case CODE_FOR_xop_vpermil2v2df3:
8362 case CODE_FOR_xop_vpermil2v4sf3:
8363 case CODE_FOR_xop_vpermil2v4df3:
8364 case CODE_FOR_xop_vpermil2v8sf3:
8365 error ("the last argument must be a 2-bit immediate");
8366 return gen_reg_rtx (tmode);
8367 case CODE_FOR_xop_rotlv2di3:
8368 new_icode = CODE_FOR_rotlv2di3;
8369 goto xop_rotl;
8370 case CODE_FOR_xop_rotlv4si3:
8371 new_icode = CODE_FOR_rotlv4si3;
8372 goto xop_rotl;
8373 case CODE_FOR_xop_rotlv8hi3:
8374 new_icode = CODE_FOR_rotlv8hi3;
8375 goto xop_rotl;
8376 case CODE_FOR_xop_rotlv16qi3:
8377 new_icode = CODE_FOR_rotlv16qi3;
8378 xop_rotl:
8379 if (CONST_INT_P (op))
8380 {
8381 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8382 op = GEN_INT (INTVAL (op) & mask);
8383 gcc_checking_assert
8384 (insn_data[icode].operand[i + 1].predicate (op, mode));
8385 }
8386 else
8387 {
8388 gcc_checking_assert
8389 (nargs == 2
8390 && insn_data[new_icode].operand[0].mode == tmode
8391 && insn_data[new_icode].operand[1].mode == tmode
8392 && insn_data[new_icode].operand[2].mode == mode
8393 && insn_data[new_icode].operand[0].predicate
8394 == insn_data[icode].operand[0].predicate
8395 && insn_data[new_icode].operand[1].predicate
8396 == insn_data[icode].operand[1].predicate);
8397 icode = new_icode;
8398 goto non_constant;
8399 }
8400 break;
8401 default:
8402 gcc_unreachable ();
8403 }
8404 }
8405 }
8406 else
8407 {
8408 non_constant:
8409 if (VECTOR_MODE_P (mode))
8410 op = safe_vector_operand (op, mode);
8411
8412 /* If we aren't optimizing, only allow one memory operand to be
8413 generated. */
8414 if (memory_operand (op, mode))
8415 num_memory++;
8416
8417 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8418
8419 if (optimize
8420 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8421 || num_memory > 1)
8422 op = force_reg (mode, op);
8423 }
8424
8425 args[i].op = op;
8426 args[i].mode = mode;
8427 }
8428
8429 switch (nargs)
8430 {
8431 case 1:
8432 pat = GEN_FCN (icode) (target, args[0].op);
8433 break;
8434
8435 case 2:
8436 if (tf_p)
8437 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8438 GEN_INT ((int)sub_code));
8439 else if (! comparison_p)
8440 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8441 else
8442 {
8443 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8444 args[0].op,
8445 args[1].op);
8446
8447 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8448 }
8449 break;
8450
8451 case 3:
8452 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8453 break;
8454
8455 case 4:
8456 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8457 break;
8458
8459 default:
8460 gcc_unreachable ();
8461 }
8462
8463 if (! pat)
8464 return 0;
8465
8466 emit_insn (pat);
8467 return target;
8468 }
8469
8470 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8471 insns with vec_merge. */
8472
8473 static rtx
ix86_expand_unop_vec_merge_builtin(enum insn_code icode,tree exp,rtx target)8474 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8475 rtx target)
8476 {
8477 rtx pat;
8478 tree arg0 = CALL_EXPR_ARG (exp, 0);
8479 rtx op1, op0 = expand_normal (arg0);
8480 machine_mode tmode = insn_data[icode].operand[0].mode;
8481 machine_mode mode0 = insn_data[icode].operand[1].mode;
8482
8483 if (optimize || !target
8484 || GET_MODE (target) != tmode
8485 || !insn_data[icode].operand[0].predicate (target, tmode))
8486 target = gen_reg_rtx (tmode);
8487
8488 if (VECTOR_MODE_P (mode0))
8489 op0 = safe_vector_operand (op0, mode0);
8490
8491 if ((optimize && !register_operand (op0, mode0))
8492 || !insn_data[icode].operand[1].predicate (op0, mode0))
8493 op0 = copy_to_mode_reg (mode0, op0);
8494
8495 op1 = op0;
8496 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8497 op1 = copy_to_mode_reg (mode0, op1);
8498
8499 pat = GEN_FCN (icode) (target, op0, op1);
8500 if (! pat)
8501 return 0;
8502 emit_insn (pat);
8503 return target;
8504 }
8505
8506 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8507
8508 static rtx
ix86_expand_sse_compare(const struct builtin_description * d,tree exp,rtx target,bool swap)8509 ix86_expand_sse_compare (const struct builtin_description *d,
8510 tree exp, rtx target, bool swap)
8511 {
8512 rtx pat;
8513 tree arg0 = CALL_EXPR_ARG (exp, 0);
8514 tree arg1 = CALL_EXPR_ARG (exp, 1);
8515 rtx op0 = expand_normal (arg0);
8516 rtx op1 = expand_normal (arg1);
8517 rtx op2;
8518 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8519 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8520 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8521 enum rtx_code comparison = d->comparison;
8522
8523 if (VECTOR_MODE_P (mode0))
8524 op0 = safe_vector_operand (op0, mode0);
8525 if (VECTOR_MODE_P (mode1))
8526 op1 = safe_vector_operand (op1, mode1);
8527
8528 /* Swap operands if we have a comparison that isn't available in
8529 hardware. */
8530 if (swap)
8531 std::swap (op0, op1);
8532
8533 if (optimize || !target
8534 || GET_MODE (target) != tmode
8535 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8536 target = gen_reg_rtx (tmode);
8537
8538 if ((optimize && !register_operand (op0, mode0))
8539 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8540 op0 = copy_to_mode_reg (mode0, op0);
8541 if ((optimize && !register_operand (op1, mode1))
8542 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8543 op1 = copy_to_mode_reg (mode1, op1);
8544
8545 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8546 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8547 if (! pat)
8548 return 0;
8549 emit_insn (pat);
8550 return target;
8551 }
8552
8553 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8554
8555 static rtx
ix86_expand_sse_comi(const struct builtin_description * d,tree exp,rtx target)8556 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8557 rtx target)
8558 {
8559 rtx pat;
8560 tree arg0 = CALL_EXPR_ARG (exp, 0);
8561 tree arg1 = CALL_EXPR_ARG (exp, 1);
8562 rtx op0 = expand_normal (arg0);
8563 rtx op1 = expand_normal (arg1);
8564 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8565 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8566 enum rtx_code comparison = d->comparison;
8567
8568 if (VECTOR_MODE_P (mode0))
8569 op0 = safe_vector_operand (op0, mode0);
8570 if (VECTOR_MODE_P (mode1))
8571 op1 = safe_vector_operand (op1, mode1);
8572
8573 /* Swap operands if we have a comparison that isn't available in
8574 hardware. */
8575 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8576 std::swap (op0, op1);
8577
8578 target = gen_reg_rtx (SImode);
8579 emit_move_insn (target, const0_rtx);
8580 target = gen_rtx_SUBREG (QImode, target, 0);
8581
8582 if ((optimize && !register_operand (op0, mode0))
8583 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8584 op0 = copy_to_mode_reg (mode0, op0);
8585 if ((optimize && !register_operand (op1, mode1))
8586 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8587 op1 = copy_to_mode_reg (mode1, op1);
8588
8589 pat = GEN_FCN (d->icode) (op0, op1);
8590 if (! pat)
8591 return 0;
8592 emit_insn (pat);
8593 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8594 gen_rtx_fmt_ee (comparison, QImode,
8595 SET_DEST (pat),
8596 const0_rtx)));
8597
8598 return SUBREG_REG (target);
8599 }
8600
8601 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8602
8603 static rtx
ix86_expand_sse_round(const struct builtin_description * d,tree exp,rtx target)8604 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8605 rtx target)
8606 {
8607 rtx pat;
8608 tree arg0 = CALL_EXPR_ARG (exp, 0);
8609 rtx op1, op0 = expand_normal (arg0);
8610 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8611 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8612
8613 if (optimize || target == 0
8614 || GET_MODE (target) != tmode
8615 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8616 target = gen_reg_rtx (tmode);
8617
8618 if (VECTOR_MODE_P (mode0))
8619 op0 = safe_vector_operand (op0, mode0);
8620
8621 if ((optimize && !register_operand (op0, mode0))
8622 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8623 op0 = copy_to_mode_reg (mode0, op0);
8624
8625 op1 = GEN_INT (d->comparison);
8626
8627 pat = GEN_FCN (d->icode) (target, op0, op1);
8628 if (! pat)
8629 return 0;
8630 emit_insn (pat);
8631 return target;
8632 }
8633
8634 static rtx
ix86_expand_sse_round_vec_pack_sfix(const struct builtin_description * d,tree exp,rtx target)8635 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8636 tree exp, rtx target)
8637 {
8638 rtx pat;
8639 tree arg0 = CALL_EXPR_ARG (exp, 0);
8640 tree arg1 = CALL_EXPR_ARG (exp, 1);
8641 rtx op0 = expand_normal (arg0);
8642 rtx op1 = expand_normal (arg1);
8643 rtx op2;
8644 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8645 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8646 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8647
8648 if (optimize || target == 0
8649 || GET_MODE (target) != tmode
8650 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8651 target = gen_reg_rtx (tmode);
8652
8653 op0 = safe_vector_operand (op0, mode0);
8654 op1 = safe_vector_operand (op1, mode1);
8655
8656 if ((optimize && !register_operand (op0, mode0))
8657 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8658 op0 = copy_to_mode_reg (mode0, op0);
8659 if ((optimize && !register_operand (op1, mode1))
8660 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8661 op1 = copy_to_mode_reg (mode1, op1);
8662
8663 op2 = GEN_INT (d->comparison);
8664
8665 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8666 if (! pat)
8667 return 0;
8668 emit_insn (pat);
8669 return target;
8670 }
8671
8672 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8673
8674 static rtx
ix86_expand_sse_ptest(const struct builtin_description * d,tree exp,rtx target)8675 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8676 rtx target)
8677 {
8678 rtx pat;
8679 tree arg0 = CALL_EXPR_ARG (exp, 0);
8680 tree arg1 = CALL_EXPR_ARG (exp, 1);
8681 rtx op0 = expand_normal (arg0);
8682 rtx op1 = expand_normal (arg1);
8683 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8684 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8685 enum rtx_code comparison = d->comparison;
8686
8687 if (VECTOR_MODE_P (mode0))
8688 op0 = safe_vector_operand (op0, mode0);
8689 if (VECTOR_MODE_P (mode1))
8690 op1 = safe_vector_operand (op1, mode1);
8691
8692 target = gen_reg_rtx (SImode);
8693 emit_move_insn (target, const0_rtx);
8694 target = gen_rtx_SUBREG (QImode, target, 0);
8695
8696 if ((optimize && !register_operand (op0, mode0))
8697 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8698 op0 = copy_to_mode_reg (mode0, op0);
8699 if ((optimize && !register_operand (op1, mode1))
8700 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8701 op1 = copy_to_mode_reg (mode1, op1);
8702
8703 pat = GEN_FCN (d->icode) (op0, op1);
8704 if (! pat)
8705 return 0;
8706 emit_insn (pat);
8707 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8708 gen_rtx_fmt_ee (comparison, QImode,
8709 SET_DEST (pat),
8710 const0_rtx)));
8711
8712 return SUBREG_REG (target);
8713 }
8714
8715 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8716
8717 static rtx
ix86_expand_sse_pcmpestr(const struct builtin_description * d,tree exp,rtx target)8718 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8719 tree exp, rtx target)
8720 {
8721 rtx pat;
8722 tree arg0 = CALL_EXPR_ARG (exp, 0);
8723 tree arg1 = CALL_EXPR_ARG (exp, 1);
8724 tree arg2 = CALL_EXPR_ARG (exp, 2);
8725 tree arg3 = CALL_EXPR_ARG (exp, 3);
8726 tree arg4 = CALL_EXPR_ARG (exp, 4);
8727 rtx scratch0, scratch1;
8728 rtx op0 = expand_normal (arg0);
8729 rtx op1 = expand_normal (arg1);
8730 rtx op2 = expand_normal (arg2);
8731 rtx op3 = expand_normal (arg3);
8732 rtx op4 = expand_normal (arg4);
8733 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8734
8735 tmode0 = insn_data[d->icode].operand[0].mode;
8736 tmode1 = insn_data[d->icode].operand[1].mode;
8737 modev2 = insn_data[d->icode].operand[2].mode;
8738 modei3 = insn_data[d->icode].operand[3].mode;
8739 modev4 = insn_data[d->icode].operand[4].mode;
8740 modei5 = insn_data[d->icode].operand[5].mode;
8741 modeimm = insn_data[d->icode].operand[6].mode;
8742
8743 if (VECTOR_MODE_P (modev2))
8744 op0 = safe_vector_operand (op0, modev2);
8745 if (VECTOR_MODE_P (modev4))
8746 op2 = safe_vector_operand (op2, modev4);
8747
8748 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8749 op0 = copy_to_mode_reg (modev2, op0);
8750 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8751 op1 = copy_to_mode_reg (modei3, op1);
8752 if ((optimize && !register_operand (op2, modev4))
8753 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8754 op2 = copy_to_mode_reg (modev4, op2);
8755 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8756 op3 = copy_to_mode_reg (modei5, op3);
8757
8758 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8759 {
8760 error ("the fifth argument must be an 8-bit immediate");
8761 return const0_rtx;
8762 }
8763
8764 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8765 {
8766 if (optimize || !target
8767 || GET_MODE (target) != tmode0
8768 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8769 target = gen_reg_rtx (tmode0);
8770
8771 scratch1 = gen_reg_rtx (tmode1);
8772
8773 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8774 }
8775 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8776 {
8777 if (optimize || !target
8778 || GET_MODE (target) != tmode1
8779 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8780 target = gen_reg_rtx (tmode1);
8781
8782 scratch0 = gen_reg_rtx (tmode0);
8783
8784 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8785 }
8786 else
8787 {
8788 gcc_assert (d->flag);
8789
8790 scratch0 = gen_reg_rtx (tmode0);
8791 scratch1 = gen_reg_rtx (tmode1);
8792
8793 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8794 }
8795
8796 if (! pat)
8797 return 0;
8798
8799 emit_insn (pat);
8800
8801 if (d->flag)
8802 {
8803 target = gen_reg_rtx (SImode);
8804 emit_move_insn (target, const0_rtx);
8805 target = gen_rtx_SUBREG (QImode, target, 0);
8806
8807 emit_insn
8808 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8809 gen_rtx_fmt_ee (EQ, QImode,
8810 gen_rtx_REG ((machine_mode) d->flag,
8811 FLAGS_REG),
8812 const0_rtx)));
8813 return SUBREG_REG (target);
8814 }
8815 else
8816 return target;
8817 }
8818
8819
8820 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8821
8822 static rtx
ix86_expand_sse_pcmpistr(const struct builtin_description * d,tree exp,rtx target)8823 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8824 tree exp, rtx target)
8825 {
8826 rtx pat;
8827 tree arg0 = CALL_EXPR_ARG (exp, 0);
8828 tree arg1 = CALL_EXPR_ARG (exp, 1);
8829 tree arg2 = CALL_EXPR_ARG (exp, 2);
8830 rtx scratch0, scratch1;
8831 rtx op0 = expand_normal (arg0);
8832 rtx op1 = expand_normal (arg1);
8833 rtx op2 = expand_normal (arg2);
8834 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8835
8836 tmode0 = insn_data[d->icode].operand[0].mode;
8837 tmode1 = insn_data[d->icode].operand[1].mode;
8838 modev2 = insn_data[d->icode].operand[2].mode;
8839 modev3 = insn_data[d->icode].operand[3].mode;
8840 modeimm = insn_data[d->icode].operand[4].mode;
8841
8842 if (VECTOR_MODE_P (modev2))
8843 op0 = safe_vector_operand (op0, modev2);
8844 if (VECTOR_MODE_P (modev3))
8845 op1 = safe_vector_operand (op1, modev3);
8846
8847 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8848 op0 = copy_to_mode_reg (modev2, op0);
8849 if ((optimize && !register_operand (op1, modev3))
8850 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8851 op1 = copy_to_mode_reg (modev3, op1);
8852
8853 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8854 {
8855 error ("the third argument must be an 8-bit immediate");
8856 return const0_rtx;
8857 }
8858
8859 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8860 {
8861 if (optimize || !target
8862 || GET_MODE (target) != tmode0
8863 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8864 target = gen_reg_rtx (tmode0);
8865
8866 scratch1 = gen_reg_rtx (tmode1);
8867
8868 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8869 }
8870 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8871 {
8872 if (optimize || !target
8873 || GET_MODE (target) != tmode1
8874 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8875 target = gen_reg_rtx (tmode1);
8876
8877 scratch0 = gen_reg_rtx (tmode0);
8878
8879 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8880 }
8881 else
8882 {
8883 gcc_assert (d->flag);
8884
8885 scratch0 = gen_reg_rtx (tmode0);
8886 scratch1 = gen_reg_rtx (tmode1);
8887
8888 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8889 }
8890
8891 if (! pat)
8892 return 0;
8893
8894 emit_insn (pat);
8895
8896 if (d->flag)
8897 {
8898 target = gen_reg_rtx (SImode);
8899 emit_move_insn (target, const0_rtx);
8900 target = gen_rtx_SUBREG (QImode, target, 0);
8901
8902 emit_insn
8903 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8904 gen_rtx_fmt_ee (EQ, QImode,
8905 gen_rtx_REG ((machine_mode) d->flag,
8906 FLAGS_REG),
8907 const0_rtx)));
8908 return SUBREG_REG (target);
8909 }
8910 else
8911 return target;
8912 }
8913
8914 /* Fixup modeless constants to fit required mode. */
8915
8916 static rtx
fixup_modeless_constant(rtx x,machine_mode mode)8917 fixup_modeless_constant (rtx x, machine_mode mode)
8918 {
8919 if (GET_MODE (x) == VOIDmode)
8920 x = convert_to_mode (mode, x, 1);
8921 return x;
8922 }
8923
8924 /* Subroutine of ix86_expand_builtin to take care of insns with
8925 variable number of operands. */
8926
8927 static rtx
ix86_expand_args_builtin(const struct builtin_description * d,tree exp,rtx target)8928 ix86_expand_args_builtin (const struct builtin_description *d,
8929 tree exp, rtx target)
8930 {
8931 rtx pat, real_target;
8932 unsigned int i, nargs;
8933 unsigned int nargs_constant = 0;
8934 unsigned int mask_pos = 0;
8935 int num_memory = 0;
8936 struct
8937 {
8938 rtx op;
8939 machine_mode mode;
8940 } args[6];
8941 bool second_arg_count = false;
8942 enum insn_code icode = d->icode;
8943 const struct insn_data_d *insn_p = &insn_data[icode];
8944 machine_mode tmode = insn_p->operand[0].mode;
8945 machine_mode rmode = VOIDmode;
8946 bool swap = false;
8947 enum rtx_code comparison = d->comparison;
8948
8949 switch ((enum ix86_builtin_func_type) d->flag)
8950 {
8951 case V2DF_FTYPE_V2DF_ROUND:
8952 case V4DF_FTYPE_V4DF_ROUND:
8953 case V8DF_FTYPE_V8DF_ROUND:
8954 case V4SF_FTYPE_V4SF_ROUND:
8955 case V8SF_FTYPE_V8SF_ROUND:
8956 case V16SF_FTYPE_V16SF_ROUND:
8957 case V4SI_FTYPE_V4SF_ROUND:
8958 case V8SI_FTYPE_V8SF_ROUND:
8959 case V16SI_FTYPE_V16SF_ROUND:
8960 return ix86_expand_sse_round (d, exp, target);
8961 case V4SI_FTYPE_V2DF_V2DF_ROUND:
8962 case V8SI_FTYPE_V4DF_V4DF_ROUND:
8963 case V16SI_FTYPE_V8DF_V8DF_ROUND:
8964 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8965 case INT_FTYPE_V8SF_V8SF_PTEST:
8966 case INT_FTYPE_V4DI_V4DI_PTEST:
8967 case INT_FTYPE_V4DF_V4DF_PTEST:
8968 case INT_FTYPE_V4SF_V4SF_PTEST:
8969 case INT_FTYPE_V2DI_V2DI_PTEST:
8970 case INT_FTYPE_V2DF_V2DF_PTEST:
8971 return ix86_expand_sse_ptest (d, exp, target);
8972 case FLOAT128_FTYPE_FLOAT128:
8973 case FLOAT_FTYPE_FLOAT:
8974 case INT_FTYPE_INT:
8975 case UINT_FTYPE_UINT:
8976 case UINT16_FTYPE_UINT16:
8977 case UINT64_FTYPE_INT:
8978 case UINT64_FTYPE_UINT64:
8979 case INT64_FTYPE_INT64:
8980 case INT64_FTYPE_V4SF:
8981 case INT64_FTYPE_V2DF:
8982 case INT_FTYPE_V16QI:
8983 case INT_FTYPE_V8QI:
8984 case INT_FTYPE_V8SF:
8985 case INT_FTYPE_V4DF:
8986 case INT_FTYPE_V4SF:
8987 case INT_FTYPE_V2DF:
8988 case INT_FTYPE_V32QI:
8989 case V16QI_FTYPE_V16QI:
8990 case V8SI_FTYPE_V8SF:
8991 case V8SI_FTYPE_V4SI:
8992 case V8HI_FTYPE_V8HI:
8993 case V8HI_FTYPE_V16QI:
8994 case V8QI_FTYPE_V8QI:
8995 case V8SF_FTYPE_V8SF:
8996 case V8SF_FTYPE_V8SI:
8997 case V8SF_FTYPE_V4SF:
8998 case V8SF_FTYPE_V8HI:
8999 case V4SI_FTYPE_V4SI:
9000 case V4SI_FTYPE_V16QI:
9001 case V4SI_FTYPE_V4SF:
9002 case V4SI_FTYPE_V8SI:
9003 case V4SI_FTYPE_V8HI:
9004 case V4SI_FTYPE_V4DF:
9005 case V4SI_FTYPE_V2DF:
9006 case V4HI_FTYPE_V4HI:
9007 case V4DF_FTYPE_V4DF:
9008 case V4DF_FTYPE_V4SI:
9009 case V4DF_FTYPE_V4SF:
9010 case V4DF_FTYPE_V2DF:
9011 case V4SF_FTYPE_V4SF:
9012 case V4SF_FTYPE_V4SI:
9013 case V4SF_FTYPE_V8SF:
9014 case V4SF_FTYPE_V4DF:
9015 case V4SF_FTYPE_V8HI:
9016 case V4SF_FTYPE_V2DF:
9017 case V2DI_FTYPE_V2DI:
9018 case V2DI_FTYPE_V16QI:
9019 case V2DI_FTYPE_V8HI:
9020 case V2DI_FTYPE_V4SI:
9021 case V2DF_FTYPE_V2DF:
9022 case V2DF_FTYPE_V4SI:
9023 case V2DF_FTYPE_V4DF:
9024 case V2DF_FTYPE_V4SF:
9025 case V2DF_FTYPE_V2SI:
9026 case V2SI_FTYPE_V2SI:
9027 case V2SI_FTYPE_V4SF:
9028 case V2SI_FTYPE_V2SF:
9029 case V2SI_FTYPE_V2DF:
9030 case V2SF_FTYPE_V2SF:
9031 case V2SF_FTYPE_V2SI:
9032 case V32QI_FTYPE_V32QI:
9033 case V32QI_FTYPE_V16QI:
9034 case V16HI_FTYPE_V16HI:
9035 case V16HI_FTYPE_V8HI:
9036 case V8SI_FTYPE_V8SI:
9037 case V16HI_FTYPE_V16QI:
9038 case V8SI_FTYPE_V16QI:
9039 case V4DI_FTYPE_V16QI:
9040 case V8SI_FTYPE_V8HI:
9041 case V4DI_FTYPE_V8HI:
9042 case V4DI_FTYPE_V4SI:
9043 case V4DI_FTYPE_V2DI:
9044 case UQI_FTYPE_UQI:
9045 case UHI_FTYPE_UHI:
9046 case USI_FTYPE_USI:
9047 case USI_FTYPE_UQI:
9048 case USI_FTYPE_UHI:
9049 case UDI_FTYPE_UDI:
9050 case UHI_FTYPE_V16QI:
9051 case USI_FTYPE_V32QI:
9052 case UDI_FTYPE_V64QI:
9053 case V16QI_FTYPE_UHI:
9054 case V32QI_FTYPE_USI:
9055 case V64QI_FTYPE_UDI:
9056 case V8HI_FTYPE_UQI:
9057 case V16HI_FTYPE_UHI:
9058 case V32HI_FTYPE_USI:
9059 case V4SI_FTYPE_UQI:
9060 case V8SI_FTYPE_UQI:
9061 case V4SI_FTYPE_UHI:
9062 case V8SI_FTYPE_UHI:
9063 case UQI_FTYPE_V8HI:
9064 case UHI_FTYPE_V16HI:
9065 case USI_FTYPE_V32HI:
9066 case UQI_FTYPE_V4SI:
9067 case UQI_FTYPE_V8SI:
9068 case UHI_FTYPE_V16SI:
9069 case UQI_FTYPE_V2DI:
9070 case UQI_FTYPE_V4DI:
9071 case UQI_FTYPE_V8DI:
9072 case V16SI_FTYPE_UHI:
9073 case V2DI_FTYPE_UQI:
9074 case V4DI_FTYPE_UQI:
9075 case V16SI_FTYPE_INT:
9076 case V16SF_FTYPE_V8SF:
9077 case V16SI_FTYPE_V8SI:
9078 case V16SF_FTYPE_V4SF:
9079 case V16SI_FTYPE_V4SI:
9080 case V16SI_FTYPE_V16SF:
9081 case V16SI_FTYPE_V16SI:
9082 case V64QI_FTYPE_V64QI:
9083 case V32HI_FTYPE_V32HI:
9084 case V16SF_FTYPE_V16SF:
9085 case V8DI_FTYPE_UQI:
9086 case V8DI_FTYPE_V8DI:
9087 case V8DF_FTYPE_V4DF:
9088 case V8DF_FTYPE_V2DF:
9089 case V8DF_FTYPE_V8DF:
9090 case V4DI_FTYPE_V4DI:
9091 case V16HI_FTYPE_V16SF:
9092 case V8HI_FTYPE_V8SF:
9093 case V8HI_FTYPE_V4SF:
9094 nargs = 1;
9095 break;
9096 case V4SF_FTYPE_V4SF_VEC_MERGE:
9097 case V2DF_FTYPE_V2DF_VEC_MERGE:
9098 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9099 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9100 case V16QI_FTYPE_V16QI_V16QI:
9101 case V16QI_FTYPE_V8HI_V8HI:
9102 case V16SF_FTYPE_V16SF_V16SF:
9103 case V8QI_FTYPE_V8QI_V8QI:
9104 case V8QI_FTYPE_V4HI_V4HI:
9105 case V8HI_FTYPE_V8HI_V8HI:
9106 case V8HI_FTYPE_V16QI_V16QI:
9107 case V8HI_FTYPE_V4SI_V4SI:
9108 case V8SF_FTYPE_V8SF_V8SF:
9109 case V8SF_FTYPE_V8SF_V8SI:
9110 case V8DF_FTYPE_V8DF_V8DF:
9111 case V4SI_FTYPE_V4SI_V4SI:
9112 case V4SI_FTYPE_V8HI_V8HI:
9113 case V4SI_FTYPE_V2DF_V2DF:
9114 case V4HI_FTYPE_V4HI_V4HI:
9115 case V4HI_FTYPE_V8QI_V8QI:
9116 case V4HI_FTYPE_V2SI_V2SI:
9117 case V4DF_FTYPE_V4DF_V4DF:
9118 case V4DF_FTYPE_V4DF_V4DI:
9119 case V4SF_FTYPE_V4SF_V4SF:
9120 case V4SF_FTYPE_V4SF_V4SI:
9121 case V4SF_FTYPE_V4SF_V2SI:
9122 case V4SF_FTYPE_V4SF_V2DF:
9123 case V4SF_FTYPE_V4SF_UINT:
9124 case V4SF_FTYPE_V4SF_DI:
9125 case V4SF_FTYPE_V4SF_SI:
9126 case V2DI_FTYPE_V2DI_V2DI:
9127 case V2DI_FTYPE_V16QI_V16QI:
9128 case V2DI_FTYPE_V4SI_V4SI:
9129 case V2DI_FTYPE_V2DI_V16QI:
9130 case V2SI_FTYPE_V2SI_V2SI:
9131 case V2SI_FTYPE_V4HI_V4HI:
9132 case V2SI_FTYPE_V2SF_V2SF:
9133 case V2DF_FTYPE_V2DF_V2DF:
9134 case V2DF_FTYPE_V2DF_V4SF:
9135 case V2DF_FTYPE_V2DF_V2DI:
9136 case V2DF_FTYPE_V2DF_DI:
9137 case V2DF_FTYPE_V2DF_SI:
9138 case V2DF_FTYPE_V2DF_UINT:
9139 case V2SF_FTYPE_V2SF_V2SF:
9140 case V1DI_FTYPE_V1DI_V1DI:
9141 case V1DI_FTYPE_V8QI_V8QI:
9142 case V1DI_FTYPE_V2SI_V2SI:
9143 case V32QI_FTYPE_V16HI_V16HI:
9144 case V16HI_FTYPE_V8SI_V8SI:
9145 case V64QI_FTYPE_V64QI_V64QI:
9146 case V32QI_FTYPE_V32QI_V32QI:
9147 case V16HI_FTYPE_V32QI_V32QI:
9148 case V16HI_FTYPE_V16HI_V16HI:
9149 case V8SI_FTYPE_V4DF_V4DF:
9150 case V8SI_FTYPE_V8SI_V8SI:
9151 case V8SI_FTYPE_V16HI_V16HI:
9152 case V4DI_FTYPE_V4DI_V4DI:
9153 case V4DI_FTYPE_V8SI_V8SI:
9154 case V8DI_FTYPE_V64QI_V64QI:
9155 if (comparison == UNKNOWN)
9156 return ix86_expand_binop_builtin (icode, exp, target);
9157 nargs = 2;
9158 break;
9159 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9160 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9161 gcc_assert (comparison != UNKNOWN);
9162 nargs = 2;
9163 swap = true;
9164 break;
9165 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9166 case V16HI_FTYPE_V16HI_SI_COUNT:
9167 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9168 case V8SI_FTYPE_V8SI_SI_COUNT:
9169 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9170 case V4DI_FTYPE_V4DI_INT_COUNT:
9171 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9172 case V8HI_FTYPE_V8HI_SI_COUNT:
9173 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9174 case V4SI_FTYPE_V4SI_SI_COUNT:
9175 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9176 case V4HI_FTYPE_V4HI_SI_COUNT:
9177 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9178 case V2DI_FTYPE_V2DI_SI_COUNT:
9179 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9180 case V2SI_FTYPE_V2SI_SI_COUNT:
9181 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9182 case V1DI_FTYPE_V1DI_SI_COUNT:
9183 nargs = 2;
9184 second_arg_count = true;
9185 break;
9186 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9187 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9188 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9189 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9190 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9191 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9192 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9193 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9194 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9195 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9196 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9197 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9198 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9199 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9200 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9201 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9202 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9203 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9204 nargs = 4;
9205 second_arg_count = true;
9206 break;
9207 case UINT64_FTYPE_UINT64_UINT64:
9208 case UINT_FTYPE_UINT_UINT:
9209 case UINT_FTYPE_UINT_USHORT:
9210 case UINT_FTYPE_UINT_UCHAR:
9211 case UINT16_FTYPE_UINT16_INT:
9212 case UINT8_FTYPE_UINT8_INT:
9213 case UQI_FTYPE_UQI_UQI:
9214 case UHI_FTYPE_UHI_UHI:
9215 case USI_FTYPE_USI_USI:
9216 case UDI_FTYPE_UDI_UDI:
9217 case V16SI_FTYPE_V8DF_V8DF:
9218 case V32HI_FTYPE_V16SF_V16SF:
9219 case V16HI_FTYPE_V8SF_V8SF:
9220 case V8HI_FTYPE_V4SF_V4SF:
9221 case V16HI_FTYPE_V16SF_UHI:
9222 case V8HI_FTYPE_V8SF_UQI:
9223 case V8HI_FTYPE_V4SF_UQI:
9224 nargs = 2;
9225 break;
9226 case V2DI_FTYPE_V2DI_INT_CONVERT:
9227 nargs = 2;
9228 rmode = V1TImode;
9229 nargs_constant = 1;
9230 break;
9231 case V4DI_FTYPE_V4DI_INT_CONVERT:
9232 nargs = 2;
9233 rmode = V2TImode;
9234 nargs_constant = 1;
9235 break;
9236 case V8DI_FTYPE_V8DI_INT_CONVERT:
9237 nargs = 2;
9238 rmode = V4TImode;
9239 nargs_constant = 1;
9240 break;
9241 case V8HI_FTYPE_V8HI_INT:
9242 case V8HI_FTYPE_V8SF_INT:
9243 case V16HI_FTYPE_V16SF_INT:
9244 case V8HI_FTYPE_V4SF_INT:
9245 case V8SF_FTYPE_V8SF_INT:
9246 case V4SF_FTYPE_V16SF_INT:
9247 case V16SF_FTYPE_V16SF_INT:
9248 case V4SI_FTYPE_V4SI_INT:
9249 case V4SI_FTYPE_V8SI_INT:
9250 case V4HI_FTYPE_V4HI_INT:
9251 case V4DF_FTYPE_V4DF_INT:
9252 case V4DF_FTYPE_V8DF_INT:
9253 case V4SF_FTYPE_V4SF_INT:
9254 case V4SF_FTYPE_V8SF_INT:
9255 case V2DI_FTYPE_V2DI_INT:
9256 case V2DF_FTYPE_V2DF_INT:
9257 case V2DF_FTYPE_V4DF_INT:
9258 case V16HI_FTYPE_V16HI_INT:
9259 case V8SI_FTYPE_V8SI_INT:
9260 case V16SI_FTYPE_V16SI_INT:
9261 case V4SI_FTYPE_V16SI_INT:
9262 case V4DI_FTYPE_V4DI_INT:
9263 case V2DI_FTYPE_V4DI_INT:
9264 case V4DI_FTYPE_V8DI_INT:
9265 case UQI_FTYPE_UQI_UQI_CONST:
9266 case UHI_FTYPE_UHI_UQI:
9267 case USI_FTYPE_USI_UQI:
9268 case UDI_FTYPE_UDI_UQI:
9269 nargs = 2;
9270 nargs_constant = 1;
9271 break;
9272 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9273 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9274 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9275 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9276 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9277 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9278 case UHI_FTYPE_V16SI_V16SI_UHI:
9279 case UQI_FTYPE_V8DI_V8DI_UQI:
9280 case V16HI_FTYPE_V16SI_V16HI_UHI:
9281 case V16QI_FTYPE_V16SI_V16QI_UHI:
9282 case V16QI_FTYPE_V8DI_V16QI_UQI:
9283 case V16SF_FTYPE_V16SF_V16SF_UHI:
9284 case V16SF_FTYPE_V4SF_V16SF_UHI:
9285 case V16SI_FTYPE_SI_V16SI_UHI:
9286 case V16SI_FTYPE_V16HI_V16SI_UHI:
9287 case V16SI_FTYPE_V16QI_V16SI_UHI:
9288 case V8SF_FTYPE_V4SF_V8SF_UQI:
9289 case V4DF_FTYPE_V2DF_V4DF_UQI:
9290 case V8SI_FTYPE_V4SI_V8SI_UQI:
9291 case V8SI_FTYPE_SI_V8SI_UQI:
9292 case V4SI_FTYPE_V4SI_V4SI_UQI:
9293 case V4SI_FTYPE_SI_V4SI_UQI:
9294 case V4DI_FTYPE_V2DI_V4DI_UQI:
9295 case V4DI_FTYPE_DI_V4DI_UQI:
9296 case V2DI_FTYPE_V2DI_V2DI_UQI:
9297 case V2DI_FTYPE_DI_V2DI_UQI:
9298 case V64QI_FTYPE_V64QI_V64QI_UDI:
9299 case V64QI_FTYPE_V16QI_V64QI_UDI:
9300 case V64QI_FTYPE_QI_V64QI_UDI:
9301 case V32QI_FTYPE_V32QI_V32QI_USI:
9302 case V32QI_FTYPE_V16QI_V32QI_USI:
9303 case V32QI_FTYPE_QI_V32QI_USI:
9304 case V16QI_FTYPE_V16QI_V16QI_UHI:
9305 case V16QI_FTYPE_QI_V16QI_UHI:
9306 case V32HI_FTYPE_V8HI_V32HI_USI:
9307 case V32HI_FTYPE_HI_V32HI_USI:
9308 case V16HI_FTYPE_V8HI_V16HI_UHI:
9309 case V16HI_FTYPE_HI_V16HI_UHI:
9310 case V8HI_FTYPE_V8HI_V8HI_UQI:
9311 case V8HI_FTYPE_HI_V8HI_UQI:
9312 case V8SF_FTYPE_V8HI_V8SF_UQI:
9313 case V4SF_FTYPE_V8HI_V4SF_UQI:
9314 case V8SI_FTYPE_V8SF_V8SI_UQI:
9315 case V4SI_FTYPE_V4SF_V4SI_UQI:
9316 case V4DI_FTYPE_V4SF_V4DI_UQI:
9317 case V2DI_FTYPE_V4SF_V2DI_UQI:
9318 case V4SF_FTYPE_V4DI_V4SF_UQI:
9319 case V4SF_FTYPE_V2DI_V4SF_UQI:
9320 case V4DF_FTYPE_V4DI_V4DF_UQI:
9321 case V2DF_FTYPE_V2DI_V2DF_UQI:
9322 case V16QI_FTYPE_V8HI_V16QI_UQI:
9323 case V16QI_FTYPE_V16HI_V16QI_UHI:
9324 case V16QI_FTYPE_V4SI_V16QI_UQI:
9325 case V16QI_FTYPE_V8SI_V16QI_UQI:
9326 case V8HI_FTYPE_V4SI_V8HI_UQI:
9327 case V8HI_FTYPE_V8SI_V8HI_UQI:
9328 case V16QI_FTYPE_V2DI_V16QI_UQI:
9329 case V16QI_FTYPE_V4DI_V16QI_UQI:
9330 case V8HI_FTYPE_V2DI_V8HI_UQI:
9331 case V8HI_FTYPE_V4DI_V8HI_UQI:
9332 case V4SI_FTYPE_V2DI_V4SI_UQI:
9333 case V4SI_FTYPE_V4DI_V4SI_UQI:
9334 case V32QI_FTYPE_V32HI_V32QI_USI:
9335 case UHI_FTYPE_V16QI_V16QI_UHI:
9336 case USI_FTYPE_V32QI_V32QI_USI:
9337 case UDI_FTYPE_V64QI_V64QI_UDI:
9338 case UQI_FTYPE_V8HI_V8HI_UQI:
9339 case UHI_FTYPE_V16HI_V16HI_UHI:
9340 case USI_FTYPE_V32HI_V32HI_USI:
9341 case UQI_FTYPE_V4SI_V4SI_UQI:
9342 case UQI_FTYPE_V8SI_V8SI_UQI:
9343 case UQI_FTYPE_V2DI_V2DI_UQI:
9344 case UQI_FTYPE_V4DI_V4DI_UQI:
9345 case V4SF_FTYPE_V2DF_V4SF_UQI:
9346 case V4SF_FTYPE_V4DF_V4SF_UQI:
9347 case V16SI_FTYPE_V16SI_V16SI_UHI:
9348 case V16SI_FTYPE_V4SI_V16SI_UHI:
9349 case V2DI_FTYPE_V4SI_V2DI_UQI:
9350 case V2DI_FTYPE_V8HI_V2DI_UQI:
9351 case V2DI_FTYPE_V16QI_V2DI_UQI:
9352 case V4DI_FTYPE_V4DI_V4DI_UQI:
9353 case V4DI_FTYPE_V4SI_V4DI_UQI:
9354 case V4DI_FTYPE_V8HI_V4DI_UQI:
9355 case V4DI_FTYPE_V16QI_V4DI_UQI:
9356 case V4DI_FTYPE_V4DF_V4DI_UQI:
9357 case V2DI_FTYPE_V2DF_V2DI_UQI:
9358 case V4SI_FTYPE_V4DF_V4SI_UQI:
9359 case V4SI_FTYPE_V2DF_V4SI_UQI:
9360 case V4SI_FTYPE_V8HI_V4SI_UQI:
9361 case V4SI_FTYPE_V16QI_V4SI_UQI:
9362 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9363 case V8DF_FTYPE_V2DF_V8DF_UQI:
9364 case V8DF_FTYPE_V4DF_V8DF_UQI:
9365 case V8DF_FTYPE_V8DF_V8DF_UQI:
9366 case V8SF_FTYPE_V8SF_V8SF_UQI:
9367 case V8SF_FTYPE_V8SI_V8SF_UQI:
9368 case V4DF_FTYPE_V4DF_V4DF_UQI:
9369 case V4SF_FTYPE_V4SF_V4SF_UQI:
9370 case V2DF_FTYPE_V2DF_V2DF_UQI:
9371 case V2DF_FTYPE_V4SF_V2DF_UQI:
9372 case V2DF_FTYPE_V4SI_V2DF_UQI:
9373 case V4SF_FTYPE_V4SI_V4SF_UQI:
9374 case V4DF_FTYPE_V4SF_V4DF_UQI:
9375 case V4DF_FTYPE_V4SI_V4DF_UQI:
9376 case V8SI_FTYPE_V8SI_V8SI_UQI:
9377 case V8SI_FTYPE_V8HI_V8SI_UQI:
9378 case V8SI_FTYPE_V16QI_V8SI_UQI:
9379 case V8DF_FTYPE_V8SI_V8DF_UQI:
9380 case V8DI_FTYPE_DI_V8DI_UQI:
9381 case V16SF_FTYPE_V8SF_V16SF_UHI:
9382 case V16SI_FTYPE_V8SI_V16SI_UHI:
9383 case V16HI_FTYPE_V16HI_V16HI_UHI:
9384 case V8HI_FTYPE_V16QI_V8HI_UQI:
9385 case V16HI_FTYPE_V16QI_V16HI_UHI:
9386 case V32HI_FTYPE_V32HI_V32HI_USI:
9387 case V32HI_FTYPE_V32QI_V32HI_USI:
9388 case V8DI_FTYPE_V16QI_V8DI_UQI:
9389 case V8DI_FTYPE_V2DI_V8DI_UQI:
9390 case V8DI_FTYPE_V4DI_V8DI_UQI:
9391 case V8DI_FTYPE_V8DI_V8DI_UQI:
9392 case V8DI_FTYPE_V8HI_V8DI_UQI:
9393 case V8DI_FTYPE_V8SI_V8DI_UQI:
9394 case V8HI_FTYPE_V8DI_V8HI_UQI:
9395 case V8SI_FTYPE_V8DI_V8SI_UQI:
9396 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9397 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9398 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9399 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9400 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9401 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9402 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9403 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9404 case V32HI_FTYPE_V16SF_V16SF_USI:
9405 case V16HI_FTYPE_V8SF_V8SF_UHI:
9406 case V8HI_FTYPE_V4SF_V4SF_UQI:
9407 case V16HI_FTYPE_V16SF_V16HI_UHI:
9408 case V8HI_FTYPE_V8SF_V8HI_UQI:
9409 case V8HI_FTYPE_V4SF_V8HI_UQI:
9410 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9411 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9412 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9413 nargs = 3;
9414 break;
9415 case V32QI_FTYPE_V32QI_V32QI_INT:
9416 case V16HI_FTYPE_V16HI_V16HI_INT:
9417 case V16QI_FTYPE_V16QI_V16QI_INT:
9418 case V4DI_FTYPE_V4DI_V4DI_INT:
9419 case V8HI_FTYPE_V8HI_V8HI_INT:
9420 case V8SI_FTYPE_V8SI_V8SI_INT:
9421 case V8SI_FTYPE_V8SI_V4SI_INT:
9422 case V8SF_FTYPE_V8SF_V8SF_INT:
9423 case V8SF_FTYPE_V8SF_V4SF_INT:
9424 case V4SI_FTYPE_V4SI_V4SI_INT:
9425 case V4DF_FTYPE_V4DF_V4DF_INT:
9426 case V16SF_FTYPE_V16SF_V16SF_INT:
9427 case V16SF_FTYPE_V16SF_V4SF_INT:
9428 case V16SI_FTYPE_V16SI_V4SI_INT:
9429 case V4DF_FTYPE_V4DF_V2DF_INT:
9430 case V4SF_FTYPE_V4SF_V4SF_INT:
9431 case V2DI_FTYPE_V2DI_V2DI_INT:
9432 case V4DI_FTYPE_V4DI_V2DI_INT:
9433 case V2DF_FTYPE_V2DF_V2DF_INT:
9434 case UQI_FTYPE_V8DI_V8UDI_INT:
9435 case UQI_FTYPE_V8DF_V8DF_INT:
9436 case UQI_FTYPE_V2DF_V2DF_INT:
9437 case UQI_FTYPE_V4SF_V4SF_INT:
9438 case UHI_FTYPE_V16SI_V16SI_INT:
9439 case UHI_FTYPE_V16SF_V16SF_INT:
9440 case V64QI_FTYPE_V64QI_V64QI_INT:
9441 case V32HI_FTYPE_V32HI_V32HI_INT:
9442 case V16SI_FTYPE_V16SI_V16SI_INT:
9443 case V8DI_FTYPE_V8DI_V8DI_INT:
9444 nargs = 3;
9445 nargs_constant = 1;
9446 break;
9447 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9448 nargs = 3;
9449 rmode = V4DImode;
9450 nargs_constant = 1;
9451 break;
9452 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9453 nargs = 3;
9454 rmode = V2DImode;
9455 nargs_constant = 1;
9456 break;
9457 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9458 nargs = 3;
9459 rmode = DImode;
9460 nargs_constant = 1;
9461 break;
9462 case V2DI_FTYPE_V2DI_UINT_UINT:
9463 nargs = 3;
9464 nargs_constant = 2;
9465 break;
9466 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9467 nargs = 3;
9468 rmode = V8DImode;
9469 nargs_constant = 1;
9470 break;
9471 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9472 nargs = 5;
9473 rmode = V8DImode;
9474 mask_pos = 2;
9475 nargs_constant = 1;
9476 break;
9477 case QI_FTYPE_V8DF_INT_UQI:
9478 case QI_FTYPE_V4DF_INT_UQI:
9479 case QI_FTYPE_V2DF_INT_UQI:
9480 case HI_FTYPE_V16SF_INT_UHI:
9481 case QI_FTYPE_V8SF_INT_UQI:
9482 case QI_FTYPE_V4SF_INT_UQI:
9483 case V4SI_FTYPE_V4SI_V4SI_UHI:
9484 case V8SI_FTYPE_V8SI_V8SI_UHI:
9485 nargs = 3;
9486 mask_pos = 1;
9487 nargs_constant = 1;
9488 break;
9489 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9490 nargs = 5;
9491 rmode = V4DImode;
9492 mask_pos = 2;
9493 nargs_constant = 1;
9494 break;
9495 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9496 nargs = 5;
9497 rmode = V2DImode;
9498 mask_pos = 2;
9499 nargs_constant = 1;
9500 break;
9501 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9502 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9503 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9504 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9505 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9506 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9507 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9508 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9509 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9510 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9511 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9512 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9513 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9514 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9515 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9516 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9517 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9518 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9519 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9520 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9521 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9522 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9523 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9524 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9525 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9526 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9527 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9528 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9529 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9530 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9531 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9532 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9533 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9534 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9535 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9536 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9537 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9538 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9539 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9540 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9541 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9542 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9543 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9544 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9545 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9546 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9547 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9548 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9549 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9550 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9551 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9552 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9553 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9554 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9555 nargs = 4;
9556 break;
9557 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9558 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9559 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9560 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9561 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9562 nargs = 4;
9563 nargs_constant = 1;
9564 break;
9565 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9566 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9567 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9568 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9569 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9570 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9571 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9572 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9573 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9574 case USI_FTYPE_V32QI_V32QI_INT_USI:
9575 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9576 case USI_FTYPE_V32HI_V32HI_INT_USI:
9577 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9578 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9579 nargs = 4;
9580 mask_pos = 1;
9581 nargs_constant = 1;
9582 break;
9583 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9584 nargs = 4;
9585 nargs_constant = 2;
9586 break;
9587 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9588 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9589 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9590 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9591 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9592 nargs = 4;
9593 break;
9594 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9595 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9596 mask_pos = 1;
9597 nargs = 4;
9598 nargs_constant = 1;
9599 break;
9600 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9601 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9602 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9603 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9604 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9605 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9606 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9607 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9608 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9609 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9610 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9611 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9612 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9613 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9614 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9615 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9616 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9617 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9618 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9619 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9620 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9621 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9622 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9623 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9624 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9625 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9626 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9627 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9628 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9629 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9630 nargs = 4;
9631 mask_pos = 2;
9632 nargs_constant = 1;
9633 break;
9634 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9635 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9636 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9637 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9638 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9639 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9640 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9641 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9642 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9643 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9644 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9645 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9646 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9647 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9648 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9649 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9650 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9651 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9652 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9653 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9654 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9655 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9656 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9657 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9658 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9659 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9660 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9661 nargs = 5;
9662 mask_pos = 2;
9663 nargs_constant = 1;
9664 break;
9665 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9666 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9667 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9668 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9669 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9670 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9671 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9672 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9673 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9674 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9675 nargs = 5;
9676 mask_pos = 1;
9677 nargs_constant = 1;
9678 break;
9679 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9680 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9681 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9682 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9683 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9684 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9685 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9686 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9687 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9688 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9689 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9690 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9691 nargs = 5;
9692 mask_pos = 1;
9693 nargs_constant = 2;
9694 break;
9695
9696 default:
9697 gcc_unreachable ();
9698 }
9699
9700 gcc_assert (nargs <= ARRAY_SIZE (args));
9701
9702 if (comparison != UNKNOWN)
9703 {
9704 gcc_assert (nargs == 2);
9705 return ix86_expand_sse_compare (d, exp, target, swap);
9706 }
9707
9708 if (rmode == VOIDmode || rmode == tmode)
9709 {
9710 if (optimize
9711 || target == 0
9712 || GET_MODE (target) != tmode
9713 || !insn_p->operand[0].predicate (target, tmode))
9714 target = gen_reg_rtx (tmode);
9715 else if (memory_operand (target, tmode))
9716 num_memory++;
9717 real_target = target;
9718 }
9719 else
9720 {
9721 real_target = gen_reg_rtx (tmode);
9722 target = lowpart_subreg (rmode, real_target, tmode);
9723 }
9724
9725 for (i = 0; i < nargs; i++)
9726 {
9727 tree arg = CALL_EXPR_ARG (exp, i);
9728 rtx op = expand_normal (arg);
9729 machine_mode mode = insn_p->operand[i + 1].mode;
9730 bool match = insn_p->operand[i + 1].predicate (op, mode);
9731
9732 if (second_arg_count && i == 1)
9733 {
9734 /* SIMD shift insns take either an 8-bit immediate or
9735 register as count. But builtin functions take int as
9736 count. If count doesn't match, we put it in register.
9737 The instructions are using 64-bit count, if op is just
9738 32-bit, zero-extend it, as negative shift counts
9739 are undefined behavior and zero-extension is more
9740 efficient. */
9741 if (!match)
9742 {
9743 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9744 op = convert_modes (mode, GET_MODE (op), op, 1);
9745 else
9746 op = lowpart_subreg (mode, op, GET_MODE (op));
9747 if (!insn_p->operand[i + 1].predicate (op, mode))
9748 op = copy_to_reg (op);
9749 }
9750 }
9751 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9752 (!mask_pos && (nargs - i) <= nargs_constant))
9753 {
9754 if (!match)
9755 switch (icode)
9756 {
9757 case CODE_FOR_avx_vinsertf128v4di:
9758 case CODE_FOR_avx_vextractf128v4di:
9759 error ("the last argument must be an 1-bit immediate");
9760 return const0_rtx;
9761
9762 case CODE_FOR_avx512f_cmpv8di3_mask:
9763 case CODE_FOR_avx512f_cmpv16si3_mask:
9764 case CODE_FOR_avx512f_ucmpv8di3_mask:
9765 case CODE_FOR_avx512f_ucmpv16si3_mask:
9766 case CODE_FOR_avx512vl_cmpv4di3_mask:
9767 case CODE_FOR_avx512vl_cmpv8si3_mask:
9768 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9769 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9770 case CODE_FOR_avx512vl_cmpv2di3_mask:
9771 case CODE_FOR_avx512vl_cmpv4si3_mask:
9772 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9773 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9774 error ("the last argument must be a 3-bit immediate");
9775 return const0_rtx;
9776
9777 case CODE_FOR_sse4_1_roundsd:
9778 case CODE_FOR_sse4_1_roundss:
9779
9780 case CODE_FOR_sse4_1_roundpd:
9781 case CODE_FOR_sse4_1_roundps:
9782 case CODE_FOR_avx_roundpd256:
9783 case CODE_FOR_avx_roundps256:
9784
9785 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9786 case CODE_FOR_sse4_1_roundps_sfix:
9787 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9788 case CODE_FOR_avx_roundps_sfix256:
9789
9790 case CODE_FOR_sse4_1_blendps:
9791 case CODE_FOR_avx_blendpd256:
9792 case CODE_FOR_avx_vpermilv4df:
9793 case CODE_FOR_avx_vpermilv4df_mask:
9794 case CODE_FOR_avx512f_getmantv8df_mask:
9795 case CODE_FOR_avx512f_getmantv16sf_mask:
9796 case CODE_FOR_avx512vl_getmantv8sf_mask:
9797 case CODE_FOR_avx512vl_getmantv4df_mask:
9798 case CODE_FOR_avx512vl_getmantv4sf_mask:
9799 case CODE_FOR_avx512vl_getmantv2df_mask:
9800 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9801 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9802 case CODE_FOR_avx512dq_rangepv4df_mask:
9803 case CODE_FOR_avx512dq_rangepv8sf_mask:
9804 case CODE_FOR_avx512dq_rangepv2df_mask:
9805 case CODE_FOR_avx512dq_rangepv4sf_mask:
9806 case CODE_FOR_avx_shufpd256_mask:
9807 error ("the last argument must be a 4-bit immediate");
9808 return const0_rtx;
9809
9810 case CODE_FOR_sha1rnds4:
9811 case CODE_FOR_sse4_1_blendpd:
9812 case CODE_FOR_avx_vpermilv2df:
9813 case CODE_FOR_avx_vpermilv2df_mask:
9814 case CODE_FOR_xop_vpermil2v2df3:
9815 case CODE_FOR_xop_vpermil2v4sf3:
9816 case CODE_FOR_xop_vpermil2v4df3:
9817 case CODE_FOR_xop_vpermil2v8sf3:
9818 case CODE_FOR_avx512f_vinsertf32x4_mask:
9819 case CODE_FOR_avx512f_vinserti32x4_mask:
9820 case CODE_FOR_avx512f_vextractf32x4_mask:
9821 case CODE_FOR_avx512f_vextracti32x4_mask:
9822 case CODE_FOR_sse2_shufpd:
9823 case CODE_FOR_sse2_shufpd_mask:
9824 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9825 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9826 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9827 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9828 error ("the last argument must be a 2-bit immediate");
9829 return const0_rtx;
9830
9831 case CODE_FOR_avx_vextractf128v4df:
9832 case CODE_FOR_avx_vextractf128v8sf:
9833 case CODE_FOR_avx_vextractf128v8si:
9834 case CODE_FOR_avx_vinsertf128v4df:
9835 case CODE_FOR_avx_vinsertf128v8sf:
9836 case CODE_FOR_avx_vinsertf128v8si:
9837 case CODE_FOR_avx512f_vinsertf64x4_mask:
9838 case CODE_FOR_avx512f_vinserti64x4_mask:
9839 case CODE_FOR_avx512f_vextractf64x4_mask:
9840 case CODE_FOR_avx512f_vextracti64x4_mask:
9841 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9842 case CODE_FOR_avx512dq_vinserti32x8_mask:
9843 case CODE_FOR_avx512vl_vinsertv4df:
9844 case CODE_FOR_avx512vl_vinsertv4di:
9845 case CODE_FOR_avx512vl_vinsertv8sf:
9846 case CODE_FOR_avx512vl_vinsertv8si:
9847 error ("the last argument must be a 1-bit immediate");
9848 return const0_rtx;
9849
9850 case CODE_FOR_avx_vmcmpv2df3:
9851 case CODE_FOR_avx_vmcmpv4sf3:
9852 case CODE_FOR_avx_cmpv2df3:
9853 case CODE_FOR_avx_cmpv4sf3:
9854 case CODE_FOR_avx_cmpv4df3:
9855 case CODE_FOR_avx_cmpv8sf3:
9856 case CODE_FOR_avx512f_cmpv8df3_mask:
9857 case CODE_FOR_avx512f_cmpv16sf3_mask:
9858 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9859 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9860 error ("the last argument must be a 5-bit immediate");
9861 return const0_rtx;
9862
9863 default:
9864 switch (nargs_constant)
9865 {
9866 case 2:
9867 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9868 (!mask_pos && (nargs - i) == nargs_constant))
9869 {
9870 error ("the next to last argument must be an 8-bit immediate");
9871 break;
9872 }
9873 /* FALLTHRU */
9874 case 1:
9875 error ("the last argument must be an 8-bit immediate");
9876 break;
9877 default:
9878 gcc_unreachable ();
9879 }
9880 return const0_rtx;
9881 }
9882 }
9883 else
9884 {
9885 if (VECTOR_MODE_P (mode))
9886 op = safe_vector_operand (op, mode);
9887
9888 /* If we aren't optimizing, only allow one memory operand to
9889 be generated. */
9890 if (memory_operand (op, mode))
9891 num_memory++;
9892
9893 op = fixup_modeless_constant (op, mode);
9894
9895 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9896 {
9897 if (optimize || !match || num_memory > 1)
9898 op = copy_to_mode_reg (mode, op);
9899 }
9900 else
9901 {
9902 op = copy_to_reg (op);
9903 op = lowpart_subreg (mode, op, GET_MODE (op));
9904 }
9905 }
9906
9907 args[i].op = op;
9908 args[i].mode = mode;
9909 }
9910
9911 switch (nargs)
9912 {
9913 case 1:
9914 pat = GEN_FCN (icode) (real_target, args[0].op);
9915 break;
9916 case 2:
9917 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9918 break;
9919 case 3:
9920 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9921 args[2].op);
9922 break;
9923 case 4:
9924 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9925 args[2].op, args[3].op);
9926 break;
9927 case 5:
9928 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9929 args[2].op, args[3].op, args[4].op);
9930 break;
9931 case 6:
9932 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9933 args[2].op, args[3].op, args[4].op,
9934 args[5].op);
9935 break;
9936 default:
9937 gcc_unreachable ();
9938 }
9939
9940 if (! pat)
9941 return 0;
9942
9943 emit_insn (pat);
9944 return target;
9945 }
9946
9947 /* Transform pattern of following layout:
9948 (set A
9949 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9950 )
9951 into:
9952 (set (A B)) */
9953
9954 static rtx
ix86_erase_embedded_rounding(rtx pat)9955 ix86_erase_embedded_rounding (rtx pat)
9956 {
9957 if (GET_CODE (pat) == INSN)
9958 pat = PATTERN (pat);
9959
9960 gcc_assert (GET_CODE (pat) == SET);
9961 rtx src = SET_SRC (pat);
9962 gcc_assert (XVECLEN (src, 0) == 2);
9963 rtx p0 = XVECEXP (src, 0, 0);
9964 gcc_assert (GET_CODE (src) == UNSPEC
9965 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9966 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9967 return res;
9968 }
9969
9970 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9971 with rounding. */
9972 static rtx
ix86_expand_sse_comi_round(const struct builtin_description * d,tree exp,rtx target)9973 ix86_expand_sse_comi_round (const struct builtin_description *d,
9974 tree exp, rtx target)
9975 {
9976 rtx pat, set_dst;
9977 tree arg0 = CALL_EXPR_ARG (exp, 0);
9978 tree arg1 = CALL_EXPR_ARG (exp, 1);
9979 tree arg2 = CALL_EXPR_ARG (exp, 2);
9980 tree arg3 = CALL_EXPR_ARG (exp, 3);
9981 rtx op0 = expand_normal (arg0);
9982 rtx op1 = expand_normal (arg1);
9983 rtx op2 = expand_normal (arg2);
9984 rtx op3 = expand_normal (arg3);
9985 enum insn_code icode = d->icode;
9986 const struct insn_data_d *insn_p = &insn_data[icode];
9987 machine_mode mode0 = insn_p->operand[0].mode;
9988 machine_mode mode1 = insn_p->operand[1].mode;
9989
9990 /* See avxintrin.h for values. */
9991 static const enum rtx_code comparisons[32] =
9992 {
9993 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9994 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
9995 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9996 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
9997 };
9998 static const bool ordereds[32] =
9999 {
10000 true, true, true, false, false, false, false, true,
10001 false, false, false, true, true, true, true, false,
10002 true, true, true, false, false, false, false, true,
10003 false, false, false, true, true, true, true, false
10004 };
10005 static const bool non_signalings[32] =
10006 {
10007 true, false, false, true, true, false, false, true,
10008 true, false, false, true, true, false, false, true,
10009 false, true, true, false, false, true, true, false,
10010 false, true, true, false, false, true, true, false
10011 };
10012
10013 if (!CONST_INT_P (op2))
10014 {
10015 error ("the third argument must be comparison constant");
10016 return const0_rtx;
10017 }
10018 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10019 {
10020 error ("incorrect comparison mode");
10021 return const0_rtx;
10022 }
10023
10024 if (!insn_p->operand[2].predicate (op3, SImode))
10025 {
10026 error ("incorrect rounding operand");
10027 return const0_rtx;
10028 }
10029
10030 if (VECTOR_MODE_P (mode0))
10031 op0 = safe_vector_operand (op0, mode0);
10032 if (VECTOR_MODE_P (mode1))
10033 op1 = safe_vector_operand (op1, mode1);
10034
10035 enum rtx_code comparison = comparisons[INTVAL (op2)];
10036 bool ordered = ordereds[INTVAL (op2)];
10037 bool non_signaling = non_signalings[INTVAL (op2)];
10038 rtx const_val = const0_rtx;
10039
10040 bool check_unordered = false;
10041 machine_mode mode = CCFPmode;
10042 switch (comparison)
10043 {
10044 case ORDERED:
10045 if (!ordered)
10046 {
10047 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10048 if (!non_signaling)
10049 ordered = true;
10050 mode = CCSmode;
10051 }
10052 else
10053 {
10054 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10055 if (non_signaling)
10056 ordered = false;
10057 mode = CCPmode;
10058 }
10059 comparison = NE;
10060 break;
10061 case UNORDERED:
10062 if (ordered)
10063 {
10064 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10065 if (non_signaling)
10066 ordered = false;
10067 mode = CCSmode;
10068 }
10069 else
10070 {
10071 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10072 if (!non_signaling)
10073 ordered = true;
10074 mode = CCPmode;
10075 }
10076 comparison = EQ;
10077 break;
10078
10079 case LE: /* -> GE */
10080 case LT: /* -> GT */
10081 case UNGE: /* -> UNLE */
10082 case UNGT: /* -> UNLT */
10083 std::swap (op0, op1);
10084 comparison = swap_condition (comparison);
10085 /* FALLTHRU */
10086 case GT:
10087 case GE:
10088 case UNEQ:
10089 case UNLT:
10090 case UNLE:
10091 case LTGT:
10092 /* These are supported by CCFPmode. NB: Use ordered/signaling
10093 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10094 with NAN operands. */
10095 if (ordered == non_signaling)
10096 ordered = !ordered;
10097 break;
10098 case EQ:
10099 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10100 _CMP_EQ_OQ/_CMP_EQ_OS. */
10101 check_unordered = true;
10102 mode = CCZmode;
10103 break;
10104 case NE:
10105 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10106 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10107 gcc_assert (!ordered);
10108 check_unordered = true;
10109 mode = CCZmode;
10110 const_val = const1_rtx;
10111 break;
10112 default:
10113 gcc_unreachable ();
10114 }
10115
10116 target = gen_reg_rtx (SImode);
10117 emit_move_insn (target, const_val);
10118 target = gen_rtx_SUBREG (QImode, target, 0);
10119
10120 if ((optimize && !register_operand (op0, mode0))
10121 || !insn_p->operand[0].predicate (op0, mode0))
10122 op0 = copy_to_mode_reg (mode0, op0);
10123 if ((optimize && !register_operand (op1, mode1))
10124 || !insn_p->operand[1].predicate (op1, mode1))
10125 op1 = copy_to_mode_reg (mode1, op1);
10126
10127 /*
10128 1. COMI: ordered and signaling.
10129 2. UCOMI: unordered and non-signaling.
10130 */
10131 if (non_signaling)
10132 icode = (icode == CODE_FOR_sse_comi_round
10133 ? CODE_FOR_sse_ucomi_round
10134 : CODE_FOR_sse2_ucomi_round);
10135
10136 pat = GEN_FCN (icode) (op0, op1, op3);
10137 if (! pat)
10138 return 0;
10139
10140 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10141 if (INTVAL (op3) == NO_ROUND)
10142 {
10143 pat = ix86_erase_embedded_rounding (pat);
10144 if (! pat)
10145 return 0;
10146
10147 set_dst = SET_DEST (pat);
10148 }
10149 else
10150 {
10151 gcc_assert (GET_CODE (pat) == SET);
10152 set_dst = SET_DEST (pat);
10153 }
10154
10155 emit_insn (pat);
10156
10157 rtx_code_label *label = NULL;
10158
10159 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10160 with NAN operands. */
10161 if (check_unordered)
10162 {
10163 gcc_assert (comparison == EQ || comparison == NE);
10164
10165 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10166 label = gen_label_rtx ();
10167 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10168 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10169 gen_rtx_LABEL_REF (VOIDmode, label),
10170 pc_rtx);
10171 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10172 }
10173
10174 /* NB: Set CCFPmode and check a different CCmode which is in subset
10175 of CCFPmode. */
10176 if (GET_MODE (set_dst) != mode)
10177 {
10178 gcc_assert (mode == CCAmode || mode == CCCmode
10179 || mode == CCOmode || mode == CCPmode
10180 || mode == CCSmode || mode == CCZmode);
10181 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10182 }
10183
10184 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10185 gen_rtx_fmt_ee (comparison, QImode,
10186 set_dst,
10187 const0_rtx)));
10188
10189 if (label)
10190 emit_label (label);
10191
10192 return SUBREG_REG (target);
10193 }
10194
10195 static rtx
ix86_expand_round_builtin(const struct builtin_description * d,tree exp,rtx target)10196 ix86_expand_round_builtin (const struct builtin_description *d,
10197 tree exp, rtx target)
10198 {
10199 rtx pat;
10200 unsigned int i, nargs;
10201 struct
10202 {
10203 rtx op;
10204 machine_mode mode;
10205 } args[6];
10206 enum insn_code icode = d->icode;
10207 const struct insn_data_d *insn_p = &insn_data[icode];
10208 machine_mode tmode = insn_p->operand[0].mode;
10209 unsigned int nargs_constant = 0;
10210 unsigned int redundant_embed_rnd = 0;
10211
10212 switch ((enum ix86_builtin_func_type) d->flag)
10213 {
10214 case UINT64_FTYPE_V2DF_INT:
10215 case UINT64_FTYPE_V4SF_INT:
10216 case UINT_FTYPE_V2DF_INT:
10217 case UINT_FTYPE_V4SF_INT:
10218 case INT64_FTYPE_V2DF_INT:
10219 case INT64_FTYPE_V4SF_INT:
10220 case INT_FTYPE_V2DF_INT:
10221 case INT_FTYPE_V4SF_INT:
10222 nargs = 2;
10223 break;
10224 case V4SF_FTYPE_V4SF_UINT_INT:
10225 case V4SF_FTYPE_V4SF_UINT64_INT:
10226 case V2DF_FTYPE_V2DF_UINT64_INT:
10227 case V4SF_FTYPE_V4SF_INT_INT:
10228 case V4SF_FTYPE_V4SF_INT64_INT:
10229 case V2DF_FTYPE_V2DF_INT64_INT:
10230 case V4SF_FTYPE_V4SF_V4SF_INT:
10231 case V2DF_FTYPE_V2DF_V2DF_INT:
10232 case V4SF_FTYPE_V4SF_V2DF_INT:
10233 case V2DF_FTYPE_V2DF_V4SF_INT:
10234 nargs = 3;
10235 break;
10236 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10237 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10238 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10239 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10240 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10241 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10242 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10243 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10244 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10245 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10246 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10247 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10248 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10249 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10250 nargs = 4;
10251 break;
10252 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10253 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10254 nargs_constant = 2;
10255 nargs = 4;
10256 break;
10257 case INT_FTYPE_V4SF_V4SF_INT_INT:
10258 case INT_FTYPE_V2DF_V2DF_INT_INT:
10259 return ix86_expand_sse_comi_round (d, exp, target);
10260 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10261 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10262 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10263 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10264 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10265 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10266 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10267 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10268 nargs = 5;
10269 break;
10270 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10271 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10272 nargs_constant = 4;
10273 nargs = 5;
10274 break;
10275 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10276 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10277 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10278 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10279 nargs_constant = 3;
10280 nargs = 5;
10281 break;
10282 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10283 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10284 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10285 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10286 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10287 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10288 nargs = 6;
10289 nargs_constant = 4;
10290 break;
10291 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10292 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10293 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10294 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10295 nargs = 6;
10296 nargs_constant = 3;
10297 break;
10298 default:
10299 gcc_unreachable ();
10300 }
10301 gcc_assert (nargs <= ARRAY_SIZE (args));
10302
10303 if (optimize
10304 || target == 0
10305 || GET_MODE (target) != tmode
10306 || !insn_p->operand[0].predicate (target, tmode))
10307 target = gen_reg_rtx (tmode);
10308
10309 for (i = 0; i < nargs; i++)
10310 {
10311 tree arg = CALL_EXPR_ARG (exp, i);
10312 rtx op = expand_normal (arg);
10313 machine_mode mode = insn_p->operand[i + 1].mode;
10314 bool match = insn_p->operand[i + 1].predicate (op, mode);
10315
10316 if (i == nargs - nargs_constant)
10317 {
10318 if (!match)
10319 {
10320 switch (icode)
10321 {
10322 case CODE_FOR_avx512f_getmantv8df_mask_round:
10323 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10324 case CODE_FOR_avx512f_vgetmantv2df_round:
10325 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10326 case CODE_FOR_avx512f_vgetmantv4sf_round:
10327 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10328 error ("the immediate argument must be a 4-bit immediate");
10329 return const0_rtx;
10330 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10331 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10332 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10333 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10334 error ("the immediate argument must be a 5-bit immediate");
10335 return const0_rtx;
10336 default:
10337 error ("the immediate argument must be an 8-bit immediate");
10338 return const0_rtx;
10339 }
10340 }
10341 }
10342 else if (i == nargs-1)
10343 {
10344 if (!insn_p->operand[nargs].predicate (op, SImode))
10345 {
10346 error ("incorrect rounding operand");
10347 return const0_rtx;
10348 }
10349
10350 /* If there is no rounding use normal version of the pattern. */
10351 if (INTVAL (op) == NO_ROUND)
10352 redundant_embed_rnd = 1;
10353 }
10354 else
10355 {
10356 if (VECTOR_MODE_P (mode))
10357 op = safe_vector_operand (op, mode);
10358
10359 op = fixup_modeless_constant (op, mode);
10360
10361 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10362 {
10363 if (optimize || !match)
10364 op = copy_to_mode_reg (mode, op);
10365 }
10366 else
10367 {
10368 op = copy_to_reg (op);
10369 op = lowpart_subreg (mode, op, GET_MODE (op));
10370 }
10371 }
10372
10373 args[i].op = op;
10374 args[i].mode = mode;
10375 }
10376
10377 switch (nargs)
10378 {
10379 case 1:
10380 pat = GEN_FCN (icode) (target, args[0].op);
10381 break;
10382 case 2:
10383 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10384 break;
10385 case 3:
10386 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10387 args[2].op);
10388 break;
10389 case 4:
10390 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10391 args[2].op, args[3].op);
10392 break;
10393 case 5:
10394 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10395 args[2].op, args[3].op, args[4].op);
10396 break;
10397 case 6:
10398 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10399 args[2].op, args[3].op, args[4].op,
10400 args[5].op);
10401 break;
10402 default:
10403 gcc_unreachable ();
10404 }
10405
10406 if (!pat)
10407 return 0;
10408
10409 if (redundant_embed_rnd)
10410 pat = ix86_erase_embedded_rounding (pat);
10411
10412 emit_insn (pat);
10413 return target;
10414 }
10415
10416 /* Subroutine of ix86_expand_builtin to take care of special insns
10417 with variable number of operands. */
10418
10419 static rtx
ix86_expand_special_args_builtin(const struct builtin_description * d,tree exp,rtx target)10420 ix86_expand_special_args_builtin (const struct builtin_description *d,
10421 tree exp, rtx target)
10422 {
10423 tree arg;
10424 rtx pat, op;
10425 unsigned int i, nargs, arg_adjust, memory;
10426 bool aligned_mem = false;
10427 struct
10428 {
10429 rtx op;
10430 machine_mode mode;
10431 } args[3];
10432 enum insn_code icode = d->icode;
10433 bool last_arg_constant = false;
10434 const struct insn_data_d *insn_p = &insn_data[icode];
10435 machine_mode tmode = insn_p->operand[0].mode;
10436 enum { load, store } klass;
10437
10438 switch ((enum ix86_builtin_func_type) d->flag)
10439 {
10440 case VOID_FTYPE_VOID:
10441 emit_insn (GEN_FCN (icode) (target));
10442 return 0;
10443 case VOID_FTYPE_UINT64:
10444 case VOID_FTYPE_UNSIGNED:
10445 nargs = 0;
10446 klass = store;
10447 memory = 0;
10448 break;
10449
10450 case INT_FTYPE_VOID:
10451 case USHORT_FTYPE_VOID:
10452 case UINT64_FTYPE_VOID:
10453 case UINT_FTYPE_VOID:
10454 case UNSIGNED_FTYPE_VOID:
10455 nargs = 0;
10456 klass = load;
10457 memory = 0;
10458 break;
10459 case UINT64_FTYPE_PUNSIGNED:
10460 case V2DI_FTYPE_PV2DI:
10461 case V4DI_FTYPE_PV4DI:
10462 case V32QI_FTYPE_PCCHAR:
10463 case V16QI_FTYPE_PCCHAR:
10464 case V8SF_FTYPE_PCV4SF:
10465 case V8SF_FTYPE_PCFLOAT:
10466 case V4SF_FTYPE_PCFLOAT:
10467 case V4DF_FTYPE_PCV2DF:
10468 case V4DF_FTYPE_PCDOUBLE:
10469 case V2DF_FTYPE_PCDOUBLE:
10470 case VOID_FTYPE_PVOID:
10471 case V8DI_FTYPE_PV8DI:
10472 nargs = 1;
10473 klass = load;
10474 memory = 0;
10475 switch (icode)
10476 {
10477 case CODE_FOR_sse4_1_movntdqa:
10478 case CODE_FOR_avx2_movntdqa:
10479 case CODE_FOR_avx512f_movntdqa:
10480 aligned_mem = true;
10481 break;
10482 default:
10483 break;
10484 }
10485 break;
10486 case VOID_FTYPE_PV2SF_V4SF:
10487 case VOID_FTYPE_PV8DI_V8DI:
10488 case VOID_FTYPE_PV4DI_V4DI:
10489 case VOID_FTYPE_PV2DI_V2DI:
10490 case VOID_FTYPE_PCHAR_V32QI:
10491 case VOID_FTYPE_PCHAR_V16QI:
10492 case VOID_FTYPE_PFLOAT_V16SF:
10493 case VOID_FTYPE_PFLOAT_V8SF:
10494 case VOID_FTYPE_PFLOAT_V4SF:
10495 case VOID_FTYPE_PDOUBLE_V8DF:
10496 case VOID_FTYPE_PDOUBLE_V4DF:
10497 case VOID_FTYPE_PDOUBLE_V2DF:
10498 case VOID_FTYPE_PLONGLONG_LONGLONG:
10499 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10500 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10501 case VOID_FTYPE_PINT_INT:
10502 nargs = 1;
10503 klass = store;
10504 /* Reserve memory operand for target. */
10505 memory = ARRAY_SIZE (args);
10506 switch (icode)
10507 {
10508 /* These builtins and instructions require the memory
10509 to be properly aligned. */
10510 case CODE_FOR_avx_movntv4di:
10511 case CODE_FOR_sse2_movntv2di:
10512 case CODE_FOR_avx_movntv8sf:
10513 case CODE_FOR_sse_movntv4sf:
10514 case CODE_FOR_sse4a_vmmovntv4sf:
10515 case CODE_FOR_avx_movntv4df:
10516 case CODE_FOR_sse2_movntv2df:
10517 case CODE_FOR_sse4a_vmmovntv2df:
10518 case CODE_FOR_sse2_movntidi:
10519 case CODE_FOR_sse_movntq:
10520 case CODE_FOR_sse2_movntisi:
10521 case CODE_FOR_avx512f_movntv16sf:
10522 case CODE_FOR_avx512f_movntv8df:
10523 case CODE_FOR_avx512f_movntv8di:
10524 aligned_mem = true;
10525 break;
10526 default:
10527 break;
10528 }
10529 break;
10530 case VOID_FTYPE_PVOID_PCVOID:
10531 nargs = 1;
10532 klass = store;
10533 memory = 0;
10534
10535 break;
10536 case V4SF_FTYPE_V4SF_PCV2SF:
10537 case V2DF_FTYPE_V2DF_PCDOUBLE:
10538 nargs = 2;
10539 klass = load;
10540 memory = 1;
10541 break;
10542 case V8SF_FTYPE_PCV8SF_V8SI:
10543 case V4DF_FTYPE_PCV4DF_V4DI:
10544 case V4SF_FTYPE_PCV4SF_V4SI:
10545 case V2DF_FTYPE_PCV2DF_V2DI:
10546 case V8SI_FTYPE_PCV8SI_V8SI:
10547 case V4DI_FTYPE_PCV4DI_V4DI:
10548 case V4SI_FTYPE_PCV4SI_V4SI:
10549 case V2DI_FTYPE_PCV2DI_V2DI:
10550 case VOID_FTYPE_INT_INT64:
10551 nargs = 2;
10552 klass = load;
10553 memory = 0;
10554 break;
10555 case VOID_FTYPE_PV8DF_V8DF_UQI:
10556 case VOID_FTYPE_PV4DF_V4DF_UQI:
10557 case VOID_FTYPE_PV2DF_V2DF_UQI:
10558 case VOID_FTYPE_PV16SF_V16SF_UHI:
10559 case VOID_FTYPE_PV8SF_V8SF_UQI:
10560 case VOID_FTYPE_PV4SF_V4SF_UQI:
10561 case VOID_FTYPE_PV8DI_V8DI_UQI:
10562 case VOID_FTYPE_PV4DI_V4DI_UQI:
10563 case VOID_FTYPE_PV2DI_V2DI_UQI:
10564 case VOID_FTYPE_PV16SI_V16SI_UHI:
10565 case VOID_FTYPE_PV8SI_V8SI_UQI:
10566 case VOID_FTYPE_PV4SI_V4SI_UQI:
10567 case VOID_FTYPE_PV64QI_V64QI_UDI:
10568 case VOID_FTYPE_PV32HI_V32HI_USI:
10569 case VOID_FTYPE_PV32QI_V32QI_USI:
10570 case VOID_FTYPE_PV16QI_V16QI_UHI:
10571 case VOID_FTYPE_PV16HI_V16HI_UHI:
10572 case VOID_FTYPE_PV8HI_V8HI_UQI:
10573 switch (icode)
10574 {
10575 /* These builtins and instructions require the memory
10576 to be properly aligned. */
10577 case CODE_FOR_avx512f_storev16sf_mask:
10578 case CODE_FOR_avx512f_storev16si_mask:
10579 case CODE_FOR_avx512f_storev8df_mask:
10580 case CODE_FOR_avx512f_storev8di_mask:
10581 case CODE_FOR_avx512vl_storev8sf_mask:
10582 case CODE_FOR_avx512vl_storev8si_mask:
10583 case CODE_FOR_avx512vl_storev4df_mask:
10584 case CODE_FOR_avx512vl_storev4di_mask:
10585 case CODE_FOR_avx512vl_storev4sf_mask:
10586 case CODE_FOR_avx512vl_storev4si_mask:
10587 case CODE_FOR_avx512vl_storev2df_mask:
10588 case CODE_FOR_avx512vl_storev2di_mask:
10589 aligned_mem = true;
10590 break;
10591 default:
10592 break;
10593 }
10594 /* FALLTHRU */
10595 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10596 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10597 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10598 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10599 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10600 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10601 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10602 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10603 case VOID_FTYPE_PV8SI_V8DI_UQI:
10604 case VOID_FTYPE_PV8HI_V8DI_UQI:
10605 case VOID_FTYPE_PV16HI_V16SI_UHI:
10606 case VOID_FTYPE_PV16QI_V8DI_UQI:
10607 case VOID_FTYPE_PV16QI_V16SI_UHI:
10608 case VOID_FTYPE_PV4SI_V4DI_UQI:
10609 case VOID_FTYPE_PV4SI_V2DI_UQI:
10610 case VOID_FTYPE_PV8HI_V4DI_UQI:
10611 case VOID_FTYPE_PV8HI_V2DI_UQI:
10612 case VOID_FTYPE_PV8HI_V8SI_UQI:
10613 case VOID_FTYPE_PV8HI_V4SI_UQI:
10614 case VOID_FTYPE_PV16QI_V4DI_UQI:
10615 case VOID_FTYPE_PV16QI_V2DI_UQI:
10616 case VOID_FTYPE_PV16QI_V8SI_UQI:
10617 case VOID_FTYPE_PV16QI_V4SI_UQI:
10618 case VOID_FTYPE_PCHAR_V64QI_UDI:
10619 case VOID_FTYPE_PCHAR_V32QI_USI:
10620 case VOID_FTYPE_PCHAR_V16QI_UHI:
10621 case VOID_FTYPE_PSHORT_V32HI_USI:
10622 case VOID_FTYPE_PSHORT_V16HI_UHI:
10623 case VOID_FTYPE_PSHORT_V8HI_UQI:
10624 case VOID_FTYPE_PINT_V16SI_UHI:
10625 case VOID_FTYPE_PINT_V8SI_UQI:
10626 case VOID_FTYPE_PINT_V4SI_UQI:
10627 case VOID_FTYPE_PINT64_V8DI_UQI:
10628 case VOID_FTYPE_PINT64_V4DI_UQI:
10629 case VOID_FTYPE_PINT64_V2DI_UQI:
10630 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10631 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10632 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10633 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10634 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10635 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10636 case VOID_FTYPE_PV32QI_V32HI_USI:
10637 case VOID_FTYPE_PV16QI_V16HI_UHI:
10638 case VOID_FTYPE_PV8QI_V8HI_UQI:
10639 nargs = 2;
10640 klass = store;
10641 /* Reserve memory operand for target. */
10642 memory = ARRAY_SIZE (args);
10643 break;
10644 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10645 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10646 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10647 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10648 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10649 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10650 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10651 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10652 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10653 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10654 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10655 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10656 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10657 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10658 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10659 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10660 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10661 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10662 switch (icode)
10663 {
10664 /* These builtins and instructions require the memory
10665 to be properly aligned. */
10666 case CODE_FOR_avx512f_loadv16sf_mask:
10667 case CODE_FOR_avx512f_loadv16si_mask:
10668 case CODE_FOR_avx512f_loadv8df_mask:
10669 case CODE_FOR_avx512f_loadv8di_mask:
10670 case CODE_FOR_avx512vl_loadv8sf_mask:
10671 case CODE_FOR_avx512vl_loadv8si_mask:
10672 case CODE_FOR_avx512vl_loadv4df_mask:
10673 case CODE_FOR_avx512vl_loadv4di_mask:
10674 case CODE_FOR_avx512vl_loadv4sf_mask:
10675 case CODE_FOR_avx512vl_loadv4si_mask:
10676 case CODE_FOR_avx512vl_loadv2df_mask:
10677 case CODE_FOR_avx512vl_loadv2di_mask:
10678 case CODE_FOR_avx512bw_loadv64qi_mask:
10679 case CODE_FOR_avx512vl_loadv32qi_mask:
10680 case CODE_FOR_avx512vl_loadv16qi_mask:
10681 case CODE_FOR_avx512bw_loadv32hi_mask:
10682 case CODE_FOR_avx512vl_loadv16hi_mask:
10683 case CODE_FOR_avx512vl_loadv8hi_mask:
10684 aligned_mem = true;
10685 break;
10686 default:
10687 break;
10688 }
10689 /* FALLTHRU */
10690 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10691 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10692 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10693 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10694 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10695 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10696 case V16SI_FTYPE_PCINT_V16SI_UHI:
10697 case V8SI_FTYPE_PCINT_V8SI_UQI:
10698 case V4SI_FTYPE_PCINT_V4SI_UQI:
10699 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10700 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10701 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10702 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10703 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10704 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10705 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10706 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10707 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10708 nargs = 3;
10709 klass = load;
10710 memory = 0;
10711 break;
10712 case VOID_FTYPE_UINT_UINT_UINT:
10713 case VOID_FTYPE_UINT64_UINT_UINT:
10714 case UCHAR_FTYPE_UINT_UINT_UINT:
10715 case UCHAR_FTYPE_UINT64_UINT_UINT:
10716 nargs = 3;
10717 klass = load;
10718 memory = ARRAY_SIZE (args);
10719 last_arg_constant = true;
10720 break;
10721 default:
10722 gcc_unreachable ();
10723 }
10724
10725 gcc_assert (nargs <= ARRAY_SIZE (args));
10726
10727 if (klass == store)
10728 {
10729 arg = CALL_EXPR_ARG (exp, 0);
10730 op = expand_normal (arg);
10731 gcc_assert (target == 0);
10732 if (memory)
10733 {
10734 op = ix86_zero_extend_to_Pmode (op);
10735 target = gen_rtx_MEM (tmode, op);
10736 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10737 on it. Try to improve it using get_pointer_alignment,
10738 and if the special builtin is one that requires strict
10739 mode alignment, also from it's GET_MODE_ALIGNMENT.
10740 Failure to do so could lead to ix86_legitimate_combined_insn
10741 rejecting all changes to such insns. */
10742 unsigned int align = get_pointer_alignment (arg);
10743 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10744 align = GET_MODE_ALIGNMENT (tmode);
10745 if (MEM_ALIGN (target) < align)
10746 set_mem_align (target, align);
10747 }
10748 else
10749 target = force_reg (tmode, op);
10750 arg_adjust = 1;
10751 }
10752 else
10753 {
10754 arg_adjust = 0;
10755 if (optimize
10756 || target == 0
10757 || !register_operand (target, tmode)
10758 || GET_MODE (target) != tmode)
10759 target = gen_reg_rtx (tmode);
10760 }
10761
10762 for (i = 0; i < nargs; i++)
10763 {
10764 machine_mode mode = insn_p->operand[i + 1].mode;
10765 bool match;
10766
10767 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10768 op = expand_normal (arg);
10769 match = insn_p->operand[i + 1].predicate (op, mode);
10770
10771 if (last_arg_constant && (i + 1) == nargs)
10772 {
10773 if (!match)
10774 {
10775 if (icode == CODE_FOR_lwp_lwpvalsi3
10776 || icode == CODE_FOR_lwp_lwpinssi3
10777 || icode == CODE_FOR_lwp_lwpvaldi3
10778 || icode == CODE_FOR_lwp_lwpinsdi3)
10779 error ("the last argument must be a 32-bit immediate");
10780 else
10781 error ("the last argument must be an 8-bit immediate");
10782 return const0_rtx;
10783 }
10784 }
10785 else
10786 {
10787 if (i == memory)
10788 {
10789 /* This must be the memory operand. */
10790 op = ix86_zero_extend_to_Pmode (op);
10791 op = gen_rtx_MEM (mode, op);
10792 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10793 on it. Try to improve it using get_pointer_alignment,
10794 and if the special builtin is one that requires strict
10795 mode alignment, also from it's GET_MODE_ALIGNMENT.
10796 Failure to do so could lead to ix86_legitimate_combined_insn
10797 rejecting all changes to such insns. */
10798 unsigned int align = get_pointer_alignment (arg);
10799 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10800 align = GET_MODE_ALIGNMENT (mode);
10801 if (MEM_ALIGN (op) < align)
10802 set_mem_align (op, align);
10803 }
10804 else
10805 {
10806 /* This must be register. */
10807 if (VECTOR_MODE_P (mode))
10808 op = safe_vector_operand (op, mode);
10809
10810 op = fixup_modeless_constant (op, mode);
10811
10812 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10813 op = copy_to_mode_reg (mode, op);
10814 else
10815 {
10816 op = copy_to_reg (op);
10817 op = lowpart_subreg (mode, op, GET_MODE (op));
10818 }
10819 }
10820 }
10821
10822 args[i].op = op;
10823 args[i].mode = mode;
10824 }
10825
10826 switch (nargs)
10827 {
10828 case 0:
10829 pat = GEN_FCN (icode) (target);
10830 break;
10831 case 1:
10832 pat = GEN_FCN (icode) (target, args[0].op);
10833 break;
10834 case 2:
10835 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10836 break;
10837 case 3:
10838 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10839 break;
10840 default:
10841 gcc_unreachable ();
10842 }
10843
10844 if (! pat)
10845 return 0;
10846 emit_insn (pat);
10847 return klass == store ? 0 : target;
10848 }
10849
10850 /* Return the integer constant in ARG. Constrain it to be in the range
10851 of the subparts of VEC_TYPE; issue an error if not. */
10852
10853 static int
get_element_number(tree vec_type,tree arg)10854 get_element_number (tree vec_type, tree arg)
10855 {
10856 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10857
10858 if (!tree_fits_uhwi_p (arg)
10859 || (elt = tree_to_uhwi (arg), elt > max))
10860 {
10861 error ("selector must be an integer constant in the range "
10862 "[0, %wi]", max);
10863 return 0;
10864 }
10865
10866 return elt;
10867 }
10868
10869 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10870 ix86_expand_vector_init. We DO have language-level syntax for this, in
10871 the form of (type){ init-list }. Except that since we can't place emms
10872 instructions from inside the compiler, we can't allow the use of MMX
10873 registers unless the user explicitly asks for it. So we do *not* define
10874 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10875 we have builtins invoked by mmintrin.h that gives us license to emit
10876 these sorts of instructions. */
10877
10878 static rtx
ix86_expand_vec_init_builtin(tree type,tree exp,rtx target)10879 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10880 {
10881 machine_mode tmode = TYPE_MODE (type);
10882 machine_mode inner_mode = GET_MODE_INNER (tmode);
10883 int i, n_elt = GET_MODE_NUNITS (tmode);
10884 rtvec v = rtvec_alloc (n_elt);
10885
10886 gcc_assert (VECTOR_MODE_P (tmode));
10887 gcc_assert (call_expr_nargs (exp) == n_elt);
10888
10889 for (i = 0; i < n_elt; ++i)
10890 {
10891 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10892 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10893 }
10894
10895 if (!target || !register_operand (target, tmode))
10896 target = gen_reg_rtx (tmode);
10897
10898 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10899 return target;
10900 }
10901
10902 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10903 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10904 had a language-level syntax for referencing vector elements. */
10905
10906 static rtx
ix86_expand_vec_ext_builtin(tree exp,rtx target)10907 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10908 {
10909 machine_mode tmode, mode0;
10910 tree arg0, arg1;
10911 int elt;
10912 rtx op0;
10913
10914 arg0 = CALL_EXPR_ARG (exp, 0);
10915 arg1 = CALL_EXPR_ARG (exp, 1);
10916
10917 op0 = expand_normal (arg0);
10918 elt = get_element_number (TREE_TYPE (arg0), arg1);
10919
10920 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10921 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10922 gcc_assert (VECTOR_MODE_P (mode0));
10923
10924 op0 = force_reg (mode0, op0);
10925
10926 if (optimize || !target || !register_operand (target, tmode))
10927 target = gen_reg_rtx (tmode);
10928
10929 ix86_expand_vector_extract (true, target, op0, elt);
10930
10931 return target;
10932 }
10933
10934 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10935 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10936 a language-level syntax for referencing vector elements. */
10937
10938 static rtx
ix86_expand_vec_set_builtin(tree exp)10939 ix86_expand_vec_set_builtin (tree exp)
10940 {
10941 machine_mode tmode, mode1;
10942 tree arg0, arg1, arg2;
10943 int elt;
10944 rtx op0, op1, target;
10945
10946 arg0 = CALL_EXPR_ARG (exp, 0);
10947 arg1 = CALL_EXPR_ARG (exp, 1);
10948 arg2 = CALL_EXPR_ARG (exp, 2);
10949
10950 tmode = TYPE_MODE (TREE_TYPE (arg0));
10951 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10952 gcc_assert (VECTOR_MODE_P (tmode));
10953
10954 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10955 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10956 elt = get_element_number (TREE_TYPE (arg0), arg2);
10957
10958 if (GET_MODE (op1) != mode1)
10959 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10960
10961 op0 = force_reg (tmode, op0);
10962 op1 = force_reg (mode1, op1);
10963
10964 /* OP0 is the source of these builtin functions and shouldn't be
10965 modified. Create a copy, use it and return it as target. */
10966 target = gen_reg_rtx (tmode);
10967 emit_move_insn (target, op0);
10968 ix86_expand_vector_set (true, target, op1, elt);
10969
10970 return target;
10971 }
10972
10973 /* Expand an expression EXP that calls a built-in function,
10974 with result going to TARGET if that's convenient
10975 (and in mode MODE if that's convenient).
10976 SUBTARGET may be used as the target for computing one of EXP's operands.
10977 IGNORE is nonzero if the value is to be ignored. */
10978
10979 rtx
ix86_expand_builtin(tree exp,rtx target,rtx subtarget,machine_mode mode,int ignore)10980 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10981 machine_mode mode, int ignore)
10982 {
10983 size_t i;
10984 enum insn_code icode, icode2;
10985 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10986 tree arg0, arg1, arg2, arg3, arg4;
10987 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10988 machine_mode mode0, mode1, mode2, mode3, mode4;
10989 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
10990
10991 /* For CPU builtins that can be folded, fold first and expand the fold. */
10992 switch (fcode)
10993 {
10994 case IX86_BUILTIN_CPU_INIT:
10995 {
10996 /* Make it call __cpu_indicator_init in libgcc. */
10997 tree call_expr, fndecl, type;
10998 type = build_function_type_list (integer_type_node, NULL_TREE);
10999 fndecl = build_fn_decl ("__cpu_indicator_init", type);
11000 call_expr = build_call_expr (fndecl, 0);
11001 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11002 }
11003 case IX86_BUILTIN_CPU_IS:
11004 case IX86_BUILTIN_CPU_SUPPORTS:
11005 {
11006 tree arg0 = CALL_EXPR_ARG (exp, 0);
11007 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11008 gcc_assert (fold_expr != NULL_TREE);
11009 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11010 }
11011 }
11012
11013 HOST_WIDE_INT isa = ix86_isa_flags;
11014 HOST_WIDE_INT isa2 = ix86_isa_flags2;
11015 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11016 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11017 /* The general case is we require all the ISAs specified in bisa{,2}
11018 to be enabled.
11019 The exceptions are:
11020 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11021 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11022 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11023 where for each such pair it is sufficient if either of the ISAs is
11024 enabled, plus if it is ored with other options also those others.
11025 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
11026 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11027 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11028 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11029 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11030 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11031 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11032 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11033 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11034 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11035 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11036 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11037 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11038 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE)
11039 {
11040 bisa &= ~OPTION_MASK_ISA_MMX;
11041 bisa |= OPTION_MASK_ISA_SSE2;
11042 }
11043 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11044 {
11045 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11046 if (TARGET_ABI_X32)
11047 bisa |= OPTION_MASK_ABI_X32;
11048 else
11049 bisa |= OPTION_MASK_ABI_64;
11050 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11051 (enum fpmath_unit) 0,
11052 (enum prefer_vector_width) 0,
11053 false, add_abi_p);
11054 if (!opts)
11055 error ("%qE needs unknown isa option", fndecl);
11056 else
11057 {
11058 gcc_assert (opts != NULL);
11059 error ("%qE needs isa option %s", fndecl, opts);
11060 free (opts);
11061 }
11062 return expand_call (exp, target, ignore);
11063 }
11064
11065 switch (fcode)
11066 {
11067 case IX86_BUILTIN_MASKMOVQ:
11068 case IX86_BUILTIN_MASKMOVDQU:
11069 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11070 ? CODE_FOR_mmx_maskmovq
11071 : CODE_FOR_sse2_maskmovdqu);
11072 /* Note the arg order is different from the operand order. */
11073 arg1 = CALL_EXPR_ARG (exp, 0);
11074 arg2 = CALL_EXPR_ARG (exp, 1);
11075 arg0 = CALL_EXPR_ARG (exp, 2);
11076 op0 = expand_normal (arg0);
11077 op1 = expand_normal (arg1);
11078 op2 = expand_normal (arg2);
11079 mode0 = insn_data[icode].operand[0].mode;
11080 mode1 = insn_data[icode].operand[1].mode;
11081 mode2 = insn_data[icode].operand[2].mode;
11082
11083 op0 = ix86_zero_extend_to_Pmode (op0);
11084 op0 = gen_rtx_MEM (mode1, op0);
11085
11086 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11087 op0 = copy_to_mode_reg (mode0, op0);
11088 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11089 op1 = copy_to_mode_reg (mode1, op1);
11090 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11091 op2 = copy_to_mode_reg (mode2, op2);
11092 pat = GEN_FCN (icode) (op0, op1, op2);
11093 if (! pat)
11094 return 0;
11095 emit_insn (pat);
11096 return 0;
11097
11098 case IX86_BUILTIN_LDMXCSR:
11099 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11100 target = assign_386_stack_local (SImode, SLOT_TEMP);
11101 emit_move_insn (target, op0);
11102 emit_insn (gen_sse_ldmxcsr (target));
11103 return 0;
11104
11105 case IX86_BUILTIN_STMXCSR:
11106 target = assign_386_stack_local (SImode, SLOT_TEMP);
11107 emit_insn (gen_sse_stmxcsr (target));
11108 return copy_to_mode_reg (SImode, target);
11109
11110 case IX86_BUILTIN_CLFLUSH:
11111 arg0 = CALL_EXPR_ARG (exp, 0);
11112 op0 = expand_normal (arg0);
11113 icode = CODE_FOR_sse2_clflush;
11114 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11115 op0 = ix86_zero_extend_to_Pmode (op0);
11116
11117 emit_insn (gen_sse2_clflush (op0));
11118 return 0;
11119
11120 case IX86_BUILTIN_CLWB:
11121 arg0 = CALL_EXPR_ARG (exp, 0);
11122 op0 = expand_normal (arg0);
11123 icode = CODE_FOR_clwb;
11124 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11125 op0 = ix86_zero_extend_to_Pmode (op0);
11126
11127 emit_insn (gen_clwb (op0));
11128 return 0;
11129
11130 case IX86_BUILTIN_CLFLUSHOPT:
11131 arg0 = CALL_EXPR_ARG (exp, 0);
11132 op0 = expand_normal (arg0);
11133 icode = CODE_FOR_clflushopt;
11134 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11135 op0 = ix86_zero_extend_to_Pmode (op0);
11136
11137 emit_insn (gen_clflushopt (op0));
11138 return 0;
11139
11140 case IX86_BUILTIN_MONITOR:
11141 case IX86_BUILTIN_MONITORX:
11142 arg0 = CALL_EXPR_ARG (exp, 0);
11143 arg1 = CALL_EXPR_ARG (exp, 1);
11144 arg2 = CALL_EXPR_ARG (exp, 2);
11145 op0 = expand_normal (arg0);
11146 op1 = expand_normal (arg1);
11147 op2 = expand_normal (arg2);
11148 if (!REG_P (op0))
11149 op0 = ix86_zero_extend_to_Pmode (op0);
11150 if (!REG_P (op1))
11151 op1 = copy_to_mode_reg (SImode, op1);
11152 if (!REG_P (op2))
11153 op2 = copy_to_mode_reg (SImode, op2);
11154
11155 emit_insn (fcode == IX86_BUILTIN_MONITOR
11156 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11157 : gen_monitorx (Pmode, op0, op1, op2));
11158 return 0;
11159
11160 case IX86_BUILTIN_MWAIT:
11161 arg0 = CALL_EXPR_ARG (exp, 0);
11162 arg1 = CALL_EXPR_ARG (exp, 1);
11163 op0 = expand_normal (arg0);
11164 op1 = expand_normal (arg1);
11165 if (!REG_P (op0))
11166 op0 = copy_to_mode_reg (SImode, op0);
11167 if (!REG_P (op1))
11168 op1 = copy_to_mode_reg (SImode, op1);
11169 emit_insn (gen_sse3_mwait (op0, op1));
11170 return 0;
11171
11172 case IX86_BUILTIN_MWAITX:
11173 arg0 = CALL_EXPR_ARG (exp, 0);
11174 arg1 = CALL_EXPR_ARG (exp, 1);
11175 arg2 = CALL_EXPR_ARG (exp, 2);
11176 op0 = expand_normal (arg0);
11177 op1 = expand_normal (arg1);
11178 op2 = expand_normal (arg2);
11179 if (!REG_P (op0))
11180 op0 = copy_to_mode_reg (SImode, op0);
11181 if (!REG_P (op1))
11182 op1 = copy_to_mode_reg (SImode, op1);
11183 if (!REG_P (op2))
11184 op2 = copy_to_mode_reg (SImode, op2);
11185 emit_insn (gen_mwaitx (op0, op1, op2));
11186 return 0;
11187
11188 case IX86_BUILTIN_UMONITOR:
11189 arg0 = CALL_EXPR_ARG (exp, 0);
11190 op0 = expand_normal (arg0);
11191
11192 op0 = ix86_zero_extend_to_Pmode (op0);
11193 emit_insn (gen_umonitor (Pmode, op0));
11194 return 0;
11195
11196 case IX86_BUILTIN_UMWAIT:
11197 case IX86_BUILTIN_TPAUSE:
11198 arg0 = CALL_EXPR_ARG (exp, 0);
11199 arg1 = CALL_EXPR_ARG (exp, 1);
11200 op0 = expand_normal (arg0);
11201 op1 = expand_normal (arg1);
11202
11203 if (!REG_P (op0))
11204 op0 = copy_to_mode_reg (SImode, op0);
11205
11206 op1 = force_reg (DImode, op1);
11207
11208 if (TARGET_64BIT)
11209 {
11210 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11211 NULL, 1, OPTAB_DIRECT);
11212 switch (fcode)
11213 {
11214 case IX86_BUILTIN_UMWAIT:
11215 icode = CODE_FOR_umwait_rex64;
11216 break;
11217 case IX86_BUILTIN_TPAUSE:
11218 icode = CODE_FOR_tpause_rex64;
11219 break;
11220 default:
11221 gcc_unreachable ();
11222 }
11223
11224 op2 = gen_lowpart (SImode, op2);
11225 op1 = gen_lowpart (SImode, op1);
11226 pat = GEN_FCN (icode) (op0, op1, op2);
11227 }
11228 else
11229 {
11230 switch (fcode)
11231 {
11232 case IX86_BUILTIN_UMWAIT:
11233 icode = CODE_FOR_umwait;
11234 break;
11235 case IX86_BUILTIN_TPAUSE:
11236 icode = CODE_FOR_tpause;
11237 break;
11238 default:
11239 gcc_unreachable ();
11240 }
11241 pat = GEN_FCN (icode) (op0, op1);
11242 }
11243
11244 if (!pat)
11245 return 0;
11246
11247 emit_insn (pat);
11248
11249 if (target == 0
11250 || !register_operand (target, QImode))
11251 target = gen_reg_rtx (QImode);
11252
11253 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11254 const0_rtx);
11255 emit_insn (gen_rtx_SET (target, pat));
11256
11257 return target;
11258
11259 case IX86_BUILTIN_CLZERO:
11260 arg0 = CALL_EXPR_ARG (exp, 0);
11261 op0 = expand_normal (arg0);
11262 if (!REG_P (op0))
11263 op0 = ix86_zero_extend_to_Pmode (op0);
11264 emit_insn (gen_clzero (Pmode, op0));
11265 return 0;
11266
11267 case IX86_BUILTIN_CLDEMOTE:
11268 arg0 = CALL_EXPR_ARG (exp, 0);
11269 op0 = expand_normal (arg0);
11270 icode = CODE_FOR_cldemote;
11271 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11272 op0 = ix86_zero_extend_to_Pmode (op0);
11273
11274 emit_insn (gen_cldemote (op0));
11275 return 0;
11276
11277 case IX86_BUILTIN_VEC_INIT_V2SI:
11278 case IX86_BUILTIN_VEC_INIT_V4HI:
11279 case IX86_BUILTIN_VEC_INIT_V8QI:
11280 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11281
11282 case IX86_BUILTIN_VEC_EXT_V2DF:
11283 case IX86_BUILTIN_VEC_EXT_V2DI:
11284 case IX86_BUILTIN_VEC_EXT_V4SF:
11285 case IX86_BUILTIN_VEC_EXT_V4SI:
11286 case IX86_BUILTIN_VEC_EXT_V8HI:
11287 case IX86_BUILTIN_VEC_EXT_V2SI:
11288 case IX86_BUILTIN_VEC_EXT_V4HI:
11289 case IX86_BUILTIN_VEC_EXT_V16QI:
11290 return ix86_expand_vec_ext_builtin (exp, target);
11291
11292 case IX86_BUILTIN_VEC_SET_V2DI:
11293 case IX86_BUILTIN_VEC_SET_V4SF:
11294 case IX86_BUILTIN_VEC_SET_V4SI:
11295 case IX86_BUILTIN_VEC_SET_V8HI:
11296 case IX86_BUILTIN_VEC_SET_V4HI:
11297 case IX86_BUILTIN_VEC_SET_V16QI:
11298 return ix86_expand_vec_set_builtin (exp);
11299
11300 case IX86_BUILTIN_NANQ:
11301 case IX86_BUILTIN_NANSQ:
11302 return expand_call (exp, target, ignore);
11303
11304 case IX86_BUILTIN_RDPID:
11305
11306 op0 = gen_reg_rtx (word_mode);
11307
11308 if (TARGET_64BIT)
11309 {
11310 insn = gen_rdpid_rex64 (op0);
11311 op0 = convert_to_mode (SImode, op0, 1);
11312 }
11313 else
11314 insn = gen_rdpid (op0);
11315
11316 emit_insn (insn);
11317
11318 if (target == 0
11319 || !register_operand (target, SImode))
11320 target = gen_reg_rtx (SImode);
11321
11322 emit_move_insn (target, op0);
11323 return target;
11324
11325 case IX86_BUILTIN_2INTERSECTD512:
11326 case IX86_BUILTIN_2INTERSECTQ512:
11327 case IX86_BUILTIN_2INTERSECTD256:
11328 case IX86_BUILTIN_2INTERSECTQ256:
11329 case IX86_BUILTIN_2INTERSECTD128:
11330 case IX86_BUILTIN_2INTERSECTQ128:
11331 arg0 = CALL_EXPR_ARG (exp, 0);
11332 arg1 = CALL_EXPR_ARG (exp, 1);
11333 arg2 = CALL_EXPR_ARG (exp, 2);
11334 arg3 = CALL_EXPR_ARG (exp, 3);
11335 op0 = expand_normal (arg0);
11336 op1 = expand_normal (arg1);
11337 op2 = expand_normal (arg2);
11338 op3 = expand_normal (arg3);
11339
11340 if (!address_operand (op0, VOIDmode))
11341 {
11342 op0 = convert_memory_address (Pmode, op0);
11343 op0 = copy_addr_to_reg (op0);
11344 }
11345 if (!address_operand (op1, VOIDmode))
11346 {
11347 op1 = convert_memory_address (Pmode, op1);
11348 op1 = copy_addr_to_reg (op1);
11349 }
11350
11351 switch (fcode)
11352 {
11353 case IX86_BUILTIN_2INTERSECTD512:
11354 mode4 = P2HImode;
11355 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11356 break;
11357 case IX86_BUILTIN_2INTERSECTQ512:
11358 mode4 = P2QImode;
11359 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11360 break;
11361 case IX86_BUILTIN_2INTERSECTD256:
11362 mode4 = P2QImode;
11363 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11364 break;
11365 case IX86_BUILTIN_2INTERSECTQ256:
11366 mode4 = P2QImode;
11367 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11368 break;
11369 case IX86_BUILTIN_2INTERSECTD128:
11370 mode4 = P2QImode;
11371 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11372 break;
11373 case IX86_BUILTIN_2INTERSECTQ128:
11374 mode4 = P2QImode;
11375 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11376 break;
11377 default:
11378 gcc_unreachable ();
11379 }
11380
11381 mode2 = insn_data[icode].operand[1].mode;
11382 mode3 = insn_data[icode].operand[2].mode;
11383 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11384 op2 = copy_to_mode_reg (mode2, op2);
11385 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11386 op3 = copy_to_mode_reg (mode3, op3);
11387
11388 op4 = gen_reg_rtx (mode4);
11389 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11390 mode0 = mode4 == P2HImode ? HImode : QImode;
11391 emit_move_insn (gen_rtx_MEM (mode0, op0),
11392 gen_lowpart (mode0, op4));
11393 emit_move_insn (gen_rtx_MEM (mode0, op1),
11394 gen_highpart (mode0, op4));
11395
11396 return 0;
11397
11398 case IX86_BUILTIN_RDPMC:
11399 case IX86_BUILTIN_RDTSC:
11400 case IX86_BUILTIN_RDTSCP:
11401 case IX86_BUILTIN_XGETBV:
11402
11403 op0 = gen_reg_rtx (DImode);
11404 op1 = gen_reg_rtx (DImode);
11405
11406 if (fcode == IX86_BUILTIN_RDPMC)
11407 {
11408 arg0 = CALL_EXPR_ARG (exp, 0);
11409 op2 = expand_normal (arg0);
11410 if (!register_operand (op2, SImode))
11411 op2 = copy_to_mode_reg (SImode, op2);
11412
11413 insn = (TARGET_64BIT
11414 ? gen_rdpmc_rex64 (op0, op1, op2)
11415 : gen_rdpmc (op0, op2));
11416 emit_insn (insn);
11417 }
11418 else if (fcode == IX86_BUILTIN_XGETBV)
11419 {
11420 arg0 = CALL_EXPR_ARG (exp, 0);
11421 op2 = expand_normal (arg0);
11422 if (!register_operand (op2, SImode))
11423 op2 = copy_to_mode_reg (SImode, op2);
11424
11425 insn = (TARGET_64BIT
11426 ? gen_xgetbv_rex64 (op0, op1, op2)
11427 : gen_xgetbv (op0, op2));
11428 emit_insn (insn);
11429 }
11430 else if (fcode == IX86_BUILTIN_RDTSC)
11431 {
11432 insn = (TARGET_64BIT
11433 ? gen_rdtsc_rex64 (op0, op1)
11434 : gen_rdtsc (op0));
11435 emit_insn (insn);
11436 }
11437 else
11438 {
11439 op2 = gen_reg_rtx (SImode);
11440
11441 insn = (TARGET_64BIT
11442 ? gen_rdtscp_rex64 (op0, op1, op2)
11443 : gen_rdtscp (op0, op2));
11444 emit_insn (insn);
11445
11446 arg0 = CALL_EXPR_ARG (exp, 0);
11447 op4 = expand_normal (arg0);
11448 if (!address_operand (op4, VOIDmode))
11449 {
11450 op4 = convert_memory_address (Pmode, op4);
11451 op4 = copy_addr_to_reg (op4);
11452 }
11453 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11454 }
11455
11456 if (target == 0
11457 || !register_operand (target, DImode))
11458 target = gen_reg_rtx (DImode);
11459
11460 if (TARGET_64BIT)
11461 {
11462 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11463 op1, 1, OPTAB_DIRECT);
11464 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11465 op0, 1, OPTAB_DIRECT);
11466 }
11467
11468 emit_move_insn (target, op0);
11469 return target;
11470
11471 case IX86_BUILTIN_ENQCMD:
11472 case IX86_BUILTIN_ENQCMDS:
11473 case IX86_BUILTIN_MOVDIR64B:
11474
11475 arg0 = CALL_EXPR_ARG (exp, 0);
11476 arg1 = CALL_EXPR_ARG (exp, 1);
11477 op0 = expand_normal (arg0);
11478 op1 = expand_normal (arg1);
11479
11480 op0 = ix86_zero_extend_to_Pmode (op0);
11481 if (!address_operand (op1, VOIDmode))
11482 {
11483 op1 = convert_memory_address (Pmode, op1);
11484 op1 = copy_addr_to_reg (op1);
11485 }
11486 op1 = gen_rtx_MEM (XImode, op1);
11487
11488 if (fcode == IX86_BUILTIN_MOVDIR64B)
11489 {
11490 emit_insn (gen_movdir64b (Pmode, op0, op1));
11491 return 0;
11492 }
11493 else
11494 {
11495 rtx pat;
11496
11497 target = gen_reg_rtx (SImode);
11498 emit_move_insn (target, const0_rtx);
11499 target = gen_rtx_SUBREG (QImode, target, 0);
11500
11501 if (fcode == IX86_BUILTIN_ENQCMD)
11502 pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
11503 else
11504 pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
11505
11506 emit_insn (pat);
11507
11508 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11509 gen_rtx_fmt_ee (EQ, QImode,
11510 SET_DEST (pat),
11511 const0_rtx)));
11512
11513 return SUBREG_REG (target);
11514 }
11515
11516 case IX86_BUILTIN_FXSAVE:
11517 case IX86_BUILTIN_FXRSTOR:
11518 case IX86_BUILTIN_FXSAVE64:
11519 case IX86_BUILTIN_FXRSTOR64:
11520 case IX86_BUILTIN_FNSTENV:
11521 case IX86_BUILTIN_FLDENV:
11522 mode0 = BLKmode;
11523 switch (fcode)
11524 {
11525 case IX86_BUILTIN_FXSAVE:
11526 icode = CODE_FOR_fxsave;
11527 break;
11528 case IX86_BUILTIN_FXRSTOR:
11529 icode = CODE_FOR_fxrstor;
11530 break;
11531 case IX86_BUILTIN_FXSAVE64:
11532 icode = CODE_FOR_fxsave64;
11533 break;
11534 case IX86_BUILTIN_FXRSTOR64:
11535 icode = CODE_FOR_fxrstor64;
11536 break;
11537 case IX86_BUILTIN_FNSTENV:
11538 icode = CODE_FOR_fnstenv;
11539 break;
11540 case IX86_BUILTIN_FLDENV:
11541 icode = CODE_FOR_fldenv;
11542 break;
11543 default:
11544 gcc_unreachable ();
11545 }
11546
11547 arg0 = CALL_EXPR_ARG (exp, 0);
11548 op0 = expand_normal (arg0);
11549
11550 if (!address_operand (op0, VOIDmode))
11551 {
11552 op0 = convert_memory_address (Pmode, op0);
11553 op0 = copy_addr_to_reg (op0);
11554 }
11555 op0 = gen_rtx_MEM (mode0, op0);
11556
11557 pat = GEN_FCN (icode) (op0);
11558 if (pat)
11559 emit_insn (pat);
11560 return 0;
11561
11562 case IX86_BUILTIN_XSETBV:
11563 arg0 = CALL_EXPR_ARG (exp, 0);
11564 arg1 = CALL_EXPR_ARG (exp, 1);
11565 op0 = expand_normal (arg0);
11566 op1 = expand_normal (arg1);
11567
11568 if (!REG_P (op0))
11569 op0 = copy_to_mode_reg (SImode, op0);
11570
11571 op1 = force_reg (DImode, op1);
11572
11573 if (TARGET_64BIT)
11574 {
11575 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11576 NULL, 1, OPTAB_DIRECT);
11577
11578 icode = CODE_FOR_xsetbv_rex64;
11579
11580 op2 = gen_lowpart (SImode, op2);
11581 op1 = gen_lowpart (SImode, op1);
11582 pat = GEN_FCN (icode) (op0, op1, op2);
11583 }
11584 else
11585 {
11586 icode = CODE_FOR_xsetbv;
11587
11588 pat = GEN_FCN (icode) (op0, op1);
11589 }
11590 if (pat)
11591 emit_insn (pat);
11592 return 0;
11593
11594 case IX86_BUILTIN_XSAVE:
11595 case IX86_BUILTIN_XRSTOR:
11596 case IX86_BUILTIN_XSAVE64:
11597 case IX86_BUILTIN_XRSTOR64:
11598 case IX86_BUILTIN_XSAVEOPT:
11599 case IX86_BUILTIN_XSAVEOPT64:
11600 case IX86_BUILTIN_XSAVES:
11601 case IX86_BUILTIN_XRSTORS:
11602 case IX86_BUILTIN_XSAVES64:
11603 case IX86_BUILTIN_XRSTORS64:
11604 case IX86_BUILTIN_XSAVEC:
11605 case IX86_BUILTIN_XSAVEC64:
11606 arg0 = CALL_EXPR_ARG (exp, 0);
11607 arg1 = CALL_EXPR_ARG (exp, 1);
11608 op0 = expand_normal (arg0);
11609 op1 = expand_normal (arg1);
11610
11611 if (!address_operand (op0, VOIDmode))
11612 {
11613 op0 = convert_memory_address (Pmode, op0);
11614 op0 = copy_addr_to_reg (op0);
11615 }
11616 op0 = gen_rtx_MEM (BLKmode, op0);
11617
11618 op1 = force_reg (DImode, op1);
11619
11620 if (TARGET_64BIT)
11621 {
11622 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11623 NULL, 1, OPTAB_DIRECT);
11624 switch (fcode)
11625 {
11626 case IX86_BUILTIN_XSAVE:
11627 icode = CODE_FOR_xsave_rex64;
11628 break;
11629 case IX86_BUILTIN_XRSTOR:
11630 icode = CODE_FOR_xrstor_rex64;
11631 break;
11632 case IX86_BUILTIN_XSAVE64:
11633 icode = CODE_FOR_xsave64;
11634 break;
11635 case IX86_BUILTIN_XRSTOR64:
11636 icode = CODE_FOR_xrstor64;
11637 break;
11638 case IX86_BUILTIN_XSAVEOPT:
11639 icode = CODE_FOR_xsaveopt_rex64;
11640 break;
11641 case IX86_BUILTIN_XSAVEOPT64:
11642 icode = CODE_FOR_xsaveopt64;
11643 break;
11644 case IX86_BUILTIN_XSAVES:
11645 icode = CODE_FOR_xsaves_rex64;
11646 break;
11647 case IX86_BUILTIN_XRSTORS:
11648 icode = CODE_FOR_xrstors_rex64;
11649 break;
11650 case IX86_BUILTIN_XSAVES64:
11651 icode = CODE_FOR_xsaves64;
11652 break;
11653 case IX86_BUILTIN_XRSTORS64:
11654 icode = CODE_FOR_xrstors64;
11655 break;
11656 case IX86_BUILTIN_XSAVEC:
11657 icode = CODE_FOR_xsavec_rex64;
11658 break;
11659 case IX86_BUILTIN_XSAVEC64:
11660 icode = CODE_FOR_xsavec64;
11661 break;
11662 default:
11663 gcc_unreachable ();
11664 }
11665
11666 op2 = gen_lowpart (SImode, op2);
11667 op1 = gen_lowpart (SImode, op1);
11668 pat = GEN_FCN (icode) (op0, op1, op2);
11669 }
11670 else
11671 {
11672 switch (fcode)
11673 {
11674 case IX86_BUILTIN_XSAVE:
11675 icode = CODE_FOR_xsave;
11676 break;
11677 case IX86_BUILTIN_XRSTOR:
11678 icode = CODE_FOR_xrstor;
11679 break;
11680 case IX86_BUILTIN_XSAVEOPT:
11681 icode = CODE_FOR_xsaveopt;
11682 break;
11683 case IX86_BUILTIN_XSAVES:
11684 icode = CODE_FOR_xsaves;
11685 break;
11686 case IX86_BUILTIN_XRSTORS:
11687 icode = CODE_FOR_xrstors;
11688 break;
11689 case IX86_BUILTIN_XSAVEC:
11690 icode = CODE_FOR_xsavec;
11691 break;
11692 default:
11693 gcc_unreachable ();
11694 }
11695 pat = GEN_FCN (icode) (op0, op1);
11696 }
11697
11698 if (pat)
11699 emit_insn (pat);
11700 return 0;
11701
11702 case IX86_BUILTIN_LLWPCB:
11703 arg0 = CALL_EXPR_ARG (exp, 0);
11704 op0 = expand_normal (arg0);
11705 icode = CODE_FOR_lwp_llwpcb;
11706 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11707 op0 = ix86_zero_extend_to_Pmode (op0);
11708 emit_insn (gen_lwp_llwpcb (op0));
11709 return 0;
11710
11711 case IX86_BUILTIN_SLWPCB:
11712 icode = CODE_FOR_lwp_slwpcb;
11713 if (!target
11714 || !insn_data[icode].operand[0].predicate (target, Pmode))
11715 target = gen_reg_rtx (Pmode);
11716 emit_insn (gen_lwp_slwpcb (target));
11717 return target;
11718
11719 case IX86_BUILTIN_BEXTRI32:
11720 case IX86_BUILTIN_BEXTRI64:
11721 arg0 = CALL_EXPR_ARG (exp, 0);
11722 arg1 = CALL_EXPR_ARG (exp, 1);
11723 op0 = expand_normal (arg0);
11724 op1 = expand_normal (arg1);
11725 icode = (fcode == IX86_BUILTIN_BEXTRI32
11726 ? CODE_FOR_tbm_bextri_si
11727 : CODE_FOR_tbm_bextri_di);
11728 if (!CONST_INT_P (op1))
11729 {
11730 error ("last argument must be an immediate");
11731 return const0_rtx;
11732 }
11733 else
11734 {
11735 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
11736 unsigned char lsb_index = INTVAL (op1) & 0xFF;
11737 op1 = GEN_INT (length);
11738 op2 = GEN_INT (lsb_index);
11739
11740 mode1 = insn_data[icode].operand[1].mode;
11741 if (!insn_data[icode].operand[1].predicate (op0, mode1))
11742 op0 = copy_to_mode_reg (mode1, op0);
11743
11744 mode0 = insn_data[icode].operand[0].mode;
11745 if (target == 0
11746 || !register_operand (target, mode0))
11747 target = gen_reg_rtx (mode0);
11748
11749 pat = GEN_FCN (icode) (target, op0, op1, op2);
11750 if (pat)
11751 emit_insn (pat);
11752 return target;
11753 }
11754
11755 case IX86_BUILTIN_RDRAND16_STEP:
11756 icode = CODE_FOR_rdrandhi_1;
11757 mode0 = HImode;
11758 goto rdrand_step;
11759
11760 case IX86_BUILTIN_RDRAND32_STEP:
11761 icode = CODE_FOR_rdrandsi_1;
11762 mode0 = SImode;
11763 goto rdrand_step;
11764
11765 case IX86_BUILTIN_RDRAND64_STEP:
11766 icode = CODE_FOR_rdranddi_1;
11767 mode0 = DImode;
11768
11769 rdrand_step:
11770 arg0 = CALL_EXPR_ARG (exp, 0);
11771 op1 = expand_normal (arg0);
11772 if (!address_operand (op1, VOIDmode))
11773 {
11774 op1 = convert_memory_address (Pmode, op1);
11775 op1 = copy_addr_to_reg (op1);
11776 }
11777
11778 op0 = gen_reg_rtx (mode0);
11779 emit_insn (GEN_FCN (icode) (op0));
11780
11781 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11782
11783 op1 = gen_reg_rtx (SImode);
11784 emit_move_insn (op1, CONST1_RTX (SImode));
11785
11786 /* Emit SImode conditional move. */
11787 if (mode0 == HImode)
11788 {
11789 if (TARGET_ZERO_EXTEND_WITH_AND
11790 && optimize_function_for_speed_p (cfun))
11791 {
11792 op2 = force_reg (SImode, const0_rtx);
11793
11794 emit_insn (gen_movstricthi
11795 (gen_lowpart (HImode, op2), op0));
11796 }
11797 else
11798 {
11799 op2 = gen_reg_rtx (SImode);
11800
11801 emit_insn (gen_zero_extendhisi2 (op2, op0));
11802 }
11803 }
11804 else if (mode0 == SImode)
11805 op2 = op0;
11806 else
11807 op2 = gen_rtx_SUBREG (SImode, op0, 0);
11808
11809 if (target == 0
11810 || !register_operand (target, SImode))
11811 target = gen_reg_rtx (SImode);
11812
11813 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11814 const0_rtx);
11815 emit_insn (gen_rtx_SET (target,
11816 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11817 return target;
11818
11819 case IX86_BUILTIN_RDSEED16_STEP:
11820 icode = CODE_FOR_rdseedhi_1;
11821 mode0 = HImode;
11822 goto rdseed_step;
11823
11824 case IX86_BUILTIN_RDSEED32_STEP:
11825 icode = CODE_FOR_rdseedsi_1;
11826 mode0 = SImode;
11827 goto rdseed_step;
11828
11829 case IX86_BUILTIN_RDSEED64_STEP:
11830 icode = CODE_FOR_rdseeddi_1;
11831 mode0 = DImode;
11832
11833 rdseed_step:
11834 arg0 = CALL_EXPR_ARG (exp, 0);
11835 op1 = expand_normal (arg0);
11836 if (!address_operand (op1, VOIDmode))
11837 {
11838 op1 = convert_memory_address (Pmode, op1);
11839 op1 = copy_addr_to_reg (op1);
11840 }
11841
11842 op0 = gen_reg_rtx (mode0);
11843 emit_insn (GEN_FCN (icode) (op0));
11844
11845 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11846
11847 op2 = gen_reg_rtx (QImode);
11848
11849 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11850 const0_rtx);
11851 emit_insn (gen_rtx_SET (op2, pat));
11852
11853 if (target == 0
11854 || !register_operand (target, SImode))
11855 target = gen_reg_rtx (SImode);
11856
11857 emit_insn (gen_zero_extendqisi2 (target, op2));
11858 return target;
11859
11860 case IX86_BUILTIN_SBB32:
11861 icode = CODE_FOR_subborrowsi;
11862 icode2 = CODE_FOR_subborrowsi_0;
11863 mode0 = SImode;
11864 mode1 = DImode;
11865 mode2 = CCmode;
11866 goto handlecarry;
11867
11868 case IX86_BUILTIN_SBB64:
11869 icode = CODE_FOR_subborrowdi;
11870 icode2 = CODE_FOR_subborrowdi_0;
11871 mode0 = DImode;
11872 mode1 = TImode;
11873 mode2 = CCmode;
11874 goto handlecarry;
11875
11876 case IX86_BUILTIN_ADDCARRYX32:
11877 icode = CODE_FOR_addcarrysi;
11878 icode2 = CODE_FOR_addcarrysi_0;
11879 mode0 = SImode;
11880 mode1 = DImode;
11881 mode2 = CCCmode;
11882 goto handlecarry;
11883
11884 case IX86_BUILTIN_ADDCARRYX64:
11885 icode = CODE_FOR_addcarrydi;
11886 icode2 = CODE_FOR_addcarrydi_0;
11887 mode0 = DImode;
11888 mode1 = TImode;
11889 mode2 = CCCmode;
11890
11891 handlecarry:
11892 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
11893 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
11894 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
11895 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
11896
11897 op1 = expand_normal (arg0);
11898 if (!integer_zerop (arg0))
11899 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11900
11901 op2 = expand_normal (arg1);
11902 if (!register_operand (op2, mode0))
11903 op2 = copy_to_mode_reg (mode0, op2);
11904
11905 op3 = expand_normal (arg2);
11906 if (!register_operand (op3, mode0))
11907 op3 = copy_to_mode_reg (mode0, op3);
11908
11909 op4 = expand_normal (arg3);
11910 if (!address_operand (op4, VOIDmode))
11911 {
11912 op4 = convert_memory_address (Pmode, op4);
11913 op4 = copy_addr_to_reg (op4);
11914 }
11915
11916 op0 = gen_reg_rtx (mode0);
11917 if (integer_zerop (arg0))
11918 {
11919 /* If arg0 is 0, optimize right away into add or sub
11920 instruction that sets CCCmode flags. */
11921 op1 = gen_rtx_REG (mode2, FLAGS_REG);
11922 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11923 }
11924 else
11925 {
11926 /* Generate CF from input operand. */
11927 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11928
11929 /* Generate instruction that consumes CF. */
11930 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11931 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11932 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11933 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11934 }
11935
11936 /* Return current CF value. */
11937 if (target == 0)
11938 target = gen_reg_rtx (QImode);
11939
11940 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11941 emit_insn (gen_rtx_SET (target, pat));
11942
11943 /* Store the result. */
11944 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11945
11946 return target;
11947
11948 case IX86_BUILTIN_READ_FLAGS:
11949 if (ignore)
11950 return const0_rtx;
11951
11952 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11953
11954 if (optimize
11955 || target == NULL_RTX
11956 || !nonimmediate_operand (target, word_mode)
11957 || GET_MODE (target) != word_mode)
11958 target = gen_reg_rtx (word_mode);
11959
11960 emit_insn (gen_pop (target));
11961 return target;
11962
11963 case IX86_BUILTIN_WRITE_FLAGS:
11964
11965 arg0 = CALL_EXPR_ARG (exp, 0);
11966 op0 = expand_normal (arg0);
11967 if (!general_no_elim_operand (op0, word_mode))
11968 op0 = copy_to_mode_reg (word_mode, op0);
11969
11970 emit_insn (gen_push (op0));
11971 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11972 return 0;
11973
11974 case IX86_BUILTIN_KTESTC8:
11975 icode = CODE_FOR_ktestqi;
11976 mode3 = CCCmode;
11977 goto kortest;
11978
11979 case IX86_BUILTIN_KTESTZ8:
11980 icode = CODE_FOR_ktestqi;
11981 mode3 = CCZmode;
11982 goto kortest;
11983
11984 case IX86_BUILTIN_KTESTC16:
11985 icode = CODE_FOR_ktesthi;
11986 mode3 = CCCmode;
11987 goto kortest;
11988
11989 case IX86_BUILTIN_KTESTZ16:
11990 icode = CODE_FOR_ktesthi;
11991 mode3 = CCZmode;
11992 goto kortest;
11993
11994 case IX86_BUILTIN_KTESTC32:
11995 icode = CODE_FOR_ktestsi;
11996 mode3 = CCCmode;
11997 goto kortest;
11998
11999 case IX86_BUILTIN_KTESTZ32:
12000 icode = CODE_FOR_ktestsi;
12001 mode3 = CCZmode;
12002 goto kortest;
12003
12004 case IX86_BUILTIN_KTESTC64:
12005 icode = CODE_FOR_ktestdi;
12006 mode3 = CCCmode;
12007 goto kortest;
12008
12009 case IX86_BUILTIN_KTESTZ64:
12010 icode = CODE_FOR_ktestdi;
12011 mode3 = CCZmode;
12012 goto kortest;
12013
12014 case IX86_BUILTIN_KORTESTC8:
12015 icode = CODE_FOR_kortestqi;
12016 mode3 = CCCmode;
12017 goto kortest;
12018
12019 case IX86_BUILTIN_KORTESTZ8:
12020 icode = CODE_FOR_kortestqi;
12021 mode3 = CCZmode;
12022 goto kortest;
12023
12024 case IX86_BUILTIN_KORTESTC16:
12025 icode = CODE_FOR_kortesthi;
12026 mode3 = CCCmode;
12027 goto kortest;
12028
12029 case IX86_BUILTIN_KORTESTZ16:
12030 icode = CODE_FOR_kortesthi;
12031 mode3 = CCZmode;
12032 goto kortest;
12033
12034 case IX86_BUILTIN_KORTESTC32:
12035 icode = CODE_FOR_kortestsi;
12036 mode3 = CCCmode;
12037 goto kortest;
12038
12039 case IX86_BUILTIN_KORTESTZ32:
12040 icode = CODE_FOR_kortestsi;
12041 mode3 = CCZmode;
12042 goto kortest;
12043
12044 case IX86_BUILTIN_KORTESTC64:
12045 icode = CODE_FOR_kortestdi;
12046 mode3 = CCCmode;
12047 goto kortest;
12048
12049 case IX86_BUILTIN_KORTESTZ64:
12050 icode = CODE_FOR_kortestdi;
12051 mode3 = CCZmode;
12052
12053 kortest:
12054 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12055 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12056 op0 = expand_normal (arg0);
12057 op1 = expand_normal (arg1);
12058
12059 mode0 = insn_data[icode].operand[0].mode;
12060 mode1 = insn_data[icode].operand[1].mode;
12061
12062 if (GET_MODE (op0) != VOIDmode)
12063 op0 = force_reg (GET_MODE (op0), op0);
12064
12065 op0 = gen_lowpart (mode0, op0);
12066
12067 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12068 op0 = copy_to_mode_reg (mode0, op0);
12069
12070 if (GET_MODE (op1) != VOIDmode)
12071 op1 = force_reg (GET_MODE (op1), op1);
12072
12073 op1 = gen_lowpart (mode1, op1);
12074
12075 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12076 op1 = copy_to_mode_reg (mode1, op1);
12077
12078 target = gen_reg_rtx (QImode);
12079
12080 /* Emit kortest. */
12081 emit_insn (GEN_FCN (icode) (op0, op1));
12082 /* And use setcc to return result from flags. */
12083 ix86_expand_setcc (target, EQ,
12084 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12085 return target;
12086
12087 case IX86_BUILTIN_GATHERSIV2DF:
12088 icode = CODE_FOR_avx2_gathersiv2df;
12089 goto gather_gen;
12090 case IX86_BUILTIN_GATHERSIV4DF:
12091 icode = CODE_FOR_avx2_gathersiv4df;
12092 goto gather_gen;
12093 case IX86_BUILTIN_GATHERDIV2DF:
12094 icode = CODE_FOR_avx2_gatherdiv2df;
12095 goto gather_gen;
12096 case IX86_BUILTIN_GATHERDIV4DF:
12097 icode = CODE_FOR_avx2_gatherdiv4df;
12098 goto gather_gen;
12099 case IX86_BUILTIN_GATHERSIV4SF:
12100 icode = CODE_FOR_avx2_gathersiv4sf;
12101 goto gather_gen;
12102 case IX86_BUILTIN_GATHERSIV8SF:
12103 icode = CODE_FOR_avx2_gathersiv8sf;
12104 goto gather_gen;
12105 case IX86_BUILTIN_GATHERDIV4SF:
12106 icode = CODE_FOR_avx2_gatherdiv4sf;
12107 goto gather_gen;
12108 case IX86_BUILTIN_GATHERDIV8SF:
12109 icode = CODE_FOR_avx2_gatherdiv8sf;
12110 goto gather_gen;
12111 case IX86_BUILTIN_GATHERSIV2DI:
12112 icode = CODE_FOR_avx2_gathersiv2di;
12113 goto gather_gen;
12114 case IX86_BUILTIN_GATHERSIV4DI:
12115 icode = CODE_FOR_avx2_gathersiv4di;
12116 goto gather_gen;
12117 case IX86_BUILTIN_GATHERDIV2DI:
12118 icode = CODE_FOR_avx2_gatherdiv2di;
12119 goto gather_gen;
12120 case IX86_BUILTIN_GATHERDIV4DI:
12121 icode = CODE_FOR_avx2_gatherdiv4di;
12122 goto gather_gen;
12123 case IX86_BUILTIN_GATHERSIV4SI:
12124 icode = CODE_FOR_avx2_gathersiv4si;
12125 goto gather_gen;
12126 case IX86_BUILTIN_GATHERSIV8SI:
12127 icode = CODE_FOR_avx2_gathersiv8si;
12128 goto gather_gen;
12129 case IX86_BUILTIN_GATHERDIV4SI:
12130 icode = CODE_FOR_avx2_gatherdiv4si;
12131 goto gather_gen;
12132 case IX86_BUILTIN_GATHERDIV8SI:
12133 icode = CODE_FOR_avx2_gatherdiv8si;
12134 goto gather_gen;
12135 case IX86_BUILTIN_GATHERALTSIV4DF:
12136 icode = CODE_FOR_avx2_gathersiv4df;
12137 goto gather_gen;
12138 case IX86_BUILTIN_GATHERALTDIV8SF:
12139 icode = CODE_FOR_avx2_gatherdiv8sf;
12140 goto gather_gen;
12141 case IX86_BUILTIN_GATHERALTSIV4DI:
12142 icode = CODE_FOR_avx2_gathersiv4di;
12143 goto gather_gen;
12144 case IX86_BUILTIN_GATHERALTDIV8SI:
12145 icode = CODE_FOR_avx2_gatherdiv8si;
12146 goto gather_gen;
12147 case IX86_BUILTIN_GATHER3SIV16SF:
12148 icode = CODE_FOR_avx512f_gathersiv16sf;
12149 goto gather_gen;
12150 case IX86_BUILTIN_GATHER3SIV8DF:
12151 icode = CODE_FOR_avx512f_gathersiv8df;
12152 goto gather_gen;
12153 case IX86_BUILTIN_GATHER3DIV16SF:
12154 icode = CODE_FOR_avx512f_gatherdiv16sf;
12155 goto gather_gen;
12156 case IX86_BUILTIN_GATHER3DIV8DF:
12157 icode = CODE_FOR_avx512f_gatherdiv8df;
12158 goto gather_gen;
12159 case IX86_BUILTIN_GATHER3SIV16SI:
12160 icode = CODE_FOR_avx512f_gathersiv16si;
12161 goto gather_gen;
12162 case IX86_BUILTIN_GATHER3SIV8DI:
12163 icode = CODE_FOR_avx512f_gathersiv8di;
12164 goto gather_gen;
12165 case IX86_BUILTIN_GATHER3DIV16SI:
12166 icode = CODE_FOR_avx512f_gatherdiv16si;
12167 goto gather_gen;
12168 case IX86_BUILTIN_GATHER3DIV8DI:
12169 icode = CODE_FOR_avx512f_gatherdiv8di;
12170 goto gather_gen;
12171 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12172 icode = CODE_FOR_avx512f_gathersiv8df;
12173 goto gather_gen;
12174 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12175 icode = CODE_FOR_avx512f_gatherdiv16sf;
12176 goto gather_gen;
12177 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12178 icode = CODE_FOR_avx512f_gathersiv8di;
12179 goto gather_gen;
12180 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12181 icode = CODE_FOR_avx512f_gatherdiv16si;
12182 goto gather_gen;
12183 case IX86_BUILTIN_GATHER3SIV2DF:
12184 icode = CODE_FOR_avx512vl_gathersiv2df;
12185 goto gather_gen;
12186 case IX86_BUILTIN_GATHER3SIV4DF:
12187 icode = CODE_FOR_avx512vl_gathersiv4df;
12188 goto gather_gen;
12189 case IX86_BUILTIN_GATHER3DIV2DF:
12190 icode = CODE_FOR_avx512vl_gatherdiv2df;
12191 goto gather_gen;
12192 case IX86_BUILTIN_GATHER3DIV4DF:
12193 icode = CODE_FOR_avx512vl_gatherdiv4df;
12194 goto gather_gen;
12195 case IX86_BUILTIN_GATHER3SIV4SF:
12196 icode = CODE_FOR_avx512vl_gathersiv4sf;
12197 goto gather_gen;
12198 case IX86_BUILTIN_GATHER3SIV8SF:
12199 icode = CODE_FOR_avx512vl_gathersiv8sf;
12200 goto gather_gen;
12201 case IX86_BUILTIN_GATHER3DIV4SF:
12202 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12203 goto gather_gen;
12204 case IX86_BUILTIN_GATHER3DIV8SF:
12205 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12206 goto gather_gen;
12207 case IX86_BUILTIN_GATHER3SIV2DI:
12208 icode = CODE_FOR_avx512vl_gathersiv2di;
12209 goto gather_gen;
12210 case IX86_BUILTIN_GATHER3SIV4DI:
12211 icode = CODE_FOR_avx512vl_gathersiv4di;
12212 goto gather_gen;
12213 case IX86_BUILTIN_GATHER3DIV2DI:
12214 icode = CODE_FOR_avx512vl_gatherdiv2di;
12215 goto gather_gen;
12216 case IX86_BUILTIN_GATHER3DIV4DI:
12217 icode = CODE_FOR_avx512vl_gatherdiv4di;
12218 goto gather_gen;
12219 case IX86_BUILTIN_GATHER3SIV4SI:
12220 icode = CODE_FOR_avx512vl_gathersiv4si;
12221 goto gather_gen;
12222 case IX86_BUILTIN_GATHER3SIV8SI:
12223 icode = CODE_FOR_avx512vl_gathersiv8si;
12224 goto gather_gen;
12225 case IX86_BUILTIN_GATHER3DIV4SI:
12226 icode = CODE_FOR_avx512vl_gatherdiv4si;
12227 goto gather_gen;
12228 case IX86_BUILTIN_GATHER3DIV8SI:
12229 icode = CODE_FOR_avx512vl_gatherdiv8si;
12230 goto gather_gen;
12231 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12232 icode = CODE_FOR_avx512vl_gathersiv4df;
12233 goto gather_gen;
12234 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12235 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12236 goto gather_gen;
12237 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12238 icode = CODE_FOR_avx512vl_gathersiv4di;
12239 goto gather_gen;
12240 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12241 icode = CODE_FOR_avx512vl_gatherdiv8si;
12242 goto gather_gen;
12243 case IX86_BUILTIN_SCATTERSIV16SF:
12244 icode = CODE_FOR_avx512f_scattersiv16sf;
12245 goto scatter_gen;
12246 case IX86_BUILTIN_SCATTERSIV8DF:
12247 icode = CODE_FOR_avx512f_scattersiv8df;
12248 goto scatter_gen;
12249 case IX86_BUILTIN_SCATTERDIV16SF:
12250 icode = CODE_FOR_avx512f_scatterdiv16sf;
12251 goto scatter_gen;
12252 case IX86_BUILTIN_SCATTERDIV8DF:
12253 icode = CODE_FOR_avx512f_scatterdiv8df;
12254 goto scatter_gen;
12255 case IX86_BUILTIN_SCATTERSIV16SI:
12256 icode = CODE_FOR_avx512f_scattersiv16si;
12257 goto scatter_gen;
12258 case IX86_BUILTIN_SCATTERSIV8DI:
12259 icode = CODE_FOR_avx512f_scattersiv8di;
12260 goto scatter_gen;
12261 case IX86_BUILTIN_SCATTERDIV16SI:
12262 icode = CODE_FOR_avx512f_scatterdiv16si;
12263 goto scatter_gen;
12264 case IX86_BUILTIN_SCATTERDIV8DI:
12265 icode = CODE_FOR_avx512f_scatterdiv8di;
12266 goto scatter_gen;
12267 case IX86_BUILTIN_SCATTERSIV8SF:
12268 icode = CODE_FOR_avx512vl_scattersiv8sf;
12269 goto scatter_gen;
12270 case IX86_BUILTIN_SCATTERSIV4SF:
12271 icode = CODE_FOR_avx512vl_scattersiv4sf;
12272 goto scatter_gen;
12273 case IX86_BUILTIN_SCATTERSIV4DF:
12274 icode = CODE_FOR_avx512vl_scattersiv4df;
12275 goto scatter_gen;
12276 case IX86_BUILTIN_SCATTERSIV2DF:
12277 icode = CODE_FOR_avx512vl_scattersiv2df;
12278 goto scatter_gen;
12279 case IX86_BUILTIN_SCATTERDIV8SF:
12280 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12281 goto scatter_gen;
12282 case IX86_BUILTIN_SCATTERDIV4SF:
12283 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12284 goto scatter_gen;
12285 case IX86_BUILTIN_SCATTERDIV4DF:
12286 icode = CODE_FOR_avx512vl_scatterdiv4df;
12287 goto scatter_gen;
12288 case IX86_BUILTIN_SCATTERDIV2DF:
12289 icode = CODE_FOR_avx512vl_scatterdiv2df;
12290 goto scatter_gen;
12291 case IX86_BUILTIN_SCATTERSIV8SI:
12292 icode = CODE_FOR_avx512vl_scattersiv8si;
12293 goto scatter_gen;
12294 case IX86_BUILTIN_SCATTERSIV4SI:
12295 icode = CODE_FOR_avx512vl_scattersiv4si;
12296 goto scatter_gen;
12297 case IX86_BUILTIN_SCATTERSIV4DI:
12298 icode = CODE_FOR_avx512vl_scattersiv4di;
12299 goto scatter_gen;
12300 case IX86_BUILTIN_SCATTERSIV2DI:
12301 icode = CODE_FOR_avx512vl_scattersiv2di;
12302 goto scatter_gen;
12303 case IX86_BUILTIN_SCATTERDIV8SI:
12304 icode = CODE_FOR_avx512vl_scatterdiv8si;
12305 goto scatter_gen;
12306 case IX86_BUILTIN_SCATTERDIV4SI:
12307 icode = CODE_FOR_avx512vl_scatterdiv4si;
12308 goto scatter_gen;
12309 case IX86_BUILTIN_SCATTERDIV4DI:
12310 icode = CODE_FOR_avx512vl_scatterdiv4di;
12311 goto scatter_gen;
12312 case IX86_BUILTIN_SCATTERDIV2DI:
12313 icode = CODE_FOR_avx512vl_scatterdiv2di;
12314 goto scatter_gen;
12315 case IX86_BUILTIN_GATHERPFDPD:
12316 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12317 goto vec_prefetch_gen;
12318 case IX86_BUILTIN_SCATTERALTSIV8DF:
12319 icode = CODE_FOR_avx512f_scattersiv8df;
12320 goto scatter_gen;
12321 case IX86_BUILTIN_SCATTERALTDIV16SF:
12322 icode = CODE_FOR_avx512f_scatterdiv16sf;
12323 goto scatter_gen;
12324 case IX86_BUILTIN_SCATTERALTSIV8DI:
12325 icode = CODE_FOR_avx512f_scattersiv8di;
12326 goto scatter_gen;
12327 case IX86_BUILTIN_SCATTERALTDIV16SI:
12328 icode = CODE_FOR_avx512f_scatterdiv16si;
12329 goto scatter_gen;
12330 case IX86_BUILTIN_SCATTERALTSIV4DF:
12331 icode = CODE_FOR_avx512vl_scattersiv4df;
12332 goto scatter_gen;
12333 case IX86_BUILTIN_SCATTERALTDIV8SF:
12334 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12335 goto scatter_gen;
12336 case IX86_BUILTIN_SCATTERALTSIV4DI:
12337 icode = CODE_FOR_avx512vl_scattersiv4di;
12338 goto scatter_gen;
12339 case IX86_BUILTIN_SCATTERALTDIV8SI:
12340 icode = CODE_FOR_avx512vl_scatterdiv8si;
12341 goto scatter_gen;
12342 case IX86_BUILTIN_SCATTERALTSIV2DF:
12343 icode = CODE_FOR_avx512vl_scattersiv2df;
12344 goto scatter_gen;
12345 case IX86_BUILTIN_SCATTERALTDIV4SF:
12346 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12347 goto scatter_gen;
12348 case IX86_BUILTIN_SCATTERALTSIV2DI:
12349 icode = CODE_FOR_avx512vl_scattersiv2di;
12350 goto scatter_gen;
12351 case IX86_BUILTIN_SCATTERALTDIV4SI:
12352 icode = CODE_FOR_avx512vl_scatterdiv4si;
12353 goto scatter_gen;
12354 case IX86_BUILTIN_GATHERPFDPS:
12355 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12356 goto vec_prefetch_gen;
12357 case IX86_BUILTIN_GATHERPFQPD:
12358 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12359 goto vec_prefetch_gen;
12360 case IX86_BUILTIN_GATHERPFQPS:
12361 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12362 goto vec_prefetch_gen;
12363 case IX86_BUILTIN_SCATTERPFDPD:
12364 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12365 goto vec_prefetch_gen;
12366 case IX86_BUILTIN_SCATTERPFDPS:
12367 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12368 goto vec_prefetch_gen;
12369 case IX86_BUILTIN_SCATTERPFQPD:
12370 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12371 goto vec_prefetch_gen;
12372 case IX86_BUILTIN_SCATTERPFQPS:
12373 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12374 goto vec_prefetch_gen;
12375
12376 gather_gen:
12377 rtx half;
12378 rtx (*gen) (rtx, rtx);
12379
12380 arg0 = CALL_EXPR_ARG (exp, 0);
12381 arg1 = CALL_EXPR_ARG (exp, 1);
12382 arg2 = CALL_EXPR_ARG (exp, 2);
12383 arg3 = CALL_EXPR_ARG (exp, 3);
12384 arg4 = CALL_EXPR_ARG (exp, 4);
12385 op0 = expand_normal (arg0);
12386 op1 = expand_normal (arg1);
12387 op2 = expand_normal (arg2);
12388 op3 = expand_normal (arg3);
12389 op4 = expand_normal (arg4);
12390 /* Note the arg order is different from the operand order. */
12391 mode0 = insn_data[icode].operand[1].mode;
12392 mode2 = insn_data[icode].operand[3].mode;
12393 mode3 = insn_data[icode].operand[4].mode;
12394 mode4 = insn_data[icode].operand[5].mode;
12395
12396 if (target == NULL_RTX
12397 || GET_MODE (target) != insn_data[icode].operand[0].mode
12398 || !insn_data[icode].operand[0].predicate (target,
12399 GET_MODE (target)))
12400 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12401 else
12402 subtarget = target;
12403
12404 switch (fcode)
12405 {
12406 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12407 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12408 half = gen_reg_rtx (V8SImode);
12409 if (!nonimmediate_operand (op2, V16SImode))
12410 op2 = copy_to_mode_reg (V16SImode, op2);
12411 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12412 op2 = half;
12413 break;
12414 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12415 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12416 case IX86_BUILTIN_GATHERALTSIV4DF:
12417 case IX86_BUILTIN_GATHERALTSIV4DI:
12418 half = gen_reg_rtx (V4SImode);
12419 if (!nonimmediate_operand (op2, V8SImode))
12420 op2 = copy_to_mode_reg (V8SImode, op2);
12421 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12422 op2 = half;
12423 break;
12424 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12425 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12426 half = gen_reg_rtx (mode0);
12427 if (mode0 == V8SFmode)
12428 gen = gen_vec_extract_lo_v16sf;
12429 else
12430 gen = gen_vec_extract_lo_v16si;
12431 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12432 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12433 emit_insn (gen (half, op0));
12434 op0 = half;
12435 op3 = lowpart_subreg (QImode, op3, HImode);
12436 break;
12437 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12438 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12439 case IX86_BUILTIN_GATHERALTDIV8SF:
12440 case IX86_BUILTIN_GATHERALTDIV8SI:
12441 half = gen_reg_rtx (mode0);
12442 if (mode0 == V4SFmode)
12443 gen = gen_vec_extract_lo_v8sf;
12444 else
12445 gen = gen_vec_extract_lo_v8si;
12446 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12447 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12448 emit_insn (gen (half, op0));
12449 op0 = half;
12450 if (VECTOR_MODE_P (GET_MODE (op3)))
12451 {
12452 half = gen_reg_rtx (mode0);
12453 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12454 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12455 emit_insn (gen (half, op3));
12456 op3 = half;
12457 }
12458 break;
12459 default:
12460 break;
12461 }
12462
12463 /* Force memory operand only with base register here. But we
12464 don't want to do it on memory operand for other builtin
12465 functions. */
12466 op1 = ix86_zero_extend_to_Pmode (op1);
12467
12468 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12469 op0 = copy_to_mode_reg (mode0, op0);
12470 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12471 op1 = copy_to_mode_reg (Pmode, op1);
12472 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12473 op2 = copy_to_mode_reg (mode2, op2);
12474
12475 op3 = fixup_modeless_constant (op3, mode3);
12476
12477 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12478 {
12479 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12480 op3 = copy_to_mode_reg (mode3, op3);
12481 }
12482 else
12483 {
12484 op3 = copy_to_reg (op3);
12485 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12486 }
12487 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12488 {
12489 error ("the last argument must be scale 1, 2, 4, 8");
12490 return const0_rtx;
12491 }
12492
12493 /* Optimize. If mask is known to have all high bits set,
12494 replace op0 with pc_rtx to signal that the instruction
12495 overwrites the whole destination and doesn't use its
12496 previous contents. */
12497 if (optimize)
12498 {
12499 if (TREE_CODE (arg3) == INTEGER_CST)
12500 {
12501 if (integer_all_onesp (arg3))
12502 op0 = pc_rtx;
12503 }
12504 else if (TREE_CODE (arg3) == VECTOR_CST)
12505 {
12506 unsigned int negative = 0;
12507 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12508 {
12509 tree cst = VECTOR_CST_ELT (arg3, i);
12510 if (TREE_CODE (cst) == INTEGER_CST
12511 && tree_int_cst_sign_bit (cst))
12512 negative++;
12513 else if (TREE_CODE (cst) == REAL_CST
12514 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12515 negative++;
12516 }
12517 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12518 op0 = pc_rtx;
12519 }
12520 else if (TREE_CODE (arg3) == SSA_NAME
12521 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12522 {
12523 /* Recognize also when mask is like:
12524 __v2df src = _mm_setzero_pd ();
12525 __v2df mask = _mm_cmpeq_pd (src, src);
12526 or
12527 __v8sf src = _mm256_setzero_ps ();
12528 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12529 as that is a cheaper way to load all ones into
12530 a register than having to load a constant from
12531 memory. */
12532 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12533 if (is_gimple_call (def_stmt))
12534 {
12535 tree fndecl = gimple_call_fndecl (def_stmt);
12536 if (fndecl
12537 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12538 switch (DECL_MD_FUNCTION_CODE (fndecl))
12539 {
12540 case IX86_BUILTIN_CMPPD:
12541 case IX86_BUILTIN_CMPPS:
12542 case IX86_BUILTIN_CMPPD256:
12543 case IX86_BUILTIN_CMPPS256:
12544 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12545 break;
12546 /* FALLTHRU */
12547 case IX86_BUILTIN_CMPEQPD:
12548 case IX86_BUILTIN_CMPEQPS:
12549 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12550 && initializer_zerop (gimple_call_arg (def_stmt,
12551 1)))
12552 op0 = pc_rtx;
12553 break;
12554 default:
12555 break;
12556 }
12557 }
12558 }
12559 }
12560
12561 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12562 if (! pat)
12563 return const0_rtx;
12564 emit_insn (pat);
12565
12566 switch (fcode)
12567 {
12568 case IX86_BUILTIN_GATHER3DIV16SF:
12569 if (target == NULL_RTX)
12570 target = gen_reg_rtx (V8SFmode);
12571 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12572 break;
12573 case IX86_BUILTIN_GATHER3DIV16SI:
12574 if (target == NULL_RTX)
12575 target = gen_reg_rtx (V8SImode);
12576 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12577 break;
12578 case IX86_BUILTIN_GATHER3DIV8SF:
12579 case IX86_BUILTIN_GATHERDIV8SF:
12580 if (target == NULL_RTX)
12581 target = gen_reg_rtx (V4SFmode);
12582 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12583 break;
12584 case IX86_BUILTIN_GATHER3DIV8SI:
12585 case IX86_BUILTIN_GATHERDIV8SI:
12586 if (target == NULL_RTX)
12587 target = gen_reg_rtx (V4SImode);
12588 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12589 break;
12590 default:
12591 target = subtarget;
12592 break;
12593 }
12594 return target;
12595
12596 scatter_gen:
12597 arg0 = CALL_EXPR_ARG (exp, 0);
12598 arg1 = CALL_EXPR_ARG (exp, 1);
12599 arg2 = CALL_EXPR_ARG (exp, 2);
12600 arg3 = CALL_EXPR_ARG (exp, 3);
12601 arg4 = CALL_EXPR_ARG (exp, 4);
12602 op0 = expand_normal (arg0);
12603 op1 = expand_normal (arg1);
12604 op2 = expand_normal (arg2);
12605 op3 = expand_normal (arg3);
12606 op4 = expand_normal (arg4);
12607 mode1 = insn_data[icode].operand[1].mode;
12608 mode2 = insn_data[icode].operand[2].mode;
12609 mode3 = insn_data[icode].operand[3].mode;
12610 mode4 = insn_data[icode].operand[4].mode;
12611
12612 /* Scatter instruction stores operand op3 to memory with
12613 indices from op2 and scale from op4 under writemask op1.
12614 If index operand op2 has more elements then source operand
12615 op3 one need to use only its low half. And vice versa. */
12616 switch (fcode)
12617 {
12618 case IX86_BUILTIN_SCATTERALTSIV8DF:
12619 case IX86_BUILTIN_SCATTERALTSIV8DI:
12620 half = gen_reg_rtx (V8SImode);
12621 if (!nonimmediate_operand (op2, V16SImode))
12622 op2 = copy_to_mode_reg (V16SImode, op2);
12623 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12624 op2 = half;
12625 break;
12626 case IX86_BUILTIN_SCATTERALTDIV16SF:
12627 case IX86_BUILTIN_SCATTERALTDIV16SI:
12628 half = gen_reg_rtx (mode3);
12629 if (mode3 == V8SFmode)
12630 gen = gen_vec_extract_lo_v16sf;
12631 else
12632 gen = gen_vec_extract_lo_v16si;
12633 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12634 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12635 emit_insn (gen (half, op3));
12636 op3 = half;
12637 break;
12638 case IX86_BUILTIN_SCATTERALTSIV4DF:
12639 case IX86_BUILTIN_SCATTERALTSIV4DI:
12640 half = gen_reg_rtx (V4SImode);
12641 if (!nonimmediate_operand (op2, V8SImode))
12642 op2 = copy_to_mode_reg (V8SImode, op2);
12643 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12644 op2 = half;
12645 break;
12646 case IX86_BUILTIN_SCATTERALTDIV8SF:
12647 case IX86_BUILTIN_SCATTERALTDIV8SI:
12648 half = gen_reg_rtx (mode3);
12649 if (mode3 == V4SFmode)
12650 gen = gen_vec_extract_lo_v8sf;
12651 else
12652 gen = gen_vec_extract_lo_v8si;
12653 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12654 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12655 emit_insn (gen (half, op3));
12656 op3 = half;
12657 break;
12658 case IX86_BUILTIN_SCATTERALTSIV2DF:
12659 case IX86_BUILTIN_SCATTERALTSIV2DI:
12660 if (!nonimmediate_operand (op2, V4SImode))
12661 op2 = copy_to_mode_reg (V4SImode, op2);
12662 break;
12663 case IX86_BUILTIN_SCATTERALTDIV4SF:
12664 case IX86_BUILTIN_SCATTERALTDIV4SI:
12665 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12666 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12667 break;
12668 default:
12669 break;
12670 }
12671
12672 /* Force memory operand only with base register here. But we
12673 don't want to do it on memory operand for other builtin
12674 functions. */
12675 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12676
12677 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12678 op0 = copy_to_mode_reg (Pmode, op0);
12679
12680 op1 = fixup_modeless_constant (op1, mode1);
12681
12682 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12683 {
12684 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12685 op1 = copy_to_mode_reg (mode1, op1);
12686 }
12687 else
12688 {
12689 op1 = copy_to_reg (op1);
12690 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12691 }
12692
12693 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12694 op2 = copy_to_mode_reg (mode2, op2);
12695
12696 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12697 op3 = copy_to_mode_reg (mode3, op3);
12698
12699 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12700 {
12701 error ("the last argument must be scale 1, 2, 4, 8");
12702 return const0_rtx;
12703 }
12704
12705 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12706 if (! pat)
12707 return const0_rtx;
12708
12709 emit_insn (pat);
12710 return 0;
12711
12712 vec_prefetch_gen:
12713 arg0 = CALL_EXPR_ARG (exp, 0);
12714 arg1 = CALL_EXPR_ARG (exp, 1);
12715 arg2 = CALL_EXPR_ARG (exp, 2);
12716 arg3 = CALL_EXPR_ARG (exp, 3);
12717 arg4 = CALL_EXPR_ARG (exp, 4);
12718 op0 = expand_normal (arg0);
12719 op1 = expand_normal (arg1);
12720 op2 = expand_normal (arg2);
12721 op3 = expand_normal (arg3);
12722 op4 = expand_normal (arg4);
12723 mode0 = insn_data[icode].operand[0].mode;
12724 mode1 = insn_data[icode].operand[1].mode;
12725 mode3 = insn_data[icode].operand[3].mode;
12726 mode4 = insn_data[icode].operand[4].mode;
12727
12728 op0 = fixup_modeless_constant (op0, mode0);
12729
12730 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12731 {
12732 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12733 op0 = copy_to_mode_reg (mode0, op0);
12734 }
12735 else
12736 {
12737 op0 = copy_to_reg (op0);
12738 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12739 }
12740
12741 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12742 op1 = copy_to_mode_reg (mode1, op1);
12743
12744 /* Force memory operand only with base register here. But we
12745 don't want to do it on memory operand for other builtin
12746 functions. */
12747 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12748
12749 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12750 op2 = copy_to_mode_reg (Pmode, op2);
12751
12752 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12753 {
12754 error ("the forth argument must be scale 1, 2, 4, 8");
12755 return const0_rtx;
12756 }
12757
12758 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12759 {
12760 error ("incorrect hint operand");
12761 return const0_rtx;
12762 }
12763
12764 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12765 if (! pat)
12766 return const0_rtx;
12767
12768 emit_insn (pat);
12769
12770 return 0;
12771
12772 case IX86_BUILTIN_XABORT:
12773 icode = CODE_FOR_xabort;
12774 arg0 = CALL_EXPR_ARG (exp, 0);
12775 op0 = expand_normal (arg0);
12776 mode0 = insn_data[icode].operand[0].mode;
12777 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12778 {
12779 error ("the argument to %<xabort%> intrinsic must "
12780 "be an 8-bit immediate");
12781 return const0_rtx;
12782 }
12783 emit_insn (gen_xabort (op0));
12784 return 0;
12785
12786 case IX86_BUILTIN_RSTORSSP:
12787 case IX86_BUILTIN_CLRSSBSY:
12788 arg0 = CALL_EXPR_ARG (exp, 0);
12789 op0 = expand_normal (arg0);
12790 icode = (fcode == IX86_BUILTIN_RSTORSSP
12791 ? CODE_FOR_rstorssp
12792 : CODE_FOR_clrssbsy);
12793 if (!address_operand (op0, VOIDmode))
12794 {
12795 op1 = convert_memory_address (Pmode, op0);
12796 op0 = copy_addr_to_reg (op1);
12797 }
12798 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
12799 return 0;
12800
12801 case IX86_BUILTIN_WRSSD:
12802 case IX86_BUILTIN_WRSSQ:
12803 case IX86_BUILTIN_WRUSSD:
12804 case IX86_BUILTIN_WRUSSQ:
12805 arg0 = CALL_EXPR_ARG (exp, 0);
12806 op0 = expand_normal (arg0);
12807 arg1 = CALL_EXPR_ARG (exp, 1);
12808 op1 = expand_normal (arg1);
12809 switch (fcode)
12810 {
12811 case IX86_BUILTIN_WRSSD:
12812 icode = CODE_FOR_wrsssi;
12813 mode = SImode;
12814 break;
12815 case IX86_BUILTIN_WRSSQ:
12816 icode = CODE_FOR_wrssdi;
12817 mode = DImode;
12818 break;
12819 case IX86_BUILTIN_WRUSSD:
12820 icode = CODE_FOR_wrusssi;
12821 mode = SImode;
12822 break;
12823 case IX86_BUILTIN_WRUSSQ:
12824 icode = CODE_FOR_wrussdi;
12825 mode = DImode;
12826 break;
12827 }
12828 op0 = force_reg (mode, op0);
12829 if (!address_operand (op1, VOIDmode))
12830 {
12831 op2 = convert_memory_address (Pmode, op1);
12832 op1 = copy_addr_to_reg (op2);
12833 }
12834 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
12835 return 0;
12836
12837 case IX86_BUILTIN_VZEROUPPER:
12838 cfun->machine->has_explicit_vzeroupper = true;
12839 break;
12840
12841 default:
12842 break;
12843 }
12844
12845 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12846 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12847 {
12848 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12849 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12850 target);
12851 }
12852
12853 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12854 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12855 {
12856 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12857 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12858 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12859 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12860 int masked = 1;
12861 machine_mode mode, wide_mode, nar_mode;
12862
12863 nar_mode = V4SFmode;
12864 mode = V16SFmode;
12865 wide_mode = V64SFmode;
12866 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
12867 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12868
12869 switch (fcode)
12870 {
12871 case IX86_BUILTIN_4FMAPS:
12872 fcn = gen_avx5124fmaddps_4fmaddps;
12873 masked = 0;
12874 goto v4fma_expand;
12875
12876 case IX86_BUILTIN_4DPWSSD:
12877 nar_mode = V4SImode;
12878 mode = V16SImode;
12879 wide_mode = V64SImode;
12880 fcn = gen_avx5124vnniw_vp4dpwssd;
12881 masked = 0;
12882 goto v4fma_expand;
12883
12884 case IX86_BUILTIN_4DPWSSDS:
12885 nar_mode = V4SImode;
12886 mode = V16SImode;
12887 wide_mode = V64SImode;
12888 fcn = gen_avx5124vnniw_vp4dpwssds;
12889 masked = 0;
12890 goto v4fma_expand;
12891
12892 case IX86_BUILTIN_4FNMAPS:
12893 fcn = gen_avx5124fmaddps_4fnmaddps;
12894 masked = 0;
12895 goto v4fma_expand;
12896
12897 case IX86_BUILTIN_4FNMAPS_MASK:
12898 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
12899 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12900 goto v4fma_expand;
12901
12902 case IX86_BUILTIN_4DPWSSD_MASK:
12903 nar_mode = V4SImode;
12904 mode = V16SImode;
12905 wide_mode = V64SImode;
12906 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
12907 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12908 goto v4fma_expand;
12909
12910 case IX86_BUILTIN_4DPWSSDS_MASK:
12911 nar_mode = V4SImode;
12912 mode = V16SImode;
12913 wide_mode = V64SImode;
12914 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
12915 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12916 goto v4fma_expand;
12917
12918 case IX86_BUILTIN_4FMAPS_MASK:
12919 {
12920 tree args[4];
12921 rtx ops[4];
12922 rtx wide_reg;
12923 rtx accum;
12924 rtx addr;
12925 rtx mem;
12926
12927 v4fma_expand:
12928 wide_reg = gen_reg_rtx (wide_mode);
12929 for (i = 0; i < 4; i++)
12930 {
12931 args[i] = CALL_EXPR_ARG (exp, i);
12932 ops[i] = expand_normal (args[i]);
12933
12934 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12935 ops[i]);
12936 }
12937
12938 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12939 accum = force_reg (mode, accum);
12940
12941 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12942 addr = force_reg (Pmode, addr);
12943
12944 mem = gen_rtx_MEM (nar_mode, addr);
12945
12946 target = gen_reg_rtx (mode);
12947
12948 emit_move_insn (target, accum);
12949
12950 if (! masked)
12951 emit_insn (fcn (target, accum, wide_reg, mem));
12952 else
12953 {
12954 rtx merge, mask;
12955 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12956
12957 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12958
12959 if (CONST_INT_P (mask))
12960 mask = fixup_modeless_constant (mask, HImode);
12961
12962 mask = force_reg (HImode, mask);
12963
12964 if (GET_MODE (mask) != HImode)
12965 mask = gen_rtx_SUBREG (HImode, mask, 0);
12966
12967 /* If merge is 0 then we're about to emit z-masked variant. */
12968 if (const0_operand (merge, mode))
12969 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12970 /* If merge is the same as accum then emit merge-masked variant. */
12971 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12972 {
12973 merge = force_reg (mode, merge);
12974 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12975 }
12976 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12977 else
12978 {
12979 target = gen_reg_rtx (mode);
12980 emit_move_insn (target, merge);
12981 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12982 }
12983 }
12984 return target;
12985 }
12986
12987 case IX86_BUILTIN_4FNMASS:
12988 fcn = gen_avx5124fmaddps_4fnmaddss;
12989 masked = 0;
12990 goto s4fma_expand;
12991
12992 case IX86_BUILTIN_4FMASS:
12993 fcn = gen_avx5124fmaddps_4fmaddss;
12994 masked = 0;
12995 goto s4fma_expand;
12996
12997 case IX86_BUILTIN_4FNMASS_MASK:
12998 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
12999 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13000 goto s4fma_expand;
13001
13002 case IX86_BUILTIN_4FMASS_MASK:
13003 {
13004 tree args[4];
13005 rtx ops[4];
13006 rtx wide_reg;
13007 rtx accum;
13008 rtx addr;
13009 rtx mem;
13010
13011 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13012 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13013
13014 s4fma_expand:
13015 mode = V4SFmode;
13016 wide_reg = gen_reg_rtx (V64SFmode);
13017 for (i = 0; i < 4; i++)
13018 {
13019 rtx tmp;
13020 args[i] = CALL_EXPR_ARG (exp, i);
13021 ops[i] = expand_normal (args[i]);
13022
13023 tmp = gen_reg_rtx (SFmode);
13024 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13025
13026 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13027 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13028 }
13029
13030 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13031 accum = force_reg (V4SFmode, accum);
13032
13033 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13034 addr = force_reg (Pmode, addr);
13035
13036 mem = gen_rtx_MEM (V4SFmode, addr);
13037
13038 target = gen_reg_rtx (V4SFmode);
13039
13040 emit_move_insn (target, accum);
13041
13042 if (! masked)
13043 emit_insn (fcn (target, accum, wide_reg, mem));
13044 else
13045 {
13046 rtx merge, mask;
13047 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13048
13049 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13050
13051 if (CONST_INT_P (mask))
13052 mask = fixup_modeless_constant (mask, QImode);
13053
13054 mask = force_reg (QImode, mask);
13055
13056 if (GET_MODE (mask) != QImode)
13057 mask = gen_rtx_SUBREG (QImode, mask, 0);
13058
13059 /* If merge is 0 then we're about to emit z-masked variant. */
13060 if (const0_operand (merge, mode))
13061 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13062 /* If merge is the same as accum then emit merge-masked
13063 variant. */
13064 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13065 {
13066 merge = force_reg (mode, merge);
13067 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13068 }
13069 /* Merge with something unknown might happen if we z-mask
13070 w/ -O0. */
13071 else
13072 {
13073 target = gen_reg_rtx (mode);
13074 emit_move_insn (target, merge);
13075 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13076 }
13077 }
13078 return target;
13079 }
13080 case IX86_BUILTIN_RDPID:
13081 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13082 target);
13083 case IX86_BUILTIN_FABSQ:
13084 case IX86_BUILTIN_COPYSIGNQ:
13085 if (!TARGET_SSE)
13086 /* Emit a normal call if SSE isn't available. */
13087 return expand_call (exp, target, ignore);
13088 /* FALLTHRU */
13089 default:
13090 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13091 }
13092 }
13093
13094 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13095 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13096 {
13097 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13098 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13099 }
13100
13101 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13102 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13103 {
13104 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13105 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13106 }
13107
13108 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13109 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13110 {
13111 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13112 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13113 }
13114
13115 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13116 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13117 {
13118 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13119 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13120 }
13121
13122 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13123 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13124 {
13125 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13126 const struct builtin_description *d = bdesc_multi_arg + i;
13127 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13128 (enum ix86_builtin_func_type)
13129 d->flag, d->comparison);
13130 }
13131
13132 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13133 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13134 {
13135 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13136 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13137 target);
13138 }
13139
13140 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13141 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
13142 {
13143 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
13144 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
13145 target);
13146 }
13147
13148 gcc_unreachable ();
13149 }
13150
13151 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13152 fill target with val via vec_duplicate. */
13153
13154 static bool
ix86_vector_duplicate_value(machine_mode mode,rtx target,rtx val)13155 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13156 {
13157 bool ok;
13158 rtx_insn *insn;
13159 rtx dup;
13160
13161 /* First attempt to recognize VAL as-is. */
13162 dup = gen_vec_duplicate (mode, val);
13163 insn = emit_insn (gen_rtx_SET (target, dup));
13164 if (recog_memoized (insn) < 0)
13165 {
13166 rtx_insn *seq;
13167 machine_mode innermode = GET_MODE_INNER (mode);
13168 rtx reg;
13169
13170 /* If that fails, force VAL into a register. */
13171
13172 start_sequence ();
13173 reg = force_reg (innermode, val);
13174 if (GET_MODE (reg) != innermode)
13175 reg = gen_lowpart (innermode, reg);
13176 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13177 seq = get_insns ();
13178 end_sequence ();
13179 if (seq)
13180 emit_insn_before (seq, insn);
13181
13182 ok = recog_memoized (insn) >= 0;
13183 gcc_assert (ok);
13184 }
13185 return true;
13186 }
13187
13188 /* Get a vector mode of the same size as the original but with elements
13189 twice as wide. This is only guaranteed to apply to integral vectors. */
13190
13191 static machine_mode
get_mode_wider_vector(machine_mode o)13192 get_mode_wider_vector (machine_mode o)
13193 {
13194 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13195 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13196 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13197 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13198 return n;
13199 }
13200
13201 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13202 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13203
13204 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13205 with all elements equal to VAR. Return true if successful. */
13206
13207 static bool
ix86_expand_vector_init_duplicate(bool mmx_ok,machine_mode mode,rtx target,rtx val)13208 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13209 rtx target, rtx val)
13210 {
13211 bool ok;
13212
13213 switch (mode)
13214 {
13215 case E_V2SImode:
13216 case E_V2SFmode:
13217 if (!mmx_ok)
13218 return false;
13219 /* FALLTHRU */
13220
13221 case E_V4DFmode:
13222 case E_V4DImode:
13223 case E_V8SFmode:
13224 case E_V8SImode:
13225 case E_V2DFmode:
13226 case E_V2DImode:
13227 case E_V4SFmode:
13228 case E_V4SImode:
13229 case E_V16SImode:
13230 case E_V8DImode:
13231 case E_V16SFmode:
13232 case E_V8DFmode:
13233 return ix86_vector_duplicate_value (mode, target, val);
13234
13235 case E_V4HImode:
13236 if (!mmx_ok)
13237 return false;
13238 if (TARGET_SSE || TARGET_3DNOW_A)
13239 {
13240 rtx x;
13241
13242 val = gen_lowpart (SImode, val);
13243 x = gen_rtx_TRUNCATE (HImode, val);
13244 x = gen_rtx_VEC_DUPLICATE (mode, x);
13245 emit_insn (gen_rtx_SET (target, x));
13246 return true;
13247 }
13248 goto widen;
13249
13250 case E_V8QImode:
13251 if (!mmx_ok)
13252 return false;
13253 goto widen;
13254
13255 case E_V8HImode:
13256 if (TARGET_AVX2)
13257 return ix86_vector_duplicate_value (mode, target, val);
13258
13259 if (TARGET_SSE2)
13260 {
13261 struct expand_vec_perm_d dperm;
13262 rtx tmp1, tmp2;
13263
13264 permute:
13265 memset (&dperm, 0, sizeof (dperm));
13266 dperm.target = target;
13267 dperm.vmode = mode;
13268 dperm.nelt = GET_MODE_NUNITS (mode);
13269 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13270 dperm.one_operand_p = true;
13271
13272 /* Extend to SImode using a paradoxical SUBREG. */
13273 tmp1 = gen_reg_rtx (SImode);
13274 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13275
13276 /* Insert the SImode value as low element of a V4SImode vector. */
13277 tmp2 = gen_reg_rtx (V4SImode);
13278 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13279 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13280
13281 ok = (expand_vec_perm_1 (&dperm)
13282 || expand_vec_perm_broadcast_1 (&dperm));
13283 gcc_assert (ok);
13284 return ok;
13285 }
13286 goto widen;
13287
13288 case E_V16QImode:
13289 if (TARGET_AVX2)
13290 return ix86_vector_duplicate_value (mode, target, val);
13291
13292 if (TARGET_SSE2)
13293 goto permute;
13294 goto widen;
13295
13296 widen:
13297 /* Replicate the value once into the next wider mode and recurse. */
13298 {
13299 machine_mode smode, wsmode, wvmode;
13300 rtx x;
13301
13302 smode = GET_MODE_INNER (mode);
13303 wvmode = get_mode_wider_vector (mode);
13304 wsmode = GET_MODE_INNER (wvmode);
13305
13306 val = convert_modes (wsmode, smode, val, true);
13307 x = expand_simple_binop (wsmode, ASHIFT, val,
13308 GEN_INT (GET_MODE_BITSIZE (smode)),
13309 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13310 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13311
13312 x = gen_reg_rtx (wvmode);
13313 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13314 gcc_assert (ok);
13315 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13316 return ok;
13317 }
13318
13319 case E_V16HImode:
13320 case E_V32QImode:
13321 if (TARGET_AVX2)
13322 return ix86_vector_duplicate_value (mode, target, val);
13323 else
13324 {
13325 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13326 rtx x = gen_reg_rtx (hvmode);
13327
13328 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13329 gcc_assert (ok);
13330
13331 x = gen_rtx_VEC_CONCAT (mode, x, x);
13332 emit_insn (gen_rtx_SET (target, x));
13333 }
13334 return true;
13335
13336 case E_V64QImode:
13337 case E_V32HImode:
13338 if (TARGET_AVX512BW)
13339 return ix86_vector_duplicate_value (mode, target, val);
13340 else
13341 {
13342 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13343 rtx x = gen_reg_rtx (hvmode);
13344
13345 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13346 gcc_assert (ok);
13347
13348 x = gen_rtx_VEC_CONCAT (mode, x, x);
13349 emit_insn (gen_rtx_SET (target, x));
13350 }
13351 return true;
13352
13353 default:
13354 return false;
13355 }
13356 }
13357
13358 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13359 whose ONE_VAR element is VAR, and other elements are zero. Return true
13360 if successful. */
13361
13362 static bool
ix86_expand_vector_init_one_nonzero(bool mmx_ok,machine_mode mode,rtx target,rtx var,int one_var)13363 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13364 rtx target, rtx var, int one_var)
13365 {
13366 machine_mode vsimode;
13367 rtx new_target;
13368 rtx x, tmp;
13369 bool use_vector_set = false;
13370 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13371
13372 switch (mode)
13373 {
13374 case E_V2DImode:
13375 /* For SSE4.1, we normally use vector set. But if the second
13376 element is zero and inter-unit moves are OK, we use movq
13377 instead. */
13378 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13379 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13380 && one_var == 0));
13381 break;
13382 case E_V16QImode:
13383 case E_V4SImode:
13384 case E_V4SFmode:
13385 use_vector_set = TARGET_SSE4_1;
13386 break;
13387 case E_V8HImode:
13388 use_vector_set = TARGET_SSE2;
13389 break;
13390 case E_V8QImode:
13391 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13392 break;
13393 case E_V4HImode:
13394 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13395 break;
13396 case E_V32QImode:
13397 case E_V16HImode:
13398 use_vector_set = TARGET_AVX;
13399 break;
13400 case E_V8SImode:
13401 use_vector_set = TARGET_AVX;
13402 gen_vec_set_0 = gen_vec_setv8si_0;
13403 break;
13404 case E_V8SFmode:
13405 use_vector_set = TARGET_AVX;
13406 gen_vec_set_0 = gen_vec_setv8sf_0;
13407 break;
13408 case E_V4DFmode:
13409 use_vector_set = TARGET_AVX;
13410 gen_vec_set_0 = gen_vec_setv4df_0;
13411 break;
13412 case E_V4DImode:
13413 /* Use ix86_expand_vector_set in 64bit mode only. */
13414 use_vector_set = TARGET_AVX && TARGET_64BIT;
13415 gen_vec_set_0 = gen_vec_setv4di_0;
13416 break;
13417 case E_V16SImode:
13418 use_vector_set = TARGET_AVX512F && one_var == 0;
13419 gen_vec_set_0 = gen_vec_setv16si_0;
13420 break;
13421 case E_V16SFmode:
13422 use_vector_set = TARGET_AVX512F && one_var == 0;
13423 gen_vec_set_0 = gen_vec_setv16sf_0;
13424 break;
13425 case E_V8DFmode:
13426 use_vector_set = TARGET_AVX512F && one_var == 0;
13427 gen_vec_set_0 = gen_vec_setv8df_0;
13428 break;
13429 case E_V8DImode:
13430 /* Use ix86_expand_vector_set in 64bit mode only. */
13431 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13432 gen_vec_set_0 = gen_vec_setv8di_0;
13433 break;
13434 default:
13435 break;
13436 }
13437
13438 if (use_vector_set)
13439 {
13440 if (gen_vec_set_0 && one_var == 0)
13441 {
13442 var = force_reg (GET_MODE_INNER (mode), var);
13443 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13444 return true;
13445 }
13446 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13447 var = force_reg (GET_MODE_INNER (mode), var);
13448 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13449 return true;
13450 }
13451
13452 switch (mode)
13453 {
13454 case E_V2SFmode:
13455 case E_V2SImode:
13456 if (!mmx_ok)
13457 return false;
13458 /* FALLTHRU */
13459
13460 case E_V2DFmode:
13461 case E_V2DImode:
13462 if (one_var != 0)
13463 return false;
13464 var = force_reg (GET_MODE_INNER (mode), var);
13465 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13466 emit_insn (gen_rtx_SET (target, x));
13467 return true;
13468
13469 case E_V4SFmode:
13470 case E_V4SImode:
13471 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13472 new_target = gen_reg_rtx (mode);
13473 else
13474 new_target = target;
13475 var = force_reg (GET_MODE_INNER (mode), var);
13476 x = gen_rtx_VEC_DUPLICATE (mode, var);
13477 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13478 emit_insn (gen_rtx_SET (new_target, x));
13479 if (one_var != 0)
13480 {
13481 /* We need to shuffle the value to the correct position, so
13482 create a new pseudo to store the intermediate result. */
13483
13484 /* With SSE2, we can use the integer shuffle insns. */
13485 if (mode != V4SFmode && TARGET_SSE2)
13486 {
13487 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13488 const1_rtx,
13489 GEN_INT (one_var == 1 ? 0 : 1),
13490 GEN_INT (one_var == 2 ? 0 : 1),
13491 GEN_INT (one_var == 3 ? 0 : 1)));
13492 if (target != new_target)
13493 emit_move_insn (target, new_target);
13494 return true;
13495 }
13496
13497 /* Otherwise convert the intermediate result to V4SFmode and
13498 use the SSE1 shuffle instructions. */
13499 if (mode != V4SFmode)
13500 {
13501 tmp = gen_reg_rtx (V4SFmode);
13502 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13503 }
13504 else
13505 tmp = new_target;
13506
13507 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13508 const1_rtx,
13509 GEN_INT (one_var == 1 ? 0 : 1),
13510 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13511 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13512
13513 if (mode != V4SFmode)
13514 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13515 else if (tmp != target)
13516 emit_move_insn (target, tmp);
13517 }
13518 else if (target != new_target)
13519 emit_move_insn (target, new_target);
13520 return true;
13521
13522 case E_V8HImode:
13523 case E_V16QImode:
13524 vsimode = V4SImode;
13525 goto widen;
13526 case E_V4HImode:
13527 case E_V8QImode:
13528 if (!mmx_ok)
13529 return false;
13530 vsimode = V2SImode;
13531 goto widen;
13532 widen:
13533 if (one_var != 0)
13534 return false;
13535
13536 /* Zero extend the variable element to SImode and recurse. */
13537 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13538
13539 x = gen_reg_rtx (vsimode);
13540 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13541 var, one_var))
13542 gcc_unreachable ();
13543
13544 emit_move_insn (target, gen_lowpart (mode, x));
13545 return true;
13546
13547 default:
13548 return false;
13549 }
13550 }
13551
13552 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13553 consisting of the values in VALS. It is known that all elements
13554 except ONE_VAR are constants. Return true if successful. */
13555
13556 static bool
ix86_expand_vector_init_one_var(bool mmx_ok,machine_mode mode,rtx target,rtx vals,int one_var)13557 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13558 rtx target, rtx vals, int one_var)
13559 {
13560 rtx var = XVECEXP (vals, 0, one_var);
13561 machine_mode wmode;
13562 rtx const_vec, x;
13563
13564 const_vec = copy_rtx (vals);
13565 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13566 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13567
13568 switch (mode)
13569 {
13570 case E_V2DFmode:
13571 case E_V2DImode:
13572 case E_V2SFmode:
13573 case E_V2SImode:
13574 /* For the two element vectors, it's just as easy to use
13575 the general case. */
13576 return false;
13577
13578 case E_V4DImode:
13579 /* Use ix86_expand_vector_set in 64bit mode only. */
13580 if (!TARGET_64BIT)
13581 return false;
13582 /* FALLTHRU */
13583 case E_V4DFmode:
13584 case E_V8SFmode:
13585 case E_V8SImode:
13586 case E_V16HImode:
13587 case E_V32QImode:
13588 case E_V4SFmode:
13589 case E_V4SImode:
13590 case E_V8HImode:
13591 case E_V4HImode:
13592 break;
13593
13594 case E_V16QImode:
13595 if (TARGET_SSE4_1)
13596 break;
13597 wmode = V8HImode;
13598 goto widen;
13599 case E_V8QImode:
13600 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13601 break;
13602 wmode = V4HImode;
13603 goto widen;
13604 widen:
13605 /* There's no way to set one QImode entry easily. Combine
13606 the variable value with its adjacent constant value, and
13607 promote to an HImode set. */
13608 x = XVECEXP (vals, 0, one_var ^ 1);
13609 if (one_var & 1)
13610 {
13611 var = convert_modes (HImode, QImode, var, true);
13612 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13613 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13614 x = GEN_INT (INTVAL (x) & 0xff);
13615 }
13616 else
13617 {
13618 var = convert_modes (HImode, QImode, var, true);
13619 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13620 }
13621 if (x != const0_rtx)
13622 var = expand_simple_binop (HImode, IOR, var, x, var,
13623 1, OPTAB_LIB_WIDEN);
13624
13625 x = gen_reg_rtx (wmode);
13626 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13627 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13628
13629 emit_move_insn (target, gen_lowpart (mode, x));
13630 return true;
13631
13632 default:
13633 return false;
13634 }
13635
13636 emit_move_insn (target, const_vec);
13637 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13638 return true;
13639 }
13640
13641 /* A subroutine of ix86_expand_vector_init_general. Use vector
13642 concatenate to handle the most general case: all values variable,
13643 and none identical. */
13644
13645 static void
ix86_expand_vector_init_concat(machine_mode mode,rtx target,rtx * ops,int n)13646 ix86_expand_vector_init_concat (machine_mode mode,
13647 rtx target, rtx *ops, int n)
13648 {
13649 machine_mode half_mode = VOIDmode;
13650 rtx half[2];
13651 rtvec v;
13652 int i, j;
13653
13654 switch (n)
13655 {
13656 case 2:
13657 switch (mode)
13658 {
13659 case E_V16SImode:
13660 half_mode = V8SImode;
13661 break;
13662 case E_V16SFmode:
13663 half_mode = V8SFmode;
13664 break;
13665 case E_V8DImode:
13666 half_mode = V4DImode;
13667 break;
13668 case E_V8DFmode:
13669 half_mode = V4DFmode;
13670 break;
13671 case E_V8SImode:
13672 half_mode = V4SImode;
13673 break;
13674 case E_V8SFmode:
13675 half_mode = V4SFmode;
13676 break;
13677 case E_V4DImode:
13678 half_mode = V2DImode;
13679 break;
13680 case E_V4DFmode:
13681 half_mode = V2DFmode;
13682 break;
13683 case E_V4SImode:
13684 half_mode = V2SImode;
13685 break;
13686 case E_V4SFmode:
13687 half_mode = V2SFmode;
13688 break;
13689 case E_V2DImode:
13690 half_mode = DImode;
13691 break;
13692 case E_V2SImode:
13693 half_mode = SImode;
13694 break;
13695 case E_V2DFmode:
13696 half_mode = DFmode;
13697 break;
13698 case E_V2SFmode:
13699 half_mode = SFmode;
13700 break;
13701 default:
13702 gcc_unreachable ();
13703 }
13704
13705 if (!register_operand (ops[1], half_mode))
13706 ops[1] = force_reg (half_mode, ops[1]);
13707 if (!register_operand (ops[0], half_mode))
13708 ops[0] = force_reg (half_mode, ops[0]);
13709 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13710 ops[1])));
13711 break;
13712
13713 case 4:
13714 switch (mode)
13715 {
13716 case E_V4DImode:
13717 half_mode = V2DImode;
13718 break;
13719 case E_V4DFmode:
13720 half_mode = V2DFmode;
13721 break;
13722 case E_V4SImode:
13723 half_mode = V2SImode;
13724 break;
13725 case E_V4SFmode:
13726 half_mode = V2SFmode;
13727 break;
13728 default:
13729 gcc_unreachable ();
13730 }
13731 goto half;
13732
13733 case 8:
13734 switch (mode)
13735 {
13736 case E_V8DImode:
13737 half_mode = V4DImode;
13738 break;
13739 case E_V8DFmode:
13740 half_mode = V4DFmode;
13741 break;
13742 case E_V8SImode:
13743 half_mode = V4SImode;
13744 break;
13745 case E_V8SFmode:
13746 half_mode = V4SFmode;
13747 break;
13748 default:
13749 gcc_unreachable ();
13750 }
13751 goto half;
13752
13753 case 16:
13754 switch (mode)
13755 {
13756 case E_V16SImode:
13757 half_mode = V8SImode;
13758 break;
13759 case E_V16SFmode:
13760 half_mode = V8SFmode;
13761 break;
13762 default:
13763 gcc_unreachable ();
13764 }
13765 goto half;
13766
13767 half:
13768 /* FIXME: We process inputs backward to help RA. PR 36222. */
13769 i = n - 1;
13770 for (j = 1; j != -1; j--)
13771 {
13772 half[j] = gen_reg_rtx (half_mode);
13773 switch (n >> 1)
13774 {
13775 case 2:
13776 v = gen_rtvec (2, ops[i-1], ops[i]);
13777 i -= 2;
13778 break;
13779 case 4:
13780 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
13781 i -= 4;
13782 break;
13783 case 8:
13784 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
13785 ops[i-3], ops[i-2], ops[i-1], ops[i]);
13786 i -= 8;
13787 break;
13788 default:
13789 gcc_unreachable ();
13790 }
13791 ix86_expand_vector_init (false, half[j],
13792 gen_rtx_PARALLEL (half_mode, v));
13793 }
13794
13795 ix86_expand_vector_init_concat (mode, target, half, 2);
13796 break;
13797
13798 default:
13799 gcc_unreachable ();
13800 }
13801 }
13802
13803 /* A subroutine of ix86_expand_vector_init_general. Use vector
13804 interleave to handle the most general case: all values variable,
13805 and none identical. */
13806
13807 static void
ix86_expand_vector_init_interleave(machine_mode mode,rtx target,rtx * ops,int n)13808 ix86_expand_vector_init_interleave (machine_mode mode,
13809 rtx target, rtx *ops, int n)
13810 {
13811 machine_mode first_imode, second_imode, third_imode, inner_mode;
13812 int i, j;
13813 rtx op0, op1;
13814 rtx (*gen_load_even) (rtx, rtx, rtx);
13815 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13816 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13817
13818 switch (mode)
13819 {
13820 case E_V8HImode:
13821 gen_load_even = gen_vec_setv8hi;
13822 gen_interleave_first_low = gen_vec_interleave_lowv4si;
13823 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13824 inner_mode = HImode;
13825 first_imode = V4SImode;
13826 second_imode = V2DImode;
13827 third_imode = VOIDmode;
13828 break;
13829 case E_V16QImode:
13830 gen_load_even = gen_vec_setv16qi;
13831 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13832 gen_interleave_second_low = gen_vec_interleave_lowv4si;
13833 inner_mode = QImode;
13834 first_imode = V8HImode;
13835 second_imode = V4SImode;
13836 third_imode = V2DImode;
13837 break;
13838 default:
13839 gcc_unreachable ();
13840 }
13841
13842 for (i = 0; i < n; i++)
13843 {
13844 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13845 op0 = gen_reg_rtx (SImode);
13846 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13847
13848 /* Insert the SImode value as low element of V4SImode vector. */
13849 op1 = gen_reg_rtx (V4SImode);
13850 op0 = gen_rtx_VEC_MERGE (V4SImode,
13851 gen_rtx_VEC_DUPLICATE (V4SImode,
13852 op0),
13853 CONST0_RTX (V4SImode),
13854 const1_rtx);
13855 emit_insn (gen_rtx_SET (op1, op0));
13856
13857 /* Cast the V4SImode vector back to a vector in orignal mode. */
13858 op0 = gen_reg_rtx (mode);
13859 emit_move_insn (op0, gen_lowpart (mode, op1));
13860
13861 /* Load even elements into the second position. */
13862 emit_insn (gen_load_even (op0,
13863 force_reg (inner_mode,
13864 ops [i + i + 1]),
13865 const1_rtx));
13866
13867 /* Cast vector to FIRST_IMODE vector. */
13868 ops[i] = gen_reg_rtx (first_imode);
13869 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13870 }
13871
13872 /* Interleave low FIRST_IMODE vectors. */
13873 for (i = j = 0; i < n; i += 2, j++)
13874 {
13875 op0 = gen_reg_rtx (first_imode);
13876 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13877
13878 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13879 ops[j] = gen_reg_rtx (second_imode);
13880 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13881 }
13882
13883 /* Interleave low SECOND_IMODE vectors. */
13884 switch (second_imode)
13885 {
13886 case E_V4SImode:
13887 for (i = j = 0; i < n / 2; i += 2, j++)
13888 {
13889 op0 = gen_reg_rtx (second_imode);
13890 emit_insn (gen_interleave_second_low (op0, ops[i],
13891 ops[i + 1]));
13892
13893 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13894 vector. */
13895 ops[j] = gen_reg_rtx (third_imode);
13896 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13897 }
13898 second_imode = V2DImode;
13899 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13900 /* FALLTHRU */
13901
13902 case E_V2DImode:
13903 op0 = gen_reg_rtx (second_imode);
13904 emit_insn (gen_interleave_second_low (op0, ops[0],
13905 ops[1]));
13906
13907 /* Cast the SECOND_IMODE vector back to a vector on original
13908 mode. */
13909 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13910 break;
13911
13912 default:
13913 gcc_unreachable ();
13914 }
13915 }
13916
13917 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13918 all values variable, and none identical. */
13919
13920 static void
ix86_expand_vector_init_general(bool mmx_ok,machine_mode mode,rtx target,rtx vals)13921 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13922 rtx target, rtx vals)
13923 {
13924 rtx ops[64], op0, op1, op2, op3, op4, op5;
13925 machine_mode half_mode = VOIDmode;
13926 machine_mode quarter_mode = VOIDmode;
13927 int n, i;
13928
13929 switch (mode)
13930 {
13931 case E_V2SFmode:
13932 case E_V2SImode:
13933 if (!mmx_ok && !TARGET_SSE)
13934 break;
13935 /* FALLTHRU */
13936
13937 case E_V16SImode:
13938 case E_V16SFmode:
13939 case E_V8DFmode:
13940 case E_V8DImode:
13941 case E_V8SFmode:
13942 case E_V8SImode:
13943 case E_V4DFmode:
13944 case E_V4DImode:
13945 case E_V4SFmode:
13946 case E_V4SImode:
13947 case E_V2DFmode:
13948 case E_V2DImode:
13949 n = GET_MODE_NUNITS (mode);
13950 for (i = 0; i < n; i++)
13951 ops[i] = XVECEXP (vals, 0, i);
13952 ix86_expand_vector_init_concat (mode, target, ops, n);
13953 return;
13954
13955 case E_V2TImode:
13956 for (i = 0; i < 2; i++)
13957 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13958 op0 = gen_reg_rtx (V4DImode);
13959 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13960 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13961 return;
13962
13963 case E_V4TImode:
13964 for (i = 0; i < 4; i++)
13965 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13966 ops[4] = gen_reg_rtx (V4DImode);
13967 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
13968 ops[5] = gen_reg_rtx (V4DImode);
13969 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
13970 op0 = gen_reg_rtx (V8DImode);
13971 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
13972 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13973 return;
13974
13975 case E_V32QImode:
13976 half_mode = V16QImode;
13977 goto half;
13978
13979 case E_V16HImode:
13980 half_mode = V8HImode;
13981 goto half;
13982
13983 half:
13984 n = GET_MODE_NUNITS (mode);
13985 for (i = 0; i < n; i++)
13986 ops[i] = XVECEXP (vals, 0, i);
13987 op0 = gen_reg_rtx (half_mode);
13988 op1 = gen_reg_rtx (half_mode);
13989 ix86_expand_vector_init_interleave (half_mode, op0, ops,
13990 n >> 2);
13991 ix86_expand_vector_init_interleave (half_mode, op1,
13992 &ops [n >> 1], n >> 2);
13993 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
13994 return;
13995
13996 case E_V64QImode:
13997 quarter_mode = V16QImode;
13998 half_mode = V32QImode;
13999 goto quarter;
14000
14001 case E_V32HImode:
14002 quarter_mode = V8HImode;
14003 half_mode = V16HImode;
14004 goto quarter;
14005
14006 quarter:
14007 n = GET_MODE_NUNITS (mode);
14008 for (i = 0; i < n; i++)
14009 ops[i] = XVECEXP (vals, 0, i);
14010 op0 = gen_reg_rtx (quarter_mode);
14011 op1 = gen_reg_rtx (quarter_mode);
14012 op2 = gen_reg_rtx (quarter_mode);
14013 op3 = gen_reg_rtx (quarter_mode);
14014 op4 = gen_reg_rtx (half_mode);
14015 op5 = gen_reg_rtx (half_mode);
14016 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14017 n >> 3);
14018 ix86_expand_vector_init_interleave (quarter_mode, op1,
14019 &ops [n >> 2], n >> 3);
14020 ix86_expand_vector_init_interleave (quarter_mode, op2,
14021 &ops [n >> 1], n >> 3);
14022 ix86_expand_vector_init_interleave (quarter_mode, op3,
14023 &ops [(n >> 1) | (n >> 2)], n >> 3);
14024 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14025 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14026 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14027 return;
14028
14029 case E_V16QImode:
14030 if (!TARGET_SSE4_1)
14031 break;
14032 /* FALLTHRU */
14033
14034 case E_V8HImode:
14035 if (!TARGET_SSE2)
14036 break;
14037
14038 /* Don't use ix86_expand_vector_init_interleave if we can't
14039 move from GPR to SSE register directly. */
14040 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14041 break;
14042
14043 n = GET_MODE_NUNITS (mode);
14044 for (i = 0; i < n; i++)
14045 ops[i] = XVECEXP (vals, 0, i);
14046 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14047 return;
14048
14049 case E_V4HImode:
14050 case E_V8QImode:
14051 break;
14052
14053 default:
14054 gcc_unreachable ();
14055 }
14056
14057 {
14058 int i, j, n_elts, n_words, n_elt_per_word;
14059 machine_mode inner_mode;
14060 rtx words[4], shift;
14061
14062 inner_mode = GET_MODE_INNER (mode);
14063 n_elts = GET_MODE_NUNITS (mode);
14064 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14065 n_elt_per_word = n_elts / n_words;
14066 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14067
14068 for (i = 0; i < n_words; ++i)
14069 {
14070 rtx word = NULL_RTX;
14071
14072 for (j = 0; j < n_elt_per_word; ++j)
14073 {
14074 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14075 elt = convert_modes (word_mode, inner_mode, elt, true);
14076
14077 if (j == 0)
14078 word = elt;
14079 else
14080 {
14081 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14082 NULL_RTX, 1, OPTAB_LIB_WIDEN);
14083 word = expand_simple_binop (word_mode, IOR, word, elt,
14084 NULL_RTX, 1, OPTAB_LIB_WIDEN);
14085 }
14086 }
14087
14088 words[i] = word;
14089 }
14090
14091 if (n_words == 1)
14092 emit_move_insn (target, gen_lowpart (mode, words[0]));
14093 else if (n_words == 2)
14094 {
14095 rtx tmp = gen_reg_rtx (mode);
14096 emit_clobber (tmp);
14097 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14098 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14099 emit_move_insn (target, tmp);
14100 }
14101 else if (n_words == 4)
14102 {
14103 rtx tmp = gen_reg_rtx (V4SImode);
14104 gcc_assert (word_mode == SImode);
14105 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14106 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14107 emit_move_insn (target, gen_lowpart (mode, tmp));
14108 }
14109 else
14110 gcc_unreachable ();
14111 }
14112 }
14113
14114 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14115 instructions unless MMX_OK is true. */
14116
14117 void
ix86_expand_vector_init(bool mmx_ok,rtx target,rtx vals)14118 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14119 {
14120 machine_mode mode = GET_MODE (target);
14121 machine_mode inner_mode = GET_MODE_INNER (mode);
14122 int n_elts = GET_MODE_NUNITS (mode);
14123 int n_var = 0, one_var = -1;
14124 bool all_same = true, all_const_zero = true;
14125 int i;
14126 rtx x;
14127
14128 /* Handle first initialization from vector elts. */
14129 if (n_elts != XVECLEN (vals, 0))
14130 {
14131 rtx subtarget = target;
14132 x = XVECEXP (vals, 0, 0);
14133 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14134 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14135 {
14136 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14137 if (inner_mode == QImode
14138 || inner_mode == HImode
14139 || inner_mode == TImode)
14140 {
14141 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14142 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
14143 n_bits /= GET_MODE_SIZE (elt_mode);
14144 mode = mode_for_vector (elt_mode, n_bits).require ();
14145 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
14146 ops[0] = gen_lowpart (inner_mode, ops[0]);
14147 ops[1] = gen_lowpart (inner_mode, ops[1]);
14148 subtarget = gen_reg_rtx (mode);
14149 }
14150 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14151 if (subtarget != target)
14152 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14153 return;
14154 }
14155 gcc_unreachable ();
14156 }
14157
14158 for (i = 0; i < n_elts; ++i)
14159 {
14160 x = XVECEXP (vals, 0, i);
14161 if (!(CONST_SCALAR_INT_P (x)
14162 || CONST_DOUBLE_P (x)
14163 || CONST_FIXED_P (x)))
14164 n_var++, one_var = i;
14165 else if (x != CONST0_RTX (inner_mode))
14166 all_const_zero = false;
14167 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14168 all_same = false;
14169 }
14170
14171 /* Constants are best loaded from the constant pool. */
14172 if (n_var == 0)
14173 {
14174 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14175 return;
14176 }
14177
14178 /* If all values are identical, broadcast the value. */
14179 if (all_same
14180 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14181 XVECEXP (vals, 0, 0)))
14182 return;
14183
14184 /* Values where only one field is non-constant are best loaded from
14185 the pool and overwritten via move later. */
14186 if (n_var == 1)
14187 {
14188 if (all_const_zero
14189 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14190 XVECEXP (vals, 0, one_var),
14191 one_var))
14192 return;
14193
14194 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14195 return;
14196 }
14197
14198 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14199 }
14200
14201 void
ix86_expand_vector_set(bool mmx_ok,rtx target,rtx val,int elt)14202 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14203 {
14204 machine_mode mode = GET_MODE (target);
14205 machine_mode inner_mode = GET_MODE_INNER (mode);
14206 machine_mode half_mode;
14207 bool use_vec_merge = false;
14208 rtx tmp;
14209 static rtx (*gen_extract[6][2]) (rtx, rtx)
14210 = {
14211 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14212 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14213 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14214 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14215 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14216 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14217 };
14218 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14219 = {
14220 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14221 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14222 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14223 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14224 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14225 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14226 };
14227 int i, j, n;
14228 machine_mode mmode = VOIDmode;
14229 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14230
14231 switch (mode)
14232 {
14233 case E_V2SImode:
14234 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14235 if (use_vec_merge)
14236 break;
14237 /* FALLTHRU */
14238
14239 case E_V2SFmode:
14240 if (mmx_ok)
14241 {
14242 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14243 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14244 if (elt == 0)
14245 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14246 else
14247 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14248 emit_insn (gen_rtx_SET (target, tmp));
14249 return;
14250 }
14251 break;
14252
14253 case E_V2DImode:
14254 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14255 if (use_vec_merge)
14256 break;
14257
14258 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14259 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14260 if (elt == 0)
14261 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14262 else
14263 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14264 emit_insn (gen_rtx_SET (target, tmp));
14265 return;
14266
14267 case E_V2DFmode:
14268 /* NB: For ELT == 0, use standard scalar operation patterns which
14269 preserve the rest of the vector for combiner:
14270
14271 (vec_merge:V2DF
14272 (vec_duplicate:V2DF (reg:DF))
14273 (reg:V2DF)
14274 (const_int 1))
14275 */
14276 if (elt == 0)
14277 goto do_vec_merge;
14278
14279 {
14280 rtx op0, op1;
14281
14282 /* For the two element vectors, we implement a VEC_CONCAT with
14283 the extraction of the other element. */
14284
14285 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14286 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14287
14288 if (elt == 0)
14289 op0 = val, op1 = tmp;
14290 else
14291 op0 = tmp, op1 = val;
14292
14293 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14294 emit_insn (gen_rtx_SET (target, tmp));
14295 }
14296 return;
14297
14298 case E_V4SFmode:
14299 use_vec_merge = TARGET_SSE4_1;
14300 if (use_vec_merge)
14301 break;
14302
14303 switch (elt)
14304 {
14305 case 0:
14306 use_vec_merge = true;
14307 break;
14308
14309 case 1:
14310 /* tmp = target = A B C D */
14311 tmp = copy_to_reg (target);
14312 /* target = A A B B */
14313 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14314 /* target = X A B B */
14315 ix86_expand_vector_set (false, target, val, 0);
14316 /* target = A X C D */
14317 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14318 const1_rtx, const0_rtx,
14319 GEN_INT (2+4), GEN_INT (3+4)));
14320 return;
14321
14322 case 2:
14323 /* tmp = target = A B C D */
14324 tmp = copy_to_reg (target);
14325 /* tmp = X B C D */
14326 ix86_expand_vector_set (false, tmp, val, 0);
14327 /* target = A B X D */
14328 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14329 const0_rtx, const1_rtx,
14330 GEN_INT (0+4), GEN_INT (3+4)));
14331 return;
14332
14333 case 3:
14334 /* tmp = target = A B C D */
14335 tmp = copy_to_reg (target);
14336 /* tmp = X B C D */
14337 ix86_expand_vector_set (false, tmp, val, 0);
14338 /* target = A B X D */
14339 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14340 const0_rtx, const1_rtx,
14341 GEN_INT (2+4), GEN_INT (0+4)));
14342 return;
14343
14344 default:
14345 gcc_unreachable ();
14346 }
14347 break;
14348
14349 case E_V4SImode:
14350 use_vec_merge = TARGET_SSE4_1;
14351 if (use_vec_merge)
14352 break;
14353
14354 /* Element 0 handled by vec_merge below. */
14355 if (elt == 0)
14356 {
14357 use_vec_merge = true;
14358 break;
14359 }
14360
14361 if (TARGET_SSE2)
14362 {
14363 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14364 store into element 0, then shuffle them back. */
14365
14366 rtx order[4];
14367
14368 order[0] = GEN_INT (elt);
14369 order[1] = const1_rtx;
14370 order[2] = const2_rtx;
14371 order[3] = GEN_INT (3);
14372 order[elt] = const0_rtx;
14373
14374 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14375 order[1], order[2], order[3]));
14376
14377 ix86_expand_vector_set (false, target, val, 0);
14378
14379 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14380 order[1], order[2], order[3]));
14381 }
14382 else
14383 {
14384 /* For SSE1, we have to reuse the V4SF code. */
14385 rtx t = gen_reg_rtx (V4SFmode);
14386 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14387 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14388 emit_move_insn (target, gen_lowpart (mode, t));
14389 }
14390 return;
14391
14392 case E_V8HImode:
14393 use_vec_merge = TARGET_SSE2;
14394 break;
14395 case E_V4HImode:
14396 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14397 break;
14398
14399 case E_V16QImode:
14400 use_vec_merge = TARGET_SSE4_1;
14401 break;
14402
14403 case E_V8QImode:
14404 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14405 break;
14406
14407 case E_V32QImode:
14408 half_mode = V16QImode;
14409 j = 0;
14410 n = 16;
14411 goto half;
14412
14413 case E_V16HImode:
14414 half_mode = V8HImode;
14415 j = 1;
14416 n = 8;
14417 goto half;
14418
14419 case E_V8SImode:
14420 half_mode = V4SImode;
14421 j = 2;
14422 n = 4;
14423 goto half;
14424
14425 case E_V4DImode:
14426 half_mode = V2DImode;
14427 j = 3;
14428 n = 2;
14429 goto half;
14430
14431 case E_V8SFmode:
14432 half_mode = V4SFmode;
14433 j = 4;
14434 n = 4;
14435 goto half;
14436
14437 case E_V4DFmode:
14438 half_mode = V2DFmode;
14439 j = 5;
14440 n = 2;
14441 goto half;
14442
14443 half:
14444 /* Compute offset. */
14445 i = elt / n;
14446 elt %= n;
14447
14448 gcc_assert (i <= 1);
14449
14450 /* Extract the half. */
14451 tmp = gen_reg_rtx (half_mode);
14452 emit_insn (gen_extract[j][i] (tmp, target));
14453
14454 /* Put val in tmp at elt. */
14455 ix86_expand_vector_set (false, tmp, val, elt);
14456
14457 /* Put it back. */
14458 emit_insn (gen_insert[j][i] (target, target, tmp));
14459 return;
14460
14461 case E_V8DFmode:
14462 if (TARGET_AVX512F)
14463 {
14464 mmode = QImode;
14465 gen_blendm = gen_avx512f_blendmv8df;
14466 }
14467 break;
14468
14469 case E_V8DImode:
14470 if (TARGET_AVX512F)
14471 {
14472 mmode = QImode;
14473 gen_blendm = gen_avx512f_blendmv8di;
14474 }
14475 break;
14476
14477 case E_V16SFmode:
14478 if (TARGET_AVX512F)
14479 {
14480 mmode = HImode;
14481 gen_blendm = gen_avx512f_blendmv16sf;
14482 }
14483 break;
14484
14485 case E_V16SImode:
14486 if (TARGET_AVX512F)
14487 {
14488 mmode = HImode;
14489 gen_blendm = gen_avx512f_blendmv16si;
14490 }
14491 break;
14492
14493 case E_V32HImode:
14494 if (TARGET_AVX512BW)
14495 {
14496 mmode = SImode;
14497 gen_blendm = gen_avx512bw_blendmv32hi;
14498 }
14499 else if (TARGET_AVX512F)
14500 {
14501 half_mode = E_V8HImode;
14502 n = 8;
14503 goto quarter;
14504 }
14505 break;
14506
14507 case E_V64QImode:
14508 if (TARGET_AVX512BW)
14509 {
14510 mmode = DImode;
14511 gen_blendm = gen_avx512bw_blendmv64qi;
14512 }
14513 else if (TARGET_AVX512F)
14514 {
14515 half_mode = E_V16QImode;
14516 n = 16;
14517 goto quarter;
14518 }
14519 break;
14520
14521 quarter:
14522 /* Compute offset. */
14523 i = elt / n;
14524 elt %= n;
14525
14526 gcc_assert (i <= 3);
14527
14528 {
14529 /* Extract the quarter. */
14530 tmp = gen_reg_rtx (V4SImode);
14531 rtx tmp2 = gen_lowpart (V16SImode, target);
14532 rtx mask = gen_reg_rtx (QImode);
14533
14534 emit_move_insn (mask, constm1_rtx);
14535 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14536 tmp, mask));
14537
14538 tmp2 = gen_reg_rtx (half_mode);
14539 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14540 tmp = tmp2;
14541
14542 /* Put val in tmp at elt. */
14543 ix86_expand_vector_set (false, tmp, val, elt);
14544
14545 /* Put it back. */
14546 tmp2 = gen_reg_rtx (V16SImode);
14547 rtx tmp3 = gen_lowpart (V16SImode, target);
14548 mask = gen_reg_rtx (HImode);
14549 emit_move_insn (mask, constm1_rtx);
14550 tmp = gen_lowpart (V4SImode, tmp);
14551 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14552 tmp3, mask));
14553 emit_move_insn (target, gen_lowpart (mode, tmp2));
14554 }
14555 return;
14556
14557 default:
14558 break;
14559 }
14560
14561 if (mmode != VOIDmode)
14562 {
14563 tmp = gen_reg_rtx (mode);
14564 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14565 /* The avx512*_blendm<mode> expanders have different operand order
14566 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14567 elements where the mask is set and second input operand otherwise,
14568 in {sse,avx}*_*blend* the first input operand is used for elements
14569 where the mask is clear and second input operand otherwise. */
14570 emit_insn (gen_blendm (target, target, tmp,
14571 force_reg (mmode,
14572 gen_int_mode (HOST_WIDE_INT_1U << elt,
14573 mmode))));
14574 }
14575 else if (use_vec_merge)
14576 {
14577 do_vec_merge:
14578 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14579 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14580 GEN_INT (HOST_WIDE_INT_1U << elt));
14581 emit_insn (gen_rtx_SET (target, tmp));
14582 }
14583 else
14584 {
14585 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14586
14587 emit_move_insn (mem, target);
14588
14589 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14590 emit_move_insn (tmp, val);
14591
14592 emit_move_insn (target, mem);
14593 }
14594 }
14595
14596 void
ix86_expand_vector_extract(bool mmx_ok,rtx target,rtx vec,int elt)14597 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14598 {
14599 machine_mode mode = GET_MODE (vec);
14600 machine_mode inner_mode = GET_MODE_INNER (mode);
14601 bool use_vec_extr = false;
14602 rtx tmp;
14603
14604 switch (mode)
14605 {
14606 case E_V2SImode:
14607 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14608 if (use_vec_extr)
14609 break;
14610 /* FALLTHRU */
14611
14612 case E_V2SFmode:
14613 if (!mmx_ok)
14614 break;
14615 /* FALLTHRU */
14616
14617 case E_V2DFmode:
14618 case E_V2DImode:
14619 case E_V2TImode:
14620 case E_V4TImode:
14621 use_vec_extr = true;
14622 break;
14623
14624 case E_V4SFmode:
14625 use_vec_extr = TARGET_SSE4_1;
14626 if (use_vec_extr)
14627 break;
14628
14629 switch (elt)
14630 {
14631 case 0:
14632 tmp = vec;
14633 break;
14634
14635 case 1:
14636 case 3:
14637 tmp = gen_reg_rtx (mode);
14638 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14639 GEN_INT (elt), GEN_INT (elt),
14640 GEN_INT (elt+4), GEN_INT (elt+4)));
14641 break;
14642
14643 case 2:
14644 tmp = gen_reg_rtx (mode);
14645 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14646 break;
14647
14648 default:
14649 gcc_unreachable ();
14650 }
14651 vec = tmp;
14652 use_vec_extr = true;
14653 elt = 0;
14654 break;
14655
14656 case E_V4SImode:
14657 use_vec_extr = TARGET_SSE4_1;
14658 if (use_vec_extr)
14659 break;
14660
14661 if (TARGET_SSE2)
14662 {
14663 switch (elt)
14664 {
14665 case 0:
14666 tmp = vec;
14667 break;
14668
14669 case 1:
14670 case 3:
14671 tmp = gen_reg_rtx (mode);
14672 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14673 GEN_INT (elt), GEN_INT (elt),
14674 GEN_INT (elt), GEN_INT (elt)));
14675 break;
14676
14677 case 2:
14678 tmp = gen_reg_rtx (mode);
14679 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14680 break;
14681
14682 default:
14683 gcc_unreachable ();
14684 }
14685 vec = tmp;
14686 use_vec_extr = true;
14687 elt = 0;
14688 }
14689 else
14690 {
14691 /* For SSE1, we have to reuse the V4SF code. */
14692 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14693 gen_lowpart (V4SFmode, vec), elt);
14694 return;
14695 }
14696 break;
14697
14698 case E_V8HImode:
14699 use_vec_extr = TARGET_SSE2;
14700 break;
14701 case E_V4HImode:
14702 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14703 break;
14704
14705 case E_V16QImode:
14706 use_vec_extr = TARGET_SSE4_1;
14707 if (!use_vec_extr
14708 && TARGET_SSE2
14709 && elt == 0
14710 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
14711 {
14712 tmp = gen_reg_rtx (SImode);
14713 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
14714 0);
14715 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
14716 return;
14717 }
14718 break;
14719
14720 case E_V8SFmode:
14721 if (TARGET_AVX)
14722 {
14723 tmp = gen_reg_rtx (V4SFmode);
14724 if (elt < 4)
14725 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14726 else
14727 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14728 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14729 return;
14730 }
14731 break;
14732
14733 case E_V4DFmode:
14734 if (TARGET_AVX)
14735 {
14736 tmp = gen_reg_rtx (V2DFmode);
14737 if (elt < 2)
14738 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14739 else
14740 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14741 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14742 return;
14743 }
14744 break;
14745
14746 case E_V32QImode:
14747 if (TARGET_AVX)
14748 {
14749 tmp = gen_reg_rtx (V16QImode);
14750 if (elt < 16)
14751 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14752 else
14753 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14754 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14755 return;
14756 }
14757 break;
14758
14759 case E_V16HImode:
14760 if (TARGET_AVX)
14761 {
14762 tmp = gen_reg_rtx (V8HImode);
14763 if (elt < 8)
14764 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14765 else
14766 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14767 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14768 return;
14769 }
14770 break;
14771
14772 case E_V8SImode:
14773 if (TARGET_AVX)
14774 {
14775 tmp = gen_reg_rtx (V4SImode);
14776 if (elt < 4)
14777 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14778 else
14779 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14780 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14781 return;
14782 }
14783 break;
14784
14785 case E_V4DImode:
14786 if (TARGET_AVX)
14787 {
14788 tmp = gen_reg_rtx (V2DImode);
14789 if (elt < 2)
14790 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14791 else
14792 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14793 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14794 return;
14795 }
14796 break;
14797
14798 case E_V32HImode:
14799 if (TARGET_AVX512BW)
14800 {
14801 tmp = gen_reg_rtx (V16HImode);
14802 if (elt < 16)
14803 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14804 else
14805 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14806 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14807 return;
14808 }
14809 break;
14810
14811 case E_V64QImode:
14812 if (TARGET_AVX512BW)
14813 {
14814 tmp = gen_reg_rtx (V32QImode);
14815 if (elt < 32)
14816 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14817 else
14818 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14819 ix86_expand_vector_extract (false, target, tmp, elt & 31);
14820 return;
14821 }
14822 break;
14823
14824 case E_V16SFmode:
14825 tmp = gen_reg_rtx (V8SFmode);
14826 if (elt < 8)
14827 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14828 else
14829 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14830 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14831 return;
14832
14833 case E_V8DFmode:
14834 tmp = gen_reg_rtx (V4DFmode);
14835 if (elt < 4)
14836 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14837 else
14838 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14839 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14840 return;
14841
14842 case E_V16SImode:
14843 tmp = gen_reg_rtx (V8SImode);
14844 if (elt < 8)
14845 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14846 else
14847 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14848 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14849 return;
14850
14851 case E_V8DImode:
14852 tmp = gen_reg_rtx (V4DImode);
14853 if (elt < 4)
14854 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14855 else
14856 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14857 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14858 return;
14859
14860 case E_V8QImode:
14861 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14862 /* ??? Could extract the appropriate HImode element and shift. */
14863 break;
14864
14865 default:
14866 break;
14867 }
14868
14869 if (use_vec_extr)
14870 {
14871 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14872 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14873
14874 /* Let the rtl optimizers know about the zero extension performed. */
14875 if (inner_mode == QImode || inner_mode == HImode)
14876 {
14877 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14878 target = gen_lowpart (SImode, target);
14879 }
14880
14881 emit_insn (gen_rtx_SET (target, tmp));
14882 }
14883 else
14884 {
14885 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14886
14887 emit_move_insn (mem, vec);
14888
14889 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14890 emit_move_insn (target, tmp);
14891 }
14892 }
14893
14894 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14895 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14896 The upper bits of DEST are undefined, though they shouldn't cause
14897 exceptions (some bits from src or all zeros are ok). */
14898
14899 static void
emit_reduc_half(rtx dest,rtx src,int i)14900 emit_reduc_half (rtx dest, rtx src, int i)
14901 {
14902 rtx tem, d = dest;
14903 switch (GET_MODE (src))
14904 {
14905 case E_V4SFmode:
14906 if (i == 128)
14907 tem = gen_sse_movhlps (dest, src, src);
14908 else
14909 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14910 GEN_INT (1 + 4), GEN_INT (1 + 4));
14911 break;
14912 case E_V2DFmode:
14913 tem = gen_vec_interleave_highv2df (dest, src, src);
14914 break;
14915 case E_V16QImode:
14916 case E_V8HImode:
14917 case E_V4SImode:
14918 case E_V2DImode:
14919 d = gen_reg_rtx (V1TImode);
14920 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14921 GEN_INT (i / 2));
14922 break;
14923 case E_V8SFmode:
14924 if (i == 256)
14925 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14926 else
14927 tem = gen_avx_shufps256 (dest, src, src,
14928 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14929 break;
14930 case E_V4DFmode:
14931 if (i == 256)
14932 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14933 else
14934 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14935 break;
14936 case E_V32QImode:
14937 case E_V16HImode:
14938 case E_V8SImode:
14939 case E_V4DImode:
14940 if (i == 256)
14941 {
14942 if (GET_MODE (dest) != V4DImode)
14943 d = gen_reg_rtx (V4DImode);
14944 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14945 gen_lowpart (V4DImode, src),
14946 const1_rtx);
14947 }
14948 else
14949 {
14950 d = gen_reg_rtx (V2TImode);
14951 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14952 GEN_INT (i / 2));
14953 }
14954 break;
14955 case E_V64QImode:
14956 case E_V32HImode:
14957 if (i < 64)
14958 {
14959 d = gen_reg_rtx (V4TImode);
14960 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
14961 GEN_INT (i / 2));
14962 break;
14963 }
14964 /* FALLTHRU */
14965 case E_V16SImode:
14966 case E_V16SFmode:
14967 case E_V8DImode:
14968 case E_V8DFmode:
14969 if (i > 128)
14970 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
14971 gen_lowpart (V16SImode, src),
14972 gen_lowpart (V16SImode, src),
14973 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14974 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14975 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14976 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14977 GEN_INT (0xC), GEN_INT (0xD),
14978 GEN_INT (0xE), GEN_INT (0xF),
14979 GEN_INT (0x10), GEN_INT (0x11),
14980 GEN_INT (0x12), GEN_INT (0x13),
14981 GEN_INT (0x14), GEN_INT (0x15),
14982 GEN_INT (0x16), GEN_INT (0x17));
14983 else
14984 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
14985 gen_lowpart (V16SImode, src),
14986 GEN_INT (i == 128 ? 0x2 : 0x1),
14987 GEN_INT (0x3),
14988 GEN_INT (0x3),
14989 GEN_INT (0x3),
14990 GEN_INT (i == 128 ? 0x6 : 0x5),
14991 GEN_INT (0x7),
14992 GEN_INT (0x7),
14993 GEN_INT (0x7),
14994 GEN_INT (i == 128 ? 0xA : 0x9),
14995 GEN_INT (0xB),
14996 GEN_INT (0xB),
14997 GEN_INT (0xB),
14998 GEN_INT (i == 128 ? 0xE : 0xD),
14999 GEN_INT (0xF),
15000 GEN_INT (0xF),
15001 GEN_INT (0xF));
15002 break;
15003 default:
15004 gcc_unreachable ();
15005 }
15006 emit_insn (tem);
15007 if (d != dest)
15008 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15009 }
15010
15011 /* Expand a vector reduction. FN is the binary pattern to reduce;
15012 DEST is the destination; IN is the input vector. */
15013
15014 void
ix86_expand_reduc(rtx (* fn)(rtx,rtx,rtx),rtx dest,rtx in)15015 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15016 {
15017 rtx half, dst, vec = in;
15018 machine_mode mode = GET_MODE (in);
15019 int i;
15020
15021 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15022 if (TARGET_SSE4_1
15023 && mode == V8HImode
15024 && fn == gen_uminv8hi3)
15025 {
15026 emit_insn (gen_sse4_1_phminposuw (dest, in));
15027 return;
15028 }
15029
15030 for (i = GET_MODE_BITSIZE (mode);
15031 i > GET_MODE_UNIT_BITSIZE (mode);
15032 i >>= 1)
15033 {
15034 half = gen_reg_rtx (mode);
15035 emit_reduc_half (half, vec, i);
15036 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15037 dst = dest;
15038 else
15039 dst = gen_reg_rtx (mode);
15040 emit_insn (fn (dst, half, vec));
15041 vec = dst;
15042 }
15043 }
15044
15045 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15046 FP status register is set. */
15047
15048 void
ix86_emit_fp_unordered_jump(rtx label)15049 ix86_emit_fp_unordered_jump (rtx label)
15050 {
15051 rtx reg = gen_reg_rtx (HImode);
15052 rtx_insn *insn;
15053 rtx temp;
15054
15055 emit_insn (gen_x86_fnstsw_1 (reg));
15056
15057 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15058 {
15059 emit_insn (gen_x86_sahf_1 (reg));
15060
15061 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15062 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15063 }
15064 else
15065 {
15066 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15067
15068 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15069 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15070 }
15071
15072 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15073 gen_rtx_LABEL_REF (VOIDmode, label),
15074 pc_rtx);
15075 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15076 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15077 JUMP_LABEL (insn) = label;
15078 }
15079
15080 /* Output code to perform an sinh XFmode calculation. */
15081
ix86_emit_i387_sinh(rtx op0,rtx op1)15082 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15083 {
15084 rtx e1 = gen_reg_rtx (XFmode);
15085 rtx e2 = gen_reg_rtx (XFmode);
15086 rtx scratch = gen_reg_rtx (HImode);
15087 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15088 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15089 rtx cst1, tmp;
15090 rtx_code_label *jump_label = gen_label_rtx ();
15091 rtx_insn *insn;
15092
15093 /* scratch = fxam (op1) */
15094 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15095
15096 /* e1 = expm1 (|op1|) */
15097 emit_insn (gen_absxf2 (e2, op1));
15098 emit_insn (gen_expm1xf2 (e1, e2));
15099
15100 /* e2 = e1 / (e1 + 1.0) + e1 */
15101 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15102 emit_insn (gen_addxf3 (e2, e1, cst1));
15103 emit_insn (gen_divxf3 (e2, e1, e2));
15104 emit_insn (gen_addxf3 (e2, e2, e1));
15105
15106 /* flags = signbit (op1) */
15107 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15108
15109 /* if (flags) then e2 = -e2 */
15110 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15111 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15112 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15113 pc_rtx);
15114 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15115 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15116 JUMP_LABEL (insn) = jump_label;
15117
15118 emit_insn (gen_negxf2 (e2, e2));
15119
15120 emit_label (jump_label);
15121 LABEL_NUSES (jump_label) = 1;
15122
15123 /* op0 = 0.5 * e2 */
15124 half = force_reg (XFmode, half);
15125 emit_insn (gen_mulxf3 (op0, e2, half));
15126 }
15127
15128 /* Output code to perform an cosh XFmode calculation. */
15129
ix86_emit_i387_cosh(rtx op0,rtx op1)15130 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15131 {
15132 rtx e1 = gen_reg_rtx (XFmode);
15133 rtx e2 = gen_reg_rtx (XFmode);
15134 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15135 rtx cst1;
15136
15137 /* e1 = exp (op1) */
15138 emit_insn (gen_expxf2 (e1, op1));
15139
15140 /* e2 = e1 + 1.0 / e1 */
15141 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15142 emit_insn (gen_divxf3 (e2, cst1, e1));
15143 emit_insn (gen_addxf3 (e2, e1, e2));
15144
15145 /* op0 = 0.5 * e2 */
15146 half = force_reg (XFmode, half);
15147 emit_insn (gen_mulxf3 (op0, e2, half));
15148 }
15149
15150 /* Output code to perform an tanh XFmode calculation. */
15151
ix86_emit_i387_tanh(rtx op0,rtx op1)15152 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15153 {
15154 rtx e1 = gen_reg_rtx (XFmode);
15155 rtx e2 = gen_reg_rtx (XFmode);
15156 rtx scratch = gen_reg_rtx (HImode);
15157 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15158 rtx cst2, tmp;
15159 rtx_code_label *jump_label = gen_label_rtx ();
15160 rtx_insn *insn;
15161
15162 /* scratch = fxam (op1) */
15163 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15164
15165 /* e1 = expm1 (-|2 * op1|) */
15166 emit_insn (gen_addxf3 (e2, op1, op1));
15167 emit_insn (gen_absxf2 (e2, e2));
15168 emit_insn (gen_negxf2 (e2, e2));
15169 emit_insn (gen_expm1xf2 (e1, e2));
15170
15171 /* e2 = e1 / (e1 + 2.0) */
15172 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15173 emit_insn (gen_addxf3 (e2, e1, cst2));
15174 emit_insn (gen_divxf3 (e2, e1, e2));
15175
15176 /* flags = signbit (op1) */
15177 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15178
15179 /* if (!flags) then e2 = -e2 */
15180 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15181 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15182 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15183 pc_rtx);
15184 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15185 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15186 JUMP_LABEL (insn) = jump_label;
15187
15188 emit_insn (gen_negxf2 (e2, e2));
15189
15190 emit_label (jump_label);
15191 LABEL_NUSES (jump_label) = 1;
15192
15193 emit_move_insn (op0, e2);
15194 }
15195
15196 /* Output code to perform an asinh XFmode calculation. */
15197
ix86_emit_i387_asinh(rtx op0,rtx op1)15198 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15199 {
15200 rtx e1 = gen_reg_rtx (XFmode);
15201 rtx e2 = gen_reg_rtx (XFmode);
15202 rtx scratch = gen_reg_rtx (HImode);
15203 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15204 rtx cst1, tmp;
15205 rtx_code_label *jump_label = gen_label_rtx ();
15206 rtx_insn *insn;
15207
15208 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15209 emit_insn (gen_mulxf3 (e1, op1, op1));
15210 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15211 emit_insn (gen_addxf3 (e2, e1, cst1));
15212 emit_insn (gen_sqrtxf2 (e2, e2));
15213 emit_insn (gen_addxf3 (e2, e2, cst1));
15214
15215 /* e1 = e1 / e2 */
15216 emit_insn (gen_divxf3 (e1, e1, e2));
15217
15218 /* scratch = fxam (op1) */
15219 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15220
15221 /* e1 = e1 + |op1| */
15222 emit_insn (gen_absxf2 (e2, op1));
15223 emit_insn (gen_addxf3 (e1, e1, e2));
15224
15225 /* e2 = log1p (e1) */
15226 ix86_emit_i387_log1p (e2, e1);
15227
15228 /* flags = signbit (op1) */
15229 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15230
15231 /* if (flags) then e2 = -e2 */
15232 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15233 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15234 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15235 pc_rtx);
15236 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15237 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15238 JUMP_LABEL (insn) = jump_label;
15239
15240 emit_insn (gen_negxf2 (e2, e2));
15241
15242 emit_label (jump_label);
15243 LABEL_NUSES (jump_label) = 1;
15244
15245 emit_move_insn (op0, e2);
15246 }
15247
15248 /* Output code to perform an acosh XFmode calculation. */
15249
ix86_emit_i387_acosh(rtx op0,rtx op1)15250 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15251 {
15252 rtx e1 = gen_reg_rtx (XFmode);
15253 rtx e2 = gen_reg_rtx (XFmode);
15254 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15255
15256 /* e2 = sqrt (op1 + 1.0) */
15257 emit_insn (gen_addxf3 (e2, op1, cst1));
15258 emit_insn (gen_sqrtxf2 (e2, e2));
15259
15260 /* e1 = sqrt (op1 - 1.0) */
15261 emit_insn (gen_subxf3 (e1, op1, cst1));
15262 emit_insn (gen_sqrtxf2 (e1, e1));
15263
15264 /* e1 = e1 * e2 */
15265 emit_insn (gen_mulxf3 (e1, e1, e2));
15266
15267 /* e1 = e1 + op1 */
15268 emit_insn (gen_addxf3 (e1, e1, op1));
15269
15270 /* op0 = log (e1) */
15271 emit_insn (gen_logxf2 (op0, e1));
15272 }
15273
15274 /* Output code to perform an atanh XFmode calculation. */
15275
ix86_emit_i387_atanh(rtx op0,rtx op1)15276 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15277 {
15278 rtx e1 = gen_reg_rtx (XFmode);
15279 rtx e2 = gen_reg_rtx (XFmode);
15280 rtx scratch = gen_reg_rtx (HImode);
15281 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15282 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15283 rtx cst1, tmp;
15284 rtx_code_label *jump_label = gen_label_rtx ();
15285 rtx_insn *insn;
15286
15287 /* scratch = fxam (op1) */
15288 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15289
15290 /* e2 = |op1| */
15291 emit_insn (gen_absxf2 (e2, op1));
15292
15293 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15294 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15295 emit_insn (gen_addxf3 (e1, e2, cst1));
15296 emit_insn (gen_addxf3 (e2, e2, e2));
15297 emit_insn (gen_negxf2 (e2, e2));
15298 emit_insn (gen_divxf3 (e1, e2, e1));
15299
15300 /* e2 = log1p (e1) */
15301 ix86_emit_i387_log1p (e2, e1);
15302
15303 /* flags = signbit (op1) */
15304 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15305
15306 /* if (!flags) then e2 = -e2 */
15307 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15308 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15309 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15310 pc_rtx);
15311 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15312 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15313 JUMP_LABEL (insn) = jump_label;
15314
15315 emit_insn (gen_negxf2 (e2, e2));
15316
15317 emit_label (jump_label);
15318 LABEL_NUSES (jump_label) = 1;
15319
15320 /* op0 = 0.5 * e2 */
15321 half = force_reg (XFmode, half);
15322 emit_insn (gen_mulxf3 (op0, e2, half));
15323 }
15324
15325 /* Output code to perform a log1p XFmode calculation. */
15326
ix86_emit_i387_log1p(rtx op0,rtx op1)15327 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15328 {
15329 rtx_code_label *label1 = gen_label_rtx ();
15330 rtx_code_label *label2 = gen_label_rtx ();
15331
15332 rtx tmp = gen_reg_rtx (XFmode);
15333 rtx res = gen_reg_rtx (XFmode);
15334 rtx cst, cstln2, cst1;
15335 rtx_insn *insn;
15336
15337 /* The emit_jump call emits pending stack adjust, make sure it is emitted
15338 before the conditional jump, otherwise the stack adjustment will be
15339 only conditional. */
15340 do_pending_stack_adjust ();
15341
15342 cst = const_double_from_real_value
15343 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15344 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15345
15346 emit_insn (gen_absxf2 (tmp, op1));
15347
15348 cst = force_reg (XFmode, cst);
15349 ix86_expand_branch (GE, tmp, cst, label1);
15350 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15351 insn = get_last_insn ();
15352 JUMP_LABEL (insn) = label1;
15353
15354 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15355 emit_jump (label2);
15356
15357 emit_label (label1);
15358 LABEL_NUSES (label1) = 1;
15359
15360 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15361 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15362 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15363
15364 emit_label (label2);
15365 LABEL_NUSES (label2) = 1;
15366
15367 emit_move_insn (op0, res);
15368 }
15369
15370 /* Emit code for round calculation. */
ix86_emit_i387_round(rtx op0,rtx op1)15371 void ix86_emit_i387_round (rtx op0, rtx op1)
15372 {
15373 machine_mode inmode = GET_MODE (op1);
15374 machine_mode outmode = GET_MODE (op0);
15375 rtx e1 = gen_reg_rtx (XFmode);
15376 rtx e2 = gen_reg_rtx (XFmode);
15377 rtx scratch = gen_reg_rtx (HImode);
15378 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15379 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15380 rtx res = gen_reg_rtx (outmode);
15381 rtx_code_label *jump_label = gen_label_rtx ();
15382 rtx (*floor_insn) (rtx, rtx);
15383 rtx (*neg_insn) (rtx, rtx);
15384 rtx_insn *insn;
15385 rtx tmp;
15386
15387 switch (inmode)
15388 {
15389 case E_SFmode:
15390 case E_DFmode:
15391 tmp = gen_reg_rtx (XFmode);
15392
15393 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15394 op1 = tmp;
15395 break;
15396 case E_XFmode:
15397 break;
15398 default:
15399 gcc_unreachable ();
15400 }
15401
15402 switch (outmode)
15403 {
15404 case E_SFmode:
15405 floor_insn = gen_frndintxf2_floor;
15406 neg_insn = gen_negsf2;
15407 break;
15408 case E_DFmode:
15409 floor_insn = gen_frndintxf2_floor;
15410 neg_insn = gen_negdf2;
15411 break;
15412 case E_XFmode:
15413 floor_insn = gen_frndintxf2_floor;
15414 neg_insn = gen_negxf2;
15415 break;
15416 case E_HImode:
15417 floor_insn = gen_lfloorxfhi2;
15418 neg_insn = gen_neghi2;
15419 break;
15420 case E_SImode:
15421 floor_insn = gen_lfloorxfsi2;
15422 neg_insn = gen_negsi2;
15423 break;
15424 case E_DImode:
15425 floor_insn = gen_lfloorxfdi2;
15426 neg_insn = gen_negdi2;
15427 break;
15428 default:
15429 gcc_unreachable ();
15430 }
15431
15432 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15433
15434 /* scratch = fxam(op1) */
15435 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15436
15437 /* e1 = fabs(op1) */
15438 emit_insn (gen_absxf2 (e1, op1));
15439
15440 /* e2 = e1 + 0.5 */
15441 half = force_reg (XFmode, half);
15442 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15443
15444 /* res = floor(e2) */
15445 switch (outmode)
15446 {
15447 case E_SFmode:
15448 case E_DFmode:
15449 {
15450 tmp = gen_reg_rtx (XFmode);
15451
15452 emit_insn (floor_insn (tmp, e2));
15453 emit_insn (gen_rtx_SET (res,
15454 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15455 UNSPEC_TRUNC_NOOP)));
15456 }
15457 break;
15458 default:
15459 emit_insn (floor_insn (res, e2));
15460 }
15461
15462 /* flags = signbit(a) */
15463 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15464
15465 /* if (flags) then res = -res */
15466 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15467 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15468 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15469 pc_rtx);
15470 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15471 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15472 JUMP_LABEL (insn) = jump_label;
15473
15474 emit_insn (neg_insn (res, res));
15475
15476 emit_label (jump_label);
15477 LABEL_NUSES (jump_label) = 1;
15478
15479 emit_move_insn (op0, res);
15480 }
15481
15482 /* Output code to perform a Newton-Rhapson approximation of a single precision
15483 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15484
ix86_emit_swdivsf(rtx res,rtx a,rtx b,machine_mode mode)15485 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15486 {
15487 rtx x0, x1, e0, e1;
15488
15489 x0 = gen_reg_rtx (mode);
15490 e0 = gen_reg_rtx (mode);
15491 e1 = gen_reg_rtx (mode);
15492 x1 = gen_reg_rtx (mode);
15493
15494 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15495
15496 b = force_reg (mode, b);
15497
15498 /* x0 = rcp(b) estimate */
15499 if (mode == V16SFmode || mode == V8DFmode)
15500 {
15501 if (TARGET_AVX512ER)
15502 {
15503 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15504 UNSPEC_RCP28)));
15505 /* res = a * x0 */
15506 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15507 return;
15508 }
15509 else
15510 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15511 UNSPEC_RCP14)));
15512 }
15513 else
15514 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15515 UNSPEC_RCP)));
15516
15517 /* e0 = x0 * b */
15518 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15519
15520 /* e0 = x0 * e0 */
15521 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15522
15523 /* e1 = x0 + x0 */
15524 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15525
15526 /* x1 = e1 - e0 */
15527 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15528
15529 /* res = a * x1 */
15530 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15531 }
15532
15533 /* Output code to perform a Newton-Rhapson approximation of a
15534 single precision floating point [reciprocal] square root. */
15535
ix86_emit_swsqrtsf(rtx res,rtx a,machine_mode mode,bool recip)15536 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15537 {
15538 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15539 REAL_VALUE_TYPE r;
15540 int unspec;
15541
15542 x0 = gen_reg_rtx (mode);
15543 e0 = gen_reg_rtx (mode);
15544 e1 = gen_reg_rtx (mode);
15545 e2 = gen_reg_rtx (mode);
15546 e3 = gen_reg_rtx (mode);
15547
15548 if (TARGET_AVX512ER && mode == V16SFmode)
15549 {
15550 if (recip)
15551 /* res = rsqrt28(a) estimate */
15552 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15553 UNSPEC_RSQRT28)));
15554 else
15555 {
15556 /* x0 = rsqrt28(a) estimate */
15557 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15558 UNSPEC_RSQRT28)));
15559 /* res = rcp28(x0) estimate */
15560 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15561 UNSPEC_RCP28)));
15562 }
15563 return;
15564 }
15565
15566 real_from_integer (&r, VOIDmode, -3, SIGNED);
15567 mthree = const_double_from_real_value (r, SFmode);
15568
15569 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15570 mhalf = const_double_from_real_value (r, SFmode);
15571 unspec = UNSPEC_RSQRT;
15572
15573 if (VECTOR_MODE_P (mode))
15574 {
15575 mthree = ix86_build_const_vector (mode, true, mthree);
15576 mhalf = ix86_build_const_vector (mode, true, mhalf);
15577 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15578 if (GET_MODE_SIZE (mode) == 64)
15579 unspec = UNSPEC_RSQRT14;
15580 }
15581
15582 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15583 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15584
15585 a = force_reg (mode, a);
15586
15587 /* x0 = rsqrt(a) estimate */
15588 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15589 unspec)));
15590
15591 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15592 if (!recip)
15593 {
15594 rtx zero = force_reg (mode, CONST0_RTX(mode));
15595 rtx mask;
15596
15597 /* Handle masked compare. */
15598 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15599 {
15600 mask = gen_reg_rtx (HImode);
15601 /* Imm value 0x4 corresponds to not-equal comparison. */
15602 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15603 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15604 }
15605 else
15606 {
15607 mask = gen_reg_rtx (mode);
15608 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15609 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15610 }
15611 }
15612
15613 /* e0 = x0 * a */
15614 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15615 /* e1 = e0 * x0 */
15616 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15617
15618 /* e2 = e1 - 3. */
15619 mthree = force_reg (mode, mthree);
15620 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15621
15622 mhalf = force_reg (mode, mhalf);
15623 if (recip)
15624 /* e3 = -.5 * x0 */
15625 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15626 else
15627 /* e3 = -.5 * e0 */
15628 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15629 /* ret = e2 * e3 */
15630 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15631 }
15632
15633 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15634 mask for masking out the sign-bit is stored in *SMASK, if that is
15635 non-null. */
15636
15637 static rtx
ix86_expand_sse_fabs(rtx op0,rtx * smask)15638 ix86_expand_sse_fabs (rtx op0, rtx *smask)
15639 {
15640 machine_mode vmode, mode = GET_MODE (op0);
15641 rtx xa, mask;
15642
15643 xa = gen_reg_rtx (mode);
15644 if (mode == SFmode)
15645 vmode = V4SFmode;
15646 else if (mode == DFmode)
15647 vmode = V2DFmode;
15648 else
15649 vmode = mode;
15650 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15651 if (!VECTOR_MODE_P (mode))
15652 {
15653 /* We need to generate a scalar mode mask in this case. */
15654 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15655 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15656 mask = gen_reg_rtx (mode);
15657 emit_insn (gen_rtx_SET (mask, tmp));
15658 }
15659 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15660
15661 if (smask)
15662 *smask = mask;
15663
15664 return xa;
15665 }
15666
15667 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15668 swapping the operands if SWAP_OPERANDS is true. The expanded
15669 code is a forward jump to a newly created label in case the
15670 comparison is true. The generated label rtx is returned. */
15671 static rtx_code_label *
ix86_expand_sse_compare_and_jump(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)15672 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15673 bool swap_operands)
15674 {
15675 bool unordered_compare = ix86_unordered_fp_compare (code);
15676 rtx_code_label *label;
15677 rtx tmp, reg;
15678
15679 if (swap_operands)
15680 std::swap (op0, op1);
15681
15682 label = gen_label_rtx ();
15683 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15684 if (unordered_compare)
15685 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15686 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15687 emit_insn (gen_rtx_SET (reg, tmp));
15688 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15689 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15690 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15691 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15692 JUMP_LABEL (tmp) = label;
15693
15694 return label;
15695 }
15696
15697 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15698 using comparison code CODE. Operands are swapped for the comparison if
15699 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15700 static rtx
ix86_expand_sse_compare_mask(enum rtx_code code,rtx op0,rtx op1,bool swap_operands)15701 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15702 bool swap_operands)
15703 {
15704 rtx (*insn)(rtx, rtx, rtx, rtx);
15705 machine_mode mode = GET_MODE (op0);
15706 rtx mask = gen_reg_rtx (mode);
15707
15708 if (swap_operands)
15709 std::swap (op0, op1);
15710
15711 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15712
15713 emit_insn (insn (mask, op0, op1,
15714 gen_rtx_fmt_ee (code, mode, op0, op1)));
15715 return mask;
15716 }
15717
15718 /* Expand copysign from SIGN to the positive value ABS_VALUE
15719 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15720 the sign-bit. */
15721
15722 static void
ix86_sse_copysign_to_positive(rtx result,rtx abs_value,rtx sign,rtx mask)15723 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15724 {
15725 machine_mode mode = GET_MODE (sign);
15726 rtx sgn = gen_reg_rtx (mode);
15727 if (mask == NULL_RTX)
15728 {
15729 machine_mode vmode;
15730
15731 if (mode == SFmode)
15732 vmode = V4SFmode;
15733 else if (mode == DFmode)
15734 vmode = V2DFmode;
15735 else
15736 vmode = mode;
15737
15738 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15739 if (!VECTOR_MODE_P (mode))
15740 {
15741 /* We need to generate a scalar mode mask in this case. */
15742 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15743 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15744 mask = gen_reg_rtx (mode);
15745 emit_insn (gen_rtx_SET (mask, tmp));
15746 }
15747 }
15748 else
15749 mask = gen_rtx_NOT (mode, mask);
15750 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15751 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15752 }
15753
15754 /* Expand SSE sequence for computing lround from OP1 storing
15755 into OP0. */
15756
15757 void
ix86_expand_lround(rtx op0,rtx op1)15758 ix86_expand_lround (rtx op0, rtx op1)
15759 {
15760 /* C code for the stuff we're doing below:
15761 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15762 return (long)tmp;
15763 */
15764 machine_mode mode = GET_MODE (op1);
15765 const struct real_format *fmt;
15766 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15767 rtx adj;
15768
15769 /* load nextafter (0.5, 0.0) */
15770 fmt = REAL_MODE_FORMAT (mode);
15771 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15772 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15773
15774 /* adj = copysign (0.5, op1) */
15775 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15776 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15777
15778 /* adj = op1 + adj */
15779 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15780
15781 /* op0 = (imode)adj */
15782 expand_fix (op0, adj, 0);
15783 }
15784
15785 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15786 into OPERAND0. */
15787
15788 void
ix86_expand_lfloorceil(rtx op0,rtx op1,bool do_floor)15789 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15790 {
15791 /* C code for the stuff we're doing below (for do_floor):
15792 xi = (long)op1;
15793 xi -= (double)xi > op1 ? 1 : 0;
15794 return xi;
15795 */
15796 machine_mode fmode = GET_MODE (op1);
15797 machine_mode imode = GET_MODE (op0);
15798 rtx ireg, freg, tmp;
15799 rtx_code_label *label;
15800
15801 /* reg = (long)op1 */
15802 ireg = gen_reg_rtx (imode);
15803 expand_fix (ireg, op1, 0);
15804
15805 /* freg = (double)reg */
15806 freg = gen_reg_rtx (fmode);
15807 expand_float (freg, ireg, 0);
15808
15809 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15810 label = ix86_expand_sse_compare_and_jump (UNLE,
15811 freg, op1, !do_floor);
15812 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15813 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15814 emit_move_insn (ireg, tmp);
15815
15816 emit_label (label);
15817 LABEL_NUSES (label) = 1;
15818
15819 emit_move_insn (op0, ireg);
15820 }
15821
15822 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15823 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15824
15825 static rtx
ix86_gen_TWO52(machine_mode mode)15826 ix86_gen_TWO52 (machine_mode mode)
15827 {
15828 REAL_VALUE_TYPE TWO52r;
15829 rtx TWO52;
15830
15831 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15832 TWO52 = const_double_from_real_value (TWO52r, mode);
15833 TWO52 = force_reg (mode, TWO52);
15834
15835 return TWO52;
15836 }
15837
15838 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15839
15840 void
ix86_expand_rint(rtx operand0,rtx operand1)15841 ix86_expand_rint (rtx operand0, rtx operand1)
15842 {
15843 /* C code for the stuff we're doing below:
15844 xa = fabs (operand1);
15845 if (!isless (xa, 2**52))
15846 return operand1;
15847 two52 = 2**52;
15848 if (flag_rounding_math)
15849 {
15850 two52 = copysign (two52, operand1);
15851 xa = operand1;
15852 }
15853 xa = xa + two52 - two52;
15854 return copysign (xa, operand1);
15855 */
15856 machine_mode mode = GET_MODE (operand0);
15857 rtx res, xa, TWO52, mask;
15858 rtx_code_label *label;
15859
15860 res = gen_reg_rtx (mode);
15861 emit_move_insn (res, operand1);
15862
15863 /* xa = abs (operand1) */
15864 xa = ix86_expand_sse_fabs (res, &mask);
15865
15866 /* if (!isless (xa, TWO52)) goto label; */
15867 TWO52 = ix86_gen_TWO52 (mode);
15868 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15869
15870 if (flag_rounding_math)
15871 {
15872 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
15873 xa = res;
15874 }
15875
15876 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15877 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
15878
15879 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
15880 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
15881 xa = ix86_expand_sse_fabs (xa, NULL);
15882
15883 ix86_sse_copysign_to_positive (res, xa, res, mask);
15884
15885 emit_label (label);
15886 LABEL_NUSES (label) = 1;
15887
15888 emit_move_insn (operand0, res);
15889 }
15890
15891 /* Expand SSE2 sequence for computing floor or ceil
15892 from OPERAND1 storing into OPERAND0. */
15893 void
ix86_expand_floorceil(rtx operand0,rtx operand1,bool do_floor)15894 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15895 {
15896 /* C code for the stuff we expand below.
15897 double xa = fabs (x), x2;
15898 if (!isless (xa, TWO52))
15899 return x;
15900 x2 = (double)(long)x;
15901
15902 Compensate. Floor:
15903 if (x2 > x)
15904 x2 -= 1;
15905 Compensate. Ceil:
15906 if (x2 < x)
15907 x2 += 1;
15908
15909 if (HONOR_SIGNED_ZEROS (mode))
15910 return copysign (x2, x);
15911 return x2;
15912 */
15913 machine_mode mode = GET_MODE (operand0);
15914 rtx xa, xi, TWO52, tmp, one, res, mask;
15915 rtx_code_label *label;
15916
15917 TWO52 = ix86_gen_TWO52 (mode);
15918
15919 /* Temporary for holding the result, initialized to the input
15920 operand to ease control flow. */
15921 res = gen_reg_rtx (mode);
15922 emit_move_insn (res, operand1);
15923
15924 /* xa = abs (operand1) */
15925 xa = ix86_expand_sse_fabs (res, &mask);
15926
15927 /* if (!isless (xa, TWO52)) goto label; */
15928 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15929
15930 /* xa = (double)(long)x */
15931 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15932 expand_fix (xi, res, 0);
15933 expand_float (xa, xi, 0);
15934
15935 /* generate 1.0 */
15936 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15937
15938 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15939 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15940 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15941 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15942 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15943 if (HONOR_SIGNED_ZEROS (mode))
15944 {
15945 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
15946 if (do_floor && flag_rounding_math)
15947 tmp = ix86_expand_sse_fabs (tmp, NULL);
15948
15949 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
15950 }
15951 emit_move_insn (res, tmp);
15952
15953 emit_label (label);
15954 LABEL_NUSES (label) = 1;
15955
15956 emit_move_insn (operand0, res);
15957 }
15958
15959 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15960 into OPERAND0 without relying on DImode truncation via cvttsd2siq
15961 that is only available on 64bit targets. */
15962 void
ix86_expand_floorceildf_32(rtx operand0,rtx operand1,bool do_floor)15963 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
15964 {
15965 /* C code for the stuff we expand below.
15966 double xa = fabs (x), x2;
15967 if (!isless (xa, TWO52))
15968 return x;
15969 xa = xa + TWO52 - TWO52;
15970 x2 = copysign (xa, x);
15971
15972 Compensate. Floor:
15973 if (x2 > x)
15974 x2 -= 1;
15975 Compensate. Ceil:
15976 if (x2 < x)
15977 x2 += 1;
15978
15979 if (HONOR_SIGNED_ZEROS (mode))
15980 x2 = copysign (x2, x);
15981 return x2;
15982 */
15983 machine_mode mode = GET_MODE (operand0);
15984 rtx xa, TWO52, tmp, one, res, mask;
15985 rtx_code_label *label;
15986
15987 TWO52 = ix86_gen_TWO52 (mode);
15988
15989 /* Temporary for holding the result, initialized to the input
15990 operand to ease control flow. */
15991 res = gen_reg_rtx (mode);
15992 emit_move_insn (res, operand1);
15993
15994 /* xa = abs (operand1) */
15995 xa = ix86_expand_sse_fabs (res, &mask);
15996
15997 /* if (!isless (xa, TWO52)) goto label; */
15998 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15999
16000 /* xa = xa + TWO52 - TWO52; */
16001 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16002 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16003
16004 /* xa = copysign (xa, operand1) */
16005 ix86_sse_copysign_to_positive (xa, xa, res, mask);
16006
16007 /* generate 1.0 */
16008 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16009
16010 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16011 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16012 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16013 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16014 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16015 if (HONOR_SIGNED_ZEROS (mode))
16016 {
16017 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16018 if (do_floor && flag_rounding_math)
16019 tmp = ix86_expand_sse_fabs (tmp, NULL);
16020
16021 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16022 }
16023 emit_move_insn (res, tmp);
16024
16025 emit_label (label);
16026 LABEL_NUSES (label) = 1;
16027
16028 emit_move_insn (operand0, res);
16029 }
16030
16031 /* Expand SSE sequence for computing trunc
16032 from OPERAND1 storing into OPERAND0. */
16033 void
ix86_expand_trunc(rtx operand0,rtx operand1)16034 ix86_expand_trunc (rtx operand0, rtx operand1)
16035 {
16036 /* C code for SSE variant we expand below.
16037 double xa = fabs (x), x2;
16038 if (!isless (xa, TWO52))
16039 return x;
16040 x2 = (double)(long)x;
16041 if (HONOR_SIGNED_ZEROS (mode))
16042 return copysign (x2, x);
16043 return x2;
16044 */
16045 machine_mode mode = GET_MODE (operand0);
16046 rtx xa, xi, TWO52, res, mask;
16047 rtx_code_label *label;
16048
16049 TWO52 = ix86_gen_TWO52 (mode);
16050
16051 /* Temporary for holding the result, initialized to the input
16052 operand to ease control flow. */
16053 res = gen_reg_rtx (mode);
16054 emit_move_insn (res, operand1);
16055
16056 /* xa = abs (operand1) */
16057 xa = ix86_expand_sse_fabs (res, &mask);
16058
16059 /* if (!isless (xa, TWO52)) goto label; */
16060 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16061
16062 /* x = (double)(long)x */
16063 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16064 expand_fix (xi, res, 0);
16065 expand_float (res, xi, 0);
16066
16067 if (HONOR_SIGNED_ZEROS (mode))
16068 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16069
16070 emit_label (label);
16071 LABEL_NUSES (label) = 1;
16072
16073 emit_move_insn (operand0, res);
16074 }
16075
16076 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16077 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16078 that is only available on 64bit targets. */
16079 void
ix86_expand_truncdf_32(rtx operand0,rtx operand1)16080 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16081 {
16082 machine_mode mode = GET_MODE (operand0);
16083 rtx xa, xa2, TWO52, tmp, one, res, mask;
16084 rtx_code_label *label;
16085
16086 /* C code for SSE variant we expand below.
16087 double xa = fabs (x), x2;
16088 if (!isless (xa, TWO52))
16089 return x;
16090 xa2 = xa + TWO52 - TWO52;
16091 Compensate:
16092 if (xa2 > xa)
16093 xa2 -= 1.0;
16094 x2 = copysign (xa2, x);
16095 return x2;
16096 */
16097
16098 TWO52 = ix86_gen_TWO52 (mode);
16099
16100 /* Temporary for holding the result, initialized to the input
16101 operand to ease control flow. */
16102 res = gen_reg_rtx (mode);
16103 emit_move_insn (res, operand1);
16104
16105 /* xa = abs (operand1) */
16106 xa = ix86_expand_sse_fabs (res, &mask);
16107
16108 /* if (!isless (xa, TWO52)) goto label; */
16109 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16110
16111 /* xa2 = xa + TWO52 - TWO52; */
16112 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16113 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16114
16115 /* generate 1.0 */
16116 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16117
16118 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
16119 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
16120 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16121 tmp = expand_simple_binop (mode, MINUS,
16122 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16123 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16124 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16125 tmp = ix86_expand_sse_fabs (tmp, NULL);
16126
16127 /* res = copysign (xa2, operand1) */
16128 ix86_sse_copysign_to_positive (res, tmp, res, mask);
16129
16130 emit_label (label);
16131 LABEL_NUSES (label) = 1;
16132
16133 emit_move_insn (operand0, res);
16134 }
16135
16136 /* Expand SSE sequence for computing round
16137 from OPERAND1 storing into OPERAND0. */
16138 void
ix86_expand_round(rtx operand0,rtx operand1)16139 ix86_expand_round (rtx operand0, rtx operand1)
16140 {
16141 /* C code for the stuff we're doing below:
16142 double xa = fabs (x);
16143 if (!isless (xa, TWO52))
16144 return x;
16145 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16146 return copysign (xa, x);
16147 */
16148 machine_mode mode = GET_MODE (operand0);
16149 rtx res, TWO52, xa, xi, half, mask;
16150 rtx_code_label *label;
16151 const struct real_format *fmt;
16152 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16153
16154 /* Temporary for holding the result, initialized to the input
16155 operand to ease control flow. */
16156 res = gen_reg_rtx (mode);
16157 emit_move_insn (res, operand1);
16158
16159 TWO52 = ix86_gen_TWO52 (mode);
16160 xa = ix86_expand_sse_fabs (res, &mask);
16161 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16162
16163 /* load nextafter (0.5, 0.0) */
16164 fmt = REAL_MODE_FORMAT (mode);
16165 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16166 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16167
16168 /* xa = xa + 0.5 */
16169 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16170 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16171
16172 /* xa = (double)(int64_t)xa */
16173 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16174 expand_fix (xi, xa, 0);
16175 expand_float (xa, xi, 0);
16176
16177 /* res = copysign (xa, operand1) */
16178 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16179
16180 emit_label (label);
16181 LABEL_NUSES (label) = 1;
16182
16183 emit_move_insn (operand0, res);
16184 }
16185
16186 /* Expand SSE sequence for computing round from OPERAND1 storing
16187 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16188 that is only available on 64bit targets. */
16189 void
ix86_expand_rounddf_32(rtx operand0,rtx operand1)16190 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16191 {
16192 /* C code for the stuff we expand below.
16193 double xa = fabs (x), xa2, x2;
16194 if (!isless (xa, TWO52))
16195 return x;
16196 Using the absolute value and copying back sign makes
16197 -0.0 -> -0.0 correct.
16198 xa2 = xa + TWO52 - TWO52;
16199 Compensate.
16200 dxa = xa2 - xa;
16201 if (dxa <= -0.5)
16202 xa2 += 1;
16203 else if (dxa > 0.5)
16204 xa2 -= 1;
16205 x2 = copysign (xa2, x);
16206 return x2;
16207 */
16208 machine_mode mode = GET_MODE (operand0);
16209 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16210 rtx_code_label *label;
16211
16212 TWO52 = ix86_gen_TWO52 (mode);
16213
16214 /* Temporary for holding the result, initialized to the input
16215 operand to ease control flow. */
16216 res = gen_reg_rtx (mode);
16217 emit_move_insn (res, operand1);
16218
16219 /* xa = abs (operand1) */
16220 xa = ix86_expand_sse_fabs (res, &mask);
16221
16222 /* if (!isless (xa, TWO52)) goto label; */
16223 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16224
16225 /* xa2 = xa + TWO52 - TWO52; */
16226 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16227 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16228
16229 /* dxa = xa2 - xa; */
16230 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16231
16232 /* generate 0.5, 1.0 and -0.5 */
16233 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16234 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16235 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16236 0, OPTAB_DIRECT);
16237
16238 /* Compensate. */
16239 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16240 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16241 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16242 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16243 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16244 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16245 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16246 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16247
16248 /* res = copysign (xa2, operand1) */
16249 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16250
16251 emit_label (label);
16252 LABEL_NUSES (label) = 1;
16253
16254 emit_move_insn (operand0, res);
16255 }
16256
16257 /* Expand SSE sequence for computing round
16258 from OP1 storing into OP0 using sse4 round insn. */
16259 void
ix86_expand_round_sse4(rtx op0,rtx op1)16260 ix86_expand_round_sse4 (rtx op0, rtx op1)
16261 {
16262 machine_mode mode = GET_MODE (op0);
16263 rtx e1, e2, res, half;
16264 const struct real_format *fmt;
16265 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16266 rtx (*gen_copysign) (rtx, rtx, rtx);
16267 rtx (*gen_round) (rtx, rtx, rtx);
16268
16269 switch (mode)
16270 {
16271 case E_SFmode:
16272 gen_copysign = gen_copysignsf3;
16273 gen_round = gen_sse4_1_roundsf2;
16274 break;
16275 case E_DFmode:
16276 gen_copysign = gen_copysigndf3;
16277 gen_round = gen_sse4_1_rounddf2;
16278 break;
16279 default:
16280 gcc_unreachable ();
16281 }
16282
16283 /* round (a) = trunc (a + copysign (0.5, a)) */
16284
16285 /* load nextafter (0.5, 0.0) */
16286 fmt = REAL_MODE_FORMAT (mode);
16287 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16288 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16289 half = const_double_from_real_value (pred_half, mode);
16290
16291 /* e1 = copysign (0.5, op1) */
16292 e1 = gen_reg_rtx (mode);
16293 emit_insn (gen_copysign (e1, half, op1));
16294
16295 /* e2 = op1 + e1 */
16296 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16297
16298 /* res = trunc (e2) */
16299 res = gen_reg_rtx (mode);
16300 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16301
16302 emit_move_insn (op0, res);
16303 }
16304
16305 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16306 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16307 insn every time. */
16308
16309 static GTY(()) rtx_insn *vselect_insn;
16310
16311 /* Initialize vselect_insn. */
16312
16313 static void
init_vselect_insn(void)16314 init_vselect_insn (void)
16315 {
16316 unsigned i;
16317 rtx x;
16318
16319 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16320 for (i = 0; i < MAX_VECT_LEN; ++i)
16321 XVECEXP (x, 0, i) = const0_rtx;
16322 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16323 const0_rtx), x);
16324 x = gen_rtx_SET (const0_rtx, x);
16325 start_sequence ();
16326 vselect_insn = emit_insn (x);
16327 end_sequence ();
16328 }
16329
16330 /* Construct (set target (vec_select op0 (parallel perm))) and
16331 return true if that's a valid instruction in the active ISA. */
16332
16333 static bool
expand_vselect(rtx target,rtx op0,const unsigned char * perm,unsigned nelt,bool testing_p)16334 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16335 unsigned nelt, bool testing_p)
16336 {
16337 unsigned int i;
16338 rtx x, save_vconcat;
16339 int icode;
16340
16341 if (vselect_insn == NULL_RTX)
16342 init_vselect_insn ();
16343
16344 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16345 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16346 for (i = 0; i < nelt; ++i)
16347 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16348 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16349 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16350 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16351 SET_DEST (PATTERN (vselect_insn)) = target;
16352 icode = recog_memoized (vselect_insn);
16353
16354 if (icode >= 0 && !testing_p)
16355 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16356
16357 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16358 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16359 INSN_CODE (vselect_insn) = -1;
16360
16361 return icode >= 0;
16362 }
16363
16364 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16365
16366 static bool
expand_vselect_vconcat(rtx target,rtx op0,rtx op1,const unsigned char * perm,unsigned nelt,bool testing_p)16367 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16368 const unsigned char *perm, unsigned nelt,
16369 bool testing_p)
16370 {
16371 machine_mode v2mode;
16372 rtx x;
16373 bool ok;
16374
16375 if (vselect_insn == NULL_RTX)
16376 init_vselect_insn ();
16377
16378 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16379 return false;
16380 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16381 PUT_MODE (x, v2mode);
16382 XEXP (x, 0) = op0;
16383 XEXP (x, 1) = op1;
16384 ok = expand_vselect (target, x, perm, nelt, testing_p);
16385 XEXP (x, 0) = const0_rtx;
16386 XEXP (x, 1) = const0_rtx;
16387 return ok;
16388 }
16389
16390 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16391 using movss or movsd. */
16392 static bool
expand_vec_perm_movs(struct expand_vec_perm_d * d)16393 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16394 {
16395 machine_mode vmode = d->vmode;
16396 unsigned i, nelt = d->nelt;
16397 rtx x;
16398
16399 if (d->one_operand_p)
16400 return false;
16401
16402 if (!(TARGET_SSE && vmode == V4SFmode)
16403 && !(TARGET_SSE2 && vmode == V2DFmode))
16404 return false;
16405
16406 /* Only the first element is changed. */
16407 if (d->perm[0] != nelt && d->perm[0] != 0)
16408 return false;
16409 for (i = 1; i < nelt; ++i)
16410 if (d->perm[i] != i + nelt - d->perm[0])
16411 return false;
16412
16413 if (d->testing_p)
16414 return true;
16415
16416 if (d->perm[0] == nelt)
16417 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16418 else
16419 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16420
16421 emit_insn (gen_rtx_SET (d->target, x));
16422
16423 return true;
16424 }
16425
16426 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16427 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16428
16429 static bool
expand_vec_perm_blend(struct expand_vec_perm_d * d)16430 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16431 {
16432 machine_mode mmode, vmode = d->vmode;
16433 unsigned i, nelt = d->nelt;
16434 unsigned HOST_WIDE_INT mask;
16435 rtx target, op0, op1, maskop, x;
16436 rtx rperm[32], vperm;
16437
16438 if (d->one_operand_p)
16439 return false;
16440 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16441 && (TARGET_AVX512BW
16442 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16443 ;
16444 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16445 ;
16446 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16447 ;
16448 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16449 ;
16450 else
16451 return false;
16452
16453 /* This is a blend, not a permute. Elements must stay in their
16454 respective lanes. */
16455 for (i = 0; i < nelt; ++i)
16456 {
16457 unsigned e = d->perm[i];
16458 if (!(e == i || e == i + nelt))
16459 return false;
16460 }
16461
16462 if (d->testing_p)
16463 return true;
16464
16465 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16466 decision should be extracted elsewhere, so that we only try that
16467 sequence once all budget==3 options have been tried. */
16468 target = d->target;
16469 op0 = d->op0;
16470 op1 = d->op1;
16471 mask = 0;
16472
16473 switch (vmode)
16474 {
16475 case E_V8DFmode:
16476 case E_V16SFmode:
16477 case E_V4DFmode:
16478 case E_V8SFmode:
16479 case E_V2DFmode:
16480 case E_V4SFmode:
16481 case E_V8HImode:
16482 case E_V8SImode:
16483 case E_V32HImode:
16484 case E_V64QImode:
16485 case E_V16SImode:
16486 case E_V8DImode:
16487 for (i = 0; i < nelt; ++i)
16488 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16489 break;
16490
16491 case E_V2DImode:
16492 for (i = 0; i < 2; ++i)
16493 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16494 vmode = V8HImode;
16495 goto do_subreg;
16496
16497 case E_V4SImode:
16498 for (i = 0; i < 4; ++i)
16499 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16500 vmode = V8HImode;
16501 goto do_subreg;
16502
16503 case E_V16QImode:
16504 /* See if bytes move in pairs so we can use pblendw with
16505 an immediate argument, rather than pblendvb with a vector
16506 argument. */
16507 for (i = 0; i < 16; i += 2)
16508 if (d->perm[i] + 1 != d->perm[i + 1])
16509 {
16510 use_pblendvb:
16511 for (i = 0; i < nelt; ++i)
16512 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16513
16514 finish_pblendvb:
16515 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16516 vperm = force_reg (vmode, vperm);
16517
16518 if (GET_MODE_SIZE (vmode) == 16)
16519 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16520 else
16521 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16522 if (target != d->target)
16523 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16524 return true;
16525 }
16526
16527 for (i = 0; i < 8; ++i)
16528 mask |= (d->perm[i * 2] >= 16) << i;
16529 vmode = V8HImode;
16530 /* FALLTHRU */
16531
16532 do_subreg:
16533 target = gen_reg_rtx (vmode);
16534 op0 = gen_lowpart (vmode, op0);
16535 op1 = gen_lowpart (vmode, op1);
16536 break;
16537
16538 case E_V32QImode:
16539 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16540 for (i = 0; i < 32; i += 2)
16541 if (d->perm[i] + 1 != d->perm[i + 1])
16542 goto use_pblendvb;
16543 /* See if bytes move in quadruplets. If yes, vpblendd
16544 with immediate can be used. */
16545 for (i = 0; i < 32; i += 4)
16546 if (d->perm[i] + 2 != d->perm[i + 2])
16547 break;
16548 if (i < 32)
16549 {
16550 /* See if bytes move the same in both lanes. If yes,
16551 vpblendw with immediate can be used. */
16552 for (i = 0; i < 16; i += 2)
16553 if (d->perm[i] + 16 != d->perm[i + 16])
16554 goto use_pblendvb;
16555
16556 /* Use vpblendw. */
16557 for (i = 0; i < 16; ++i)
16558 mask |= (d->perm[i * 2] >= 32) << i;
16559 vmode = V16HImode;
16560 goto do_subreg;
16561 }
16562
16563 /* Use vpblendd. */
16564 for (i = 0; i < 8; ++i)
16565 mask |= (d->perm[i * 4] >= 32) << i;
16566 vmode = V8SImode;
16567 goto do_subreg;
16568
16569 case E_V16HImode:
16570 /* See if words move in pairs. If yes, vpblendd can be used. */
16571 for (i = 0; i < 16; i += 2)
16572 if (d->perm[i] + 1 != d->perm[i + 1])
16573 break;
16574 if (i < 16)
16575 {
16576 /* See if words move the same in both lanes. If not,
16577 vpblendvb must be used. */
16578 for (i = 0; i < 8; i++)
16579 if (d->perm[i] + 8 != d->perm[i + 8])
16580 {
16581 /* Use vpblendvb. */
16582 for (i = 0; i < 32; ++i)
16583 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16584
16585 vmode = V32QImode;
16586 nelt = 32;
16587 target = gen_reg_rtx (vmode);
16588 op0 = gen_lowpart (vmode, op0);
16589 op1 = gen_lowpart (vmode, op1);
16590 goto finish_pblendvb;
16591 }
16592
16593 /* Use vpblendw. */
16594 for (i = 0; i < 16; ++i)
16595 mask |= (d->perm[i] >= 16) << i;
16596 break;
16597 }
16598
16599 /* Use vpblendd. */
16600 for (i = 0; i < 8; ++i)
16601 mask |= (d->perm[i * 2] >= 16) << i;
16602 vmode = V8SImode;
16603 goto do_subreg;
16604
16605 case E_V4DImode:
16606 /* Use vpblendd. */
16607 for (i = 0; i < 4; ++i)
16608 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16609 vmode = V8SImode;
16610 goto do_subreg;
16611
16612 default:
16613 gcc_unreachable ();
16614 }
16615
16616 switch (vmode)
16617 {
16618 case E_V8DFmode:
16619 case E_V8DImode:
16620 mmode = QImode;
16621 break;
16622 case E_V16SFmode:
16623 case E_V16SImode:
16624 mmode = HImode;
16625 break;
16626 case E_V32HImode:
16627 mmode = SImode;
16628 break;
16629 case E_V64QImode:
16630 mmode = DImode;
16631 break;
16632 default:
16633 mmode = VOIDmode;
16634 }
16635
16636 if (mmode != VOIDmode)
16637 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16638 else
16639 maskop = GEN_INT (mask);
16640
16641 /* This matches five different patterns with the different modes. */
16642 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16643 x = gen_rtx_SET (target, x);
16644 emit_insn (x);
16645 if (target != d->target)
16646 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16647
16648 return true;
16649 }
16650
16651 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16652 in terms of the variable form of vpermilps.
16653
16654 Note that we will have already failed the immediate input vpermilps,
16655 which requires that the high and low part shuffle be identical; the
16656 variable form doesn't require that. */
16657
16658 static bool
expand_vec_perm_vpermil(struct expand_vec_perm_d * d)16659 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16660 {
16661 rtx rperm[8], vperm;
16662 unsigned i;
16663
16664 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16665 return false;
16666
16667 /* We can only permute within the 128-bit lane. */
16668 for (i = 0; i < 8; ++i)
16669 {
16670 unsigned e = d->perm[i];
16671 if (i < 4 ? e >= 4 : e < 4)
16672 return false;
16673 }
16674
16675 if (d->testing_p)
16676 return true;
16677
16678 for (i = 0; i < 8; ++i)
16679 {
16680 unsigned e = d->perm[i];
16681
16682 /* Within each 128-bit lane, the elements of op0 are numbered
16683 from 0 and the elements of op1 are numbered from 4. */
16684 if (e >= 8 + 4)
16685 e -= 8;
16686 else if (e >= 4)
16687 e -= 4;
16688
16689 rperm[i] = GEN_INT (e);
16690 }
16691
16692 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16693 vperm = force_reg (V8SImode, vperm);
16694 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16695
16696 return true;
16697 }
16698
16699 /* Return true if permutation D can be performed as VMODE permutation
16700 instead. */
16701
16702 static bool
valid_perm_using_mode_p(machine_mode vmode,struct expand_vec_perm_d * d)16703 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16704 {
16705 unsigned int i, j, chunk;
16706
16707 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16708 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16709 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16710 return false;
16711
16712 if (GET_MODE_NUNITS (vmode) >= d->nelt)
16713 return true;
16714
16715 chunk = d->nelt / GET_MODE_NUNITS (vmode);
16716 for (i = 0; i < d->nelt; i += chunk)
16717 if (d->perm[i] & (chunk - 1))
16718 return false;
16719 else
16720 for (j = 1; j < chunk; ++j)
16721 if (d->perm[i] + j != d->perm[i + j])
16722 return false;
16723
16724 return true;
16725 }
16726
16727 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16728 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16729
16730 static bool
expand_vec_perm_pshufb(struct expand_vec_perm_d * d)16731 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16732 {
16733 unsigned i, nelt, eltsz, mask;
16734 unsigned char perm[64];
16735 machine_mode vmode = V16QImode;
16736 rtx rperm[64], vperm, target, op0, op1;
16737
16738 nelt = d->nelt;
16739
16740 if (!d->one_operand_p)
16741 {
16742 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16743 {
16744 if (TARGET_AVX2
16745 && valid_perm_using_mode_p (V2TImode, d))
16746 {
16747 if (d->testing_p)
16748 return true;
16749
16750 /* Use vperm2i128 insn. The pattern uses
16751 V4DImode instead of V2TImode. */
16752 target = d->target;
16753 if (d->vmode != V4DImode)
16754 target = gen_reg_rtx (V4DImode);
16755 op0 = gen_lowpart (V4DImode, d->op0);
16756 op1 = gen_lowpart (V4DImode, d->op1);
16757 rperm[0]
16758 = GEN_INT ((d->perm[0] / (nelt / 2))
16759 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16760 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16761 if (target != d->target)
16762 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16763 return true;
16764 }
16765 return false;
16766 }
16767 }
16768 else
16769 {
16770 if (GET_MODE_SIZE (d->vmode) == 16)
16771 {
16772 if (!TARGET_SSSE3)
16773 return false;
16774 }
16775 else if (GET_MODE_SIZE (d->vmode) == 32)
16776 {
16777 if (!TARGET_AVX2)
16778 return false;
16779
16780 /* V4DImode should be already handled through
16781 expand_vselect by vpermq instruction. */
16782 gcc_assert (d->vmode != V4DImode);
16783
16784 vmode = V32QImode;
16785 if (d->vmode == V8SImode
16786 || d->vmode == V16HImode
16787 || d->vmode == V32QImode)
16788 {
16789 /* First see if vpermq can be used for
16790 V8SImode/V16HImode/V32QImode. */
16791 if (valid_perm_using_mode_p (V4DImode, d))
16792 {
16793 for (i = 0; i < 4; i++)
16794 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16795 if (d->testing_p)
16796 return true;
16797 target = gen_reg_rtx (V4DImode);
16798 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16799 perm, 4, false))
16800 {
16801 emit_move_insn (d->target,
16802 gen_lowpart (d->vmode, target));
16803 return true;
16804 }
16805 return false;
16806 }
16807
16808 /* Next see if vpermd can be used. */
16809 if (valid_perm_using_mode_p (V8SImode, d))
16810 vmode = V8SImode;
16811 }
16812 /* Or if vpermps can be used. */
16813 else if (d->vmode == V8SFmode)
16814 vmode = V8SImode;
16815
16816 if (vmode == V32QImode)
16817 {
16818 /* vpshufb only works intra lanes, it is not
16819 possible to shuffle bytes in between the lanes. */
16820 for (i = 0; i < nelt; ++i)
16821 if ((d->perm[i] ^ i) & (nelt / 2))
16822 return false;
16823 }
16824 }
16825 else if (GET_MODE_SIZE (d->vmode) == 64)
16826 {
16827 if (!TARGET_AVX512BW)
16828 return false;
16829
16830 /* If vpermq didn't work, vpshufb won't work either. */
16831 if (d->vmode == V8DFmode || d->vmode == V8DImode)
16832 return false;
16833
16834 vmode = V64QImode;
16835 if (d->vmode == V16SImode
16836 || d->vmode == V32HImode
16837 || d->vmode == V64QImode)
16838 {
16839 /* First see if vpermq can be used for
16840 V16SImode/V32HImode/V64QImode. */
16841 if (valid_perm_using_mode_p (V8DImode, d))
16842 {
16843 for (i = 0; i < 8; i++)
16844 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16845 if (d->testing_p)
16846 return true;
16847 target = gen_reg_rtx (V8DImode);
16848 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16849 perm, 8, false))
16850 {
16851 emit_move_insn (d->target,
16852 gen_lowpart (d->vmode, target));
16853 return true;
16854 }
16855 return false;
16856 }
16857
16858 /* Next see if vpermd can be used. */
16859 if (valid_perm_using_mode_p (V16SImode, d))
16860 vmode = V16SImode;
16861 }
16862 /* Or if vpermps can be used. */
16863 else if (d->vmode == V16SFmode)
16864 vmode = V16SImode;
16865 if (vmode == V64QImode)
16866 {
16867 /* vpshufb only works intra lanes, it is not
16868 possible to shuffle bytes in between the lanes. */
16869 for (i = 0; i < nelt; ++i)
16870 if ((d->perm[i] ^ i) & (3 * nelt / 4))
16871 return false;
16872 }
16873 }
16874 else
16875 return false;
16876 }
16877
16878 if (d->testing_p)
16879 return true;
16880
16881 if (vmode == V8SImode)
16882 for (i = 0; i < 8; ++i)
16883 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16884 else if (vmode == V16SImode)
16885 for (i = 0; i < 16; ++i)
16886 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16887 else
16888 {
16889 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16890 if (!d->one_operand_p)
16891 mask = 2 * nelt - 1;
16892 else if (vmode == V16QImode)
16893 mask = nelt - 1;
16894 else if (vmode == V64QImode)
16895 mask = nelt / 4 - 1;
16896 else
16897 mask = nelt / 2 - 1;
16898
16899 for (i = 0; i < nelt; ++i)
16900 {
16901 unsigned j, e = d->perm[i] & mask;
16902 for (j = 0; j < eltsz; ++j)
16903 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16904 }
16905 }
16906
16907 vperm = gen_rtx_CONST_VECTOR (vmode,
16908 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16909 vperm = force_reg (vmode, vperm);
16910
16911 target = d->target;
16912 if (d->vmode != vmode)
16913 target = gen_reg_rtx (vmode);
16914 op0 = gen_lowpart (vmode, d->op0);
16915 if (d->one_operand_p)
16916 {
16917 if (vmode == V16QImode)
16918 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16919 else if (vmode == V32QImode)
16920 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16921 else if (vmode == V64QImode)
16922 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16923 else if (vmode == V8SFmode)
16924 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16925 else if (vmode == V8SImode)
16926 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16927 else if (vmode == V16SFmode)
16928 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16929 else if (vmode == V16SImode)
16930 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16931 else
16932 gcc_unreachable ();
16933 }
16934 else
16935 {
16936 op1 = gen_lowpart (vmode, d->op1);
16937 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16938 }
16939 if (target != d->target)
16940 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16941
16942 return true;
16943 }
16944
16945 /* For V*[QHS]Imode permutations, check if the same permutation
16946 can't be performed in a 2x, 4x or 8x wider inner mode. */
16947
16948 static bool
canonicalize_vector_int_perm(const struct expand_vec_perm_d * d,struct expand_vec_perm_d * nd)16949 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16950 struct expand_vec_perm_d *nd)
16951 {
16952 int i;
16953 machine_mode mode = VOIDmode;
16954
16955 switch (d->vmode)
16956 {
16957 case E_V16QImode: mode = V8HImode; break;
16958 case E_V32QImode: mode = V16HImode; break;
16959 case E_V64QImode: mode = V32HImode; break;
16960 case E_V8HImode: mode = V4SImode; break;
16961 case E_V16HImode: mode = V8SImode; break;
16962 case E_V32HImode: mode = V16SImode; break;
16963 case E_V4SImode: mode = V2DImode; break;
16964 case E_V8SImode: mode = V4DImode; break;
16965 case E_V16SImode: mode = V8DImode; break;
16966 default: return false;
16967 }
16968 for (i = 0; i < d->nelt; i += 2)
16969 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16970 return false;
16971 nd->vmode = mode;
16972 nd->nelt = d->nelt / 2;
16973 for (i = 0; i < nd->nelt; i++)
16974 nd->perm[i] = d->perm[2 * i] / 2;
16975 if (GET_MODE_INNER (mode) != DImode)
16976 canonicalize_vector_int_perm (nd, nd);
16977 if (nd != d)
16978 {
16979 nd->one_operand_p = d->one_operand_p;
16980 nd->testing_p = d->testing_p;
16981 if (d->op0 == d->op1)
16982 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16983 else
16984 {
16985 nd->op0 = gen_lowpart (nd->vmode, d->op0);
16986 nd->op1 = gen_lowpart (nd->vmode, d->op1);
16987 }
16988 if (d->testing_p)
16989 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16990 else
16991 nd->target = gen_reg_rtx (nd->vmode);
16992 }
16993 return true;
16994 }
16995
16996 /* Try to expand one-operand permutation with constant mask. */
16997
16998 static bool
ix86_expand_vec_one_operand_perm_avx512(struct expand_vec_perm_d * d)16999 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
17000 {
17001 machine_mode mode = GET_MODE (d->op0);
17002 machine_mode maskmode = mode;
17003 rtx (*gen) (rtx, rtx, rtx) = NULL;
17004 rtx target, op0, mask;
17005 rtx vec[64];
17006
17007 if (!rtx_equal_p (d->op0, d->op1))
17008 return false;
17009
17010 if (!TARGET_AVX512F)
17011 return false;
17012
17013 switch (mode)
17014 {
17015 case E_V16SImode:
17016 gen = gen_avx512f_permvarv16si;
17017 break;
17018 case E_V16SFmode:
17019 gen = gen_avx512f_permvarv16sf;
17020 maskmode = V16SImode;
17021 break;
17022 case E_V8DImode:
17023 gen = gen_avx512f_permvarv8di;
17024 break;
17025 case E_V8DFmode:
17026 gen = gen_avx512f_permvarv8df;
17027 maskmode = V8DImode;
17028 break;
17029 default:
17030 return false;
17031 }
17032
17033 target = d->target;
17034 op0 = d->op0;
17035 for (int i = 0; i < d->nelt; ++i)
17036 vec[i] = GEN_INT (d->perm[i]);
17037 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17038 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17039 return true;
17040 }
17041
17042 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17043
17044 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17045 in a single instruction. */
17046
17047 static bool
expand_vec_perm_1(struct expand_vec_perm_d * d)17048 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17049 {
17050 unsigned i, nelt = d->nelt;
17051 struct expand_vec_perm_d nd;
17052
17053 /* Check plain VEC_SELECT first, because AVX has instructions that could
17054 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17055 input where SEL+CONCAT may not. */
17056 if (d->one_operand_p)
17057 {
17058 int mask = nelt - 1;
17059 bool identity_perm = true;
17060 bool broadcast_perm = true;
17061
17062 for (i = 0; i < nelt; i++)
17063 {
17064 nd.perm[i] = d->perm[i] & mask;
17065 if (nd.perm[i] != i)
17066 identity_perm = false;
17067 if (nd.perm[i])
17068 broadcast_perm = false;
17069 }
17070
17071 if (identity_perm)
17072 {
17073 if (!d->testing_p)
17074 emit_move_insn (d->target, d->op0);
17075 return true;
17076 }
17077 else if (broadcast_perm && TARGET_AVX2)
17078 {
17079 /* Use vpbroadcast{b,w,d}. */
17080 rtx (*gen) (rtx, rtx) = NULL;
17081 switch (d->vmode)
17082 {
17083 case E_V64QImode:
17084 if (TARGET_AVX512BW)
17085 gen = gen_avx512bw_vec_dupv64qi_1;
17086 break;
17087 case E_V32QImode:
17088 gen = gen_avx2_pbroadcastv32qi_1;
17089 break;
17090 case E_V32HImode:
17091 if (TARGET_AVX512BW)
17092 gen = gen_avx512bw_vec_dupv32hi_1;
17093 break;
17094 case E_V16HImode:
17095 gen = gen_avx2_pbroadcastv16hi_1;
17096 break;
17097 case E_V16SImode:
17098 if (TARGET_AVX512F)
17099 gen = gen_avx512f_vec_dupv16si_1;
17100 break;
17101 case E_V8SImode:
17102 gen = gen_avx2_pbroadcastv8si_1;
17103 break;
17104 case E_V16QImode:
17105 gen = gen_avx2_pbroadcastv16qi;
17106 break;
17107 case E_V8HImode:
17108 gen = gen_avx2_pbroadcastv8hi;
17109 break;
17110 case E_V16SFmode:
17111 if (TARGET_AVX512F)
17112 gen = gen_avx512f_vec_dupv16sf_1;
17113 break;
17114 case E_V8SFmode:
17115 gen = gen_avx2_vec_dupv8sf_1;
17116 break;
17117 case E_V8DFmode:
17118 if (TARGET_AVX512F)
17119 gen = gen_avx512f_vec_dupv8df_1;
17120 break;
17121 case E_V8DImode:
17122 if (TARGET_AVX512F)
17123 gen = gen_avx512f_vec_dupv8di_1;
17124 break;
17125 /* For other modes prefer other shuffles this function creates. */
17126 default: break;
17127 }
17128 if (gen != NULL)
17129 {
17130 if (!d->testing_p)
17131 emit_insn (gen (d->target, d->op0));
17132 return true;
17133 }
17134 }
17135
17136 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17137 return true;
17138
17139 /* There are plenty of patterns in sse.md that are written for
17140 SEL+CONCAT and are not replicated for a single op. Perhaps
17141 that should be changed, to avoid the nastiness here. */
17142
17143 /* Recognize interleave style patterns, which means incrementing
17144 every other permutation operand. */
17145 for (i = 0; i < nelt; i += 2)
17146 {
17147 nd.perm[i] = d->perm[i] & mask;
17148 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17149 }
17150 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17151 d->testing_p))
17152 return true;
17153
17154 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17155 if (nelt >= 4)
17156 {
17157 for (i = 0; i < nelt; i += 4)
17158 {
17159 nd.perm[i + 0] = d->perm[i + 0] & mask;
17160 nd.perm[i + 1] = d->perm[i + 1] & mask;
17161 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17162 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17163 }
17164
17165 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17166 d->testing_p))
17167 return true;
17168 }
17169 }
17170
17171 /* Try movss/movsd instructions. */
17172 if (expand_vec_perm_movs (d))
17173 return true;
17174
17175 /* Finally, try the fully general two operand permute. */
17176 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17177 d->testing_p))
17178 return true;
17179
17180 /* Recognize interleave style patterns with reversed operands. */
17181 if (!d->one_operand_p)
17182 {
17183 for (i = 0; i < nelt; ++i)
17184 {
17185 unsigned e = d->perm[i];
17186 if (e >= nelt)
17187 e -= nelt;
17188 else
17189 e += nelt;
17190 nd.perm[i] = e;
17191 }
17192
17193 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17194 d->testing_p))
17195 return true;
17196 }
17197
17198 /* Try the SSE4.1 blend variable merge instructions. */
17199 if (expand_vec_perm_blend (d))
17200 return true;
17201
17202 /* Try one of the AVX vpermil variable permutations. */
17203 if (expand_vec_perm_vpermil (d))
17204 return true;
17205
17206 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17207 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17208 if (expand_vec_perm_pshufb (d))
17209 return true;
17210
17211 /* Try the AVX2 vpalignr instruction. */
17212 if (expand_vec_perm_palignr (d, true))
17213 return true;
17214
17215 /* Try the AVX512F vperm{s,d} instructions. */
17216 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17217 return true;
17218
17219 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17220 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17221 return true;
17222
17223 /* See if we can get the same permutation in different vector integer
17224 mode. */
17225 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17226 {
17227 if (!d->testing_p)
17228 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17229 return true;
17230 }
17231 return false;
17232 }
17233
17234 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17235 in terms of a pair of pshuflw + pshufhw instructions. */
17236
17237 static bool
expand_vec_perm_pshuflw_pshufhw(struct expand_vec_perm_d * d)17238 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17239 {
17240 unsigned char perm2[MAX_VECT_LEN];
17241 unsigned i;
17242 bool ok;
17243
17244 if (d->vmode != V8HImode || !d->one_operand_p)
17245 return false;
17246
17247 /* The two permutations only operate in 64-bit lanes. */
17248 for (i = 0; i < 4; ++i)
17249 if (d->perm[i] >= 4)
17250 return false;
17251 for (i = 4; i < 8; ++i)
17252 if (d->perm[i] < 4)
17253 return false;
17254
17255 if (d->testing_p)
17256 return true;
17257
17258 /* Emit the pshuflw. */
17259 memcpy (perm2, d->perm, 4);
17260 for (i = 4; i < 8; ++i)
17261 perm2[i] = i;
17262 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17263 gcc_assert (ok);
17264
17265 /* Emit the pshufhw. */
17266 memcpy (perm2 + 4, d->perm + 4, 4);
17267 for (i = 0; i < 4; ++i)
17268 perm2[i] = i;
17269 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17270 gcc_assert (ok);
17271
17272 return true;
17273 }
17274
17275 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17276 the permutation using the SSSE3 palignr instruction. This succeeds
17277 when all of the elements in PERM fit within one vector and we merely
17278 need to shift them down so that a single vector permutation has a
17279 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17280 the vpalignr instruction itself can perform the requested permutation. */
17281
17282 static bool
expand_vec_perm_palignr(struct expand_vec_perm_d * d,bool single_insn_only_p)17283 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17284 {
17285 unsigned i, nelt = d->nelt;
17286 unsigned min, max, minswap, maxswap;
17287 bool in_order, ok, swap = false;
17288 rtx shift, target;
17289 struct expand_vec_perm_d dcopy;
17290
17291 /* Even with AVX, palignr only operates on 128-bit vectors,
17292 in AVX2 palignr operates on both 128-bit lanes. */
17293 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17294 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17295 return false;
17296
17297 min = 2 * nelt;
17298 max = 0;
17299 minswap = 2 * nelt;
17300 maxswap = 0;
17301 for (i = 0; i < nelt; ++i)
17302 {
17303 unsigned e = d->perm[i];
17304 unsigned eswap = d->perm[i] ^ nelt;
17305 if (GET_MODE_SIZE (d->vmode) == 32)
17306 {
17307 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17308 eswap = e ^ (nelt / 2);
17309 }
17310 if (e < min)
17311 min = e;
17312 if (e > max)
17313 max = e;
17314 if (eswap < minswap)
17315 minswap = eswap;
17316 if (eswap > maxswap)
17317 maxswap = eswap;
17318 }
17319 if (min == 0
17320 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17321 {
17322 if (d->one_operand_p
17323 || minswap == 0
17324 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17325 ? nelt / 2 : nelt))
17326 return false;
17327 swap = true;
17328 min = minswap;
17329 max = maxswap;
17330 }
17331
17332 /* Given that we have SSSE3, we know we'll be able to implement the
17333 single operand permutation after the palignr with pshufb for
17334 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17335 first. */
17336 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17337 return true;
17338
17339 dcopy = *d;
17340 if (swap)
17341 {
17342 dcopy.op0 = d->op1;
17343 dcopy.op1 = d->op0;
17344 for (i = 0; i < nelt; ++i)
17345 dcopy.perm[i] ^= nelt;
17346 }
17347
17348 in_order = true;
17349 for (i = 0; i < nelt; ++i)
17350 {
17351 unsigned e = dcopy.perm[i];
17352 if (GET_MODE_SIZE (d->vmode) == 32
17353 && e >= nelt
17354 && (e & (nelt / 2 - 1)) < min)
17355 e = e - min - (nelt / 2);
17356 else
17357 e = e - min;
17358 if (e != i)
17359 in_order = false;
17360 dcopy.perm[i] = e;
17361 }
17362 dcopy.one_operand_p = true;
17363
17364 if (single_insn_only_p && !in_order)
17365 return false;
17366
17367 /* For AVX2, test whether we can permute the result in one instruction. */
17368 if (d->testing_p)
17369 {
17370 if (in_order)
17371 return true;
17372 dcopy.op1 = dcopy.op0;
17373 return expand_vec_perm_1 (&dcopy);
17374 }
17375
17376 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17377 if (GET_MODE_SIZE (d->vmode) == 16)
17378 {
17379 target = gen_reg_rtx (TImode);
17380 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17381 gen_lowpart (TImode, dcopy.op0), shift));
17382 }
17383 else
17384 {
17385 target = gen_reg_rtx (V2TImode);
17386 emit_insn (gen_avx2_palignrv2ti (target,
17387 gen_lowpart (V2TImode, dcopy.op1),
17388 gen_lowpart (V2TImode, dcopy.op0),
17389 shift));
17390 }
17391
17392 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17393
17394 /* Test for the degenerate case where the alignment by itself
17395 produces the desired permutation. */
17396 if (in_order)
17397 {
17398 emit_move_insn (d->target, dcopy.op0);
17399 return true;
17400 }
17401
17402 ok = expand_vec_perm_1 (&dcopy);
17403 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17404
17405 return ok;
17406 }
17407
17408 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17409 the permutation using the SSE4_1 pblendv instruction. Potentially
17410 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17411
17412 static bool
expand_vec_perm_pblendv(struct expand_vec_perm_d * d)17413 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17414 {
17415 unsigned i, which, nelt = d->nelt;
17416 struct expand_vec_perm_d dcopy, dcopy1;
17417 machine_mode vmode = d->vmode;
17418 bool ok;
17419
17420 /* Use the same checks as in expand_vec_perm_blend. */
17421 if (d->one_operand_p)
17422 return false;
17423 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17424 ;
17425 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17426 ;
17427 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17428 ;
17429 else
17430 return false;
17431
17432 /* Figure out where permutation elements stay not in their
17433 respective lanes. */
17434 for (i = 0, which = 0; i < nelt; ++i)
17435 {
17436 unsigned e = d->perm[i];
17437 if (e != i)
17438 which |= (e < nelt ? 1 : 2);
17439 }
17440 /* We can pblend the part where elements stay not in their
17441 respective lanes only when these elements are all in one
17442 half of a permutation.
17443 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17444 lanes, but both 8 and 9 >= 8
17445 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17446 respective lanes and 8 >= 8, but 2 not. */
17447 if (which != 1 && which != 2)
17448 return false;
17449 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17450 return true;
17451
17452 /* First we apply one operand permutation to the part where
17453 elements stay not in their respective lanes. */
17454 dcopy = *d;
17455 if (which == 2)
17456 dcopy.op0 = dcopy.op1 = d->op1;
17457 else
17458 dcopy.op0 = dcopy.op1 = d->op0;
17459 if (!d->testing_p)
17460 dcopy.target = gen_reg_rtx (vmode);
17461 dcopy.one_operand_p = true;
17462
17463 for (i = 0; i < nelt; ++i)
17464 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17465
17466 ok = expand_vec_perm_1 (&dcopy);
17467 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17468 return false;
17469 else
17470 gcc_assert (ok);
17471 if (d->testing_p)
17472 return true;
17473
17474 /* Next we put permuted elements into their positions. */
17475 dcopy1 = *d;
17476 if (which == 2)
17477 dcopy1.op1 = dcopy.target;
17478 else
17479 dcopy1.op0 = dcopy.target;
17480
17481 for (i = 0; i < nelt; ++i)
17482 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17483
17484 ok = expand_vec_perm_blend (&dcopy1);
17485 gcc_assert (ok);
17486
17487 return true;
17488 }
17489
17490 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17491
17492 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17493 a two vector permutation into a single vector permutation by using
17494 an interleave operation to merge the vectors. */
17495
17496 static bool
expand_vec_perm_interleave2(struct expand_vec_perm_d * d)17497 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17498 {
17499 struct expand_vec_perm_d dremap, dfinal;
17500 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17501 unsigned HOST_WIDE_INT contents;
17502 unsigned char remap[2 * MAX_VECT_LEN];
17503 rtx_insn *seq;
17504 bool ok, same_halves = false;
17505
17506 if (GET_MODE_SIZE (d->vmode) == 16)
17507 {
17508 if (d->one_operand_p)
17509 return false;
17510 }
17511 else if (GET_MODE_SIZE (d->vmode) == 32)
17512 {
17513 if (!TARGET_AVX)
17514 return false;
17515 /* For 32-byte modes allow even d->one_operand_p.
17516 The lack of cross-lane shuffling in some instructions
17517 might prevent a single insn shuffle. */
17518 dfinal = *d;
17519 dfinal.testing_p = true;
17520 /* If expand_vec_perm_interleave3 can expand this into
17521 a 3 insn sequence, give up and let it be expanded as
17522 3 insn sequence. While that is one insn longer,
17523 it doesn't need a memory operand and in the common
17524 case that both interleave low and high permutations
17525 with the same operands are adjacent needs 4 insns
17526 for both after CSE. */
17527 if (expand_vec_perm_interleave3 (&dfinal))
17528 return false;
17529 }
17530 else
17531 return false;
17532
17533 /* Examine from whence the elements come. */
17534 contents = 0;
17535 for (i = 0; i < nelt; ++i)
17536 contents |= HOST_WIDE_INT_1U << d->perm[i];
17537
17538 memset (remap, 0xff, sizeof (remap));
17539 dremap = *d;
17540
17541 if (GET_MODE_SIZE (d->vmode) == 16)
17542 {
17543 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17544
17545 /* Split the two input vectors into 4 halves. */
17546 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17547 h2 = h1 << nelt2;
17548 h3 = h2 << nelt2;
17549 h4 = h3 << nelt2;
17550
17551 /* If the elements from the low halves use interleave low, and similarly
17552 for interleave high. If the elements are from mis-matched halves, we
17553 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17554 if ((contents & (h1 | h3)) == contents)
17555 {
17556 /* punpckl* */
17557 for (i = 0; i < nelt2; ++i)
17558 {
17559 remap[i] = i * 2;
17560 remap[i + nelt] = i * 2 + 1;
17561 dremap.perm[i * 2] = i;
17562 dremap.perm[i * 2 + 1] = i + nelt;
17563 }
17564 if (!TARGET_SSE2 && d->vmode == V4SImode)
17565 dremap.vmode = V4SFmode;
17566 }
17567 else if ((contents & (h2 | h4)) == contents)
17568 {
17569 /* punpckh* */
17570 for (i = 0; i < nelt2; ++i)
17571 {
17572 remap[i + nelt2] = i * 2;
17573 remap[i + nelt + nelt2] = i * 2 + 1;
17574 dremap.perm[i * 2] = i + nelt2;
17575 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17576 }
17577 if (!TARGET_SSE2 && d->vmode == V4SImode)
17578 dremap.vmode = V4SFmode;
17579 }
17580 else if ((contents & (h1 | h4)) == contents)
17581 {
17582 /* shufps */
17583 for (i = 0; i < nelt2; ++i)
17584 {
17585 remap[i] = i;
17586 remap[i + nelt + nelt2] = i + nelt2;
17587 dremap.perm[i] = i;
17588 dremap.perm[i + nelt2] = i + nelt + nelt2;
17589 }
17590 if (nelt != 4)
17591 {
17592 /* shufpd */
17593 dremap.vmode = V2DImode;
17594 dremap.nelt = 2;
17595 dremap.perm[0] = 0;
17596 dremap.perm[1] = 3;
17597 }
17598 }
17599 else if ((contents & (h2 | h3)) == contents)
17600 {
17601 /* shufps */
17602 for (i = 0; i < nelt2; ++i)
17603 {
17604 remap[i + nelt2] = i;
17605 remap[i + nelt] = i + nelt2;
17606 dremap.perm[i] = i + nelt2;
17607 dremap.perm[i + nelt2] = i + nelt;
17608 }
17609 if (nelt != 4)
17610 {
17611 /* shufpd */
17612 dremap.vmode = V2DImode;
17613 dremap.nelt = 2;
17614 dremap.perm[0] = 1;
17615 dremap.perm[1] = 2;
17616 }
17617 }
17618 else
17619 return false;
17620 }
17621 else
17622 {
17623 unsigned int nelt4 = nelt / 4, nzcnt = 0;
17624 unsigned HOST_WIDE_INT q[8];
17625 unsigned int nonzero_halves[4];
17626
17627 /* Split the two input vectors into 8 quarters. */
17628 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17629 for (i = 1; i < 8; ++i)
17630 q[i] = q[0] << (nelt4 * i);
17631 for (i = 0; i < 4; ++i)
17632 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17633 {
17634 nonzero_halves[nzcnt] = i;
17635 ++nzcnt;
17636 }
17637
17638 if (nzcnt == 1)
17639 {
17640 gcc_assert (d->one_operand_p);
17641 nonzero_halves[1] = nonzero_halves[0];
17642 same_halves = true;
17643 }
17644 else if (d->one_operand_p)
17645 {
17646 gcc_assert (nonzero_halves[0] == 0);
17647 gcc_assert (nonzero_halves[1] == 1);
17648 }
17649
17650 if (nzcnt <= 2)
17651 {
17652 if (d->perm[0] / nelt2 == nonzero_halves[1])
17653 {
17654 /* Attempt to increase the likelihood that dfinal
17655 shuffle will be intra-lane. */
17656 std::swap (nonzero_halves[0], nonzero_halves[1]);
17657 }
17658
17659 /* vperm2f128 or vperm2i128. */
17660 for (i = 0; i < nelt2; ++i)
17661 {
17662 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17663 remap[i + nonzero_halves[0] * nelt2] = i;
17664 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17665 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17666 }
17667
17668 if (d->vmode != V8SFmode
17669 && d->vmode != V4DFmode
17670 && d->vmode != V8SImode)
17671 {
17672 dremap.vmode = V8SImode;
17673 dremap.nelt = 8;
17674 for (i = 0; i < 4; ++i)
17675 {
17676 dremap.perm[i] = i + nonzero_halves[0] * 4;
17677 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17678 }
17679 }
17680 }
17681 else if (d->one_operand_p)
17682 return false;
17683 else if (TARGET_AVX2
17684 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17685 {
17686 /* vpunpckl* */
17687 for (i = 0; i < nelt4; ++i)
17688 {
17689 remap[i] = i * 2;
17690 remap[i + nelt] = i * 2 + 1;
17691 remap[i + nelt2] = i * 2 + nelt2;
17692 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17693 dremap.perm[i * 2] = i;
17694 dremap.perm[i * 2 + 1] = i + nelt;
17695 dremap.perm[i * 2 + nelt2] = i + nelt2;
17696 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17697 }
17698 }
17699 else if (TARGET_AVX2
17700 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17701 {
17702 /* vpunpckh* */
17703 for (i = 0; i < nelt4; ++i)
17704 {
17705 remap[i + nelt4] = i * 2;
17706 remap[i + nelt + nelt4] = i * 2 + 1;
17707 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17708 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17709 dremap.perm[i * 2] = i + nelt4;
17710 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17711 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17712 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17713 }
17714 }
17715 else
17716 return false;
17717 }
17718
17719 /* Use the remapping array set up above to move the elements from their
17720 swizzled locations into their final destinations. */
17721 dfinal = *d;
17722 for (i = 0; i < nelt; ++i)
17723 {
17724 unsigned e = remap[d->perm[i]];
17725 gcc_assert (e < nelt);
17726 /* If same_halves is true, both halves of the remapped vector are the
17727 same. Avoid cross-lane accesses if possible. */
17728 if (same_halves && i >= nelt2)
17729 {
17730 gcc_assert (e < nelt2);
17731 dfinal.perm[i] = e + nelt2;
17732 }
17733 else
17734 dfinal.perm[i] = e;
17735 }
17736 if (!d->testing_p)
17737 {
17738 dremap.target = gen_reg_rtx (dremap.vmode);
17739 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17740 }
17741 dfinal.op1 = dfinal.op0;
17742 dfinal.one_operand_p = true;
17743
17744 /* Test if the final remap can be done with a single insn. For V4SFmode or
17745 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17746 start_sequence ();
17747 ok = expand_vec_perm_1 (&dfinal);
17748 seq = get_insns ();
17749 end_sequence ();
17750
17751 if (!ok)
17752 return false;
17753
17754 if (d->testing_p)
17755 return true;
17756
17757 if (dremap.vmode != dfinal.vmode)
17758 {
17759 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17760 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17761 }
17762
17763 ok = expand_vec_perm_1 (&dremap);
17764 gcc_assert (ok);
17765
17766 emit_insn (seq);
17767 return true;
17768 }
17769
17770 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17771 a single vector cross-lane permutation into vpermq followed
17772 by any of the single insn permutations. */
17773
17774 static bool
expand_vec_perm_vpermq_perm_1(struct expand_vec_perm_d * d)17775 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17776 {
17777 struct expand_vec_perm_d dremap, dfinal;
17778 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17779 unsigned contents[2];
17780 bool ok;
17781
17782 if (!(TARGET_AVX2
17783 && (d->vmode == V32QImode || d->vmode == V16HImode)
17784 && d->one_operand_p))
17785 return false;
17786
17787 contents[0] = 0;
17788 contents[1] = 0;
17789 for (i = 0; i < nelt2; ++i)
17790 {
17791 contents[0] |= 1u << (d->perm[i] / nelt4);
17792 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17793 }
17794
17795 for (i = 0; i < 2; ++i)
17796 {
17797 unsigned int cnt = 0;
17798 for (j = 0; j < 4; ++j)
17799 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17800 return false;
17801 }
17802
17803 if (d->testing_p)
17804 return true;
17805
17806 dremap = *d;
17807 dremap.vmode = V4DImode;
17808 dremap.nelt = 4;
17809 dremap.target = gen_reg_rtx (V4DImode);
17810 dremap.op0 = gen_lowpart (V4DImode, d->op0);
17811 dremap.op1 = dremap.op0;
17812 dremap.one_operand_p = true;
17813 for (i = 0; i < 2; ++i)
17814 {
17815 unsigned int cnt = 0;
17816 for (j = 0; j < 4; ++j)
17817 if ((contents[i] & (1u << j)) != 0)
17818 dremap.perm[2 * i + cnt++] = j;
17819 for (; cnt < 2; ++cnt)
17820 dremap.perm[2 * i + cnt] = 0;
17821 }
17822
17823 dfinal = *d;
17824 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17825 dfinal.op1 = dfinal.op0;
17826 dfinal.one_operand_p = true;
17827 for (i = 0, j = 0; i < nelt; ++i)
17828 {
17829 if (i == nelt2)
17830 j = 2;
17831 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17832 if ((d->perm[i] / nelt4) == dremap.perm[j])
17833 ;
17834 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17835 dfinal.perm[i] |= nelt4;
17836 else
17837 gcc_unreachable ();
17838 }
17839
17840 ok = expand_vec_perm_1 (&dremap);
17841 gcc_assert (ok);
17842
17843 ok = expand_vec_perm_1 (&dfinal);
17844 gcc_assert (ok);
17845
17846 return true;
17847 }
17848
17849 static bool canonicalize_perm (struct expand_vec_perm_d *d);
17850
17851 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
17852 a vector permutation using two instructions, vperm2f128 resp.
17853 vperm2i128 followed by any single in-lane permutation. */
17854
17855 static bool
expand_vec_perm_vperm2f128(struct expand_vec_perm_d * d)17856 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17857 {
17858 struct expand_vec_perm_d dfirst, dsecond;
17859 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17860 bool ok;
17861
17862 if (!TARGET_AVX
17863 || GET_MODE_SIZE (d->vmode) != 32
17864 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17865 return false;
17866
17867 dsecond = *d;
17868 dsecond.one_operand_p = false;
17869 dsecond.testing_p = true;
17870
17871 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17872 immediate. For perm < 16 the second permutation uses
17873 d->op0 as first operand, for perm >= 16 it uses d->op1
17874 as first operand. The second operand is the result of
17875 vperm2[fi]128. */
17876 for (perm = 0; perm < 32; perm++)
17877 {
17878 /* Ignore permutations which do not move anything cross-lane. */
17879 if (perm < 16)
17880 {
17881 /* The second shuffle for e.g. V4DFmode has
17882 0123 and ABCD operands.
17883 Ignore AB23, as 23 is already in the second lane
17884 of the first operand. */
17885 if ((perm & 0xc) == (1 << 2)) continue;
17886 /* And 01CD, as 01 is in the first lane of the first
17887 operand. */
17888 if ((perm & 3) == 0) continue;
17889 /* And 4567, as then the vperm2[fi]128 doesn't change
17890 anything on the original 4567 second operand. */
17891 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17892 }
17893 else
17894 {
17895 /* The second shuffle for e.g. V4DFmode has
17896 4567 and ABCD operands.
17897 Ignore AB67, as 67 is already in the second lane
17898 of the first operand. */
17899 if ((perm & 0xc) == (3 << 2)) continue;
17900 /* And 45CD, as 45 is in the first lane of the first
17901 operand. */
17902 if ((perm & 3) == 2) continue;
17903 /* And 0123, as then the vperm2[fi]128 doesn't change
17904 anything on the original 0123 first operand. */
17905 if ((perm & 0xf) == (1 << 2)) continue;
17906 }
17907
17908 for (i = 0; i < nelt; i++)
17909 {
17910 j = d->perm[i] / nelt2;
17911 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17912 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17913 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17914 dsecond.perm[i] = d->perm[i] & (nelt - 1);
17915 else
17916 break;
17917 }
17918
17919 if (i == nelt)
17920 {
17921 start_sequence ();
17922 ok = expand_vec_perm_1 (&dsecond);
17923 end_sequence ();
17924 }
17925 else
17926 ok = false;
17927
17928 if (ok)
17929 {
17930 if (d->testing_p)
17931 return true;
17932
17933 /* Found a usable second shuffle. dfirst will be
17934 vperm2f128 on d->op0 and d->op1. */
17935 dsecond.testing_p = false;
17936 dfirst = *d;
17937 dfirst.target = gen_reg_rtx (d->vmode);
17938 for (i = 0; i < nelt; i++)
17939 dfirst.perm[i] = (i & (nelt2 - 1))
17940 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17941
17942 canonicalize_perm (&dfirst);
17943 ok = expand_vec_perm_1 (&dfirst);
17944 gcc_assert (ok);
17945
17946 /* And dsecond is some single insn shuffle, taking
17947 d->op0 and result of vperm2f128 (if perm < 16) or
17948 d->op1 and result of vperm2f128 (otherwise). */
17949 if (perm >= 16)
17950 dsecond.op0 = dsecond.op1;
17951 dsecond.op1 = dfirst.target;
17952
17953 ok = expand_vec_perm_1 (&dsecond);
17954 gcc_assert (ok);
17955
17956 return true;
17957 }
17958
17959 /* For one operand, the only useful vperm2f128 permutation is 0x01
17960 aka lanes swap. */
17961 if (d->one_operand_p)
17962 return false;
17963 }
17964
17965 return false;
17966 }
17967
17968 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17969 a two vector permutation using 2 intra-lane interleave insns
17970 and cross-lane shuffle for 32-byte vectors. */
17971
17972 static bool
expand_vec_perm_interleave3(struct expand_vec_perm_d * d)17973 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17974 {
17975 unsigned i, nelt;
17976 rtx (*gen) (rtx, rtx, rtx);
17977
17978 if (d->one_operand_p)
17979 return false;
17980 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17981 ;
17982 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17983 ;
17984 else
17985 return false;
17986
17987 nelt = d->nelt;
17988 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17989 return false;
17990 for (i = 0; i < nelt; i += 2)
17991 if (d->perm[i] != d->perm[0] + i / 2
17992 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17993 return false;
17994
17995 if (d->testing_p)
17996 return true;
17997
17998 switch (d->vmode)
17999 {
18000 case E_V32QImode:
18001 if (d->perm[0])
18002 gen = gen_vec_interleave_highv32qi;
18003 else
18004 gen = gen_vec_interleave_lowv32qi;
18005 break;
18006 case E_V16HImode:
18007 if (d->perm[0])
18008 gen = gen_vec_interleave_highv16hi;
18009 else
18010 gen = gen_vec_interleave_lowv16hi;
18011 break;
18012 case E_V8SImode:
18013 if (d->perm[0])
18014 gen = gen_vec_interleave_highv8si;
18015 else
18016 gen = gen_vec_interleave_lowv8si;
18017 break;
18018 case E_V4DImode:
18019 if (d->perm[0])
18020 gen = gen_vec_interleave_highv4di;
18021 else
18022 gen = gen_vec_interleave_lowv4di;
18023 break;
18024 case E_V8SFmode:
18025 if (d->perm[0])
18026 gen = gen_vec_interleave_highv8sf;
18027 else
18028 gen = gen_vec_interleave_lowv8sf;
18029 break;
18030 case E_V4DFmode:
18031 if (d->perm[0])
18032 gen = gen_vec_interleave_highv4df;
18033 else
18034 gen = gen_vec_interleave_lowv4df;
18035 break;
18036 default:
18037 gcc_unreachable ();
18038 }
18039
18040 emit_insn (gen (d->target, d->op0, d->op1));
18041 return true;
18042 }
18043
18044 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18045 a single vector permutation using a single intra-lane vector
18046 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18047 the non-swapped and swapped vectors together. */
18048
18049 static bool
expand_vec_perm_vperm2f128_vblend(struct expand_vec_perm_d * d)18050 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18051 {
18052 struct expand_vec_perm_d dfirst, dsecond;
18053 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18054 rtx_insn *seq;
18055 bool ok;
18056 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18057
18058 if (!TARGET_AVX
18059 || TARGET_AVX2
18060 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18061 || !d->one_operand_p)
18062 return false;
18063
18064 dfirst = *d;
18065 for (i = 0; i < nelt; i++)
18066 dfirst.perm[i] = 0xff;
18067 for (i = 0, msk = 0; i < nelt; i++)
18068 {
18069 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18070 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18071 return false;
18072 dfirst.perm[j] = d->perm[i];
18073 if (j != i)
18074 msk |= (1 << i);
18075 }
18076 for (i = 0; i < nelt; i++)
18077 if (dfirst.perm[i] == 0xff)
18078 dfirst.perm[i] = i;
18079
18080 if (!d->testing_p)
18081 dfirst.target = gen_reg_rtx (dfirst.vmode);
18082
18083 start_sequence ();
18084 ok = expand_vec_perm_1 (&dfirst);
18085 seq = get_insns ();
18086 end_sequence ();
18087
18088 if (!ok)
18089 return false;
18090
18091 if (d->testing_p)
18092 return true;
18093
18094 emit_insn (seq);
18095
18096 dsecond = *d;
18097 dsecond.op0 = dfirst.target;
18098 dsecond.op1 = dfirst.target;
18099 dsecond.one_operand_p = true;
18100 dsecond.target = gen_reg_rtx (dsecond.vmode);
18101 for (i = 0; i < nelt; i++)
18102 dsecond.perm[i] = i ^ nelt2;
18103
18104 ok = expand_vec_perm_1 (&dsecond);
18105 gcc_assert (ok);
18106
18107 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18108 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18109 return true;
18110 }
18111
18112 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18113 permutation using two vperm2f128, followed by a vshufpd insn blending
18114 the two vectors together. */
18115
18116 static bool
expand_vec_perm_2vperm2f128_vshuf(struct expand_vec_perm_d * d)18117 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18118 {
18119 struct expand_vec_perm_d dfirst, dsecond, dthird;
18120 bool ok;
18121
18122 if (!TARGET_AVX || (d->vmode != V4DFmode))
18123 return false;
18124
18125 if (d->testing_p)
18126 return true;
18127
18128 dfirst = *d;
18129 dsecond = *d;
18130 dthird = *d;
18131
18132 dfirst.perm[0] = (d->perm[0] & ~1);
18133 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18134 dfirst.perm[2] = (d->perm[2] & ~1);
18135 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18136 dsecond.perm[0] = (d->perm[1] & ~1);
18137 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18138 dsecond.perm[2] = (d->perm[3] & ~1);
18139 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18140 dthird.perm[0] = (d->perm[0] % 2);
18141 dthird.perm[1] = (d->perm[1] % 2) + 4;
18142 dthird.perm[2] = (d->perm[2] % 2) + 2;
18143 dthird.perm[3] = (d->perm[3] % 2) + 6;
18144
18145 dfirst.target = gen_reg_rtx (dfirst.vmode);
18146 dsecond.target = gen_reg_rtx (dsecond.vmode);
18147 dthird.op0 = dfirst.target;
18148 dthird.op1 = dsecond.target;
18149 dthird.one_operand_p = false;
18150
18151 canonicalize_perm (&dfirst);
18152 canonicalize_perm (&dsecond);
18153
18154 ok = expand_vec_perm_1 (&dfirst)
18155 && expand_vec_perm_1 (&dsecond)
18156 && expand_vec_perm_1 (&dthird);
18157
18158 gcc_assert (ok);
18159
18160 return true;
18161 }
18162
18163 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18164
18165 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18166 a two vector permutation using two intra-lane vector
18167 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18168 the non-swapped and swapped vectors together. */
18169
18170 static bool
expand_vec_perm2_vperm2f128_vblend(struct expand_vec_perm_d * d)18171 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18172 {
18173 struct expand_vec_perm_d dfirst, dsecond, dthird;
18174 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18175 rtx_insn *seq1, *seq2;
18176 bool ok;
18177 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18178
18179 if (!TARGET_AVX
18180 || TARGET_AVX2
18181 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18182 || d->one_operand_p)
18183 return false;
18184
18185 dfirst = *d;
18186 dsecond = *d;
18187 for (i = 0; i < nelt; i++)
18188 {
18189 dfirst.perm[i] = 0xff;
18190 dsecond.perm[i] = 0xff;
18191 }
18192 for (i = 0, msk = 0; i < nelt; i++)
18193 {
18194 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18195 if (j == i)
18196 {
18197 dfirst.perm[j] = d->perm[i];
18198 which1 |= (d->perm[i] < nelt ? 1 : 2);
18199 }
18200 else
18201 {
18202 dsecond.perm[j] = d->perm[i];
18203 which2 |= (d->perm[i] < nelt ? 1 : 2);
18204 msk |= (1U << i);
18205 }
18206 }
18207 if (msk == 0 || msk == (1U << nelt) - 1)
18208 return false;
18209
18210 if (!d->testing_p)
18211 {
18212 dfirst.target = gen_reg_rtx (dfirst.vmode);
18213 dsecond.target = gen_reg_rtx (dsecond.vmode);
18214 }
18215
18216 for (i = 0; i < nelt; i++)
18217 {
18218 if (dfirst.perm[i] == 0xff)
18219 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18220 if (dsecond.perm[i] == 0xff)
18221 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18222 }
18223 canonicalize_perm (&dfirst);
18224 start_sequence ();
18225 ok = ix86_expand_vec_perm_const_1 (&dfirst);
18226 seq1 = get_insns ();
18227 end_sequence ();
18228
18229 if (!ok)
18230 return false;
18231
18232 canonicalize_perm (&dsecond);
18233 start_sequence ();
18234 ok = ix86_expand_vec_perm_const_1 (&dsecond);
18235 seq2 = get_insns ();
18236 end_sequence ();
18237
18238 if (!ok)
18239 return false;
18240
18241 if (d->testing_p)
18242 return true;
18243
18244 emit_insn (seq1);
18245 emit_insn (seq2);
18246
18247 dthird = *d;
18248 dthird.op0 = dsecond.target;
18249 dthird.op1 = dsecond.target;
18250 dthird.one_operand_p = true;
18251 dthird.target = gen_reg_rtx (dthird.vmode);
18252 for (i = 0; i < nelt; i++)
18253 dthird.perm[i] = i ^ nelt2;
18254
18255 ok = expand_vec_perm_1 (&dthird);
18256 gcc_assert (ok);
18257
18258 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18259 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18260 return true;
18261 }
18262
18263 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18264 permutation with two pshufb insns and an ior. We should have already
18265 failed all two instruction sequences. */
18266
18267 static bool
expand_vec_perm_pshufb2(struct expand_vec_perm_d * d)18268 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18269 {
18270 rtx rperm[2][16], vperm, l, h, op, m128;
18271 unsigned int i, nelt, eltsz;
18272
18273 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18274 return false;
18275 gcc_assert (!d->one_operand_p);
18276
18277 if (d->testing_p)
18278 return true;
18279
18280 nelt = d->nelt;
18281 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18282
18283 /* Generate two permutation masks. If the required element is within
18284 the given vector it is shuffled into the proper lane. If the required
18285 element is in the other vector, force a zero into the lane by setting
18286 bit 7 in the permutation mask. */
18287 m128 = GEN_INT (-128);
18288 for (i = 0; i < nelt; ++i)
18289 {
18290 unsigned j, e = d->perm[i];
18291 unsigned which = (e >= nelt);
18292 if (e >= nelt)
18293 e -= nelt;
18294
18295 for (j = 0; j < eltsz; ++j)
18296 {
18297 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18298 rperm[1-which][i*eltsz + j] = m128;
18299 }
18300 }
18301
18302 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18303 vperm = force_reg (V16QImode, vperm);
18304
18305 l = gen_reg_rtx (V16QImode);
18306 op = gen_lowpart (V16QImode, d->op0);
18307 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18308
18309 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18310 vperm = force_reg (V16QImode, vperm);
18311
18312 h = gen_reg_rtx (V16QImode);
18313 op = gen_lowpart (V16QImode, d->op1);
18314 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18315
18316 op = d->target;
18317 if (d->vmode != V16QImode)
18318 op = gen_reg_rtx (V16QImode);
18319 emit_insn (gen_iorv16qi3 (op, l, h));
18320 if (op != d->target)
18321 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18322
18323 return true;
18324 }
18325
18326 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18327 with two vpshufb insns, vpermq and vpor. We should have already failed
18328 all two or three instruction sequences. */
18329
18330 static bool
expand_vec_perm_vpshufb2_vpermq(struct expand_vec_perm_d * d)18331 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18332 {
18333 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18334 unsigned int i, nelt, eltsz;
18335
18336 if (!TARGET_AVX2
18337 || !d->one_operand_p
18338 || (d->vmode != V32QImode && d->vmode != V16HImode))
18339 return false;
18340
18341 if (d->testing_p)
18342 return true;
18343
18344 nelt = d->nelt;
18345 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18346
18347 /* Generate two permutation masks. If the required element is within
18348 the same lane, it is shuffled in. If the required element from the
18349 other lane, force a zero by setting bit 7 in the permutation mask.
18350 In the other mask the mask has non-negative elements if element
18351 is requested from the other lane, but also moved to the other lane,
18352 so that the result of vpshufb can have the two V2TImode halves
18353 swapped. */
18354 m128 = GEN_INT (-128);
18355 for (i = 0; i < nelt; ++i)
18356 {
18357 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18358 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18359
18360 for (j = 0; j < eltsz; ++j)
18361 {
18362 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18363 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18364 }
18365 }
18366
18367 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18368 vperm = force_reg (V32QImode, vperm);
18369
18370 h = gen_reg_rtx (V32QImode);
18371 op = gen_lowpart (V32QImode, d->op0);
18372 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18373
18374 /* Swap the 128-byte lanes of h into hp. */
18375 hp = gen_reg_rtx (V4DImode);
18376 op = gen_lowpart (V4DImode, h);
18377 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18378 const1_rtx));
18379
18380 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18381 vperm = force_reg (V32QImode, vperm);
18382
18383 l = gen_reg_rtx (V32QImode);
18384 op = gen_lowpart (V32QImode, d->op0);
18385 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18386
18387 op = d->target;
18388 if (d->vmode != V32QImode)
18389 op = gen_reg_rtx (V32QImode);
18390 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18391 if (op != d->target)
18392 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18393
18394 return true;
18395 }
18396
18397 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18398 and extract-odd permutations of two V32QImode and V16QImode operand
18399 with two vpshufb insns, vpor and vpermq. We should have already
18400 failed all two or three instruction sequences. */
18401
18402 static bool
expand_vec_perm_vpshufb2_vpermq_even_odd(struct expand_vec_perm_d * d)18403 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18404 {
18405 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18406 unsigned int i, nelt, eltsz;
18407
18408 if (!TARGET_AVX2
18409 || d->one_operand_p
18410 || (d->vmode != V32QImode && d->vmode != V16HImode))
18411 return false;
18412
18413 for (i = 0; i < d->nelt; ++i)
18414 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18415 return false;
18416
18417 if (d->testing_p)
18418 return true;
18419
18420 nelt = d->nelt;
18421 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18422
18423 /* Generate two permutation masks. In the first permutation mask
18424 the first quarter will contain indexes for the first half
18425 of the op0, the second quarter will contain bit 7 set, third quarter
18426 will contain indexes for the second half of the op0 and the
18427 last quarter bit 7 set. In the second permutation mask
18428 the first quarter will contain bit 7 set, the second quarter
18429 indexes for the first half of the op1, the third quarter bit 7 set
18430 and last quarter indexes for the second half of the op1.
18431 I.e. the first mask e.g. for V32QImode extract even will be:
18432 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18433 (all values masked with 0xf except for -128) and second mask
18434 for extract even will be
18435 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18436 m128 = GEN_INT (-128);
18437 for (i = 0; i < nelt; ++i)
18438 {
18439 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18440 unsigned which = d->perm[i] >= nelt;
18441 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18442
18443 for (j = 0; j < eltsz; ++j)
18444 {
18445 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18446 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18447 }
18448 }
18449
18450 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18451 vperm = force_reg (V32QImode, vperm);
18452
18453 l = gen_reg_rtx (V32QImode);
18454 op = gen_lowpart (V32QImode, d->op0);
18455 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18456
18457 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18458 vperm = force_reg (V32QImode, vperm);
18459
18460 h = gen_reg_rtx (V32QImode);
18461 op = gen_lowpart (V32QImode, d->op1);
18462 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18463
18464 ior = gen_reg_rtx (V32QImode);
18465 emit_insn (gen_iorv32qi3 (ior, l, h));
18466
18467 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18468 op = gen_reg_rtx (V4DImode);
18469 ior = gen_lowpart (V4DImode, ior);
18470 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18471 const1_rtx, GEN_INT (3)));
18472 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18473
18474 return true;
18475 }
18476
18477 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18478 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18479 with two "and" and "pack" or two "shift" and "pack" insns. We should
18480 have already failed all two instruction sequences. */
18481
18482 static bool
expand_vec_perm_even_odd_pack(struct expand_vec_perm_d * d)18483 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18484 {
18485 rtx op, dop0, dop1, t;
18486 unsigned i, odd, c, s, nelt = d->nelt;
18487 bool end_perm = false;
18488 machine_mode half_mode;
18489 rtx (*gen_and) (rtx, rtx, rtx);
18490 rtx (*gen_pack) (rtx, rtx, rtx);
18491 rtx (*gen_shift) (rtx, rtx, rtx);
18492
18493 if (d->one_operand_p)
18494 return false;
18495
18496 switch (d->vmode)
18497 {
18498 case E_V8HImode:
18499 /* Required for "pack". */
18500 if (!TARGET_SSE4_1)
18501 return false;
18502 c = 0xffff;
18503 s = 16;
18504 half_mode = V4SImode;
18505 gen_and = gen_andv4si3;
18506 gen_pack = gen_sse4_1_packusdw;
18507 gen_shift = gen_lshrv4si3;
18508 break;
18509 case E_V16QImode:
18510 /* No check as all instructions are SSE2. */
18511 c = 0xff;
18512 s = 8;
18513 half_mode = V8HImode;
18514 gen_and = gen_andv8hi3;
18515 gen_pack = gen_sse2_packuswb;
18516 gen_shift = gen_lshrv8hi3;
18517 break;
18518 case E_V16HImode:
18519 if (!TARGET_AVX2)
18520 return false;
18521 c = 0xffff;
18522 s = 16;
18523 half_mode = V8SImode;
18524 gen_and = gen_andv8si3;
18525 gen_pack = gen_avx2_packusdw;
18526 gen_shift = gen_lshrv8si3;
18527 end_perm = true;
18528 break;
18529 case E_V32QImode:
18530 if (!TARGET_AVX2)
18531 return false;
18532 c = 0xff;
18533 s = 8;
18534 half_mode = V16HImode;
18535 gen_and = gen_andv16hi3;
18536 gen_pack = gen_avx2_packuswb;
18537 gen_shift = gen_lshrv16hi3;
18538 end_perm = true;
18539 break;
18540 default:
18541 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18542 general shuffles. */
18543 return false;
18544 }
18545
18546 /* Check that permutation is even or odd. */
18547 odd = d->perm[0];
18548 if (odd > 1)
18549 return false;
18550
18551 for (i = 1; i < nelt; ++i)
18552 if (d->perm[i] != 2 * i + odd)
18553 return false;
18554
18555 if (d->testing_p)
18556 return true;
18557
18558 dop0 = gen_reg_rtx (half_mode);
18559 dop1 = gen_reg_rtx (half_mode);
18560 if (odd == 0)
18561 {
18562 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18563 t = force_reg (half_mode, t);
18564 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18565 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18566 }
18567 else
18568 {
18569 emit_insn (gen_shift (dop0,
18570 gen_lowpart (half_mode, d->op0),
18571 GEN_INT (s)));
18572 emit_insn (gen_shift (dop1,
18573 gen_lowpart (half_mode, d->op1),
18574 GEN_INT (s)));
18575 }
18576 /* In AVX2 for 256 bit case we need to permute pack result. */
18577 if (TARGET_AVX2 && end_perm)
18578 {
18579 op = gen_reg_rtx (d->vmode);
18580 t = gen_reg_rtx (V4DImode);
18581 emit_insn (gen_pack (op, dop0, dop1));
18582 emit_insn (gen_avx2_permv4di_1 (t,
18583 gen_lowpart (V4DImode, op),
18584 const0_rtx,
18585 const2_rtx,
18586 const1_rtx,
18587 GEN_INT (3)));
18588 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18589 }
18590 else
18591 emit_insn (gen_pack (d->target, dop0, dop1));
18592
18593 return true;
18594 }
18595
18596 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18597 and extract-odd permutations of two V64QI operands
18598 with two "shifts", two "truncs" and one "concat" insns for "odd"
18599 and two "truncs" and one concat insn for "even."
18600 Have already failed all two instruction sequences. */
18601
18602 static bool
expand_vec_perm_even_odd_trunc(struct expand_vec_perm_d * d)18603 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18604 {
18605 rtx t1, t2, t3, t4;
18606 unsigned i, odd, nelt = d->nelt;
18607
18608 if (!TARGET_AVX512BW
18609 || d->one_operand_p
18610 || d->vmode != V64QImode)
18611 return false;
18612
18613 /* Check that permutation is even or odd. */
18614 odd = d->perm[0];
18615 if (odd > 1)
18616 return false;
18617
18618 for (i = 1; i < nelt; ++i)
18619 if (d->perm[i] != 2 * i + odd)
18620 return false;
18621
18622 if (d->testing_p)
18623 return true;
18624
18625
18626 if (odd)
18627 {
18628 t1 = gen_reg_rtx (V32HImode);
18629 t2 = gen_reg_rtx (V32HImode);
18630 emit_insn (gen_lshrv32hi3 (t1,
18631 gen_lowpart (V32HImode, d->op0),
18632 GEN_INT (8)));
18633 emit_insn (gen_lshrv32hi3 (t2,
18634 gen_lowpart (V32HImode, d->op1),
18635 GEN_INT (8)));
18636 }
18637 else
18638 {
18639 t1 = gen_lowpart (V32HImode, d->op0);
18640 t2 = gen_lowpart (V32HImode, d->op1);
18641 }
18642
18643 t3 = gen_reg_rtx (V32QImode);
18644 t4 = gen_reg_rtx (V32QImode);
18645 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18646 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18647 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18648
18649 return true;
18650 }
18651
18652 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
18653 and extract-odd permutations. */
18654
18655 static bool
expand_vec_perm_even_odd_1(struct expand_vec_perm_d * d,unsigned odd)18656 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18657 {
18658 rtx t1, t2, t3, t4, t5;
18659
18660 switch (d->vmode)
18661 {
18662 case E_V4DFmode:
18663 if (d->testing_p)
18664 break;
18665 t1 = gen_reg_rtx (V4DFmode);
18666 t2 = gen_reg_rtx (V4DFmode);
18667
18668 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18669 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18670 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18671
18672 /* Now an unpck[lh]pd will produce the result required. */
18673 if (odd)
18674 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18675 else
18676 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18677 emit_insn (t3);
18678 break;
18679
18680 case E_V8SFmode:
18681 {
18682 int mask = odd ? 0xdd : 0x88;
18683
18684 if (d->testing_p)
18685 break;
18686 t1 = gen_reg_rtx (V8SFmode);
18687 t2 = gen_reg_rtx (V8SFmode);
18688 t3 = gen_reg_rtx (V8SFmode);
18689
18690 /* Shuffle within the 128-bit lanes to produce:
18691 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18692 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18693 GEN_INT (mask)));
18694
18695 /* Shuffle the lanes around to produce:
18696 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18697 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18698 GEN_INT (0x3)));
18699
18700 /* Shuffle within the 128-bit lanes to produce:
18701 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18702 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18703
18704 /* Shuffle within the 128-bit lanes to produce:
18705 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18706 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18707
18708 /* Shuffle the lanes around to produce:
18709 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18710 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18711 GEN_INT (0x20)));
18712 }
18713 break;
18714
18715 case E_V2DFmode:
18716 case E_V4SFmode:
18717 case E_V2DImode:
18718 case E_V4SImode:
18719 /* These are always directly implementable by expand_vec_perm_1. */
18720 gcc_unreachable ();
18721
18722 case E_V8HImode:
18723 if (TARGET_SSE4_1)
18724 return expand_vec_perm_even_odd_pack (d);
18725 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18726 return expand_vec_perm_pshufb2 (d);
18727 else
18728 {
18729 if (d->testing_p)
18730 break;
18731 /* We need 2*log2(N)-1 operations to achieve odd/even
18732 with interleave. */
18733 t1 = gen_reg_rtx (V8HImode);
18734 t2 = gen_reg_rtx (V8HImode);
18735 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18736 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18737 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18738 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18739 if (odd)
18740 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18741 else
18742 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18743 emit_insn (t3);
18744 }
18745 break;
18746
18747 case E_V16QImode:
18748 return expand_vec_perm_even_odd_pack (d);
18749
18750 case E_V16HImode:
18751 case E_V32QImode:
18752 return expand_vec_perm_even_odd_pack (d);
18753
18754 case E_V64QImode:
18755 return expand_vec_perm_even_odd_trunc (d);
18756
18757 case E_V4DImode:
18758 if (!TARGET_AVX2)
18759 {
18760 struct expand_vec_perm_d d_copy = *d;
18761 d_copy.vmode = V4DFmode;
18762 if (d->testing_p)
18763 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18764 else
18765 d_copy.target = gen_reg_rtx (V4DFmode);
18766 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18767 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18768 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18769 {
18770 if (!d->testing_p)
18771 emit_move_insn (d->target,
18772 gen_lowpart (V4DImode, d_copy.target));
18773 return true;
18774 }
18775 return false;
18776 }
18777
18778 if (d->testing_p)
18779 break;
18780
18781 t1 = gen_reg_rtx (V4DImode);
18782 t2 = gen_reg_rtx (V4DImode);
18783
18784 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18785 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18786 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18787
18788 /* Now an vpunpck[lh]qdq will produce the result required. */
18789 if (odd)
18790 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18791 else
18792 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18793 emit_insn (t3);
18794 break;
18795
18796 case E_V8SImode:
18797 if (!TARGET_AVX2)
18798 {
18799 struct expand_vec_perm_d d_copy = *d;
18800 d_copy.vmode = V8SFmode;
18801 if (d->testing_p)
18802 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18803 else
18804 d_copy.target = gen_reg_rtx (V8SFmode);
18805 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18806 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18807 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18808 {
18809 if (!d->testing_p)
18810 emit_move_insn (d->target,
18811 gen_lowpart (V8SImode, d_copy.target));
18812 return true;
18813 }
18814 return false;
18815 }
18816
18817 if (d->testing_p)
18818 break;
18819
18820 t1 = gen_reg_rtx (V8SImode);
18821 t2 = gen_reg_rtx (V8SImode);
18822 t3 = gen_reg_rtx (V4DImode);
18823 t4 = gen_reg_rtx (V4DImode);
18824 t5 = gen_reg_rtx (V4DImode);
18825
18826 /* Shuffle the lanes around into
18827 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18828 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18829 gen_lowpart (V4DImode, d->op1),
18830 GEN_INT (0x20)));
18831 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18832 gen_lowpart (V4DImode, d->op1),
18833 GEN_INT (0x31)));
18834
18835 /* Swap the 2nd and 3rd position in each lane into
18836 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18837 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18838 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18839 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18840 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18841
18842 /* Now an vpunpck[lh]qdq will produce
18843 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18844 if (odd)
18845 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18846 gen_lowpart (V4DImode, t2));
18847 else
18848 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18849 gen_lowpart (V4DImode, t2));
18850 emit_insn (t3);
18851 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18852 break;
18853
18854 default:
18855 gcc_unreachable ();
18856 }
18857
18858 return true;
18859 }
18860
18861 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18862 extract-even and extract-odd permutations. */
18863
18864 static bool
expand_vec_perm_even_odd(struct expand_vec_perm_d * d)18865 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18866 {
18867 unsigned i, odd, nelt = d->nelt;
18868
18869 odd = d->perm[0];
18870 if (odd != 0 && odd != 1)
18871 return false;
18872
18873 for (i = 1; i < nelt; ++i)
18874 if (d->perm[i] != 2 * i + odd)
18875 return false;
18876
18877 return expand_vec_perm_even_odd_1 (d, odd);
18878 }
18879
18880 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
18881 permutations. We assume that expand_vec_perm_1 has already failed. */
18882
18883 static bool
expand_vec_perm_broadcast_1(struct expand_vec_perm_d * d)18884 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18885 {
18886 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18887 machine_mode vmode = d->vmode;
18888 unsigned char perm2[4];
18889 rtx op0 = d->op0, dest;
18890 bool ok;
18891
18892 switch (vmode)
18893 {
18894 case E_V4DFmode:
18895 case E_V8SFmode:
18896 /* These are special-cased in sse.md so that we can optionally
18897 use the vbroadcast instruction. They expand to two insns
18898 if the input happens to be in a register. */
18899 gcc_unreachable ();
18900
18901 case E_V2DFmode:
18902 case E_V2DImode:
18903 case E_V4SFmode:
18904 case E_V4SImode:
18905 /* These are always implementable using standard shuffle patterns. */
18906 gcc_unreachable ();
18907
18908 case E_V8HImode:
18909 case E_V16QImode:
18910 /* These can be implemented via interleave. We save one insn by
18911 stopping once we have promoted to V4SImode and then use pshufd. */
18912 if (d->testing_p)
18913 return true;
18914 do
18915 {
18916 rtx dest;
18917 rtx (*gen) (rtx, rtx, rtx)
18918 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18919 : gen_vec_interleave_lowv8hi;
18920
18921 if (elt >= nelt2)
18922 {
18923 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18924 : gen_vec_interleave_highv8hi;
18925 elt -= nelt2;
18926 }
18927 nelt2 /= 2;
18928
18929 dest = gen_reg_rtx (vmode);
18930 emit_insn (gen (dest, op0, op0));
18931 vmode = get_mode_wider_vector (vmode);
18932 op0 = gen_lowpart (vmode, dest);
18933 }
18934 while (vmode != V4SImode);
18935
18936 memset (perm2, elt, 4);
18937 dest = gen_reg_rtx (V4SImode);
18938 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18939 gcc_assert (ok);
18940 if (!d->testing_p)
18941 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18942 return true;
18943
18944 case E_V64QImode:
18945 case E_V32QImode:
18946 case E_V16HImode:
18947 case E_V8SImode:
18948 case E_V4DImode:
18949 /* For AVX2 broadcasts of the first element vpbroadcast* or
18950 vpermq should be used by expand_vec_perm_1. */
18951 gcc_assert (!TARGET_AVX2 || d->perm[0]);
18952 return false;
18953
18954 default:
18955 gcc_unreachable ();
18956 }
18957 }
18958
18959 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18960 broadcast permutations. */
18961
18962 static bool
expand_vec_perm_broadcast(struct expand_vec_perm_d * d)18963 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18964 {
18965 unsigned i, elt, nelt = d->nelt;
18966
18967 if (!d->one_operand_p)
18968 return false;
18969
18970 elt = d->perm[0];
18971 for (i = 1; i < nelt; ++i)
18972 if (d->perm[i] != elt)
18973 return false;
18974
18975 return expand_vec_perm_broadcast_1 (d);
18976 }
18977
18978 /* Implement arbitrary permutations of two V64QImode operands
18979 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18980 static bool
expand_vec_perm_vpermt2_vpshub2(struct expand_vec_perm_d * d)18981 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18982 {
18983 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
18984 return false;
18985
18986 if (d->testing_p)
18987 return true;
18988
18989 struct expand_vec_perm_d ds[2];
18990 rtx rperm[128], vperm, target0, target1;
18991 unsigned int i, nelt;
18992 machine_mode vmode;
18993
18994 nelt = d->nelt;
18995 vmode = V64QImode;
18996
18997 for (i = 0; i < 2; i++)
18998 {
18999 ds[i] = *d;
19000 ds[i].vmode = V32HImode;
19001 ds[i].nelt = 32;
19002 ds[i].target = gen_reg_rtx (V32HImode);
19003 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19004 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19005 }
19006
19007 /* Prepare permutations such that the first one takes care of
19008 putting the even bytes into the right positions or one higher
19009 positions (ds[0]) and the second one takes care of
19010 putting the odd bytes into the right positions or one below
19011 (ds[1]). */
19012
19013 for (i = 0; i < nelt; i++)
19014 {
19015 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19016 if (i & 1)
19017 {
19018 rperm[i] = constm1_rtx;
19019 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19020 }
19021 else
19022 {
19023 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19024 rperm[i + 64] = constm1_rtx;
19025 }
19026 }
19027
19028 bool ok = expand_vec_perm_1 (&ds[0]);
19029 gcc_assert (ok);
19030 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19031
19032 ok = expand_vec_perm_1 (&ds[1]);
19033 gcc_assert (ok);
19034 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19035
19036 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19037 vperm = force_reg (vmode, vperm);
19038 target0 = gen_reg_rtx (V64QImode);
19039 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19040
19041 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19042 vperm = force_reg (vmode, vperm);
19043 target1 = gen_reg_rtx (V64QImode);
19044 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19045
19046 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19047 return true;
19048 }
19049
19050 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19051 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19052 all the shorter instruction sequences. */
19053
19054 static bool
expand_vec_perm_vpshufb4_vpermq2(struct expand_vec_perm_d * d)19055 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19056 {
19057 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19058 unsigned int i, nelt, eltsz;
19059 bool used[4];
19060
19061 if (!TARGET_AVX2
19062 || d->one_operand_p
19063 || (d->vmode != V32QImode && d->vmode != V16HImode))
19064 return false;
19065
19066 if (d->testing_p)
19067 return true;
19068
19069 nelt = d->nelt;
19070 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19071
19072 /* Generate 4 permutation masks. If the required element is within
19073 the same lane, it is shuffled in. If the required element from the
19074 other lane, force a zero by setting bit 7 in the permutation mask.
19075 In the other mask the mask has non-negative elements if element
19076 is requested from the other lane, but also moved to the other lane,
19077 so that the result of vpshufb can have the two V2TImode halves
19078 swapped. */
19079 m128 = GEN_INT (-128);
19080 for (i = 0; i < 32; ++i)
19081 {
19082 rperm[0][i] = m128;
19083 rperm[1][i] = m128;
19084 rperm[2][i] = m128;
19085 rperm[3][i] = m128;
19086 }
19087 used[0] = false;
19088 used[1] = false;
19089 used[2] = false;
19090 used[3] = false;
19091 for (i = 0; i < nelt; ++i)
19092 {
19093 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19094 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19095 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19096
19097 for (j = 0; j < eltsz; ++j)
19098 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19099 used[which] = true;
19100 }
19101
19102 for (i = 0; i < 2; ++i)
19103 {
19104 if (!used[2 * i + 1])
19105 {
19106 h[i] = NULL_RTX;
19107 continue;
19108 }
19109 vperm = gen_rtx_CONST_VECTOR (V32QImode,
19110 gen_rtvec_v (32, rperm[2 * i + 1]));
19111 vperm = force_reg (V32QImode, vperm);
19112 h[i] = gen_reg_rtx (V32QImode);
19113 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19114 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19115 }
19116
19117 /* Swap the 128-byte lanes of h[X]. */
19118 for (i = 0; i < 2; ++i)
19119 {
19120 if (h[i] == NULL_RTX)
19121 continue;
19122 op = gen_reg_rtx (V4DImode);
19123 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19124 const2_rtx, GEN_INT (3), const0_rtx,
19125 const1_rtx));
19126 h[i] = gen_lowpart (V32QImode, op);
19127 }
19128
19129 for (i = 0; i < 2; ++i)
19130 {
19131 if (!used[2 * i])
19132 {
19133 l[i] = NULL_RTX;
19134 continue;
19135 }
19136 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19137 vperm = force_reg (V32QImode, vperm);
19138 l[i] = gen_reg_rtx (V32QImode);
19139 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19140 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19141 }
19142
19143 for (i = 0; i < 2; ++i)
19144 {
19145 if (h[i] && l[i])
19146 {
19147 op = gen_reg_rtx (V32QImode);
19148 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19149 l[i] = op;
19150 }
19151 else if (h[i])
19152 l[i] = h[i];
19153 }
19154
19155 gcc_assert (l[0] && l[1]);
19156 op = d->target;
19157 if (d->vmode != V32QImode)
19158 op = gen_reg_rtx (V32QImode);
19159 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19160 if (op != d->target)
19161 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19162 return true;
19163 }
19164
19165 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19166 taken care of, perform the expansion in D and return true on success. */
19167
19168 static bool
ix86_expand_vec_perm_const_1(struct expand_vec_perm_d * d)19169 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19170 {
19171 /* Try a single instruction expansion. */
19172 if (expand_vec_perm_1 (d))
19173 return true;
19174
19175 /* Try sequences of two instructions. */
19176
19177 if (expand_vec_perm_pshuflw_pshufhw (d))
19178 return true;
19179
19180 if (expand_vec_perm_palignr (d, false))
19181 return true;
19182
19183 if (expand_vec_perm_interleave2 (d))
19184 return true;
19185
19186 if (expand_vec_perm_broadcast (d))
19187 return true;
19188
19189 if (expand_vec_perm_vpermq_perm_1 (d))
19190 return true;
19191
19192 if (expand_vec_perm_vperm2f128 (d))
19193 return true;
19194
19195 if (expand_vec_perm_pblendv (d))
19196 return true;
19197
19198 /* Try sequences of three instructions. */
19199
19200 if (expand_vec_perm_even_odd_pack (d))
19201 return true;
19202
19203 if (expand_vec_perm_2vperm2f128_vshuf (d))
19204 return true;
19205
19206 if (expand_vec_perm_pshufb2 (d))
19207 return true;
19208
19209 if (expand_vec_perm_interleave3 (d))
19210 return true;
19211
19212 if (expand_vec_perm_vperm2f128_vblend (d))
19213 return true;
19214
19215 /* Try sequences of four instructions. */
19216
19217 if (expand_vec_perm_even_odd_trunc (d))
19218 return true;
19219 if (expand_vec_perm_vpshufb2_vpermq (d))
19220 return true;
19221
19222 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19223 return true;
19224
19225 if (expand_vec_perm_vpermt2_vpshub2 (d))
19226 return true;
19227
19228 /* ??? Look for narrow permutations whose element orderings would
19229 allow the promotion to a wider mode. */
19230
19231 /* ??? Look for sequences of interleave or a wider permute that place
19232 the data into the correct lanes for a half-vector shuffle like
19233 pshuf[lh]w or vpermilps. */
19234
19235 /* ??? Look for sequences of interleave that produce the desired results.
19236 The combinatorics of punpck[lh] get pretty ugly... */
19237
19238 if (expand_vec_perm_even_odd (d))
19239 return true;
19240
19241 /* Even longer sequences. */
19242 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19243 return true;
19244
19245 /* See if we can get the same permutation in different vector integer
19246 mode. */
19247 struct expand_vec_perm_d nd;
19248 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19249 {
19250 if (!d->testing_p)
19251 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19252 return true;
19253 }
19254
19255 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19256 if (expand_vec_perm2_vperm2f128_vblend (d))
19257 return true;
19258
19259 return false;
19260 }
19261
19262 /* If a permutation only uses one operand, make it clear. Returns true
19263 if the permutation references both operands. */
19264
19265 static bool
canonicalize_perm(struct expand_vec_perm_d * d)19266 canonicalize_perm (struct expand_vec_perm_d *d)
19267 {
19268 int i, which, nelt = d->nelt;
19269
19270 for (i = which = 0; i < nelt; ++i)
19271 which |= (d->perm[i] < nelt ? 1 : 2);
19272
19273 d->one_operand_p = true;
19274 switch (which)
19275 {
19276 default:
19277 gcc_unreachable();
19278
19279 case 3:
19280 if (!rtx_equal_p (d->op0, d->op1))
19281 {
19282 d->one_operand_p = false;
19283 break;
19284 }
19285 /* The elements of PERM do not suggest that only the first operand
19286 is used, but both operands are identical. Allow easier matching
19287 of the permutation by folding the permutation into the single
19288 input vector. */
19289 /* FALLTHRU */
19290
19291 case 2:
19292 for (i = 0; i < nelt; ++i)
19293 d->perm[i] &= nelt - 1;
19294 d->op0 = d->op1;
19295 break;
19296
19297 case 1:
19298 d->op1 = d->op0;
19299 break;
19300 }
19301
19302 return (which == 3);
19303 }
19304
19305 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19306
19307 bool
ix86_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)19308 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19309 rtx op1, const vec_perm_indices &sel)
19310 {
19311 struct expand_vec_perm_d d;
19312 unsigned char perm[MAX_VECT_LEN];
19313 unsigned int i, nelt, which;
19314 bool two_args;
19315
19316 d.target = target;
19317 d.op0 = op0;
19318 d.op1 = op1;
19319
19320 d.vmode = vmode;
19321 gcc_assert (VECTOR_MODE_P (d.vmode));
19322 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19323 d.testing_p = !target;
19324
19325 gcc_assert (sel.length () == nelt);
19326 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19327
19328 /* Given sufficient ISA support we can just return true here
19329 for selected vector modes. */
19330 switch (d.vmode)
19331 {
19332 case E_V16SFmode:
19333 case E_V16SImode:
19334 case E_V8DImode:
19335 case E_V8DFmode:
19336 if (!TARGET_AVX512F)
19337 return false;
19338 /* All implementable with a single vperm[it]2 insn. */
19339 if (d.testing_p)
19340 return true;
19341 break;
19342 case E_V32HImode:
19343 if (!TARGET_AVX512BW)
19344 return false;
19345 if (d.testing_p)
19346 /* All implementable with a single vperm[it]2 insn. */
19347 return true;
19348 break;
19349 case E_V64QImode:
19350 if (!TARGET_AVX512BW)
19351 return false;
19352 if (d.testing_p)
19353 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19354 return true;
19355 break;
19356 case E_V8SImode:
19357 case E_V8SFmode:
19358 case E_V4DFmode:
19359 case E_V4DImode:
19360 if (!TARGET_AVX)
19361 return false;
19362 if (d.testing_p && TARGET_AVX512VL)
19363 /* All implementable with a single vperm[it]2 insn. */
19364 return true;
19365 break;
19366 case E_V16HImode:
19367 if (!TARGET_SSE2)
19368 return false;
19369 if (d.testing_p && TARGET_AVX2)
19370 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19371 return true;
19372 break;
19373 case E_V32QImode:
19374 if (!TARGET_SSE2)
19375 return false;
19376 if (d.testing_p && TARGET_AVX2)
19377 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19378 return true;
19379 break;
19380 case E_V8HImode:
19381 case E_V16QImode:
19382 if (!TARGET_SSE2)
19383 return false;
19384 /* Fall through. */
19385 case E_V4SImode:
19386 case E_V4SFmode:
19387 if (!TARGET_SSE)
19388 return false;
19389 /* All implementable with a single vpperm insn. */
19390 if (d.testing_p && TARGET_XOP)
19391 return true;
19392 /* All implementable with 2 pshufb + 1 ior. */
19393 if (d.testing_p && TARGET_SSSE3)
19394 return true;
19395 break;
19396 case E_V2DImode:
19397 case E_V2DFmode:
19398 if (!TARGET_SSE)
19399 return false;
19400 /* All implementable with shufpd or unpck[lh]pd. */
19401 if (d.testing_p)
19402 return true;
19403 break;
19404 default:
19405 return false;
19406 }
19407
19408 for (i = which = 0; i < nelt; ++i)
19409 {
19410 unsigned char e = sel[i];
19411 gcc_assert (e < 2 * nelt);
19412 d.perm[i] = e;
19413 perm[i] = e;
19414 which |= (e < nelt ? 1 : 2);
19415 }
19416
19417 if (d.testing_p)
19418 {
19419 /* For all elements from second vector, fold the elements to first. */
19420 if (which == 2)
19421 for (i = 0; i < nelt; ++i)
19422 d.perm[i] -= nelt;
19423
19424 /* Check whether the mask can be applied to the vector type. */
19425 d.one_operand_p = (which != 3);
19426
19427 /* Implementable with shufps or pshufd. */
19428 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
19429 return true;
19430
19431 /* Otherwise we have to go through the motions and see if we can
19432 figure out how to generate the requested permutation. */
19433 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19434 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19435 if (!d.one_operand_p)
19436 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19437
19438 start_sequence ();
19439 bool ret = ix86_expand_vec_perm_const_1 (&d);
19440 end_sequence ();
19441
19442 return ret;
19443 }
19444
19445 two_args = canonicalize_perm (&d);
19446
19447 if (ix86_expand_vec_perm_const_1 (&d))
19448 return true;
19449
19450 /* If the selector says both arguments are needed, but the operands are the
19451 same, the above tried to expand with one_operand_p and flattened selector.
19452 If that didn't work, retry without one_operand_p; we succeeded with that
19453 during testing. */
19454 if (two_args && d.one_operand_p)
19455 {
19456 d.one_operand_p = false;
19457 memcpy (d.perm, perm, sizeof (perm));
19458 return ix86_expand_vec_perm_const_1 (&d);
19459 }
19460
19461 return false;
19462 }
19463
19464 void
ix86_expand_vec_extract_even_odd(rtx targ,rtx op0,rtx op1,unsigned odd)19465 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19466 {
19467 struct expand_vec_perm_d d;
19468 unsigned i, nelt;
19469
19470 d.target = targ;
19471 d.op0 = op0;
19472 d.op1 = op1;
19473 d.vmode = GET_MODE (targ);
19474 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19475 d.one_operand_p = false;
19476 d.testing_p = false;
19477
19478 for (i = 0; i < nelt; ++i)
19479 d.perm[i] = i * 2 + odd;
19480
19481 /* We'll either be able to implement the permutation directly... */
19482 if (expand_vec_perm_1 (&d))
19483 return;
19484
19485 /* ... or we use the special-case patterns. */
19486 expand_vec_perm_even_odd_1 (&d, odd);
19487 }
19488
19489 static void
ix86_expand_vec_interleave(rtx targ,rtx op0,rtx op1,bool high_p)19490 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19491 {
19492 struct expand_vec_perm_d d;
19493 unsigned i, nelt, base;
19494 bool ok;
19495
19496 d.target = targ;
19497 d.op0 = op0;
19498 d.op1 = op1;
19499 d.vmode = GET_MODE (targ);
19500 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19501 d.one_operand_p = false;
19502 d.testing_p = false;
19503
19504 base = high_p ? nelt / 2 : 0;
19505 for (i = 0; i < nelt / 2; ++i)
19506 {
19507 d.perm[i * 2] = i + base;
19508 d.perm[i * 2 + 1] = i + base + nelt;
19509 }
19510
19511 /* Note that for AVX this isn't one instruction. */
19512 ok = ix86_expand_vec_perm_const_1 (&d);
19513 gcc_assert (ok);
19514 }
19515
19516
19517 /* Expand a vector operation CODE for a V*QImode in terms of the
19518 same operation on V*HImode. */
19519
19520 void
ix86_expand_vecop_qihi(enum rtx_code code,rtx dest,rtx op1,rtx op2)19521 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19522 {
19523 machine_mode qimode = GET_MODE (dest);
19524 machine_mode himode;
19525 rtx (*gen_il) (rtx, rtx, rtx);
19526 rtx (*gen_ih) (rtx, rtx, rtx);
19527 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19528 struct expand_vec_perm_d d;
19529 bool ok, full_interleave;
19530 bool uns_p = false;
19531 int i;
19532
19533 switch (qimode)
19534 {
19535 case E_V16QImode:
19536 himode = V8HImode;
19537 gen_il = gen_vec_interleave_lowv16qi;
19538 gen_ih = gen_vec_interleave_highv16qi;
19539 break;
19540 case E_V32QImode:
19541 himode = V16HImode;
19542 gen_il = gen_avx2_interleave_lowv32qi;
19543 gen_ih = gen_avx2_interleave_highv32qi;
19544 break;
19545 case E_V64QImode:
19546 himode = V32HImode;
19547 gen_il = gen_avx512bw_interleave_lowv64qi;
19548 gen_ih = gen_avx512bw_interleave_highv64qi;
19549 break;
19550 default:
19551 gcc_unreachable ();
19552 }
19553
19554 op2_l = op2_h = op2;
19555 switch (code)
19556 {
19557 case MULT:
19558 /* Unpack data such that we've got a source byte in each low byte of
19559 each word. We don't care what goes into the high byte of each word.
19560 Rather than trying to get zero in there, most convenient is to let
19561 it be a copy of the low byte. */
19562 op2_l = gen_reg_rtx (qimode);
19563 op2_h = gen_reg_rtx (qimode);
19564 emit_insn (gen_il (op2_l, op2, op2));
19565 emit_insn (gen_ih (op2_h, op2, op2));
19566
19567 op1_l = gen_reg_rtx (qimode);
19568 op1_h = gen_reg_rtx (qimode);
19569 emit_insn (gen_il (op1_l, op1, op1));
19570 emit_insn (gen_ih (op1_h, op1, op1));
19571 full_interleave = qimode == V16QImode;
19572 break;
19573
19574 case ASHIFT:
19575 case LSHIFTRT:
19576 uns_p = true;
19577 /* FALLTHRU */
19578 case ASHIFTRT:
19579 op1_l = gen_reg_rtx (himode);
19580 op1_h = gen_reg_rtx (himode);
19581 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19582 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19583 full_interleave = true;
19584 break;
19585 default:
19586 gcc_unreachable ();
19587 }
19588
19589 /* Perform the operation. */
19590 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19591 1, OPTAB_DIRECT);
19592 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19593 1, OPTAB_DIRECT);
19594 gcc_assert (res_l && res_h);
19595
19596 /* Merge the data back into the right place. */
19597 d.target = dest;
19598 d.op0 = gen_lowpart (qimode, res_l);
19599 d.op1 = gen_lowpart (qimode, res_h);
19600 d.vmode = qimode;
19601 d.nelt = GET_MODE_NUNITS (qimode);
19602 d.one_operand_p = false;
19603 d.testing_p = false;
19604
19605 if (full_interleave)
19606 {
19607 /* For SSE2, we used an full interleave, so the desired
19608 results are in the even elements. */
19609 for (i = 0; i < d.nelt; ++i)
19610 d.perm[i] = i * 2;
19611 }
19612 else
19613 {
19614 /* For AVX, the interleave used above was not cross-lane. So the
19615 extraction is evens but with the second and third quarter swapped.
19616 Happily, that is even one insn shorter than even extraction.
19617 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19618 always first from the first and then from the second source operand,
19619 the index bits above the low 4 bits remains the same.
19620 Thus, for d.nelt == 32 we want permutation
19621 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19622 and for d.nelt == 64 we want permutation
19623 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19624 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19625 for (i = 0; i < d.nelt; ++i)
19626 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19627 }
19628
19629 ok = ix86_expand_vec_perm_const_1 (&d);
19630 gcc_assert (ok);
19631
19632 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19633 gen_rtx_fmt_ee (code, qimode, op1, op2));
19634 }
19635
19636 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19637 if op is CONST_VECTOR with all odd elements equal to their
19638 preceding element. */
19639
19640 static bool
const_vector_equal_evenodd_p(rtx op)19641 const_vector_equal_evenodd_p (rtx op)
19642 {
19643 machine_mode mode = GET_MODE (op);
19644 int i, nunits = GET_MODE_NUNITS (mode);
19645 if (GET_CODE (op) != CONST_VECTOR
19646 || nunits != CONST_VECTOR_NUNITS (op))
19647 return false;
19648 for (i = 0; i < nunits; i += 2)
19649 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19650 return false;
19651 return true;
19652 }
19653
19654 void
ix86_expand_mul_widen_evenodd(rtx dest,rtx op1,rtx op2,bool uns_p,bool odd_p)19655 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19656 bool uns_p, bool odd_p)
19657 {
19658 machine_mode mode = GET_MODE (op1);
19659 machine_mode wmode = GET_MODE (dest);
19660 rtx x;
19661 rtx orig_op1 = op1, orig_op2 = op2;
19662
19663 if (!nonimmediate_operand (op1, mode))
19664 op1 = force_reg (mode, op1);
19665 if (!nonimmediate_operand (op2, mode))
19666 op2 = force_reg (mode, op2);
19667
19668 /* We only play even/odd games with vectors of SImode. */
19669 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19670
19671 /* If we're looking for the odd results, shift those members down to
19672 the even slots. For some cpus this is faster than a PSHUFD. */
19673 if (odd_p)
19674 {
19675 /* For XOP use vpmacsdqh, but only for smult, as it is only
19676 signed. */
19677 if (TARGET_XOP && mode == V4SImode && !uns_p)
19678 {
19679 x = force_reg (wmode, CONST0_RTX (wmode));
19680 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19681 return;
19682 }
19683
19684 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19685 if (!const_vector_equal_evenodd_p (orig_op1))
19686 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19687 x, NULL, 1, OPTAB_DIRECT);
19688 if (!const_vector_equal_evenodd_p (orig_op2))
19689 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19690 x, NULL, 1, OPTAB_DIRECT);
19691 op1 = gen_lowpart (mode, op1);
19692 op2 = gen_lowpart (mode, op2);
19693 }
19694
19695 if (mode == V16SImode)
19696 {
19697 if (uns_p)
19698 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19699 else
19700 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19701 }
19702 else if (mode == V8SImode)
19703 {
19704 if (uns_p)
19705 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19706 else
19707 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19708 }
19709 else if (uns_p)
19710 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19711 else if (TARGET_SSE4_1)
19712 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19713 else
19714 {
19715 rtx s1, s2, t0, t1, t2;
19716
19717 /* The easiest way to implement this without PMULDQ is to go through
19718 the motions as if we are performing a full 64-bit multiply. With
19719 the exception that we need to do less shuffling of the elements. */
19720
19721 /* Compute the sign-extension, aka highparts, of the two operands. */
19722 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19723 op1, pc_rtx, pc_rtx);
19724 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19725 op2, pc_rtx, pc_rtx);
19726
19727 /* Multiply LO(A) * HI(B), and vice-versa. */
19728 t1 = gen_reg_rtx (wmode);
19729 t2 = gen_reg_rtx (wmode);
19730 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19731 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19732
19733 /* Multiply LO(A) * LO(B). */
19734 t0 = gen_reg_rtx (wmode);
19735 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19736
19737 /* Combine and shift the highparts into place. */
19738 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19739 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19740 1, OPTAB_DIRECT);
19741
19742 /* Combine high and low parts. */
19743 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19744 return;
19745 }
19746 emit_insn (x);
19747 }
19748
19749 void
ix86_expand_mul_widen_hilo(rtx dest,rtx op1,rtx op2,bool uns_p,bool high_p)19750 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19751 bool uns_p, bool high_p)
19752 {
19753 machine_mode wmode = GET_MODE (dest);
19754 machine_mode mode = GET_MODE (op1);
19755 rtx t1, t2, t3, t4, mask;
19756
19757 switch (mode)
19758 {
19759 case E_V4SImode:
19760 t1 = gen_reg_rtx (mode);
19761 t2 = gen_reg_rtx (mode);
19762 if (TARGET_XOP && !uns_p)
19763 {
19764 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19765 shuffle the elements once so that all elements are in the right
19766 place for immediate use: { A C B D }. */
19767 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19768 const1_rtx, GEN_INT (3)));
19769 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19770 const1_rtx, GEN_INT (3)));
19771 }
19772 else
19773 {
19774 /* Put the elements into place for the multiply. */
19775 ix86_expand_vec_interleave (t1, op1, op1, high_p);
19776 ix86_expand_vec_interleave (t2, op2, op2, high_p);
19777 high_p = false;
19778 }
19779 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19780 break;
19781
19782 case E_V8SImode:
19783 /* Shuffle the elements between the lanes. After this we
19784 have { A B E F | C D G H } for each operand. */
19785 t1 = gen_reg_rtx (V4DImode);
19786 t2 = gen_reg_rtx (V4DImode);
19787 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19788 const0_rtx, const2_rtx,
19789 const1_rtx, GEN_INT (3)));
19790 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19791 const0_rtx, const2_rtx,
19792 const1_rtx, GEN_INT (3)));
19793
19794 /* Shuffle the elements within the lanes. After this we
19795 have { A A B B | C C D D } or { E E F F | G G H H }. */
19796 t3 = gen_reg_rtx (V8SImode);
19797 t4 = gen_reg_rtx (V8SImode);
19798 mask = GEN_INT (high_p
19799 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19800 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19801 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19802 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19803
19804 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19805 break;
19806
19807 case E_V8HImode:
19808 case E_V16HImode:
19809 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19810 uns_p, OPTAB_DIRECT);
19811 t2 = expand_binop (mode,
19812 uns_p ? umul_highpart_optab : smul_highpart_optab,
19813 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
19814 gcc_assert (t1 && t2);
19815
19816 t3 = gen_reg_rtx (mode);
19817 ix86_expand_vec_interleave (t3, t1, t2, high_p);
19818 emit_move_insn (dest, gen_lowpart (wmode, t3));
19819 break;
19820
19821 case E_V16QImode:
19822 case E_V32QImode:
19823 case E_V32HImode:
19824 case E_V16SImode:
19825 case E_V64QImode:
19826 t1 = gen_reg_rtx (wmode);
19827 t2 = gen_reg_rtx (wmode);
19828 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
19829 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
19830
19831 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
19832 break;
19833
19834 default:
19835 gcc_unreachable ();
19836 }
19837 }
19838
19839 void
ix86_expand_sse2_mulv4si3(rtx op0,rtx op1,rtx op2)19840 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
19841 {
19842 rtx res_1, res_2, res_3, res_4;
19843
19844 res_1 = gen_reg_rtx (V4SImode);
19845 res_2 = gen_reg_rtx (V4SImode);
19846 res_3 = gen_reg_rtx (V2DImode);
19847 res_4 = gen_reg_rtx (V2DImode);
19848 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
19849 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
19850
19851 /* Move the results in element 2 down to element 1; we don't care
19852 what goes in elements 2 and 3. Then we can merge the parts
19853 back together with an interleave.
19854
19855 Note that two other sequences were tried:
19856 (1) Use interleaves at the start instead of psrldq, which allows
19857 us to use a single shufps to merge things back at the end.
19858 (2) Use shufps here to combine the two vectors, then pshufd to
19859 put the elements in the correct order.
19860 In both cases the cost of the reformatting stall was too high
19861 and the overall sequence slower. */
19862
19863 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
19864 const0_rtx, const2_rtx,
19865 const0_rtx, const0_rtx));
19866 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
19867 const0_rtx, const2_rtx,
19868 const0_rtx, const0_rtx));
19869 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
19870
19871 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
19872 }
19873
19874 void
ix86_expand_sse2_mulvxdi3(rtx op0,rtx op1,rtx op2)19875 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
19876 {
19877 machine_mode mode = GET_MODE (op0);
19878 rtx t1, t2, t3, t4, t5, t6;
19879
19880 if (TARGET_AVX512DQ && mode == V8DImode)
19881 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
19882 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
19883 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
19884 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
19885 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
19886 else if (TARGET_XOP && mode == V2DImode)
19887 {
19888 /* op1: A,B,C,D, op2: E,F,G,H */
19889 op1 = gen_lowpart (V4SImode, op1);
19890 op2 = gen_lowpart (V4SImode, op2);
19891
19892 t1 = gen_reg_rtx (V4SImode);
19893 t2 = gen_reg_rtx (V4SImode);
19894 t3 = gen_reg_rtx (V2DImode);
19895 t4 = gen_reg_rtx (V2DImode);
19896
19897 /* t1: B,A,D,C */
19898 emit_insn (gen_sse2_pshufd_1 (t1, op1,
19899 GEN_INT (1),
19900 GEN_INT (0),
19901 GEN_INT (3),
19902 GEN_INT (2)));
19903
19904 /* t2: (B*E),(A*F),(D*G),(C*H) */
19905 emit_insn (gen_mulv4si3 (t2, t1, op2));
19906
19907 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19908 emit_insn (gen_xop_phadddq (t3, t2));
19909
19910 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19911 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
19912
19913 /* Multiply lower parts and add all */
19914 t5 = gen_reg_rtx (V2DImode);
19915 emit_insn (gen_vec_widen_umult_even_v4si (t5,
19916 gen_lowpart (V4SImode, op1),
19917 gen_lowpart (V4SImode, op2)));
19918 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
19919 }
19920 else
19921 {
19922 machine_mode nmode;
19923 rtx (*umul) (rtx, rtx, rtx);
19924
19925 if (mode == V2DImode)
19926 {
19927 umul = gen_vec_widen_umult_even_v4si;
19928 nmode = V4SImode;
19929 }
19930 else if (mode == V4DImode)
19931 {
19932 umul = gen_vec_widen_umult_even_v8si;
19933 nmode = V8SImode;
19934 }
19935 else if (mode == V8DImode)
19936 {
19937 umul = gen_vec_widen_umult_even_v16si;
19938 nmode = V16SImode;
19939 }
19940 else
19941 gcc_unreachable ();
19942
19943
19944 /* Multiply low parts. */
19945 t1 = gen_reg_rtx (mode);
19946 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
19947
19948 /* Shift input vectors right 32 bits so we can multiply high parts. */
19949 t6 = GEN_INT (32);
19950 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
19951 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
19952
19953 /* Multiply high parts by low parts. */
19954 t4 = gen_reg_rtx (mode);
19955 t5 = gen_reg_rtx (mode);
19956 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
19957 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
19958
19959 /* Combine and shift the highparts back. */
19960 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
19961 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
19962
19963 /* Combine high and low parts. */
19964 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
19965 }
19966
19967 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19968 gen_rtx_MULT (mode, op1, op2));
19969 }
19970
19971 /* Return 1 if control tansfer instruction INSN
19972 should be encoded with notrack prefix. */
19973
19974 bool
ix86_notrack_prefixed_insn_p(rtx_insn * insn)19975 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
19976 {
19977 if (!insn || !((flag_cf_protection & CF_BRANCH)))
19978 return false;
19979
19980 if (CALL_P (insn))
19981 {
19982 rtx call = get_call_rtx_from (insn);
19983 gcc_assert (call != NULL_RTX);
19984 rtx addr = XEXP (call, 0);
19985
19986 /* Do not emit 'notrack' if it's not an indirect call. */
19987 if (MEM_P (addr)
19988 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
19989 return false;
19990 else
19991 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
19992 }
19993
19994 if (JUMP_P (insn) && !flag_cet_switch)
19995 {
19996 rtx target = JUMP_LABEL (insn);
19997 if (target == NULL_RTX || ANY_RETURN_P (target))
19998 return false;
19999
20000 /* Check the jump is a switch table. */
20001 rtx_insn *label = as_a<rtx_insn *> (target);
20002 rtx_insn *table = next_insn (label);
20003 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20004 return false;
20005 else
20006 return true;
20007 }
20008 return false;
20009 }
20010
20011 /* Calculate integer abs() using only SSE2 instructions. */
20012
20013 void
ix86_expand_sse2_abs(rtx target,rtx input)20014 ix86_expand_sse2_abs (rtx target, rtx input)
20015 {
20016 machine_mode mode = GET_MODE (target);
20017 rtx tmp0, tmp1, x;
20018
20019 switch (mode)
20020 {
20021 case E_V2DImode:
20022 case E_V4DImode:
20023 /* For 64-bit signed integer X, with SSE4.2 use
20024 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20025 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20026 32 and use logical instead of arithmetic right shift (which is
20027 unimplemented) and subtract. */
20028 if (TARGET_SSE4_2)
20029 {
20030 tmp0 = gen_reg_rtx (mode);
20031 tmp1 = gen_reg_rtx (mode);
20032 emit_move_insn (tmp1, CONST0_RTX (mode));
20033 if (mode == E_V2DImode)
20034 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20035 else
20036 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20037 }
20038 else
20039 {
20040 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20041 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20042 - 1), NULL, 0, OPTAB_DIRECT);
20043 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20044 }
20045
20046 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20047 NULL, 0, OPTAB_DIRECT);
20048 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20049 target, 0, OPTAB_DIRECT);
20050 break;
20051
20052 case E_V4SImode:
20053 /* For 32-bit signed integer X, the best way to calculate the absolute
20054 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20055 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20056 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20057 NULL, 0, OPTAB_DIRECT);
20058 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20059 NULL, 0, OPTAB_DIRECT);
20060 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20061 target, 0, OPTAB_DIRECT);
20062 break;
20063
20064 case E_V8HImode:
20065 /* For 16-bit signed integer X, the best way to calculate the absolute
20066 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20067 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20068
20069 x = expand_simple_binop (mode, SMAX, tmp0, input,
20070 target, 0, OPTAB_DIRECT);
20071 break;
20072
20073 case E_V16QImode:
20074 /* For 8-bit signed integer X, the best way to calculate the absolute
20075 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20076 as SSE2 provides the PMINUB insn. */
20077 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20078
20079 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20080 target, 0, OPTAB_DIRECT);
20081 break;
20082
20083 default:
20084 gcc_unreachable ();
20085 }
20086
20087 if (x != target)
20088 emit_move_insn (target, x);
20089 }
20090
20091 /* Expand an extract from a vector register through pextr insn.
20092 Return true if successful. */
20093
20094 bool
ix86_expand_pextr(rtx * operands)20095 ix86_expand_pextr (rtx *operands)
20096 {
20097 rtx dst = operands[0];
20098 rtx src = operands[1];
20099
20100 unsigned int size = INTVAL (operands[2]);
20101 unsigned int pos = INTVAL (operands[3]);
20102
20103 if (SUBREG_P (dst))
20104 {
20105 /* Reject non-lowpart subregs. */
20106 if (SUBREG_BYTE (dst) > 0)
20107 return false;
20108 dst = SUBREG_REG (dst);
20109 }
20110
20111 if (SUBREG_P (src))
20112 {
20113 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20114 src = SUBREG_REG (src);
20115 }
20116
20117 switch (GET_MODE (src))
20118 {
20119 case E_V16QImode:
20120 case E_V8HImode:
20121 case E_V4SImode:
20122 case E_V2DImode:
20123 case E_V1TImode:
20124 {
20125 machine_mode srcmode, dstmode;
20126 rtx d, pat;
20127
20128 if (!int_mode_for_size (size, 0).exists (&dstmode))
20129 return false;
20130
20131 switch (dstmode)
20132 {
20133 case E_QImode:
20134 if (!TARGET_SSE4_1)
20135 return false;
20136 srcmode = V16QImode;
20137 break;
20138
20139 case E_HImode:
20140 if (!TARGET_SSE2)
20141 return false;
20142 srcmode = V8HImode;
20143 break;
20144
20145 case E_SImode:
20146 if (!TARGET_SSE4_1)
20147 return false;
20148 srcmode = V4SImode;
20149 break;
20150
20151 case E_DImode:
20152 gcc_assert (TARGET_64BIT);
20153 if (!TARGET_SSE4_1)
20154 return false;
20155 srcmode = V2DImode;
20156 break;
20157
20158 default:
20159 return false;
20160 }
20161
20162 /* Reject extractions from misaligned positions. */
20163 if (pos & (size-1))
20164 return false;
20165
20166 if (GET_MODE (dst) == dstmode)
20167 d = dst;
20168 else
20169 d = gen_reg_rtx (dstmode);
20170
20171 /* Construct insn pattern. */
20172 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20173 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20174
20175 /* Let the rtl optimizers know about the zero extension performed. */
20176 if (dstmode == QImode || dstmode == HImode)
20177 {
20178 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20179 d = gen_lowpart (SImode, d);
20180 }
20181
20182 emit_insn (gen_rtx_SET (d, pat));
20183
20184 if (d != dst)
20185 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20186 return true;
20187 }
20188
20189 default:
20190 return false;
20191 }
20192 }
20193
20194 /* Expand an insert into a vector register through pinsr insn.
20195 Return true if successful. */
20196
20197 bool
ix86_expand_pinsr(rtx * operands)20198 ix86_expand_pinsr (rtx *operands)
20199 {
20200 rtx dst = operands[0];
20201 rtx src = operands[3];
20202
20203 unsigned int size = INTVAL (operands[1]);
20204 unsigned int pos = INTVAL (operands[2]);
20205
20206 if (SUBREG_P (dst))
20207 {
20208 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20209 dst = SUBREG_REG (dst);
20210 }
20211
20212 switch (GET_MODE (dst))
20213 {
20214 case E_V16QImode:
20215 case E_V8HImode:
20216 case E_V4SImode:
20217 case E_V2DImode:
20218 case E_V1TImode:
20219 {
20220 machine_mode srcmode, dstmode;
20221 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20222 rtx d;
20223
20224 if (!int_mode_for_size (size, 0).exists (&srcmode))
20225 return false;
20226
20227 switch (srcmode)
20228 {
20229 case E_QImode:
20230 if (!TARGET_SSE4_1)
20231 return false;
20232 dstmode = V16QImode;
20233 pinsr = gen_sse4_1_pinsrb;
20234 break;
20235
20236 case E_HImode:
20237 if (!TARGET_SSE2)
20238 return false;
20239 dstmode = V8HImode;
20240 pinsr = gen_sse2_pinsrw;
20241 break;
20242
20243 case E_SImode:
20244 if (!TARGET_SSE4_1)
20245 return false;
20246 dstmode = V4SImode;
20247 pinsr = gen_sse4_1_pinsrd;
20248 break;
20249
20250 case E_DImode:
20251 gcc_assert (TARGET_64BIT);
20252 if (!TARGET_SSE4_1)
20253 return false;
20254 dstmode = V2DImode;
20255 pinsr = gen_sse4_1_pinsrq;
20256 break;
20257
20258 default:
20259 return false;
20260 }
20261
20262 /* Reject insertions to misaligned positions. */
20263 if (pos & (size-1))
20264 return false;
20265
20266 if (SUBREG_P (src))
20267 {
20268 unsigned int srcpos = SUBREG_BYTE (src);
20269
20270 if (srcpos > 0)
20271 {
20272 rtx extr_ops[4];
20273
20274 extr_ops[0] = gen_reg_rtx (srcmode);
20275 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20276 extr_ops[2] = GEN_INT (size);
20277 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20278
20279 if (!ix86_expand_pextr (extr_ops))
20280 return false;
20281
20282 src = extr_ops[0];
20283 }
20284 else
20285 src = gen_lowpart (srcmode, SUBREG_REG (src));
20286 }
20287
20288 if (GET_MODE (dst) == dstmode)
20289 d = dst;
20290 else
20291 d = gen_reg_rtx (dstmode);
20292
20293 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20294 gen_lowpart (srcmode, src),
20295 GEN_INT (1 << (pos / size))));
20296 if (d != dst)
20297 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20298 return true;
20299 }
20300
20301 default:
20302 return false;
20303 }
20304 }
20305
20306 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20307 upper against lower halves up to SSE reg size. */
20308
20309 machine_mode
ix86_split_reduction(machine_mode mode)20310 ix86_split_reduction (machine_mode mode)
20311 {
20312 /* Reduce lowpart against highpart until we reach SSE reg width to
20313 avoid cross-lane operations. */
20314 switch (mode)
20315 {
20316 case E_V8DImode:
20317 case E_V4DImode:
20318 return V2DImode;
20319 case E_V16SImode:
20320 case E_V8SImode:
20321 return V4SImode;
20322 case E_V32HImode:
20323 case E_V16HImode:
20324 return V8HImode;
20325 case E_V64QImode:
20326 case E_V32QImode:
20327 return V16QImode;
20328 case E_V16SFmode:
20329 case E_V8SFmode:
20330 return V4SFmode;
20331 case E_V8DFmode:
20332 case E_V4DFmode:
20333 return V2DFmode;
20334 default:
20335 return mode;
20336 }
20337 }
20338
20339 /* Generate call to __divmoddi4. */
20340
20341 void
ix86_expand_divmod_libfunc(rtx libfunc,machine_mode mode,rtx op0,rtx op1,rtx * quot_p,rtx * rem_p)20342 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20343 rtx op0, rtx op1,
20344 rtx *quot_p, rtx *rem_p)
20345 {
20346 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20347
20348 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20349 mode, op0, mode, op1, mode,
20350 XEXP (rem, 0), Pmode);
20351 *quot_p = quot;
20352 *rem_p = rem;
20353 }
20354
20355 #include "gt-i386-expand.h"
20356