xref: /llvm-project/llvm/test/CodeGen/X86/sse41.ll (revision 69ffa7be3bda5547d7a41233f86b88539616e386)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X86-SSE
3; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1
4; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X64-SSE
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512
8
9@g16 = external global i16
10
11define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
12; X86-SSE-LABEL: pinsrd_1:
13; X86-SSE:       ## %bb.0:
14; X86-SSE-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x44,0x24,0x04,0x01]
15; X86-SSE-NEXT:    retl ## encoding: [0xc3]
16;
17; X86-AVX1-LABEL: pinsrd_1:
18; X86-AVX1:       ## %bb.0:
19; X86-AVX1-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01]
20; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
21;
22; X86-AVX512-LABEL: pinsrd_1:
23; X86-AVX512:       ## %bb.0:
24; X86-AVX512-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01]
25; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
26;
27; X64-SSE-LABEL: pinsrd_1:
28; X64-SSE:       ## %bb.0:
29; X64-SSE-NEXT:    pinsrd $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0xc7,0x01]
30; X64-SSE-NEXT:    retq ## encoding: [0xc3]
31;
32; X64-AVX1-LABEL: pinsrd_1:
33; X64-AVX1:       ## %bb.0:
34; X64-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
35; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
36;
37; X64-AVX512-LABEL: pinsrd_1:
38; X64-AVX512:       ## %bb.0:
39; X64-AVX512-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
40; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
41  %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
42  ret <4 x i32> %tmp1
43}
44
45define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
46; X86-SSE-LABEL: pinsrb_1:
47; X86-SSE:       ## %bb.0:
48; X86-SSE-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0x44,0x24,0x04,0x01]
49; X86-SSE-NEXT:    retl ## encoding: [0xc3]
50;
51; X86-AVX1-LABEL: pinsrb_1:
52; X86-AVX1:       ## %bb.0:
53; X86-AVX1-NEXT:    vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01]
54; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
55;
56; X86-AVX512-LABEL: pinsrb_1:
57; X86-AVX512:       ## %bb.0:
58; X86-AVX512-NEXT:    vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01]
59; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
60;
61; X64-SSE-LABEL: pinsrb_1:
62; X64-SSE:       ## %bb.0:
63; X64-SSE-NEXT:    pinsrb $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0xc7,0x01]
64; X64-SSE-NEXT:    retq ## encoding: [0xc3]
65;
66; X64-AVX1-LABEL: pinsrb_1:
67; X64-AVX1:       ## %bb.0:
68; X64-AVX1-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01]
69; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
70;
71; X64-AVX512-LABEL: pinsrb_1:
72; X64-AVX512:       ## %bb.0:
73; X64-AVX512-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01]
74; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
75  %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
76  ret <16 x i8> %tmp1
77}
78
79define <2 x i64> @pmovzxbq_1() nounwind {
80; X86-SSE-LABEL: pmovzxbq_1:
81; X86-SSE:       ## %bb.0: ## %entry
82; X86-SSE-NEXT:    movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
83; X86-SSE-NEXT:    ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
84; X86-SSE-NEXT:    pmovzxbq (%eax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
85; X86-SSE-NEXT:    ## encoding: [0x66,0x0f,0x38,0x32,0x00]
86; X86-SSE-NEXT:    retl ## encoding: [0xc3]
87;
88; X86-AVX1-LABEL: pmovzxbq_1:
89; X86-AVX1:       ## %bb.0: ## %entry
90; X86-AVX1-NEXT:    movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
91; X86-AVX1-NEXT:    ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
92; X86-AVX1-NEXT:    vpmovzxbq (%eax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
93; X86-AVX1-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x32,0x00]
94; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
95;
96; X86-AVX512-LABEL: pmovzxbq_1:
97; X86-AVX512:       ## %bb.0: ## %entry
98; X86-AVX512-NEXT:    movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
99; X86-AVX512-NEXT:    ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
100; X86-AVX512-NEXT:    vpmovzxbq (%eax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
101; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]
102; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
103;
104; X64-SSE-LABEL: pmovzxbq_1:
105; X64-SSE:       ## %bb.0: ## %entry
106; X64-SSE-NEXT:    movq _g16@GOTPCREL(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
107; X64-SSE-NEXT:    ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
108; X64-SSE-NEXT:    pmovzxbq (%rax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
109; X64-SSE-NEXT:    ## encoding: [0x66,0x0f,0x38,0x32,0x00]
110; X64-SSE-NEXT:    retq ## encoding: [0xc3]
111;
112; X64-AVX1-LABEL: pmovzxbq_1:
113; X64-AVX1:       ## %bb.0: ## %entry
114; X64-AVX1-NEXT:    movq _g16@GOTPCREL(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
115; X64-AVX1-NEXT:    ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
116; X64-AVX1-NEXT:    vpmovzxbq (%rax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
117; X64-AVX1-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x32,0x00]
118; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
119;
120; X64-AVX512-LABEL: pmovzxbq_1:
121; X64-AVX512:       ## %bb.0: ## %entry
122; X64-AVX512-NEXT:    movq _g16@GOTPCREL(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
123; X64-AVX512-NEXT:    ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
124; X64-AVX512-NEXT:    vpmovzxbq (%rax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
125; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]
126; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
127entry:
128	%0 = load i16, ptr @g16, align 2		; <i16> [#uses=1]
129	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
130	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
131	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
132	ret <2 x i64> %3
133}
134
135declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
136
137define i32 @extractps_1(<4 x float> %v) nounwind {
138; SSE-LABEL: extractps_1:
139; SSE:       ## %bb.0:
140; SSE-NEXT:    extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03]
141; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
142;
143; AVX1-LABEL: extractps_1:
144; AVX1:       ## %bb.0:
145; AVX1-NEXT:    vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
146; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
147;
148; AVX512-LABEL: extractps_1:
149; AVX512:       ## %bb.0:
150; AVX512-NEXT:    vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
151; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
152  %s = extractelement <4 x float> %v, i32 3
153  %i = bitcast float %s to i32
154  ret i32 %i
155}
156define i32 @extractps_2(<4 x float> %v) nounwind {
157; SSE-LABEL: extractps_2:
158; SSE:       ## %bb.0:
159; SSE-NEXT:    extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03]
160; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
161;
162; AVX1-LABEL: extractps_2:
163; AVX1:       ## %bb.0:
164; AVX1-NEXT:    vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
165; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
166;
167; AVX512-LABEL: extractps_2:
168; AVX512:       ## %bb.0:
169; AVX512-NEXT:    vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
170; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
171  %t = bitcast <4 x float> %v to <4 x i32>
172  %s = extractelement <4 x i32> %t, i32 3
173  ret i32 %s
174}
175
176
177; The non-store form of extractps puts its result into a GPR.
178; This makes it suitable for an extract from a <4 x float> that
179; is bitcasted to i32, but unsuitable for much of anything else.
180
181define float @ext_1(<4 x float> %v) nounwind {
182; X86-SSE-LABEL: ext_1:
183; X86-SSE:       ## %bb.0:
184; X86-SSE-NEXT:    pushl %eax ## encoding: [0x50]
185; X86-SSE-NEXT:    shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff]
186; X86-SSE-NEXT:    ## xmm0 = xmm0[3,3,3,3]
187; X86-SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A]
188; X86-SSE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
189; X86-SSE-NEXT:    movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24]
190; X86-SSE-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
191; X86-SSE-NEXT:    popl %eax ## encoding: [0x58]
192; X86-SSE-NEXT:    retl ## encoding: [0xc3]
193;
194; X86-AVX1-LABEL: ext_1:
195; X86-AVX1:       ## %bb.0:
196; X86-AVX1-NEXT:    pushl %eax ## encoding: [0x50]
197; X86-AVX1-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
198; X86-AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
199; X86-AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
200; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
201; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
202; X86-AVX1-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
203; X86-AVX1-NEXT:    popl %eax ## encoding: [0x58]
204; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
205;
206; X86-AVX512-LABEL: ext_1:
207; X86-AVX512:       ## %bb.0:
208; X86-AVX512-NEXT:    pushl %eax ## encoding: [0x50]
209; X86-AVX512-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
210; X86-AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
211; X86-AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
212; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
213; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
214; X86-AVX512-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
215; X86-AVX512-NEXT:    popl %eax ## encoding: [0x58]
216; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
217;
218; X64-SSE-LABEL: ext_1:
219; X64-SSE:       ## %bb.0:
220; X64-SSE-NEXT:    shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff]
221; X64-SSE-NEXT:    ## xmm0 = xmm0[3,3,3,3]
222; X64-SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A]
223; X64-SSE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
224; X64-SSE-NEXT:    retq ## encoding: [0xc3]
225;
226; X64-AVX1-LABEL: ext_1:
227; X64-AVX1:       ## %bb.0:
228; X64-AVX1-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
229; X64-AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
230; X64-AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
231; X64-AVX1-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
232; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
233;
234; X64-AVX512-LABEL: ext_1:
235; X64-AVX512:       ## %bb.0:
236; X64-AVX512-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
237; X64-AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
238; X64-AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
239; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
240; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
241  %s = extractelement <4 x float> %v, i32 3
242  %t = fadd float %s, 1.0
243  ret float %t
244}
245
246define float @ext_2(<4 x float> %v) nounwind {
247; X86-SSE-LABEL: ext_2:
248; X86-SSE:       ## %bb.0:
249; X86-SSE-NEXT:    pushl %eax ## encoding: [0x50]
250; X86-SSE-NEXT:    shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff]
251; X86-SSE-NEXT:    ## xmm0 = xmm0[3,3,3,3]
252; X86-SSE-NEXT:    movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24]
253; X86-SSE-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
254; X86-SSE-NEXT:    popl %eax ## encoding: [0x58]
255; X86-SSE-NEXT:    retl ## encoding: [0xc3]
256;
257; X86-AVX1-LABEL: ext_2:
258; X86-AVX1:       ## %bb.0:
259; X86-AVX1-NEXT:    pushl %eax ## encoding: [0x50]
260; X86-AVX1-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
261; X86-AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
262; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
263; X86-AVX1-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
264; X86-AVX1-NEXT:    popl %eax ## encoding: [0x58]
265; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
266;
267; X86-AVX512-LABEL: ext_2:
268; X86-AVX512:       ## %bb.0:
269; X86-AVX512-NEXT:    pushl %eax ## encoding: [0x50]
270; X86-AVX512-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
271; X86-AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
272; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
273; X86-AVX512-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
274; X86-AVX512-NEXT:    popl %eax ## encoding: [0x58]
275; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
276;
277; X64-SSE-LABEL: ext_2:
278; X64-SSE:       ## %bb.0:
279; X64-SSE-NEXT:    shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff]
280; X64-SSE-NEXT:    ## xmm0 = xmm0[3,3,3,3]
281; X64-SSE-NEXT:    retq ## encoding: [0xc3]
282;
283; X64-AVX1-LABEL: ext_2:
284; X64-AVX1:       ## %bb.0:
285; X64-AVX1-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
286; X64-AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
287; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
288;
289; X64-AVX512-LABEL: ext_2:
290; X64-AVX512:       ## %bb.0:
291; X64-AVX512-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
292; X64-AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
293; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
294  %s = extractelement <4 x float> %v, i32 3
295  ret float %s
296}
297
298define i32 @ext_3(<4 x i32> %v) nounwind {
299; SSE-LABEL: ext_3:
300; SSE:       ## %bb.0:
301; SSE-NEXT:    extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03]
302; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
303;
304; AVX1-LABEL: ext_3:
305; AVX1:       ## %bb.0:
306; AVX1-NEXT:    vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
307; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
308;
309; AVX512-LABEL: ext_3:
310; AVX512:       ## %bb.0:
311; AVX512-NEXT:    vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
312; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
313  %i = extractelement <4 x i32> %v, i32 3
314  ret i32 %i
315}
316
317define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
318; SSE-LABEL: insertps_1:
319; SSE:       ## %bb.0:
320; SSE-NEXT:    insertps $21, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x15]
321; SSE-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
322; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
323;
324; AVX1-LABEL: insertps_1:
325; AVX1:       ## %bb.0:
326; AVX1-NEXT:    vinsertps $21, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15]
327; AVX1-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
328; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
329;
330; AVX512-LABEL: insertps_1:
331; AVX512:       ## %bb.0:
332; AVX512-NEXT:    vinsertps $21, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15]
333; AVX512-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
334; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
335  %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone
336  ret <4 x float> %tmp1
337}
338
339declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
340
341; When optimizing for speed, prefer blendps over insertps even if it means we have to
342; generate a separate movss to load the scalar operand.
343define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
344; X86-SSE-LABEL: blendps_not_insertps_1:
345; X86-SSE:       ## %bb.0:
346; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
347; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
348; X86-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
349; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
350; X86-SSE-NEXT:    retl ## encoding: [0xc3]
351;
352; X86-AVX1-LABEL: blendps_not_insertps_1:
353; X86-AVX1:       ## %bb.0:
354; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
355; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
356; X86-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
357; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
358; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
359;
360; X86-AVX512-LABEL: blendps_not_insertps_1:
361; X86-AVX512:       ## %bb.0:
362; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
363; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
364; X86-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
365; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
366; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
367;
368; X64-SSE-LABEL: blendps_not_insertps_1:
369; X64-SSE:       ## %bb.0:
370; X64-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
371; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
372; X64-SSE-NEXT:    retq ## encoding: [0xc3]
373;
374; X64-AVX-LABEL: blendps_not_insertps_1:
375; X64-AVX:       ## %bb.0:
376; X64-AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
377; X64-AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
378; X64-AVX-NEXT:    retq ## encoding: [0xc3]
379  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
380  ret <4 x float> %tmp1
381}
382
383; When optimizing for size, generate an insertps if there's a load fold opportunity.
384; The difference between i386 and x86-64 ABIs for the float operand means we should
385; generate an insertps for X86 but not for X64!
386define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
387; X86-SSE-LABEL: insertps_or_blendps:
388; X86-SSE:       ## %bb.0:
389; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
390; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
391; X86-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
392; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
393; X86-SSE-NEXT:    retl ## encoding: [0xc3]
394;
395; X86-AVX1-LABEL: insertps_or_blendps:
396; X86-AVX1:       ## %bb.0:
397; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
398; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
399; X86-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
400; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
401; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
402;
403; X86-AVX512-LABEL: insertps_or_blendps:
404; X86-AVX512:       ## %bb.0:
405; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
406; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
407; X86-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
408; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
409; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
410;
411; X64-SSE-LABEL: insertps_or_blendps:
412; X64-SSE:       ## %bb.0:
413; X64-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
414; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
415; X64-SSE-NEXT:    retq ## encoding: [0xc3]
416;
417; X64-AVX1-LABEL: insertps_or_blendps:
418; X64-AVX1:       ## %bb.0:
419; X64-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
420; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
421; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
422;
423; X64-AVX512-LABEL: insertps_or_blendps:
424; X64-AVX512:       ## %bb.0:
425; X64-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
426; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
427; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
428  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
429  ret <4 x float> %tmp1
430}
431
432; An insert into the low 32-bits of a vector from the low 32-bits of another vector
433; is always just a blendps because blendps is never more expensive than insertps.
434define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
435; SSE-LABEL: blendps_not_insertps_2:
436; SSE:       ## %bb.0:
437; SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
438; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
439; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
440;
441; AVX-LABEL: blendps_not_insertps_2:
442; AVX:       ## %bb.0:
443; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
444; AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
445; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
446  %tmp2 = extractelement <4 x float> %t2, i32 0
447  %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
448  ret <4 x float> %tmp1
449}
450
451define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
452; SSE-LABEL: ptestz_1:
453; SSE:       ## %bb.0:
454; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
455; SSE-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
456; SSE-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
457; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
458;
459; AVX-LABEL: ptestz_1:
460; AVX:       ## %bb.0:
461; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
462; AVX-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
463; AVX-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
464; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
465  %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
466  ret i32 %tmp1
467}
468
469define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
470; SSE-LABEL: ptestz_2:
471; SSE:       ## %bb.0:
472; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
473; SSE-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
474; SSE-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
475; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
476;
477; AVX-LABEL: ptestz_2:
478; AVX:       ## %bb.0:
479; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
480; AVX-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
481; AVX-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
482; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
483  %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
484  ret i32 %tmp1
485}
486
487define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
488; SSE-LABEL: ptestz_3:
489; SSE:       ## %bb.0:
490; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
491; SSE-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
492; SSE-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
493; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
494;
495; AVX-LABEL: ptestz_3:
496; AVX:       ## %bb.0:
497; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
498; AVX-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
499; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
500; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
501  %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
502  ret i32 %tmp1
503}
504
505declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
506declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
507declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
508
509; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
510; pointless.
511define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
512; SSE-LABEL: buildvector:
513; SSE:       ## %bb.0: ## %entry
514; SSE-NEXT:    movshdup %xmm0, %xmm2 ## encoding: [0xf3,0x0f,0x16,0xd0]
515; SSE-NEXT:    ## xmm2 = xmm0[1,1,3,3]
516; SSE-NEXT:    movshdup %xmm1, %xmm3 ## encoding: [0xf3,0x0f,0x16,0xd9]
517; SSE-NEXT:    ## xmm3 = xmm1[1,1,3,3]
518; SSE-NEXT:    addss %xmm2, %xmm3 ## encoding: [0xf3,0x0f,0x58,0xda]
519; SSE-NEXT:    addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1]
520; SSE-NEXT:    insertps $16, %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc3,0x10]
521; SSE-NEXT:    ## xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
522; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
523;
524; AVX1-LABEL: buildvector:
525; AVX1:       ## %bb.0: ## %entry
526; AVX1-NEXT:    vmovshdup %xmm0, %xmm2 ## encoding: [0xc5,0xfa,0x16,0xd0]
527; AVX1-NEXT:    ## xmm2 = xmm0[1,1,3,3]
528; AVX1-NEXT:    vmovshdup %xmm1, %xmm3 ## encoding: [0xc5,0xfa,0x16,0xd9]
529; AVX1-NEXT:    ## xmm3 = xmm1[1,1,3,3]
530; AVX1-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## encoding: [0xc5,0xea,0x58,0xd3]
531; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1]
532; AVX1-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
533; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
534; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
535;
536; AVX512-LABEL: buildvector:
537; AVX512:       ## %bb.0: ## %entry
538; AVX512-NEXT:    vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0]
539; AVX512-NEXT:    ## xmm2 = xmm0[1,1,3,3]
540; AVX512-NEXT:    vmovshdup %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd9]
541; AVX512-NEXT:    ## xmm3 = xmm1[1,1,3,3]
542; AVX512-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x58,0xd3]
543; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
544; AVX512-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
545; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
546; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
547entry:
548  %tmp7 = extractelement <2 x float> %A, i32 0
549  %tmp5 = extractelement <2 x float> %A, i32 1
550  %tmp3 = extractelement <2 x float> %B, i32 0
551  %tmp1 = extractelement <2 x float> %B, i32 1
552  %add.r = fadd float %tmp7, %tmp3
553  %add.i = fadd float %tmp5, %tmp1
554  %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
555  %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
556  ret <2 x float> %tmp9
557}
558
559define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, ptr nocapture readonly %pb) {
560; X86-SSE-LABEL: insertps_from_shufflevector_1:
561; X86-SSE:       ## %bb.0: ## %entry
562; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
563; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
564; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
565; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
566; X86-SSE-NEXT:    retl ## encoding: [0xc3]
567;
568; X86-AVX1-LABEL: insertps_from_shufflevector_1:
569; X86-AVX1:       ## %bb.0: ## %entry
570; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
571; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
572; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
573; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
574; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
575;
576; X86-AVX512-LABEL: insertps_from_shufflevector_1:
577; X86-AVX512:       ## %bb.0: ## %entry
578; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
579; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
580; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
581; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
582; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
583;
584; X64-SSE-LABEL: insertps_from_shufflevector_1:
585; X64-SSE:       ## %bb.0: ## %entry
586; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
587; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
588; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
589; X64-SSE-NEXT:    retq ## encoding: [0xc3]
590;
591; X64-AVX1-LABEL: insertps_from_shufflevector_1:
592; X64-AVX1:       ## %bb.0: ## %entry
593; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
594; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
595; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
596; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
597;
598; X64-AVX512-LABEL: insertps_from_shufflevector_1:
599; X64-AVX512:       ## %bb.0: ## %entry
600; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
601; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
602; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
603; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
604entry:
605  %0 = load <4 x float>, ptr %pb, align 16
606  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
607  ret <4 x float> %vecinit6
608}
609
610define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
611; SSE-LABEL: insertps_from_shufflevector_2:
612; SSE:       ## %bb.0: ## %entry
613; SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
614; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
615; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
616;
617; AVX1-LABEL: insertps_from_shufflevector_2:
618; AVX1:       ## %bb.0: ## %entry
619; AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
620; AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
621; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
622;
623; AVX512-LABEL: insertps_from_shufflevector_2:
624; AVX512:       ## %bb.0: ## %entry
625; AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
626; AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
627; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
628entry:
629  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
630  ret <4 x float> %vecinit6
631}
632
633; For loading an i32 from memory into an xmm register we use pinsrd
634; instead of insertps
635define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, ptr nocapture readonly %pb) {
636; X86-SSE-LABEL: pinsrd_from_shufflevector_i32:
637; X86-SSE:       ## %bb.0: ## %entry
638; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
639; X86-SSE-NEXT:    pshufd $0, (%eax), %xmm1 ## encoding: [0x66,0x0f,0x70,0x08,0x00]
640; X86-SSE-NEXT:    ## xmm1 = mem[0,0,0,0]
641; X86-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
642; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
643; X86-SSE-NEXT:    retl ## encoding: [0xc3]
644;
645; X86-AVX1-LABEL: pinsrd_from_shufflevector_i32:
646; X86-AVX1:       ## %bb.0: ## %entry
647; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
648; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x08]
649; X86-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
650; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
651; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
652;
653; X86-AVX512-LABEL: pinsrd_from_shufflevector_i32:
654; X86-AVX512:       ## %bb.0: ## %entry
655; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
656; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08]
657; X86-AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
658; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
659; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
660;
661; X64-SSE-LABEL: pinsrd_from_shufflevector_i32:
662; X64-SSE:       ## %bb.0: ## %entry
663; X64-SSE-NEXT:    pshufd $0, (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x70,0x0f,0x00]
664; X64-SSE-NEXT:    ## xmm1 = mem[0,0,0,0]
665; X64-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
666; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
667; X64-SSE-NEXT:    retq ## encoding: [0xc3]
668;
669; X64-AVX1-LABEL: pinsrd_from_shufflevector_i32:
670; X64-AVX1:       ## %bb.0: ## %entry
671; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x0f]
672; X64-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
673; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
674; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
675;
676; X64-AVX512-LABEL: pinsrd_from_shufflevector_i32:
677; X64-AVX512:       ## %bb.0: ## %entry
678; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f]
679; X64-AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
680; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
681; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
682entry:
683  %0 = load <4 x i32>, ptr %pb, align 16
684  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
685  ret <4 x i32> %vecinit6
686}
687
688define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
689; SSE-LABEL: insertps_from_shufflevector_i32_2:
690; SSE:       ## %bb.0: ## %entry
691; SSE-NEXT:    pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
692; SSE-NEXT:    ## xmm1 = xmm1[2,3,2,3]
693; SSE-NEXT:    pblendw $12, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x0c]
694; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
695; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
696;
697; AVX1-LABEL: insertps_from_shufflevector_i32_2:
698; AVX1:       ## %bb.0: ## %entry
699; AVX1-NEXT:    vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
700; AVX1-NEXT:    ## xmm1 = xmm1[2,3,2,3]
701; AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
702; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
703; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
704;
705; AVX512-LABEL: insertps_from_shufflevector_i32_2:
706; AVX512:       ## %bb.0: ## %entry
707; AVX512-NEXT:    vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
708; AVX512-NEXT:    ## xmm1 = xmm1[2,3,2,3]
709; AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
710; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
711; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
712entry:
713  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
714  ret <4 x i32> %vecinit6
715}
716
717define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, ptr %b) {
718; X86-SSE-LABEL: insertps_from_load_ins_elt_undef:
719; X86-SSE:       ## %bb.0:
720; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
721; X86-SSE-NEXT:    insertps $16, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x10]
722; X86-SSE-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
723; X86-SSE-NEXT:    retl ## encoding: [0xc3]
724;
725; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef:
726; X86-AVX1:       ## %bb.0:
727; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
728; X86-AVX1-NEXT:    vinsertps $16, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10]
729; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
730; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
731;
732; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef:
733; X86-AVX512:       ## %bb.0:
734; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
735; X86-AVX512-NEXT:    vinsertps $16, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10]
736; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
737; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
738;
739; X64-SSE-LABEL: insertps_from_load_ins_elt_undef:
740; X64-SSE:       ## %bb.0:
741; X64-SSE-NEXT:    insertps $16, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x10]
742; X64-SSE-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
743; X64-SSE-NEXT:    retq ## encoding: [0xc3]
744;
745; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef:
746; X64-AVX1:       ## %bb.0:
747; X64-AVX1-NEXT:    vinsertps $16, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10]
748; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
749; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
750;
751; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef:
752; X64-AVX512:       ## %bb.0:
753; X64-AVX512-NEXT:    vinsertps $16, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10]
754; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
755; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
756  %1 = load float, ptr %b, align 4
757  %2 = insertelement <4 x float> undef, float %1, i32 0
758  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
759  ret <4 x float> %result
760}
761
762; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
763define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, ptr %b) {
764; X86-SSE-LABEL: insertps_from_load_ins_elt_undef_i32:
765; X86-SSE:       ## %bb.0:
766; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
767; X86-SSE-NEXT:    pinsrd $2, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x00,0x02]
768; X86-SSE-NEXT:    retl ## encoding: [0xc3]
769;
770; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32:
771; X86-AVX1:       ## %bb.0:
772; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
773; X86-AVX1-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02]
774; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
775;
776; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32:
777; X86-AVX512:       ## %bb.0:
778; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
779; X86-AVX512-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02]
780; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
781;
782; X64-SSE-LABEL: insertps_from_load_ins_elt_undef_i32:
783; X64-SSE:       ## %bb.0:
784; X64-SSE-NEXT:    pinsrd $2, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x07,0x02]
785; X64-SSE-NEXT:    retq ## encoding: [0xc3]
786;
787; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32:
788; X64-AVX1:       ## %bb.0:
789; X64-AVX1-NEXT:    vpinsrd $2, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02]
790; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
791;
792; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32:
793; X64-AVX512:       ## %bb.0:
794; X64-AVX512-NEXT:    vpinsrd $2, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02]
795; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
796  %1 = load i32, ptr %b, align 4
797  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
798  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
799  ret <4 x i32> %result
800}
801
802;;;;;; Shuffles optimizable with a single insertps or blend instruction
803define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
804; SSE-LABEL: shuf_XYZ0:
805; SSE:       ## %bb.0:
806; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
807; SSE-NEXT:    blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08]
808; SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
809; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
810;
811; AVX1-LABEL: shuf_XYZ0:
812; AVX1:       ## %bb.0:
813; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
814; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
815; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
816; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
817;
818; AVX512-LABEL: shuf_XYZ0:
819; AVX512:       ## %bb.0:
820; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
821; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
822; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
823; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
824  %vecext = extractelement <4 x float> %x, i32 0
825  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
826  %vecext1 = extractelement <4 x float> %x, i32 1
827  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
828  %vecext3 = extractelement <4 x float> %x, i32 2
829  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
830  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
831  ret <4 x float> %vecinit5
832}
833
834define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
835; SSE-LABEL: shuf_XY00:
836; SSE:       ## %bb.0:
837; SSE-NEXT:    movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0]
838; SSE-NEXT:    ## xmm0 = xmm0[0],zero
839; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
840;
841; AVX1-LABEL: shuf_XY00:
842; AVX1:       ## %bb.0:
843; AVX1-NEXT:    vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0]
844; AVX1-NEXT:    ## xmm0 = xmm0[0],zero
845; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
846;
847; AVX512-LABEL: shuf_XY00:
848; AVX512:       ## %bb.0:
849; AVX512-NEXT:    vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
850; AVX512-NEXT:    ## xmm0 = xmm0[0],zero
851; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
852  %vecext = extractelement <4 x float> %x, i32 0
853  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
854  %vecext1 = extractelement <4 x float> %x, i32 1
855  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
856  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
857  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
858  ret <4 x float> %vecinit4
859}
860
861define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
862; SSE-LABEL: shuf_XYY0:
863; SSE:       ## %bb.0:
864; SSE-NEXT:    insertps $104, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x68]
865; SSE-NEXT:    ## xmm0 = xmm0[0,1,1],zero
866; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
867;
868; AVX1-LABEL: shuf_XYY0:
869; AVX1:       ## %bb.0:
870; AVX1-NEXT:    vinsertps $104, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68]
871; AVX1-NEXT:    ## xmm0 = xmm0[0,1,1],zero
872; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
873;
874; AVX512-LABEL: shuf_XYY0:
875; AVX512:       ## %bb.0:
876; AVX512-NEXT:    vinsertps $104, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68]
877; AVX512-NEXT:    ## xmm0 = xmm0[0,1,1],zero
878; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
879  %vecext = extractelement <4 x float> %x, i32 0
880  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
881  %vecext1 = extractelement <4 x float> %x, i32 1
882  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
883  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
884  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
885  ret <4 x float> %vecinit5
886}
887
888define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
889; SSE-LABEL: shuf_XYW0:
890; SSE:       ## %bb.0:
891; SSE-NEXT:    insertps $232, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xe8]
892; SSE-NEXT:    ## xmm0 = xmm0[0,1,3],zero
893; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
894;
895; AVX1-LABEL: shuf_XYW0:
896; AVX1:       ## %bb.0:
897; AVX1-NEXT:    vinsertps $232, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8]
898; AVX1-NEXT:    ## xmm0 = xmm0[0,1,3],zero
899; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
900;
901; AVX512-LABEL: shuf_XYW0:
902; AVX512:       ## %bb.0:
903; AVX512-NEXT:    vinsertps $232, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8]
904; AVX512-NEXT:    ## xmm0 = xmm0[0,1,3],zero
905; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
906  %vecext = extractelement <4 x float> %x, i32 0
907  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
908  %vecext1 = extractelement <4 x float> %x, i32 1
909  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
910  %vecext2 = extractelement <4 x float> %x, i32 3
911  %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
912  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
913  ret <4 x float> %vecinit4
914}
915
916define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
917; SSE-LABEL: shuf_W00W:
918; SSE:       ## %bb.0:
919; SSE-NEXT:    insertps $198, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xc6]
920; SSE-NEXT:    ## xmm0 = xmm0[3],zero,zero,xmm0[3]
921; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
922;
923; AVX1-LABEL: shuf_W00W:
924; AVX1:       ## %bb.0:
925; AVX1-NEXT:    vinsertps $198, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6]
926; AVX1-NEXT:    ## xmm0 = xmm0[3],zero,zero,xmm0[3]
927; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
928;
929; AVX512-LABEL: shuf_W00W:
930; AVX512:       ## %bb.0:
931; AVX512-NEXT:    vinsertps $198, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6]
932; AVX512-NEXT:    ## xmm0 = xmm0[3],zero,zero,xmm0[3]
933; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
934  %vecext = extractelement <4 x float> %x, i32 3
935  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
936  %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
937  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
938  %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
939  ret <4 x float> %vecinit4
940}
941
942define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
943; SSE-LABEL: shuf_X00A:
944; SSE:       ## %bb.0:
945; SSE-NEXT:    insertps $54, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x36]
946; SSE-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm1[0]
947; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
948;
949; AVX1-LABEL: shuf_X00A:
950; AVX1:       ## %bb.0:
951; AVX1-NEXT:    vinsertps $54, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36]
952; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm1[0]
953; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
954;
955; AVX512-LABEL: shuf_X00A:
956; AVX512:       ## %bb.0:
957; AVX512-NEXT:    vinsertps $54, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36]
958; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm1[0]
959; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
960  %vecext = extractelement <4 x float> %x, i32 0
961  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
962  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
963  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
964  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
965  ret <4 x float> %vecinit4
966}
967
968define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
969; SSE-LABEL: shuf_X00X:
970; SSE:       ## %bb.0:
971; SSE-NEXT:    insertps $54, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x36]
972; SSE-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm0[0]
973; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
974;
975; AVX1-LABEL: shuf_X00X:
976; AVX1:       ## %bb.0:
977; AVX1-NEXT:    vinsertps $54, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36]
978; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm0[0]
979; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
980;
981; AVX512-LABEL: shuf_X00X:
982; AVX512:       ## %bb.0:
983; AVX512-NEXT:    vinsertps $54, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36]
984; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm0[0]
985; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
986  %vecext = extractelement <4 x float> %x, i32 0
987  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
988  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
989  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
990  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
991  ret <4 x float> %vecinit4
992}
993
994define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
995; SSE-LABEL: shuf_X0YC:
996; SSE:       ## %bb.0:
997; SSE-NEXT:    xorps %xmm2, %xmm2 ## encoding: [0x0f,0x57,0xd2]
998; SSE-NEXT:    unpcklps %xmm2, %xmm0 ## encoding: [0x0f,0x14,0xc2]
999; SSE-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1000; SSE-NEXT:    insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0]
1001; SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[2]
1002; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1003;
1004; AVX1-LABEL: shuf_X0YC:
1005; AVX1:       ## %bb.0:
1006; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
1007; AVX1-NEXT:    vunpcklps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x14,0xc2]
1008; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1009; AVX1-NEXT:    vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
1010; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[2]
1011; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1012;
1013; AVX512-LABEL: shuf_X0YC:
1014; AVX512:       ## %bb.0:
1015; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
1016; AVX512-NEXT:    vunpcklps %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc2]
1017; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1018; AVX512-NEXT:    vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
1019; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[2]
1020; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1021  %vecext = extractelement <4 x float> %x, i32 0
1022  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1023  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
1024  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
1025  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
1026  ret <4 x float> %vecinit5
1027}
1028
1029define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
1030; SSE-LABEL: i32_shuf_XYZ0:
1031; SSE:       ## %bb.0:
1032; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
1033; SSE-NEXT:    blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08]
1034; SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1035; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1036;
1037; AVX1-LABEL: i32_shuf_XYZ0:
1038; AVX1:       ## %bb.0:
1039; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1040; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1041; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1042; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1043;
1044; AVX512-LABEL: i32_shuf_XYZ0:
1045; AVX512:       ## %bb.0:
1046; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1047; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1048; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1049; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1050  %vecext = extractelement <4 x i32> %x, i32 0
1051  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1052  %vecext1 = extractelement <4 x i32> %x, i32 1
1053  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1054  %vecext3 = extractelement <4 x i32> %x, i32 2
1055  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
1056  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
1057  ret <4 x i32> %vecinit5
1058}
1059
1060define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
1061; SSE-LABEL: i32_shuf_XY00:
1062; SSE:       ## %bb.0:
1063; SSE-NEXT:    movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0]
1064; SSE-NEXT:    ## xmm0 = xmm0[0],zero
1065; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1066;
1067; AVX1-LABEL: i32_shuf_XY00:
1068; AVX1:       ## %bb.0:
1069; AVX1-NEXT:    vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0]
1070; AVX1-NEXT:    ## xmm0 = xmm0[0],zero
1071; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1072;
1073; AVX512-LABEL: i32_shuf_XY00:
1074; AVX512:       ## %bb.0:
1075; AVX512-NEXT:    vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
1076; AVX512-NEXT:    ## xmm0 = xmm0[0],zero
1077; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1078  %vecext = extractelement <4 x i32> %x, i32 0
1079  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1080  %vecext1 = extractelement <4 x i32> %x, i32 1
1081  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1082  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
1083  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
1084  ret <4 x i32> %vecinit4
1085}
1086
1087define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
1088; SSE-LABEL: i32_shuf_XYY0:
1089; SSE:       ## %bb.0:
1090; SSE-NEXT:    pshufd $212, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xd4]
1091; SSE-NEXT:    ## xmm1 = xmm0[0,1,1,3]
1092; SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
1093; SSE-NEXT:    pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f]
1094; SSE-NEXT:    ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1095; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1096;
1097; AVX1-LABEL: i32_shuf_XYY0:
1098; AVX1:       ## %bb.0:
1099; AVX1-NEXT:    vshufps $212, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xd4]
1100; AVX1-NEXT:    ## xmm0 = xmm0[0,1,1,3]
1101; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1102; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1103; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1104; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1105;
1106; AVX512-LABEL: i32_shuf_XYY0:
1107; AVX512:       ## %bb.0:
1108; AVX512-NEXT:    vshufps $212, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xd4]
1109; AVX512-NEXT:    ## xmm0 = xmm0[0,1,1,3]
1110; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1111; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1112; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1113; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1114  %vecext = extractelement <4 x i32> %x, i32 0
1115  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1116  %vecext1 = extractelement <4 x i32> %x, i32 1
1117  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1118  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
1119  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
1120  ret <4 x i32> %vecinit5
1121}
1122
1123define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
1124; SSE-LABEL: i32_shuf_XYW0:
1125; SSE:       ## %bb.0:
1126; SSE-NEXT:    pshufd $244, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xf4]
1127; SSE-NEXT:    ## xmm1 = xmm0[0,1,3,3]
1128; SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
1129; SSE-NEXT:    pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f]
1130; SSE-NEXT:    ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1131; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1132;
1133; AVX1-LABEL: i32_shuf_XYW0:
1134; AVX1:       ## %bb.0:
1135; AVX1-NEXT:    vshufps $244, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xf4]
1136; AVX1-NEXT:    ## xmm0 = xmm0[0,1,3,3]
1137; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1138; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1139; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1140; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1141;
1142; AVX512-LABEL: i32_shuf_XYW0:
1143; AVX512:       ## %bb.0:
1144; AVX512-NEXT:    vshufps $244, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xf4]
1145; AVX512-NEXT:    ## xmm0 = xmm0[0,1,3,3]
1146; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1147; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1148; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1149; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1150  %vecext = extractelement <4 x i32> %x, i32 0
1151  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1152  %vecext1 = extractelement <4 x i32> %x, i32 1
1153  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1154  %vecext2 = extractelement <4 x i32> %x, i32 3
1155  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
1156  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
1157  ret <4 x i32> %vecinit4
1158}
1159
1160define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
1161; SSE-LABEL: i32_shuf_W00W:
1162; SSE:       ## %bb.0:
1163; SSE-NEXT:    pshufd $255, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xff]
1164; SSE-NEXT:    ## xmm1 = xmm0[3,3,3,3]
1165; SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
1166; SSE-NEXT:    pblendw $195, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc3]
1167; SSE-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
1168; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1169;
1170; AVX1-LABEL: i32_shuf_W00W:
1171; AVX1:       ## %bb.0:
1172; AVX1-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
1173; AVX1-NEXT:    ## xmm0 = xmm0[3,3,3,3]
1174; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1175; AVX1-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1176; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1177; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1178;
1179; AVX512-LABEL: i32_shuf_W00W:
1180; AVX512:       ## %bb.0:
1181; AVX512-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
1182; AVX512-NEXT:    ## xmm0 = xmm0[3,3,3,3]
1183; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1184; AVX512-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1185; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1186; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1187  %vecext = extractelement <4 x i32> %x, i32 3
1188  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1189  %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1190  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
1191  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
1192  ret <4 x i32> %vecinit4
1193}
1194
1195define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
1196; SSE-LABEL: i32_shuf_X00A:
1197; SSE:       ## %bb.0:
1198; SSE-NEXT:    pxor %xmm2, %xmm2 ## encoding: [0x66,0x0f,0xef,0xd2]
1199; SSE-NEXT:    pblendw $252, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0xfc]
1200; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
1201; SSE-NEXT:    pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00]
1202; SSE-NEXT:    ## xmm1 = xmm1[0,0,0,0]
1203; SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
1204; SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1205; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1206;
1207; AVX1-LABEL: i32_shuf_X00A:
1208; AVX1:       ## %bb.0:
1209; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
1210; AVX1-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
1211; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
1212; AVX1-NEXT:    vshufps $0, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0x00]
1213; AVX1-NEXT:    ## xmm1 = xmm1[0,0,0,0]
1214; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1215; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1216; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1217;
1218; AVX512-LABEL: i32_shuf_X00A:
1219; AVX512:       ## %bb.0:
1220; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
1221; AVX512-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
1222; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
1223; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9]
1224; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1225; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1226; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1227  %vecext = extractelement <4 x i32> %x, i32 0
1228  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1229  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1230  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
1231  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
1232  ret <4 x i32> %vecinit4
1233}
1234
1235define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
1236; SSE-LABEL: i32_shuf_X00X:
1237; SSE:       ## %bb.0:
1238; SSE-NEXT:    pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9]
1239; SSE-NEXT:    pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00]
1240; SSE-NEXT:    ## xmm0 = xmm0[0,0,0,0]
1241; SSE-NEXT:    pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c]
1242; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
1243; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1244;
1245; AVX1-LABEL: i32_shuf_X00X:
1246; AVX1:       ## %bb.0:
1247; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1248; AVX1-NEXT:    vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
1249; AVX1-NEXT:    ## xmm0 = xmm0[0,0,0,0]
1250; AVX1-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1251; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1252; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1253;
1254; AVX512-LABEL: i32_shuf_X00X:
1255; AVX512:       ## %bb.0:
1256; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1257; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
1258; AVX512-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1259; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1260; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1261  %vecext = extractelement <4 x i32> %x, i32 0
1262  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1263  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1264  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
1265  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
1266  ret <4 x i32> %vecinit4
1267}
1268
1269define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
1270; SSE-LABEL: i32_shuf_X0YC:
1271; SSE:       ## %bb.0:
1272; SSE-NEXT:    pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0]
1273; SSE-NEXT:    ## xmm2 = xmm0[0],zero,xmm0[1],zero
1274; SSE-NEXT:    pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa]
1275; SSE-NEXT:    ## xmm0 = xmm1[2,2,2,2]
1276; SSE-NEXT:    pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f]
1277; SSE-NEXT:    ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
1278; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1279;
1280; AVX1-LABEL: i32_shuf_X0YC:
1281; AVX1:       ## %bb.0:
1282; AVX1-NEXT:    vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0]
1283; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
1284; AVX1-NEXT:    vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
1285; AVX1-NEXT:    ## xmm1 = xmm1[2,2,2,2]
1286; AVX1-NEXT:    vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0]
1287; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1288; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1289;
1290; AVX512-LABEL: i32_shuf_X0YC:
1291; AVX512:       ## %bb.0:
1292; AVX512-NEXT:    vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0]
1293; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
1294; AVX512-NEXT:    vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
1295; AVX512-NEXT:    ## xmm1 = xmm1[2,2,2,2]
1296; AVX512-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
1297; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1298; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1299  %vecext = extractelement <4 x i32> %x, i32 0
1300  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1301  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1302  %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
1303  %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
1304  ret <4 x i32> %vecinit5
1305}
1306
1307;; Test for a bug in the first implementation of LowerBuildVectorv4X86
1308define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
1309; SSE-LABEL: test_insertps_no_undef:
1310; SSE:       ## %bb.0:
1311; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
1312; SSE-NEXT:    blendps $7, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc8,0x07]
1313; SSE-NEXT:    ## xmm1 = xmm0[0,1,2],xmm1[3]
1314; SSE-NEXT:    maxps %xmm1, %xmm0 ## encoding: [0x0f,0x5f,0xc1]
1315; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1316;
1317; AVX1-LABEL: test_insertps_no_undef:
1318; AVX1:       ## %bb.0:
1319; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1320; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08]
1321; AVX1-NEXT:    ## xmm1 = xmm0[0,1,2],xmm1[3]
1322; AVX1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1]
1323; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1324;
1325; AVX512-LABEL: test_insertps_no_undef:
1326; AVX512:       ## %bb.0:
1327; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1328; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08]
1329; AVX512-NEXT:    ## xmm1 = xmm0[0,1,2],xmm1[3]
1330; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
1331; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1332  %vecext = extractelement <4 x float> %x, i32 0
1333  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1334  %vecext1 = extractelement <4 x float> %x, i32 1
1335  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1336  %vecext3 = extractelement <4 x float> %x, i32 2
1337  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
1338  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
1339  %mask = fcmp olt <4 x float> %vecinit5, %x
1340  %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
1341  ret <4 x float> %res
1342}
1343
1344define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
1345; SSE-LABEL: blendvb_fallback:
1346; SSE:       ## %bb.0:
1347; SSE-NEXT:    psllw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x0f]
1348; SSE-NEXT:    psraw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x0f]
1349; SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x38,0x10,0xd1]
1350; SSE-NEXT:    movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2]
1351; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1352;
1353; AVX1-LABEL: blendvb_fallback:
1354; AVX1:       ## %bb.0:
1355; AVX1-NEXT:    vpsllw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x0f]
1356; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x0f]
1357; AVX1-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00]
1358; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1359;
1360; AVX512-LABEL: blendvb_fallback:
1361; AVX512:       ## %bb.0:
1362; AVX512-NEXT:    vpsllw $15, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x0f]
1363; AVX512-NEXT:    vpmovw2m %xmm0, %k1 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc8]
1364; AVX512-NEXT:    vpblendmw %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x66,0xc1]
1365; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1366  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
1367  ret <8 x i16> %ret
1368}
1369
1370; On X86, account for the argument's move to registers
1371define <4 x float> @insertps_from_vector_load(<4 x float> %a, ptr nocapture readonly %pb) {
1372; X86-SSE-LABEL: insertps_from_vector_load:
1373; X86-SSE:       ## %bb.0:
1374; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1375; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
1376; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1377; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1378; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1379;
1380; X86-AVX1-LABEL: insertps_from_vector_load:
1381; X86-AVX1:       ## %bb.0:
1382; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1383; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
1384; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1385; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1386; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1387;
1388; X86-AVX512-LABEL: insertps_from_vector_load:
1389; X86-AVX512:       ## %bb.0:
1390; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1391; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
1392; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1393; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1394; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1395;
1396; X64-SSE-LABEL: insertps_from_vector_load:
1397; X64-SSE:       ## %bb.0:
1398; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
1399; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1400; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1401; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1402;
1403; X64-AVX1-LABEL: insertps_from_vector_load:
1404; X64-AVX1:       ## %bb.0:
1405; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
1406; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1407; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1408; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1409;
1410; X64-AVX512-LABEL: insertps_from_vector_load:
1411; X64-AVX512:       ## %bb.0:
1412; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
1413; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1414; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1415; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1416  %1 = load <4 x float>, ptr %pb, align 16
1417  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
1418  ret <4 x float> %2
1419}
1420
1421;; Use a non-zero CountS for insertps
1422;; Try to match a bit more of the instr, since we need the load's offset.
1423define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, ptr nocapture readonly %pb) {
1424; X86-SSE-LABEL: insertps_from_vector_load_offset:
1425; X86-SSE:       ## %bb.0:
1426; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1427; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
1428; X86-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
1429; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1430; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1431;
1432; X86-AVX1-LABEL: insertps_from_vector_load_offset:
1433; X86-AVX1:       ## %bb.0:
1434; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1435; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
1436; X86-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1437; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1438; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1439;
1440; X86-AVX512-LABEL: insertps_from_vector_load_offset:
1441; X86-AVX512:       ## %bb.0:
1442; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1443; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
1444; X86-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1445; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1446; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1447;
1448; X64-SSE-LABEL: insertps_from_vector_load_offset:
1449; X64-SSE:       ## %bb.0:
1450; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
1451; X64-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
1452; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1453; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1454;
1455; X64-AVX1-LABEL: insertps_from_vector_load_offset:
1456; X64-AVX1:       ## %bb.0:
1457; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
1458; X64-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1459; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1460; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1461;
1462; X64-AVX512-LABEL: insertps_from_vector_load_offset:
1463; X64-AVX512:       ## %bb.0:
1464; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
1465; X64-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1466; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1467; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1468  %1 = load <4 x float>, ptr %pb, align 16
1469  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
1470  ret <4 x float> %2
1471}
1472
1473;; Try to match a bit more of the instr, since we need the load's offset.
1474define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocapture readonly %pb, i64 %index) {
1475; X86-SSE-LABEL: insertps_from_vector_load_offset_2:
1476; X86-SSE:       ## %bb.0:
1477; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1478; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
1479; X86-SSE-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
1480; X86-SSE-NEXT:    movaps (%eax,%ecx), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x08]
1481; X86-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
1482; X86-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1483; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1484;
1485; X86-AVX1-LABEL: insertps_from_vector_load_offset_2:
1486; X86-AVX1:       ## %bb.0:
1487; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1488; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
1489; X86-AVX1-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
1490; X86-AVX1-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08]
1491; X86-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1492; X86-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1493; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1494;
1495; X86-AVX512-LABEL: insertps_from_vector_load_offset_2:
1496; X86-AVX512:       ## %bb.0:
1497; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1498; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
1499; X86-AVX512-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
1500; X86-AVX512-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08]
1501; X86-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1502; X86-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1503; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1504;
1505; X64-SSE-LABEL: insertps_from_vector_load_offset_2:
1506; X64-SSE:       ## %bb.0:
1507; X64-SSE-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
1508; X64-SSE-NEXT:    movaps (%rdi,%rsi), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x37]
1509; X64-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
1510; X64-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1511; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1512;
1513; X64-AVX1-LABEL: insertps_from_vector_load_offset_2:
1514; X64-AVX1:       ## %bb.0:
1515; X64-AVX1-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
1516; X64-AVX1-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x37]
1517; X64-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1518; X64-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1519; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1520;
1521; X64-AVX512-LABEL: insertps_from_vector_load_offset_2:
1522; X64-AVX512:       ## %bb.0:
1523; X64-AVX512-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
1524; X64-AVX512-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x37]
1525; X64-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1526; X64-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1527; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1528  %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index
1529  %2 = load <4 x float>, ptr %1, align 16
1530  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
1531  ret <4 x float> %3
1532}
1533
1534define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, ptr nocapture readonly %fb, i64 %index) {
1535; X86-SSE-LABEL: insertps_from_broadcast_loadf32:
1536; X86-SSE:       ## %bb.0:
1537; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1538; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1539; X86-SSE-NEXT:    insertps $48, (%ecx,%eax,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0x81,0x30]
1540; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1541; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1542;
1543; X86-AVX1-LABEL: insertps_from_broadcast_loadf32:
1544; X86-AVX1:       ## %bb.0:
1545; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1546; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1547; X86-AVX1-NEXT:    vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30]
1548; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1549; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1550;
1551; X86-AVX512-LABEL: insertps_from_broadcast_loadf32:
1552; X86-AVX512:       ## %bb.0:
1553; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1554; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1555; X86-AVX512-NEXT:    vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30]
1556; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1557; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1558;
1559; X64-SSE-LABEL: insertps_from_broadcast_loadf32:
1560; X64-SSE:       ## %bb.0:
1561; X64-SSE-NEXT:    insertps $48, (%rdi,%rsi,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0xb7,0x30]
1562; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1563; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1564;
1565; X64-AVX1-LABEL: insertps_from_broadcast_loadf32:
1566; X64-AVX1:       ## %bb.0:
1567; X64-AVX1-NEXT:    vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30]
1568; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1569; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1570;
1571; X64-AVX512-LABEL: insertps_from_broadcast_loadf32:
1572; X64-AVX512:       ## %bb.0:
1573; X64-AVX512-NEXT:    vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30]
1574; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1575; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1576  %1 = getelementptr inbounds float, ptr %fb, i64 %index
1577  %2 = load float, ptr %1, align 4
1578  %3 = insertelement <4 x float> undef, float %2, i32 0
1579  %4 = insertelement <4 x float> %3, float %2, i32 1
1580  %5 = insertelement <4 x float> %4, float %2, i32 2
1581  %6 = insertelement <4 x float> %5, float %2, i32 3
1582  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
1583  ret <4 x float> %7
1584}
1585
1586define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapture readonly %b) {
1587; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32:
1588; X86-SSE:       ## %bb.0:
1589; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1590; X86-SSE-NEXT:    movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08]
1591; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1592; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1593; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1594;
1595; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
1596; X86-AVX1:       ## %bb.0:
1597; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1598; X86-AVX1-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
1599; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1600; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1601;
1602; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
1603; X86-AVX512:       ## %bb.0:
1604; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1605; X86-AVX512-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
1606; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1607; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1608;
1609; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32:
1610; X64-SSE:       ## %bb.0:
1611; X64-SSE-NEXT:    movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f]
1612; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1613; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1614; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1615;
1616; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
1617; X64-AVX1:       ## %bb.0:
1618; X64-AVX1-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
1619; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1620; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1621;
1622; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
1623; X64-AVX512:       ## %bb.0:
1624; X64-AVX512-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
1625; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1626; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1627  %1 = load <4 x float>, ptr %b, align 4
1628  %2 = extractelement <4 x float> %1, i32 0
1629  %3 = insertelement <4 x float> undef, float %2, i32 0
1630  %4 = insertelement <4 x float> %3, float %2, i32 1
1631  %5 = insertelement <4 x float> %4, float %2, i32 2
1632  %6 = insertelement <4 x float> %5, float %2, i32 3
1633  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
1634  ret <4 x float> %7
1635}
1636
1637define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr nocapture readonly %fb, i64 %index) {
1638; X86-SSE-LABEL: insertps_from_broadcast_multiple_use:
1639; X86-SSE:       ## %bb.0:
1640; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1641; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1642; X86-SSE-NEXT:    movss (%ecx,%eax,4), %xmm4 ## xmm4 = mem[0],zero,zero,zero
1643; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x24,0x81]
1644; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
1645; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1646; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
1647; X86-SSE-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1648; X86-SSE-NEXT:    addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1]
1649; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]
1650; X86-SSE-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[0]
1651; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]
1652; X86-SSE-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[0]
1653; X86-SSE-NEXT:    addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda]
1654; X86-SSE-NEXT:    addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]
1655; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1656;
1657; X86-AVX1-LABEL: insertps_from_broadcast_multiple_use:
1658; X86-AVX1:       ## %bb.0:
1659; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1660; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1661; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1662; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1663; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
1664; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1665; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
1666; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1667; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
1668; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
1669; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
1670; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
1671; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
1672; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1673; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1674;
1675; X86-AVX512-LABEL: insertps_from_broadcast_multiple_use:
1676; X86-AVX512:       ## %bb.0:
1677; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1678; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1679; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1680; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1681; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
1682; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1683; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
1684; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
1685; X86-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
1686; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
1687; X86-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
1688; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1689; X86-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
1690; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1691; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1692;
1693; X64-SSE-LABEL: insertps_from_broadcast_multiple_use:
1694; X64-SSE:       ## %bb.0:
1695; X64-SSE-NEXT:    movss (%rdi,%rsi,4), %xmm4 ## xmm4 = mem[0],zero,zero,zero
1696; X64-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x24,0xb7]
1697; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
1698; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1699; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
1700; X64-SSE-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1701; X64-SSE-NEXT:    addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1]
1702; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]
1703; X64-SSE-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[0]
1704; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]
1705; X64-SSE-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[0]
1706; X64-SSE-NEXT:    addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda]
1707; X64-SSE-NEXT:    addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]
1708; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1709;
1710; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
1711; X64-AVX1:       ## %bb.0:
1712; X64-AVX1-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1713; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1714; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
1715; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1716; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
1717; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1718; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
1719; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
1720; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
1721; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
1722; X64-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
1723; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1724; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1725;
1726; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
1727; X64-AVX512:       ## %bb.0:
1728; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1729; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
1730; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
1731; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
1732; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
1733; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
1734; X64-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
1735; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
1736; X64-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
1737; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1738; X64-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
1739; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1740; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1741  %1 = getelementptr inbounds float, ptr %fb, i64 %index
1742  %2 = load float, ptr %1, align 4
1743  %3 = insertelement <4 x float> undef, float %2, i32 0
1744  %4 = insertelement <4 x float> %3, float %2, i32 1
1745  %5 = insertelement <4 x float> %4, float %2, i32 2
1746  %6 = insertelement <4 x float> %5, float %2, i32 3
1747  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
1748  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
1749  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
1750  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
1751  %11 = fadd <4 x float> %7, %8
1752  %12 = fadd <4 x float> %9, %10
1753  %13 = fadd <4 x float> %11, %12
1754  ret <4 x float> %13
1755}
1756
1757define <4 x float> @insertps_with_undefs(<4 x float> %a, ptr %b) {
1758; X86-SSE-LABEL: insertps_with_undefs:
1759; X86-SSE:       ## %bb.0:
1760; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1761; X86-SSE-NEXT:    movss (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1762; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x08]
1763; X86-SSE-NEXT:    movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8]
1764; X86-SSE-NEXT:    ## xmm1 = xmm1[0],xmm0[0]
1765; X86-SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
1766; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1767;
1768; X86-AVX1-LABEL: insertps_with_undefs:
1769; X86-AVX1:       ## %bb.0:
1770; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1771; X86-AVX1-NEXT:    vmovss (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1772; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x08]
1773; X86-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0]
1774; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1775; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1776;
1777; X86-AVX512-LABEL: insertps_with_undefs:
1778; X86-AVX512:       ## %bb.0:
1779; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1780; X86-AVX512-NEXT:    vmovss (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1781; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
1782; X86-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
1783; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1784; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1785;
1786; X64-SSE-LABEL: insertps_with_undefs:
1787; X64-SSE:       ## %bb.0:
1788; X64-SSE-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1789; X64-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x0f]
1790; X64-SSE-NEXT:    movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8]
1791; X64-SSE-NEXT:    ## xmm1 = xmm1[0],xmm0[0]
1792; X64-SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
1793; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1794;
1795; X64-AVX1-LABEL: insertps_with_undefs:
1796; X64-AVX1:       ## %bb.0:
1797; X64-AVX1-NEXT:    vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1798; X64-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x0f]
1799; X64-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0]
1800; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1801; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1802;
1803; X64-AVX512-LABEL: insertps_with_undefs:
1804; X64-AVX512:       ## %bb.0:
1805; X64-AVX512-NEXT:    vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1806; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
1807; X64-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
1808; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1809; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1810  %1 = load float, ptr %b, align 4
1811  %2 = insertelement <4 x float> undef, float %1, i32 0
1812  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
1813  ret <4 x float> %result
1814}
1815
1816; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
1817; the destination index to change the load, instead of the source index.
1818define <4 x float> @pr20087(<4 x float> %a, ptr%ptr) {
1819; X86-SSE-LABEL: pr20087:
1820; X86-SSE:       ## %bb.0:
1821; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1822; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
1823; X86-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
1824; X86-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1825; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1826;
1827; X86-AVX1-LABEL: pr20087:
1828; X86-AVX1:       ## %bb.0:
1829; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1830; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
1831; X86-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1832; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1833; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1834;
1835; X86-AVX512-LABEL: pr20087:
1836; X86-AVX512:       ## %bb.0:
1837; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1838; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
1839; X86-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1840; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1841; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1842;
1843; X64-SSE-LABEL: pr20087:
1844; X64-SSE:       ## %bb.0:
1845; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
1846; X64-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
1847; X64-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1848; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1849;
1850; X64-AVX1-LABEL: pr20087:
1851; X64-AVX1:       ## %bb.0:
1852; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
1853; X64-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1854; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1855; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1856;
1857; X64-AVX512-LABEL: pr20087:
1858; X64-AVX512:       ## %bb.0:
1859; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
1860; X64-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1861; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1862; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1863  %load = load <4 x float> , ptr%ptr
1864  %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
1865  ret <4 x float> %ret
1866}
1867
1868; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
1869define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, ptr noalias nocapture %RET) #1 {
1870; X86-SSE-LABEL: insertps_pr20411:
1871; X86-SSE:       ## %bb.0:
1872; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1873; X86-SSE-NEXT:    pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
1874; X86-SSE-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1875; X86-SSE-NEXT:    pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3]
1876; X86-SSE-NEXT:    ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1877; X86-SSE-NEXT:    movdqu %xmm1, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x08]
1878; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1879;
1880; X86-AVX1-LABEL: insertps_pr20411:
1881; X86-AVX1:       ## %bb.0:
1882; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1883; X86-AVX1-NEXT:    vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
1884; X86-AVX1-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1885; X86-AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1886; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1887; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00]
1888; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1889;
1890; X86-AVX512-LABEL: insertps_pr20411:
1891; X86-AVX512:       ## %bb.0:
1892; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1893; X86-AVX512-NEXT:    vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
1894; X86-AVX512-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1895; X86-AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1896; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1897; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
1898; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1899;
1900; X64-SSE-LABEL: insertps_pr20411:
1901; X64-SSE:       ## %bb.0:
1902; X64-SSE-NEXT:    pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
1903; X64-SSE-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1904; X64-SSE-NEXT:    pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3]
1905; X64-SSE-NEXT:    ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1906; X64-SSE-NEXT:    movdqu %xmm1, (%rdi) ## encoding: [0xf3,0x0f,0x7f,0x0f]
1907; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1908;
1909; X64-AVX1-LABEL: insertps_pr20411:
1910; X64-AVX1:       ## %bb.0:
1911; X64-AVX1-NEXT:    vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
1912; X64-AVX1-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1913; X64-AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1914; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1915; X64-AVX1-NEXT:    vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07]
1916; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1917;
1918; X64-AVX512-LABEL: insertps_pr20411:
1919; X64-AVX512:       ## %bb.0:
1920; X64-AVX512-NEXT:    vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
1921; X64-AVX512-NEXT:    ## xmm1 = xmm1[2,3,2,3]
1922; X64-AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1923; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1924; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
1925; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1926  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
1927  store <4 x i32> %shuffle117, ptr %RET, align 4
1928  ret void
1929}
1930
1931define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
1932; SSE-LABEL: insertps_4:
1933; SSE:       ## %bb.0:
1934; SSE-NEXT:    insertps $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xaa]
1935; SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[2],zero
1936; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1937;
1938; AVX1-LABEL: insertps_4:
1939; AVX1:       ## %bb.0:
1940; AVX1-NEXT:    vinsertps $170, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa]
1941; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[2],zero
1942; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1943;
1944; AVX512-LABEL: insertps_4:
1945; AVX512:       ## %bb.0:
1946; AVX512-NEXT:    vinsertps $170, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa]
1947; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[2],zero
1948; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1949  %vecext = extractelement <4 x float> %A, i32 0
1950  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1951  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1952  %vecext2 = extractelement <4 x float> %B, i32 2
1953  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1954  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1955  ret <4 x float> %vecinit4
1956}
1957
1958define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
1959; SSE-LABEL: insertps_5:
1960; SSE:       ## %bb.0:
1961; SSE-NEXT:    insertps $92, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x5c]
1962; SSE-NEXT:    ## xmm0 = xmm0[0],xmm1[1],zero,zero
1963; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1964;
1965; AVX1-LABEL: insertps_5:
1966; AVX1:       ## %bb.0:
1967; AVX1-NEXT:    vinsertps $92, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c]
1968; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],zero,zero
1969; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1970;
1971; AVX512-LABEL: insertps_5:
1972; AVX512:       ## %bb.0:
1973; AVX512-NEXT:    vinsertps $92, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c]
1974; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],zero,zero
1975; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1976  %vecext = extractelement <4 x float> %A, i32 0
1977  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1978  %vecext1 = extractelement <4 x float> %B, i32 1
1979  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1980  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1981  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1982  ret <4 x float> %vecinit4
1983}
1984
1985define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
1986; SSE-LABEL: insertps_6:
1987; SSE:       ## %bb.0:
1988; SSE-NEXT:    insertps $169, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xa9]
1989; SSE-NEXT:    ## xmm0 = zero,xmm0[1],xmm1[2],zero
1990; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1991;
1992; AVX1-LABEL: insertps_6:
1993; AVX1:       ## %bb.0:
1994; AVX1-NEXT:    vinsertps $169, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9]
1995; AVX1-NEXT:    ## xmm0 = zero,xmm0[1],xmm1[2],zero
1996; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1997;
1998; AVX512-LABEL: insertps_6:
1999; AVX512:       ## %bb.0:
2000; AVX512-NEXT:    vinsertps $169, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9]
2001; AVX512-NEXT:    ## xmm0 = zero,xmm0[1],xmm1[2],zero
2002; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2003  %vecext = extractelement <4 x float> %A, i32 1
2004  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
2005  %vecext1 = extractelement <4 x float> %B, i32 2
2006  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
2007  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
2008  ret <4 x float> %vecinit3
2009}
2010
2011define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
2012; SSE-LABEL: insertps_7:
2013; SSE:       ## %bb.0:
2014; SSE-NEXT:    insertps $106, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x6a]
2015; SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[1],zero
2016; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2017;
2018; AVX1-LABEL: insertps_7:
2019; AVX1:       ## %bb.0:
2020; AVX1-NEXT:    vinsertps $106, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a]
2021; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[1],zero
2022; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2023;
2024; AVX512-LABEL: insertps_7:
2025; AVX512:       ## %bb.0:
2026; AVX512-NEXT:    vinsertps $106, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a]
2027; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[1],zero
2028; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2029  %vecext = extractelement <4 x float> %A, i32 0
2030  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
2031  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
2032  %vecext2 = extractelement <4 x float> %B, i32 1
2033  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
2034  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
2035  ret <4 x float> %vecinit4
2036}
2037
2038define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
2039; SSE-LABEL: insertps_8:
2040; SSE:       ## %bb.0:
2041; SSE-NEXT:    insertps $28, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x1c]
2042; SSE-NEXT:    ## xmm0 = xmm0[0],xmm1[0],zero,zero
2043; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2044;
2045; AVX1-LABEL: insertps_8:
2046; AVX1:       ## %bb.0:
2047; AVX1-NEXT:    vinsertps $28, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c]
2048; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[0],zero,zero
2049; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2050;
2051; AVX512-LABEL: insertps_8:
2052; AVX512:       ## %bb.0:
2053; AVX512-NEXT:    vinsertps $28, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c]
2054; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[0],zero,zero
2055; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2056  %vecext = extractelement <4 x float> %A, i32 0
2057  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
2058  %vecext1 = extractelement <4 x float> %B, i32 0
2059  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
2060  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
2061  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
2062  ret <4 x float> %vecinit4
2063}
2064
2065define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
2066; SSE-LABEL: insertps_9:
2067; SSE:       ## %bb.0:
2068; SSE-NEXT:    insertps $25, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xc8,0x19]
2069; SSE-NEXT:    ## xmm1 = zero,xmm0[0],xmm1[2],zero
2070; SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
2071; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2072;
2073; AVX1-LABEL: insertps_9:
2074; AVX1:       ## %bb.0:
2075; AVX1-NEXT:    vinsertps $25, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19]
2076; AVX1-NEXT:    ## xmm0 = zero,xmm0[0],xmm1[2],zero
2077; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2078;
2079; AVX512-LABEL: insertps_9:
2080; AVX512:       ## %bb.0:
2081; AVX512-NEXT:    vinsertps $25, %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19]
2082; AVX512-NEXT:    ## xmm0 = zero,xmm0[0],xmm1[2],zero
2083; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2084  %vecext = extractelement <4 x float> %A, i32 0
2085  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
2086  %vecext1 = extractelement <4 x float> %B, i32 2
2087  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
2088  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
2089  ret <4 x float> %vecinit3
2090}
2091
2092define <4 x float> @insertps_10(<4 x float> %A) {
2093; SSE-LABEL: insertps_10:
2094; SSE:       ## %bb.0:
2095; SSE-NEXT:    insertps $42, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x2a]
2096; SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[0],zero
2097; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2098;
2099; AVX1-LABEL: insertps_10:
2100; AVX1:       ## %bb.0:
2101; AVX1-NEXT:    vinsertps $42, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a]
2102; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[0],zero
2103; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2104;
2105; AVX512-LABEL: insertps_10:
2106; AVX512:       ## %bb.0:
2107; AVX512-NEXT:    vinsertps $42, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a]
2108; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[0],zero
2109; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2110  %vecext = extractelement <4 x float> %A, i32 0
2111  %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
2112  %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
2113  ret <4 x float> %vecbuild2
2114}
2115
2116define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
2117; SSE-LABEL: build_vector_to_shuffle_1:
2118; SSE:       ## %bb.0:
2119; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
2120; SSE-NEXT:    blendps $5, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x05]
2121; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2122; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2123;
2124; AVX1-LABEL: build_vector_to_shuffle_1:
2125; AVX1:       ## %bb.0:
2126; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
2127; AVX1-NEXT:    vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
2128; AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2129; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2130;
2131; AVX512-LABEL: build_vector_to_shuffle_1:
2132; AVX512:       ## %bb.0:
2133; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
2134; AVX512-NEXT:    vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
2135; AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2136; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2137  %vecext = extractelement <4 x float> %A, i32 1
2138  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
2139  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
2140  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2141  ret <4 x float> %vecinit3
2142}
2143
2144define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
2145; SSE-LABEL: build_vector_to_shuffle_2:
2146; SSE:       ## %bb.0:
2147; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
2148; SSE-NEXT:    blendps $13, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x0d]
2149; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2150; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2151;
2152; AVX1-LABEL: build_vector_to_shuffle_2:
2153; AVX1:       ## %bb.0:
2154; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
2155; AVX1-NEXT:    vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
2156; AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2157; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2158;
2159; AVX512-LABEL: build_vector_to_shuffle_2:
2160; AVX512:       ## %bb.0:
2161; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
2162; AVX512-NEXT:    vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
2163; AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2164; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2165  %vecext = extractelement <4 x float> %A, i32 1
2166  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
2167  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
2168  ret <4 x float> %vecinit1
2169}
2170