xref: /llvm-project/llvm/test/CodeGen/X86/sse2.ll (revision 4318b033bddc64d5654f3e368fddde859ff4d02e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
3; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX1,X86-AVX1
4; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX512,X86-AVX512
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512
8
9; Tests for SSE2 and below, without SSE3+.
10
11define void @test1(ptr %r, ptr %A, double %B) nounwind  {
12; X86-SSE-LABEL: test1:
13; X86-SSE:       # %bb.0:
14; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
15; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
16; X86-SSE-NEXT:    movaps (%ecx), %xmm0
17; X86-SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
18; X86-SSE-NEXT:    movaps %xmm0, (%eax)
19; X86-SSE-NEXT:    retl
20;
21; X86-AVX-LABEL: test1:
22; X86-AVX:       # %bb.0:
23; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
24; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
25; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
26; X86-AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
27; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
28; X86-AVX-NEXT:    retl
29;
30; X64-SSE-LABEL: test1:
31; X64-SSE:       # %bb.0:
32; X64-SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
33; X64-SSE-NEXT:    movapd %xmm0, (%rdi)
34; X64-SSE-NEXT:    retq
35;
36; X64-AVX-LABEL: test1:
37; X64-AVX:       # %bb.0:
38; X64-AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
39; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
40; X64-AVX-NEXT:    retq
41	%tmp3 = load <2 x double>, ptr %A, align 16
42	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
43	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
44	store <2 x double> %tmp9, ptr %r, align 16
45	ret void
46}
47
48define void @test2(ptr %r, ptr %A, double %B) nounwind  {
49; X86-SSE-LABEL: test2:
50; X86-SSE:       # %bb.0:
51; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
52; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
53; X86-SSE-NEXT:    movaps (%ecx), %xmm0
54; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
55; X86-SSE-NEXT:    movaps %xmm0, (%eax)
56; X86-SSE-NEXT:    retl
57;
58; X86-AVX-LABEL: test2:
59; X86-AVX:       # %bb.0:
60; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
61; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
62; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
63; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
64; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
65; X86-AVX-NEXT:    retl
66;
67; X64-SSE-LABEL: test2:
68; X64-SSE:       # %bb.0:
69; X64-SSE-NEXT:    movaps (%rsi), %xmm1
70; X64-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
71; X64-SSE-NEXT:    movaps %xmm1, (%rdi)
72; X64-SSE-NEXT:    retq
73;
74; X64-AVX-LABEL: test2:
75; X64-AVX:       # %bb.0:
76; X64-AVX-NEXT:    vmovaps (%rsi), %xmm1
77; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
78; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
79; X64-AVX-NEXT:    retq
80	%tmp3 = load <2 x double>, ptr %A, align 16
81	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
82	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
83	store <2 x double> %tmp9, ptr %r, align 16
84	ret void
85}
86
87
88define void @test3(ptr %res, ptr %A, ptr %B) nounwind {
89; X86-SSE-LABEL: test3:
90; X86-SSE:       # %bb.0:
91; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
92; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
93; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
94; X86-SSE-NEXT:    movaps (%edx), %xmm0
95; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
96; X86-SSE-NEXT:    movaps %xmm0, (%eax)
97; X86-SSE-NEXT:    retl
98;
99; X86-AVX-LABEL: test3:
100; X86-AVX:       # %bb.0:
101; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
102; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
103; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
104; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
105; X86-AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
106; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
107; X86-AVX-NEXT:    retl
108;
109; X64-SSE-LABEL: test3:
110; X64-SSE:       # %bb.0:
111; X64-SSE-NEXT:    movaps (%rsi), %xmm0
112; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
113; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
114; X64-SSE-NEXT:    retq
115;
116; X64-AVX-LABEL: test3:
117; X64-AVX:       # %bb.0:
118; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
119; X64-AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
120; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
121; X64-AVX-NEXT:    retq
122	%tmp = load <4 x float>, ptr %B		; <<4 x float>> [#uses=2]
123	%tmp3 = load <4 x float>, ptr %A		; <<4 x float>> [#uses=2]
124	%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0		; <float> [#uses=1]
125	%tmp7 = extractelement <4 x float> %tmp, i32 0		; <float> [#uses=1]
126	%tmp8 = extractelement <4 x float> %tmp3, i32 1		; <float> [#uses=1]
127	%tmp9 = extractelement <4 x float> %tmp, i32 1		; <float> [#uses=1]
128	%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0		; <<4 x float>> [#uses=1]
129	%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1		; <<4 x float>> [#uses=1]
130	%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2		; <<4 x float>> [#uses=1]
131	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3		; <<4 x float>> [#uses=1]
132	store <4 x float> %tmp13, ptr %res
133	ret void
134}
135
136define void @test4(<4 x float> %X, ptr %res) nounwind {
137; X86-SSE-LABEL: test4:
138; X86-SSE:       # %bb.0:
139; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
140; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
141; X86-SSE-NEXT:    movaps %xmm0, (%eax)
142; X86-SSE-NEXT:    retl
143;
144; X86-AVX-LABEL: test4:
145; X86-AVX:       # %bb.0:
146; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
147; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
148; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
149; X86-AVX-NEXT:    retl
150;
151; X64-SSE-LABEL: test4:
152; X64-SSE:       # %bb.0:
153; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
154; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
155; X64-SSE-NEXT:    retq
156;
157; X64-AVX-LABEL: test4:
158; X64-AVX:       # %bb.0:
159; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
160; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
161; X64-AVX-NEXT:    retq
162	%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
163	store <4 x float> %tmp5, ptr %res
164	ret void
165}
166
167define <4 x i32> @test5(ptr %ptr) nounwind {
168; X86-SSE-LABEL: test5:
169; X86-SSE:       # %bb.0:
170; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
171; X86-SSE-NEXT:    movl (%eax), %eax
172; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
173; X86-SSE-NEXT:    pxor %xmm0, %xmm0
174; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
175; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
176; X86-SSE-NEXT:    retl
177;
178; X86-AVX-LABEL: test5:
179; X86-AVX:       # %bb.0:
180; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
181; X86-AVX-NEXT:    movl (%eax), %eax
182; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
183; X86-AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
184; X86-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
185; X86-AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
186; X86-AVX-NEXT:    retl
187;
188; X64-SSE-LABEL: test5:
189; X64-SSE:       # %bb.0:
190; X64-SSE-NEXT:    movq (%rdi), %rax
191; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
192; X64-SSE-NEXT:    pxor %xmm0, %xmm0
193; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
194; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
195; X64-SSE-NEXT:    retq
196;
197; X64-AVX-LABEL: test5:
198; X64-AVX:       # %bb.0:
199; X64-AVX-NEXT:    movq (%rdi), %rax
200; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
201; X64-AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
202; X64-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
203; X64-AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
204; X64-AVX-NEXT:    retq
205	%tmp = load ptr, ptr %ptr		; <ptr> [#uses=1]
206	%tmp.upgrd.2 = load float, ptr %tmp		; <float> [#uses=1]
207	%tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0		; <<4 x float>> [#uses=1]
208	%tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
209	%tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
210	%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
211	%tmp21 = bitcast <4 x float> %tmp11 to <16 x i8>		; <<16 x i8>> [#uses=1]
212	%tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 >		; <<16 x i8>> [#uses=1]
213	%tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16>		; <<8 x i16>> [#uses=1]
214	%tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 >		; <<8 x i16>> [#uses=1]
215	%tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32>		; <<4 x i32>> [#uses=1]
216	ret <4 x i32> %tmp36
217}
218
219define void @test6(ptr %res, ptr %A) nounwind {
220; X86-SSE-LABEL: test6:
221; X86-SSE:       # %bb.0:
222; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
223; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
224; X86-SSE-NEXT:    movaps (%ecx), %xmm0
225; X86-SSE-NEXT:    movaps %xmm0, (%eax)
226; X86-SSE-NEXT:    retl
227;
228; X86-AVX-LABEL: test6:
229; X86-AVX:       # %bb.0:
230; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
231; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
232; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
233; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
234; X86-AVX-NEXT:    retl
235;
236; X64-SSE-LABEL: test6:
237; X64-SSE:       # %bb.0:
238; X64-SSE-NEXT:    movaps (%rsi), %xmm0
239; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
240; X64-SSE-NEXT:    retq
241;
242; X64-AVX-LABEL: test6:
243; X64-AVX:       # %bb.0:
244; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
245; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
246; X64-AVX-NEXT:    retq
247  %tmp1 = load <4 x float>, ptr %A            ; <<4 x float>> [#uses=1]
248  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
249  store <4 x float> %tmp2, ptr %res
250  ret void
251}
252
253define void @test7() nounwind {
254; SSE-LABEL: test7:
255; SSE:       # %bb.0:
256; SSE-NEXT:    xorps %xmm0, %xmm0
257; SSE-NEXT:    movaps %xmm0, 0
258; SSE-NEXT:    ret{{[l|q]}}
259;
260; AVX-LABEL: test7:
261; AVX:       # %bb.0:
262; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
263; AVX-NEXT:    vmovaps %xmm0, 0
264; AVX-NEXT:    ret{{[l|q]}}
265  bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
266  shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
267  store <4 x float> %2, ptr null
268  ret void
269}
270
271@x = external dso_local global [4 x i32]
272
273define <2 x i64> @test8() nounwind {
274; X86-SSE-LABEL: test8:
275; X86-SSE:       # %bb.0:
276; X86-SSE-NEXT:    movups x, %xmm0
277; X86-SSE-NEXT:    retl
278;
279; X86-AVX-LABEL: test8:
280; X86-AVX:       # %bb.0:
281; X86-AVX-NEXT:    vmovups x, %xmm0
282; X86-AVX-NEXT:    retl
283;
284; X64-SSE-LABEL: test8:
285; X64-SSE:       # %bb.0:
286; X64-SSE-NEXT:    movups x(%rip), %xmm0
287; X64-SSE-NEXT:    retq
288;
289; X64-AVX-LABEL: test8:
290; X64-AVX:       # %bb.0:
291; X64-AVX-NEXT:    vmovups x(%rip), %xmm0
292; X64-AVX-NEXT:    retq
293	%tmp = load i32, ptr @x		; <i32> [#uses=1]
294	%tmp3 = load i32, ptr getelementptr ([4 x i32], ptr @x, i32 0, i32 1)		; <i32> [#uses=1]
295	%tmp5 = load i32, ptr getelementptr ([4 x i32], ptr @x, i32 0, i32 2)		; <i32> [#uses=1]
296	%tmp7 = load i32, ptr getelementptr ([4 x i32], ptr @x, i32 0, i32 3)		; <i32> [#uses=1]
297	%tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0		; <<4 x i32>> [#uses=1]
298	%tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
299	%tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2		; <<4 x i32>> [#uses=1]
300	%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3		; <<4 x i32>> [#uses=1]
301	%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
302	ret <2 x i64> %tmp16
303}
304
305define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
306; X86-SSE-LABEL: test9:
307; X86-SSE:       # %bb.0:
308; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
309; X86-SSE-NEXT:    retl
310;
311; X86-AVX-LABEL: test9:
312; X86-AVX:       # %bb.0:
313; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
314; X86-AVX-NEXT:    retl
315;
316; X64-SSE-LABEL: test9:
317; X64-SSE:       # %bb.0:
318; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
319; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
320; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
321; X64-SSE-NEXT:    retq
322;
323; X64-AVX-LABEL: test9:
324; X64-AVX:       # %bb.0:
325; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
326; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
327; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
328; X64-AVX-NEXT:    retq
329	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
330	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
331	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
332	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
333	ret <4 x float> %tmp13
334}
335
336define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
337; X86-SSE-LABEL: test10:
338; X86-SSE:       # %bb.0:
339; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
340; X86-SSE-NEXT:    retl
341;
342; X86-AVX-LABEL: test10:
343; X86-AVX:       # %bb.0:
344; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
345; X86-AVX-NEXT:    retl
346;
347; X64-SSE-LABEL: test10:
348; X64-SSE:       # %bb.0:
349; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
350; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
351; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
352; X64-SSE-NEXT:    retq
353;
354; X64-AVX-LABEL: test10:
355; X64-AVX:       # %bb.0:
356; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
357; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
358; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
359; X64-AVX-NEXT:    retq
360	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
361	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
362	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
363	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
364	ret <4 x float> %tmp13
365}
366
367define <2 x double> @test11(double %a, double %b) nounwind {
368; X86-SSE-LABEL: test11:
369; X86-SSE:       # %bb.0:
370; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
371; X86-SSE-NEXT:    retl
372;
373; X86-AVX-LABEL: test11:
374; X86-AVX:       # %bb.0:
375; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
376; X86-AVX-NEXT:    retl
377;
378; X64-SSE-LABEL: test11:
379; X64-SSE:       # %bb.0:
380; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
381; X64-SSE-NEXT:    retq
382;
383; X64-AVX-LABEL: test11:
384; X64-AVX:       # %bb.0:
385; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
386; X64-AVX-NEXT:    retq
387	%tmp = insertelement <2 x double> undef, double %a, i32 0		; <<2 x double>> [#uses=1]
388	%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1		; <<2 x double>> [#uses=1]
389	ret <2 x double> %tmp7
390}
391
392define void @test12() nounwind {
393; SSE-LABEL: test12:
394; SSE:       # %bb.0:
395; SSE-NEXT:    movapd 0, %xmm0
396; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
397; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
398; SSE-NEXT:    xorps %xmm2, %xmm2
399; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
400; SSE-NEXT:    addps %xmm1, %xmm2
401; SSE-NEXT:    movaps %xmm2, 0
402; SSE-NEXT:    ret{{[l|q]}}
403;
404; AVX1-LABEL: test12:
405; AVX1:       # %bb.0:
406; AVX1-NEXT:    vmovaps 0, %xmm0
407; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
408; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
409; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
410; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
411; AVX1-NEXT:    vmovaps %xmm0, 0
412; AVX1-NEXT:    ret{{[l|q]}}
413;
414; AVX512-LABEL: test12:
415; AVX512:       # %bb.0:
416; AVX512-NEXT:    vmovaps 0, %xmm0
417; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
418; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
419; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
420; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
421; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
422; AVX512-NEXT:    vmovaps %xmm0, 0
423; AVX512-NEXT:    ret{{[l|q]}}
424  %tmp1 = load <4 x float>, ptr null          ; <<4 x float>> [#uses=2]
425  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
426  %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
427  %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
428  store <4 x float> %tmp4, ptr null
429  ret void
430}
431
432define void @test13(ptr %res, ptr %A, ptr %B, ptr %C) nounwind {
433; X86-SSE-LABEL: test13:
434; X86-SSE:       # %bb.0:
435; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
436; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
437; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
438; X86-SSE-NEXT:    movaps (%edx), %xmm0
439; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
440; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
441; X86-SSE-NEXT:    movaps %xmm0, (%eax)
442; X86-SSE-NEXT:    retl
443;
444; X86-AVX-LABEL: test13:
445; X86-AVX:       # %bb.0:
446; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
447; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
448; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
449; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
450; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
451; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
452; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
453; X86-AVX-NEXT:    retl
454;
455; X64-SSE-LABEL: test13:
456; X64-SSE:       # %bb.0:
457; X64-SSE-NEXT:    movaps (%rdx), %xmm0
458; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
459; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
460; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
461; X64-SSE-NEXT:    retq
462;
463; X64-AVX-LABEL: test13:
464; X64-AVX:       # %bb.0:
465; X64-AVX-NEXT:    vmovaps (%rdx), %xmm0
466; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
467; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
468; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
469; X64-AVX-NEXT:    retq
470  %tmp3 = load <4 x float>, ptr %B            ; <<4 x float>> [#uses=1]
471  %tmp5 = load <4 x float>, ptr %C            ; <<4 x float>> [#uses=1]
472  %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
473  store <4 x float> %tmp11, ptr %res
474  ret void
475}
476
477define <4 x float> @test14(ptr %x, ptr %y) nounwind {
478; X86-SSE-LABEL: test14:
479; X86-SSE:       # %bb.0:
480; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
481; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
482; X86-SSE-NEXT:    movaps (%ecx), %xmm1
483; X86-SSE-NEXT:    movaps (%eax), %xmm2
484; X86-SSE-NEXT:    movaps %xmm2, %xmm0
485; X86-SSE-NEXT:    addps %xmm1, %xmm0
486; X86-SSE-NEXT:    subps %xmm1, %xmm2
487; X86-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
488; X86-SSE-NEXT:    retl
489;
490; X86-AVX-LABEL: test14:
491; X86-AVX:       # %bb.0:
492; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
493; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
494; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
495; X86-AVX-NEXT:    vmovaps (%eax), %xmm1
496; X86-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm2
497; X86-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
498; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
499; X86-AVX-NEXT:    retl
500;
501; X64-SSE-LABEL: test14:
502; X64-SSE:       # %bb.0:
503; X64-SSE-NEXT:    movaps (%rsi), %xmm1
504; X64-SSE-NEXT:    movaps (%rdi), %xmm2
505; X64-SSE-NEXT:    movaps %xmm2, %xmm0
506; X64-SSE-NEXT:    addps %xmm1, %xmm0
507; X64-SSE-NEXT:    subps %xmm1, %xmm2
508; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
509; X64-SSE-NEXT:    retq
510;
511; X64-AVX-LABEL: test14:
512; X64-AVX:       # %bb.0:
513; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
514; X64-AVX-NEXT:    vmovaps (%rdi), %xmm1
515; X64-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm2
516; X64-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
517; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
518; X64-AVX-NEXT:    retq
519  %tmp = load <4 x float>, ptr %y             ; <<4 x float>> [#uses=2]
520  %tmp5 = load <4 x float>, ptr %x            ; <<4 x float>> [#uses=2]
521  %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
522  %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
523  %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
524  ret <4 x float> %tmp27
525}
526
527define <4 x float> @test15(ptr %x, ptr %y) nounwind {
528; X86-SSE-LABEL: test15:
529; X86-SSE:       # %bb.0: # %entry
530; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
531; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
532; X86-SSE-NEXT:    movaps (%ecx), %xmm0
533; X86-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
534; X86-SSE-NEXT:    retl
535;
536; X86-AVX-LABEL: test15:
537; X86-AVX:       # %bb.0: # %entry
538; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
539; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
540; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
541; X86-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
542; X86-AVX-NEXT:    retl
543;
544; X64-SSE-LABEL: test15:
545; X64-SSE:       # %bb.0: # %entry
546; X64-SSE-NEXT:    movaps (%rdi), %xmm0
547; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
548; X64-SSE-NEXT:    retq
549;
550; X64-AVX-LABEL: test15:
551; X64-AVX:       # %bb.0: # %entry
552; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
553; X64-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
554; X64-AVX-NEXT:    retq
555entry:
556  %tmp = load <4 x float>, ptr %y             ; <<4 x float>> [#uses=1]
557  %tmp3 = load <4 x float>, ptr %x            ; <<4 x float>> [#uses=1]
558  %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
559  ret <4 x float> %tmp4
560}
561
562; PR8900
563
564define  <2 x double> @test16(ptr nocapture %srcA, ptr nocapture %dst) {
565; X86-SSE-LABEL: test16:
566; X86-SSE:       # %bb.0:
567; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
568; X86-SSE-NEXT:    movaps 96(%eax), %xmm0
569; X86-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
570; X86-SSE-NEXT:    retl
571;
572; X86-AVX-LABEL: test16:
573; X86-AVX:       # %bb.0:
574; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
575; X86-AVX-NEXT:    vmovaps 96(%eax), %xmm0
576; X86-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
577; X86-AVX-NEXT:    retl
578;
579; X64-SSE-LABEL: test16:
580; X64-SSE:       # %bb.0:
581; X64-SSE-NEXT:    movaps 96(%rdi), %xmm0
582; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
583; X64-SSE-NEXT:    retq
584;
585; X64-AVX-LABEL: test16:
586; X64-AVX:       # %bb.0:
587; X64-AVX-NEXT:    vmovaps 96(%rdi), %xmm0
588; X64-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
589; X64-AVX-NEXT:    retq
590  %i5 = getelementptr inbounds <4 x double>, ptr %srcA, i32 3
591  %i6 = load <4 x double>, ptr %i5, align 32
592  %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
593  ret <2 x double> %i7
594}
595
596; PR9009
597define fastcc void @test17() nounwind {
598; X86-SSE-LABEL: test17:
599; X86-SSE:       # %bb.0: # %entry
600; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [u,u,32768,32768]
601; X86-SSE-NEXT:    movaps %xmm0, (%eax)
602; X86-SSE-NEXT:    retl
603;
604; X86-AVX-LABEL: test17:
605; X86-AVX:       # %bb.0: # %entry
606; X86-AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
607; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
608; X86-AVX-NEXT:    retl
609;
610; X64-SSE-LABEL: test17:
611; X64-SSE:       # %bb.0: # %entry
612; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = [u,u,32768,32768]
613; X64-SSE-NEXT:    movaps %xmm0, (%rax)
614; X64-SSE-NEXT:    retq
615;
616; X64-AVX-LABEL: test17:
617; X64-AVX:       # %bb.0: # %entry
618; X64-AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
619; X64-AVX-NEXT:    vmovaps %xmm0, (%rax)
620; X64-AVX-NEXT:    retq
621entry:
622  %0 = insertelement <4 x i32> undef, i32 undef, i32 1
623  %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
624  %2 = bitcast <4 x i32> %1 to <4 x float>
625  store <4 x float> %2, ptr undef
626  ret void
627}
628
629; PR9210
630define <4 x float> @f(<4 x double>) nounwind {
631; SSE-LABEL: f:
632; SSE:       # %bb.0: # %entry
633; SSE-NEXT:    cvtpd2ps %xmm1, %xmm1
634; SSE-NEXT:    cvtpd2ps %xmm0, %xmm0
635; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
636; SSE-NEXT:    ret{{[l|q]}}
637;
638; AVX-LABEL: f:
639; AVX:       # %bb.0: # %entry
640; AVX-NEXT:    vcvtpd2ps %ymm0, %xmm0
641; AVX-NEXT:    vzeroupper
642; AVX-NEXT:    ret{{[l|q]}}
643entry:
644 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
645 ret <4 x float> %double2float.i
646}
647
648define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
649; SSE-LABEL: test_insert_64_zext:
650; SSE:       # %bb.0:
651; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
652; SSE-NEXT:    ret{{[l|q]}}
653;
654; AVX-LABEL: test_insert_64_zext:
655; AVX:       # %bb.0:
656; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
657; AVX-NEXT:    ret{{[l|q]}}
658  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
659  ret <2 x i64> %1
660}
661
662define <4 x i32> @PR19721(<4 x i32> %i) {
663; X86-SSE-LABEL: PR19721:
664; X86-SSE:       # %bb.0:
665; X86-SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
666; X86-SSE-NEXT:    retl
667;
668; AVX-LABEL: PR19721:
669; AVX:       # %bb.0:
670; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
671; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
672; AVX-NEXT:    ret{{[l|q]}}
673;
674; X64-SSE-LABEL: PR19721:
675; X64-SSE:       # %bb.0:
676; X64-SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
677; X64-SSE-NEXT:    retq
678  %bc = bitcast <4 x i32> %i to i128
679  %insert = and i128 %bc, -4294967296
680  %bc2 = bitcast i128 %insert to <4 x i32>
681  ret <4 x i32> %bc2
682}
683
684define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
685; SSE-LABEL: test_mul:
686; SSE:       # %bb.0:
687; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
688; SSE-NEXT:    pmuludq %xmm1, %xmm0
689; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
690; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
691; SSE-NEXT:    pmuludq %xmm2, %xmm1
692; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
693; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
694; SSE-NEXT:    ret{{[l|q]}}
695;
696; AVX-LABEL: test_mul:
697; AVX:       # %bb.0:
698; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
699; AVX-NEXT:    ret{{[l|q]}}
700  %m = mul <4 x i32> %x, %y
701  ret <4 x i32> %m
702}
703;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
704; X64-AVX1: {{.*}}
705; X64-AVX512: {{.*}}
706; X86-AVX1: {{.*}}
707; X86-AVX512: {{.*}}
708