xref: /llvm-project/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll (revision e9f9467da063875bd684e46660e2ff36ba4f55e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
6
7define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
8; CHECK-LABEL: test_mm256_add_pd:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
11; CHECK-NEXT:    ret{{[l|q]}}
12  %res = fadd <4 x double> %a0, %a1
13  ret <4 x double> %res
14}
15
16define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
17; CHECK-LABEL: test_mm256_add_ps:
18; CHECK:       # %bb.0:
19; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
20; CHECK-NEXT:    ret{{[l|q]}}
21  %res = fadd <8 x float> %a0, %a1
22  ret <8 x float> %res
23}
24
25define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
26; CHECK-LABEL: test_mm256_addsub_pd:
27; CHECK:       # %bb.0:
28; CHECK-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
29; CHECK-NEXT:    ret{{[l|q]}}
30  %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
31  ret <4 x double> %res
32}
33declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
34
35define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
36; CHECK-LABEL: test_mm256_addsub_ps:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
39; CHECK-NEXT:    ret{{[l|q]}}
40  %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
41  ret <8 x float> %res
42}
43declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
44
45define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
46; CHECK-LABEL: test_mm256_and_pd:
47; CHECK:       # %bb.0:
48; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
49; CHECK-NEXT:    ret{{[l|q]}}
50  %1 = bitcast <4 x double> %a0 to <4 x i64>
51  %2 = bitcast <4 x double> %a1 to <4 x i64>
52  %res = and <4 x i64> %1, %2
53  %bc = bitcast <4 x i64> %res to <4 x double>
54  ret <4 x double> %bc
55}
56
57define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
58; CHECK-LABEL: test_mm256_and_ps:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
61; CHECK-NEXT:    ret{{[l|q]}}
62  %1 = bitcast <8 x float> %a0 to <8 x i32>
63  %2 = bitcast <8 x float> %a1 to <8 x i32>
64  %res = and <8 x i32> %1, %2
65  %bc = bitcast <8 x i32> %res to <8 x float>
66  ret <8 x float> %bc
67}
68
69define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
70; CHECK-LABEL: test_mm256_andnot_pd:
71; CHECK:       # %bb.0:
72; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
73; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
74; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
75; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
76; CHECK-NEXT:    ret{{[l|q]}}
77  %1 = bitcast <4 x double> %a0 to <4 x i64>
78  %2 = bitcast <4 x double> %a1 to <4 x i64>
79  %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
80  %res = and <4 x i64> %3, %2
81  %bc = bitcast <4 x i64> %res to <4 x double>
82  ret <4 x double> %bc
83}
84
85define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
86; CHECK-LABEL: test_mm256_andnot_ps:
87; CHECK:       # %bb.0:
88; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
89; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
90; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
91; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
92; CHECK-NEXT:    ret{{[l|q]}}
93  %1 = bitcast <8 x float> %a0 to <8 x i32>
94  %2 = bitcast <8 x float> %a1 to <8 x i32>
95  %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
96  %res = and <8 x i32> %3, %2
97  %bc = bitcast <8 x i32> %res to <8 x float>
98  ret <8 x float> %bc
99}
100
101define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
102; CHECK-LABEL: test_mm256_blend_pd:
103; CHECK:       # %bb.0:
104; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
105; CHECK-NEXT:    ret{{[l|q]}}
106  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
107  ret <4 x double> %res
108}
109
110define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
111; CHECK-LABEL: test_mm256_blend_ps:
112; CHECK:       # %bb.0:
113; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
114; CHECK-NEXT:    ret{{[l|q]}}
115  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
116  ret <8 x float> %res
117}
118
119define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
120; CHECK-LABEL: test_mm256_blendv_pd:
121; CHECK:       # %bb.0:
122; CHECK-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
123; CHECK-NEXT:    ret{{[l|q]}}
124  %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
125  ret <4 x double> %res
126}
127declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
128
129define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
130; CHECK-LABEL: test_mm256_blendv_ps:
131; CHECK:       # %bb.0:
132; CHECK-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
133; CHECK-NEXT:    ret{{[l|q]}}
134  %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
135  ret <8 x float> %res
136}
137declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
138
139define <4 x double> @test_mm256_broadcast_pd(ptr %a0) nounwind {
140; X86-LABEL: test_mm256_broadcast_pd:
141; X86:       # %bb.0:
142; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
143; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
144; X86-NEXT:    retl
145;
146; X64-LABEL: test_mm256_broadcast_pd:
147; X64:       # %bb.0:
148; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
149; X64-NEXT:    retq
150  %ld = load <2 x double>, ptr %a0
151  %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
152  ret <4 x double> %res
153}
154
155define <8 x float> @test_mm256_broadcast_ps(ptr %a0) nounwind {
156; X86-LABEL: test_mm256_broadcast_ps:
157; X86:       # %bb.0:
158; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
159; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
160; X86-NEXT:    retl
161;
162; X64-LABEL: test_mm256_broadcast_ps:
163; X64:       # %bb.0:
164; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
165; X64-NEXT:    retq
166  %ld = load <4 x float>, ptr %a0
167  %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
168  ret <8 x float> %res
169}
170
171define <4 x double> @test_mm256_broadcast_sd(ptr %a0) nounwind {
172; X86-LABEL: test_mm256_broadcast_sd:
173; X86:       # %bb.0:
174; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
175; X86-NEXT:    vbroadcastsd (%eax), %ymm0
176; X86-NEXT:    retl
177;
178; X64-LABEL: test_mm256_broadcast_sd:
179; X64:       # %bb.0:
180; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
181; X64-NEXT:    retq
182  %ld = load double, ptr %a0
183  %ins0 = insertelement <4 x double> undef, double %ld, i32 0
184  %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
185  %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2
186  %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3
187  ret <4 x double> %ins3
188}
189
190define <4 x float> @test_mm_broadcast_ss(ptr %a0) nounwind {
191; X86-LABEL: test_mm_broadcast_ss:
192; X86:       # %bb.0:
193; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
194; X86-NEXT:    vbroadcastss (%eax), %xmm0
195; X86-NEXT:    retl
196;
197; X64-LABEL: test_mm_broadcast_ss:
198; X64:       # %bb.0:
199; X64-NEXT:    vbroadcastss (%rdi), %xmm0
200; X64-NEXT:    retq
201  %ld = load float, ptr %a0
202  %ins0 = insertelement <4 x float> undef, float %ld, i32 0
203  %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
204  %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2
205  %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3
206  ret <4 x float> %ins3
207}
208
209define <8 x float> @test_mm256_broadcast_ss(ptr %a0) nounwind {
210; X86-LABEL: test_mm256_broadcast_ss:
211; X86:       # %bb.0:
212; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
213; X86-NEXT:    vbroadcastss (%eax), %ymm0
214; X86-NEXT:    retl
215;
216; X64-LABEL: test_mm256_broadcast_ss:
217; X64:       # %bb.0:
218; X64-NEXT:    vbroadcastss (%rdi), %ymm0
219; X64-NEXT:    retq
220  %ld = load float, ptr %a0
221  %ins0 = insertelement <8 x float> undef, float %ld, i32 0
222  %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
223  %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2
224  %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3
225  %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4
226  %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5
227  %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6
228  %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7
229  ret <8 x float> %ins7
230}
231
232define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
233; CHECK-LABEL: test_mm256_castpd_ps:
234; CHECK:       # %bb.0:
235; CHECK-NEXT:    ret{{[l|q]}}
236  %res = bitcast <4 x double> %a0 to <8 x float>
237  ret <8 x float> %res
238}
239
240define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
241; CHECK-LABEL: test_mm256_castpd_si256:
242; CHECK:       # %bb.0:
243; CHECK-NEXT:    ret{{[l|q]}}
244  %res = bitcast <4 x double> %a0 to <4 x i64>
245  ret <4 x i64> %res
246}
247
248define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
249; CHECK-LABEL: test_mm256_castpd128_pd256:
250; CHECK:       # %bb.0:
251; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
252; CHECK-NEXT:    ret{{[l|q]}}
253  %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
254  ret <4 x double> %res
255}
256
257define <4 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind {
258; CHECK-LABEL: test_mm256_castpd128_pd256_freeze:
259; CHECK:       # %bb.0:
260; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
261; CHECK-NEXT:    ret{{[l|q]}}
262  %a1 = freeze <2 x double> poison
263  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
264  ret <4 x double> %res
265}
266
267define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
268; CHECK-LABEL: test_mm256_castpd256_pd128:
269; CHECK:       # %bb.0:
270; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
271; CHECK-NEXT:    vzeroupper
272; CHECK-NEXT:    ret{{[l|q]}}
273  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
274  ret <2 x double> %res
275}
276
277define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
278; CHECK-LABEL: test_mm256_castps_pd:
279; CHECK:       # %bb.0:
280; CHECK-NEXT:    ret{{[l|q]}}
281  %res = bitcast <8 x float> %a0 to <4 x double>
282  ret <4 x double> %res
283}
284
285define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
286; CHECK-LABEL: test_mm256_castps_si256:
287; CHECK:       # %bb.0:
288; CHECK-NEXT:    ret{{[l|q]}}
289  %res = bitcast <8 x float> %a0 to <4 x i64>
290  ret <4 x i64> %res
291}
292
293define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
294; CHECK-LABEL: test_mm256_castps128_ps256:
295; CHECK:       # %bb.0:
296; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
297; CHECK-NEXT:    ret{{[l|q]}}
298  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
299  ret <8 x float> %res
300}
301
302define <8 x float> @test_mm256_castps128_ps256_freeze(<4 x float> %a0) nounwind {
303; CHECK-LABEL: test_mm256_castps128_ps256_freeze:
304; CHECK:       # %bb.0:
305; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
306; CHECK-NEXT:    ret{{[l|q]}}
307  %a1 = freeze <4 x float> poison
308  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
309  ret <8 x float> %res
310}
311
312define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
313; CHECK-LABEL: test_mm256_castps256_ps128:
314; CHECK:       # %bb.0:
315; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
316; CHECK-NEXT:    vzeroupper
317; CHECK-NEXT:    ret{{[l|q]}}
318  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
319  ret <4 x float> %res
320}
321
322define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
323; CHECK-LABEL: test_mm256_castsi128_si256:
324; CHECK:       # %bb.0:
325; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
326; CHECK-NEXT:    ret{{[l|q]}}
327  %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
328  ret <4 x i64> %res
329}
330
331define <4 x i64> @test_mm256_castsi128_si256_freeze(<2 x i64> %a0) nounwind {
332; CHECK-LABEL: test_mm256_castsi128_si256_freeze:
333; CHECK:       # %bb.0:
334; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
335; CHECK-NEXT:    ret{{[l|q]}}
336  %a1 = freeze <2 x i64> poison
337  %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
338  ret <4 x i64> %res
339}
340
341define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
342; CHECK-LABEL: test_mm256_castsi256_pd:
343; CHECK:       # %bb.0:
344; CHECK-NEXT:    ret{{[l|q]}}
345  %res = bitcast <4 x i64> %a0 to <4 x double>
346  ret <4 x double> %res
347}
348
349define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
350; CHECK-LABEL: test_mm256_castsi256_ps:
351; CHECK:       # %bb.0:
352; CHECK-NEXT:    ret{{[l|q]}}
353  %res = bitcast <4 x i64> %a0 to <8 x float>
354  ret <8 x float> %res
355}
356
357define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
358; CHECK-LABEL: test_mm256_castsi256_si128:
359; CHECK:       # %bb.0:
360; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
361; CHECK-NEXT:    vzeroupper
362; CHECK-NEXT:    ret{{[l|q]}}
363  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
364  ret <2 x i64> %res
365}
366
367define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
368; CHECK-LABEL: test_mm256_ceil_pd:
369; CHECK:       # %bb.0:
370; CHECK-NEXT:    vroundpd $2, %ymm0, %ymm0
371; CHECK-NEXT:    ret{{[l|q]}}
372  %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
373  ret <4 x double> %res
374}
375declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
376
377define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
378; CHECK-LABEL: test_mm256_ceil_ps:
379; CHECK:       # %bb.0:
380; CHECK-NEXT:    vroundps $2, %ymm0, %ymm0
381; CHECK-NEXT:    ret{{[l|q]}}
382  %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
383  ret <8 x float> %res
384}
385declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
386
387define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
388; CHECK-LABEL: test_mm_cmp_pd:
389; CHECK:       # %bb.0:
390; CHECK-NEXT:    vcmpgepd %xmm1, %xmm0, %xmm0
391; CHECK-NEXT:    ret{{[l|q]}}
392  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
393  ret <2 x double> %res
394}
395declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
396
397define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
398; CHECK-LABEL: test_mm256_cmp_pd:
399; CHECK:       # %bb.0:
400; CHECK-NEXT:    vcmpgepd %ymm1, %ymm0, %ymm0
401; CHECK-NEXT:    ret{{[l|q]}}
402  %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
403  ret <4 x double> %res
404}
405declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
406
407define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
408; CHECK-LABEL: test_mm_cmp_ps:
409; CHECK:       # %bb.0:
410; CHECK-NEXT:    vcmpgeps %xmm1, %xmm0, %xmm0
411; CHECK-NEXT:    ret{{[l|q]}}
412  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
413  ret <4 x float> %res
414}
415declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
416
417define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
418; CHECK-LABEL: test_mm256_cmp_ps:
419; CHECK:       # %bb.0:
420; CHECK-NEXT:    vcmpgeps %ymm1, %ymm0, %ymm0
421; CHECK-NEXT:    ret{{[l|q]}}
422  %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
423  ret <8 x float> %res
424}
425declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
426
427define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
428; CHECK-LABEL: test_mm_cmp_sd:
429; CHECK:       # %bb.0:
430; CHECK-NEXT:    vcmpgesd %xmm1, %xmm0, %xmm0
431; CHECK-NEXT:    ret{{[l|q]}}
432  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
433  ret <2 x double> %res
434}
435declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
436
437define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
438; CHECK-LABEL: test_mm_cmp_ss:
439; CHECK:       # %bb.0:
440; CHECK-NEXT:    vcmpgess %xmm1, %xmm0, %xmm0
441; CHECK-NEXT:    ret{{[l|q]}}
442  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
443  ret <4 x float> %res
444}
445declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
446
447define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
448; CHECK-LABEL: test_mm256_cvtepi32_pd:
449; CHECK:       # %bb.0:
450; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
451; CHECK-NEXT:    ret{{[l|q]}}
452  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
453  %res = sitofp <4 x i32> %arg0 to <4 x double>
454  ret <4 x double> %res
455}
456
457define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
458; CHECK-LABEL: test_mm256_cvtepi32_ps:
459; CHECK:       # %bb.0:
460; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
461; CHECK-NEXT:    ret{{[l|q]}}
462  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
463  %res = sitofp <8 x i32> %arg0 to <8 x float>
464  ret <8 x float> %res
465}
466
467define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
468; CHECK-LABEL: test_mm256_cvtpd_epi32:
469; CHECK:       # %bb.0:
470; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm0
471; CHECK-NEXT:    vzeroupper
472; CHECK-NEXT:    ret{{[l|q]}}
473  %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
474  %res = bitcast <4 x i32> %cvt to <2 x i64>
475  ret <2 x i64> %res
476}
477declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
478
479define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
480; CHECK-LABEL: test_mm256_cvtpd_ps:
481; CHECK:       # %bb.0:
482; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm0
483; CHECK-NEXT:    vzeroupper
484; CHECK-NEXT:    ret{{[l|q]}}
485  %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
486  ret <4 x float> %res
487}
488declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
489
490define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
491; CHECK-LABEL: test_mm256_cvtps_epi32:
492; CHECK:       # %bb.0:
493; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm0
494; CHECK-NEXT:    ret{{[l|q]}}
495  %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
496  %res = bitcast <8 x i32> %cvt to <4 x i64>
497  ret <4 x i64> %res
498}
499declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
500
501define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
502; CHECK-LABEL: test_mm256_cvtps_pd:
503; CHECK:       # %bb.0:
504; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
505; CHECK-NEXT:    ret{{[l|q]}}
506  %res = fpext <4 x float> %a0 to <4 x double>
507  ret <4 x double> %res
508}
509
510define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
511; CHECK-LABEL: test_mm256_cvttpd_epi32:
512; CHECK:       # %bb.0:
513; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
514; CHECK-NEXT:    vzeroupper
515; CHECK-NEXT:    ret{{[l|q]}}
516  %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
517  %res = bitcast <4 x i32> %cvt to <2 x i64>
518  ret <2 x i64> %res
519}
520declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
521
522define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
523; CHECK-LABEL: test_mm256_cvttps_epi32:
524; CHECK:       # %bb.0:
525; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
526; CHECK-NEXT:    ret{{[l|q]}}
527  %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
528  %res = bitcast <8 x i32> %cvt to <4 x i64>
529  ret <4 x i64> %res
530}
531declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
532
533define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
534; CHECK-LABEL: test_mm256_div_pd:
535; CHECK:       # %bb.0:
536; CHECK-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
537; CHECK-NEXT:    ret{{[l|q]}}
538  %res = fdiv <4 x double> %a0, %a1
539  ret <4 x double> %res
540}
541
542define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
543; CHECK-LABEL: test_mm256_div_ps:
544; CHECK:       # %bb.0:
545; CHECK-NEXT:    vdivps %ymm1, %ymm0, %ymm0
546; CHECK-NEXT:    ret{{[l|q]}}
547  %res = fdiv <8 x float> %a0, %a1
548  ret <8 x float> %res
549}
550
551define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
552; CHECK-LABEL: test_mm256_dp_ps:
553; CHECK:       # %bb.0:
554; CHECK-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
555; CHECK-NEXT:    ret{{[l|q]}}
556  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
557  ret <8 x float> %res
558}
559declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
560
561define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
562; CHECK-LABEL: test_mm256_extract_epi8:
563; CHECK:       # %bb.0:
564; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
565; CHECK-NEXT:    vpextrb $15, %xmm0, %eax
566; CHECK-NEXT:    movzbl %al, %eax
567; CHECK-NEXT:    vzeroupper
568; CHECK-NEXT:    ret{{[l|q]}}
569  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
570  %ext = extractelement <32 x i8> %arg0, i32 31
571  %res = zext i8 %ext to i32
572  ret i32 %res
573}
574
575define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
576; CHECK-LABEL: test_mm256_extract_epi16:
577; CHECK:       # %bb.0:
578; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
579; CHECK-NEXT:    vpextrw $3, %xmm0, %eax
580; CHECK-NEXT:    movzwl %ax, %eax
581; CHECK-NEXT:    vzeroupper
582; CHECK-NEXT:    ret{{[l|q]}}
583  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
584  %ext = extractelement <16 x i16> %arg0, i32 11
585  %res = zext i16 %ext to i32
586  ret i32 %res
587}
588
589define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
590; CHECK-LABEL: test_mm256_extract_epi32:
591; CHECK:       # %bb.0:
592; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
593; CHECK-NEXT:    vextractps $1, %xmm0, %eax
594; CHECK-NEXT:    vzeroupper
595; CHECK-NEXT:    ret{{[l|q]}}
596  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
597  %res = extractelement <8 x i32> %arg0, i32 5
598  ret i32 %res
599}
600
601define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
602; X86-LABEL: test_mm256_extract_epi64:
603; X86:       # %bb.0:
604; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
605; X86-NEXT:    vextractps $2, %xmm0, %eax
606; X86-NEXT:    vextractps $3, %xmm0, %edx
607; X86-NEXT:    vzeroupper
608; X86-NEXT:    retl
609;
610; X64-LABEL: test_mm256_extract_epi64:
611; X64:       # %bb.0:
612; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
613; X64-NEXT:    vpextrq $1, %xmm0, %rax
614; X64-NEXT:    vzeroupper
615; X64-NEXT:    retq
616  %res = extractelement <4 x i64> %a0, i32 3
617  ret i64 %res
618}
619
620define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
621; CHECK-LABEL: test_mm256_extractf128_pd:
622; CHECK:       # %bb.0:
623; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
624; CHECK-NEXT:    vzeroupper
625; CHECK-NEXT:    ret{{[l|q]}}
626  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
627  ret <2 x double> %res
628}
629
630define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
631; CHECK-LABEL: test_mm256_extractf128_ps:
632; CHECK:       # %bb.0:
633; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
634; CHECK-NEXT:    vzeroupper
635; CHECK-NEXT:    ret{{[l|q]}}
636  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
637  ret <4 x float> %res
638}
639
640define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
641; CHECK-LABEL: test_mm256_extractf128_si256:
642; CHECK:       # %bb.0:
643; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
644; CHECK-NEXT:    vzeroupper
645; CHECK-NEXT:    ret{{[l|q]}}
646  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
647  ret <2 x i64> %res
648}
649
650define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
651; CHECK-LABEL: test_mm256_floor_pd:
652; CHECK:       # %bb.0:
653; CHECK-NEXT:    vroundpd $1, %ymm0, %ymm0
654; CHECK-NEXT:    ret{{[l|q]}}
655  %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
656  ret <4 x double> %res
657}
658
659define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
660; CHECK-LABEL: test_mm256_floor_ps:
661; CHECK:       # %bb.0:
662; CHECK-NEXT:    vroundps $1, %ymm0, %ymm0
663; CHECK-NEXT:    ret{{[l|q]}}
664  %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
665  ret <8 x float> %res
666}
667
668define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
669; CHECK-LABEL: test_mm256_hadd_pd:
670; CHECK:       # %bb.0:
671; CHECK-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
672; CHECK-NEXT:    ret{{[l|q]}}
673  %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
674  ret <4 x double> %res
675}
676declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
677
678define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
679; CHECK-LABEL: test_mm256_hadd_ps:
680; CHECK:       # %bb.0:
681; CHECK-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
682; CHECK-NEXT:    ret{{[l|q]}}
683  %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
684  ret <8 x float> %res
685}
686declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
687
688define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
689; CHECK-LABEL: test_mm256_hsub_pd:
690; CHECK:       # %bb.0:
691; CHECK-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
692; CHECK-NEXT:    ret{{[l|q]}}
693  %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
694  ret <4 x double> %res
695}
696declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
697
698define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
699; CHECK-LABEL: test_mm256_hsub_ps:
700; CHECK:       # %bb.0:
701; CHECK-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
702; CHECK-NEXT:    ret{{[l|q]}}
703  %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
704  ret <8 x float> %res
705}
706declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
707
708define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
709; X86-LABEL: test_mm256_insert_epi8:
710; X86:       # %bb.0:
711; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
712; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
713; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
714; X86-NEXT:    retl
715;
716; X64-LABEL: test_mm256_insert_epi8:
717; X64:       # %bb.0:
718; X64-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm1
719; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
720; X64-NEXT:    retq
721  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
722  %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
723  %bc = bitcast <32 x i8> %res to <4 x i64>
724  ret <4 x i64> %bc
725}
726
727define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
728; X86-LABEL: test_mm256_insert_epi16:
729; X86:       # %bb.0:
730; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
731; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
732; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
733; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
734; X86-NEXT:    retl
735;
736; X64-LABEL: test_mm256_insert_epi16:
737; X64:       # %bb.0:
738; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
739; X64-NEXT:    vpinsrw $6, %edi, %xmm1, %xmm1
740; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
741; X64-NEXT:    retq
742  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
743  %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
744  %bc = bitcast <16 x i16> %res to <4 x i64>
745  ret <4 x i64> %bc
746}
747
748define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
749; X86-LABEL: test_mm256_insert_epi32:
750; X86:       # %bb.0:
751; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
752; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
753; X86-NEXT:    retl
754;
755; X64-LABEL: test_mm256_insert_epi32:
756; X64:       # %bb.0:
757; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm1
758; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
759; X64-NEXT:    retq
760  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
761  %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
762  %bc = bitcast <8 x i32> %res to <4 x i64>
763  ret <4 x i64> %bc
764}
765
766define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
767; X86-LABEL: test_mm256_insert_epi64:
768; X86:       # %bb.0:
769; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
770; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
771; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
772; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
773; X86-NEXT:    retl
774;
775; X64-LABEL: test_mm256_insert_epi64:
776; X64:       # %bb.0:
777; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
778; X64-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
779; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
780; X64-NEXT:    retq
781  %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
782  ret <4 x i64> %res
783}
784
785define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
786; CHECK-LABEL: test_mm256_insertf128_pd:
787; CHECK:       # %bb.0:
788; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
789; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
790; CHECK-NEXT:    ret{{[l|q]}}
791  %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
792  %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
793  ret <4 x double> %res
794}
795
796define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
797; CHECK-LABEL: test_mm256_insertf128_ps:
798; CHECK:       # %bb.0:
799; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
800; CHECK-NEXT:    ret{{[l|q]}}
801  %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
802  %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
803  ret <8 x float> %res
804}
805
806define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
807; CHECK-LABEL: test_mm256_insertf128_si256:
808; CHECK:       # %bb.0:
809; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
810; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
811; CHECK-NEXT:    ret{{[l|q]}}
812  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
813  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
814  ret <4 x i64> %res
815}
816
817define <4 x i64> @test_mm256_lddqu_si256(ptr %a0) nounwind {
818; X86-LABEL: test_mm256_lddqu_si256:
819; X86:       # %bb.0:
820; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
821; X86-NEXT:    vlddqu (%eax), %ymm0
822; X86-NEXT:    retl
823;
824; X64-LABEL: test_mm256_lddqu_si256:
825; X64:       # %bb.0:
826; X64-NEXT:    vlddqu (%rdi), %ymm0
827; X64-NEXT:    retq
828  %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr %a0)
829  %bc = bitcast <32 x i8> %res to <4 x i64>
830  ret <4 x i64> %bc
831}
832declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readnone
833
834define <4 x double> @test_mm256_load_pd(ptr %a0) nounwind {
835; X86-LABEL: test_mm256_load_pd:
836; X86:       # %bb.0:
837; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
838; X86-NEXT:    vmovaps (%eax), %ymm0
839; X86-NEXT:    retl
840;
841; X64-LABEL: test_mm256_load_pd:
842; X64:       # %bb.0:
843; X64-NEXT:    vmovaps (%rdi), %ymm0
844; X64-NEXT:    retq
845  %res = load <4 x double>, ptr %a0, align 32
846  ret <4 x double> %res
847}
848
849define <8 x float> @test_mm256_load_ps(ptr %a0) nounwind {
850; X86-LABEL: test_mm256_load_ps:
851; X86:       # %bb.0:
852; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
853; X86-NEXT:    vmovaps (%eax), %ymm0
854; X86-NEXT:    retl
855;
856; X64-LABEL: test_mm256_load_ps:
857; X64:       # %bb.0:
858; X64-NEXT:    vmovaps (%rdi), %ymm0
859; X64-NEXT:    retq
860  %res = load <8 x float>, ptr %a0, align 32
861  ret <8 x float> %res
862}
863
864define <4 x i64> @test_mm256_load_si256(ptr %a0) nounwind {
865; X86-LABEL: test_mm256_load_si256:
866; X86:       # %bb.0:
867; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
868; X86-NEXT:    vmovaps (%eax), %ymm0
869; X86-NEXT:    retl
870;
871; X64-LABEL: test_mm256_load_si256:
872; X64:       # %bb.0:
873; X64-NEXT:    vmovaps (%rdi), %ymm0
874; X64-NEXT:    retq
875  %res = load <4 x i64>, ptr %a0, align 32
876  ret <4 x i64> %res
877}
878
879define <4 x double> @test_mm256_loadu_pd(ptr %a0) nounwind {
880; X86-LABEL: test_mm256_loadu_pd:
881; X86:       # %bb.0:
882; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
883; X86-NEXT:    vmovups (%eax), %ymm0
884; X86-NEXT:    retl
885;
886; X64-LABEL: test_mm256_loadu_pd:
887; X64:       # %bb.0:
888; X64-NEXT:    vmovups (%rdi), %ymm0
889; X64-NEXT:    retq
890  %res = load <4 x double>, ptr %a0, align 1
891  ret <4 x double> %res
892}
893
894define <8 x float> @test_mm256_loadu_ps(ptr %a0) nounwind {
895; X86-LABEL: test_mm256_loadu_ps:
896; X86:       # %bb.0:
897; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
898; X86-NEXT:    vmovups (%eax), %ymm0
899; X86-NEXT:    retl
900;
901; X64-LABEL: test_mm256_loadu_ps:
902; X64:       # %bb.0:
903; X64-NEXT:    vmovups (%rdi), %ymm0
904; X64-NEXT:    retq
905  %res = load <8 x float>, ptr %a0, align 1
906  ret <8 x float> %res
907}
908
909define <4 x i64> @test_mm256_loadu_si256(ptr %a0) nounwind {
910; X86-LABEL: test_mm256_loadu_si256:
911; X86:       # %bb.0:
912; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
913; X86-NEXT:    vmovups (%eax), %ymm0
914; X86-NEXT:    retl
915;
916; X64-LABEL: test_mm256_loadu_si256:
917; X64:       # %bb.0:
918; X64-NEXT:    vmovups (%rdi), %ymm0
919; X64-NEXT:    retq
920  %res = load <4 x i64>, ptr %a0, align 1
921  ret <4 x i64> %res
922}
923
924define <8 x float> @test_mm256_loadu2_m128(ptr %a0, ptr %a1) nounwind {
925; X86-LABEL: test_mm256_loadu2_m128:
926; X86:       # %bb.0:
927; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
928; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
929; X86-NEXT:    vmovups (%eax), %xmm0
930; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
931; X86-NEXT:    retl
932;
933; X64-LABEL: test_mm256_loadu2_m128:
934; X64:       # %bb.0:
935; X64-NEXT:    vmovups (%rsi), %xmm0
936; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
937; X64-NEXT:    retq
938  %hi4 = load <4 x float>, ptr %a0, align 1
939  %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
940  %lo4 = load <4 x float>, ptr %a1, align 1
941  %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
942  %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
943  ret <8 x float> %res
944}
945
946define <4 x double> @test_mm256_loadu2_m128d(ptr %a0, ptr %a1) nounwind {
947; X86-LABEL: test_mm256_loadu2_m128d:
948; X86:       # %bb.0:
949; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
950; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
951; X86-NEXT:    vmovups (%eax), %xmm0
952; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
953; X86-NEXT:    retl
954;
955; X64-LABEL: test_mm256_loadu2_m128d:
956; X64:       # %bb.0:
957; X64-NEXT:    vmovups (%rsi), %xmm0
958; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
959; X64-NEXT:    retq
960  %hi2 = load <2 x double>, ptr %a0, align 1
961  %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
962  %lo2 = load <2 x double>, ptr %a1, align 1
963  %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
964  %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
965  ret <4 x double> %res
966}
967
968define <4 x i64> @test_mm256_loadu2_m128i(ptr %a0, ptr %a1) nounwind {
969; X86-LABEL: test_mm256_loadu2_m128i:
970; X86:       # %bb.0:
971; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
972; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
973; X86-NEXT:    vmovups (%eax), %xmm0
974; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
975; X86-NEXT:    retl
976;
977; X64-LABEL: test_mm256_loadu2_m128i:
978; X64:       # %bb.0:
979; X64-NEXT:    vmovups (%rsi), %xmm0
980; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
981; X64-NEXT:    retq
982  %hi2 = load <2 x i64>, ptr %a0, align 1
983  %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
984  %lo2 = load <2 x i64>, ptr %a1, align 1
985  %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
986  %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
987  ret <4 x i64> %res
988}
989
990define <2 x double> @test_mm_maskload_pd(ptr %a0, <2 x i64> %a1) nounwind {
991; X86-LABEL: test_mm_maskload_pd:
992; X86:       # %bb.0:
993; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
994; X86-NEXT:    vmaskmovpd (%eax), %xmm0, %xmm0
995; X86-NEXT:    retl
996;
997; X64-LABEL: test_mm_maskload_pd:
998; X64:       # %bb.0:
999; X64-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm0
1000; X64-NEXT:    retq
1001  %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %a1)
1002  ret <2 x double> %res
1003}
1004declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readnone
1005
1006define <4 x double> @test_mm256_maskload_pd(ptr %a0, <4 x i64> %a1) nounwind {
1007; X86-LABEL: test_mm256_maskload_pd:
1008; X86:       # %bb.0:
1009; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1010; X86-NEXT:    vmaskmovpd (%eax), %ymm0, %ymm0
1011; X86-NEXT:    retl
1012;
1013; X64-LABEL: test_mm256_maskload_pd:
1014; X64:       # %bb.0:
1015; X64-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
1016; X64-NEXT:    retq
1017  %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %a1)
1018  ret <4 x double> %res
1019}
1020declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind readnone
1021
1022define <4 x float> @test_mm_maskload_ps(ptr %a0, <2 x i64> %a1) nounwind {
1023; X86-LABEL: test_mm_maskload_ps:
1024; X86:       # %bb.0:
1025; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1026; X86-NEXT:    vmaskmovps (%eax), %xmm0, %xmm0
1027; X86-NEXT:    retl
1028;
1029; X64-LABEL: test_mm_maskload_ps:
1030; X64:       # %bb.0:
1031; X64-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
1032; X64-NEXT:    retq
1033  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1034  %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %arg1)
1035  ret <4 x float> %res
1036}
1037declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readnone
1038
1039define <8 x float> @test_mm256_maskload_ps(ptr %a0, <4 x i64> %a1) nounwind {
1040; X86-LABEL: test_mm256_maskload_ps:
1041; X86:       # %bb.0:
1042; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1043; X86-NEXT:    vmaskmovps (%eax), %ymm0, %ymm0
1044; X86-NEXT:    retl
1045;
1046; X64-LABEL: test_mm256_maskload_ps:
1047; X64:       # %bb.0:
1048; X64-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
1049; X64-NEXT:    retq
1050  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1051  %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %arg1)
1052  ret <8 x float> %res
1053}
1054declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind readnone
1055
1056define void @test_mm_maskstore_pd(ptr %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
1057; X86-LABEL: test_mm_maskstore_pd:
1058; X86:       # %bb.0:
1059; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1060; X86-NEXT:    vmaskmovpd %xmm1, %xmm0, (%eax)
1061; X86-NEXT:    retl
1062;
1063; X64-LABEL: test_mm_maskstore_pd:
1064; X64:       # %bb.0:
1065; X64-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi)
1066; X64-NEXT:    retq
1067  call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %a1, <2 x double> %a2)
1068  ret void
1069}
1070declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind readnone
1071
1072define void @test_mm256_maskstore_pd(ptr %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
1073; X86-LABEL: test_mm256_maskstore_pd:
1074; X86:       # %bb.0:
1075; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1076; X86-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax)
1077; X86-NEXT:    vzeroupper
1078; X86-NEXT:    retl
1079;
1080; X64-LABEL: test_mm256_maskstore_pd:
1081; X64:       # %bb.0:
1082; X64-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
1083; X64-NEXT:    vzeroupper
1084; X64-NEXT:    retq
1085  call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %a1, <4 x double> %a2)
1086  ret void
1087}
1088declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwind readnone
1089
1090define void @test_mm_maskstore_ps(ptr %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
1091; X86-LABEL: test_mm_maskstore_ps:
1092; X86:       # %bb.0:
1093; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1094; X86-NEXT:    vmaskmovps %xmm1, %xmm0, (%eax)
1095; X86-NEXT:    retl
1096;
1097; X64-LABEL: test_mm_maskstore_ps:
1098; X64:       # %bb.0:
1099; X64-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
1100; X64-NEXT:    retq
1101  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1102  call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %arg1, <4 x float> %a2)
1103  ret void
1104}
1105declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind readnone
1106
1107define void @test_mm256_maskstore_ps(ptr %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
1108; X86-LABEL: test_mm256_maskstore_ps:
1109; X86:       # %bb.0:
1110; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1111; X86-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax)
1112; X86-NEXT:    vzeroupper
1113; X86-NEXT:    retl
1114;
1115; X64-LABEL: test_mm256_maskstore_ps:
1116; X64:       # %bb.0:
1117; X64-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
1118; X64-NEXT:    vzeroupper
1119; X64-NEXT:    retq
1120  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1121  call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %arg1, <8 x float> %a2)
1122  ret void
1123}
1124declare void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>) nounwind readnone
1125
1126define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1127; CHECK-LABEL: test_mm256_max_pd:
1128; CHECK:       # %bb.0:
1129; CHECK-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
1130; CHECK-NEXT:    ret{{[l|q]}}
1131  %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1132  ret <4 x double> %res
1133}
1134declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
1135
1136define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1137; CHECK-LABEL: test_mm256_max_ps:
1138; CHECK:       # %bb.0:
1139; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
1140; CHECK-NEXT:    ret{{[l|q]}}
1141  %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
1142  ret <8 x float> %res
1143}
1144declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
1145
1146define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1147; CHECK-LABEL: test_mm256_min_pd:
1148; CHECK:       # %bb.0:
1149; CHECK-NEXT:    vminpd %ymm1, %ymm0, %ymm0
1150; CHECK-NEXT:    ret{{[l|q]}}
1151  %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
1152  ret <4 x double> %res
1153}
1154declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
1155
1156define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1157; CHECK-LABEL: test_mm256_min_ps:
1158; CHECK:       # %bb.0:
1159; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm0
1160; CHECK-NEXT:    ret{{[l|q]}}
1161  %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
1162  ret <8 x float> %res
1163}
1164declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
1165
1166define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
1167; CHECK-LABEL: test_mm256_movedup_pd:
1168; CHECK:       # %bb.0:
1169; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1170; CHECK-NEXT:    ret{{[l|q]}}
1171  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1172  ret <4 x double> %res
1173}
1174
1175define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
1176; CHECK-LABEL: test_mm256_movehdup_ps:
1177; CHECK:       # %bb.0:
1178; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1179; CHECK-NEXT:    ret{{[l|q]}}
1180  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1181  ret <8 x float> %res
1182}
1183
1184define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
1185; CHECK-LABEL: test_mm256_moveldup_ps:
1186; CHECK:       # %bb.0:
1187; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1188; CHECK-NEXT:    ret{{[l|q]}}
1189  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1190  ret <8 x float> %res
1191}
1192
1193define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
1194; CHECK-LABEL: test_mm256_movemask_pd:
1195; CHECK:       # %bb.0:
1196; CHECK-NEXT:    vmovmskpd %ymm0, %eax
1197; CHECK-NEXT:    vzeroupper
1198; CHECK-NEXT:    ret{{[l|q]}}
1199  %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
1200  ret i32 %res
1201}
1202declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
1203
1204define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
1205; CHECK-LABEL: test_mm256_movemask_ps:
1206; CHECK:       # %bb.0:
1207; CHECK-NEXT:    vmovmskps %ymm0, %eax
1208; CHECK-NEXT:    vzeroupper
1209; CHECK-NEXT:    ret{{[l|q]}}
1210  %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
1211  ret i32 %res
1212}
1213declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
1214
1215define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1216; CHECK-LABEL: test_mm256_mul_pd:
1217; CHECK:       # %bb.0:
1218; CHECK-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
1219; CHECK-NEXT:    ret{{[l|q]}}
1220  %res = fmul <4 x double> %a0, %a1
1221  ret <4 x double> %res
1222}
1223
1224define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1225; CHECK-LABEL: test_mm256_mul_ps:
1226; CHECK:       # %bb.0:
1227; CHECK-NEXT:    vmulps %ymm1, %ymm0, %ymm0
1228; CHECK-NEXT:    ret{{[l|q]}}
1229  %res = fmul <8 x float> %a0, %a1
1230  ret <8 x float> %res
1231}
1232
1233define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1234; CHECK-LABEL: test_mm256_or_pd:
1235; CHECK:       # %bb.0:
1236; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1237; CHECK-NEXT:    ret{{[l|q]}}
1238  %1 = bitcast <4 x double> %a0 to <4 x i64>
1239  %2 = bitcast <4 x double> %a1 to <4 x i64>
1240  %res = or <4 x i64> %1, %2
1241  %bc = bitcast <4 x i64> %res to <4 x double>
1242  ret <4 x double> %bc
1243}
1244
1245define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1246; CHECK-LABEL: test_mm256_or_ps:
1247; CHECK:       # %bb.0:
1248; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1249; CHECK-NEXT:    ret{{[l|q]}}
1250  %1 = bitcast <8 x float> %a0 to <8 x i32>
1251  %2 = bitcast <8 x float> %a1 to <8 x i32>
1252  %res = or <8 x i32> %1, %2
1253  %bc = bitcast <8 x i32> %res to <8 x float>
1254  ret <8 x float> %bc
1255}
1256
1257define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
1258; CHECK-LABEL: test_mm_permute_pd:
1259; CHECK:       # %bb.0:
1260; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1261; CHECK-NEXT:    ret{{[l|q]}}
1262  %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
1263  ret <2 x double> %res
1264}
1265
1266define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
1267; CHECK-LABEL: test_mm256_permute_pd:
1268; CHECK:       # %bb.0:
1269; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1270; CHECK-NEXT:    ret{{[l|q]}}
1271  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
1272  ret <4 x double> %res
1273}
1274
1275define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
1276; CHECK-LABEL: test_mm_permute_ps:
1277; CHECK:       # %bb.0:
1278; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1279; CHECK-NEXT:    ret{{[l|q]}}
1280  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1281  ret <4 x float> %res
1282}
1283
1284define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
1285; CHECK-LABEL: test2_mm_permute_ps:
1286; CHECK:       # %bb.0:
1287; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,2,3]
1288; CHECK-NEXT:    ret{{[l|q]}}
1289  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
1290  ret <4 x float> %res
1291}
1292
1293define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
1294; CHECK-LABEL: test_mm256_permute_ps:
1295; CHECK:       # %bb.0:
1296; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1297; CHECK-NEXT:    ret{{[l|q]}}
1298  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1299  ret <8 x float> %res
1300}
1301
1302define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1303; CHECK-LABEL: test_mm256_permute2f128_pd:
1304; CHECK:       # %bb.0:
1305; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
1306; CHECK-NEXT:    ret{{[l|q]}}
1307  %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1308  ret <4 x double> %res
1309}
1310declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
1311
1312; PR26667
1313define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1314; CHECK-LABEL: test_mm256_permute2f128_ps:
1315; CHECK:       # %bb.0:
1316; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1317; CHECK-NEXT:    ret{{[l|q]}}
1318  %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
1319  ret <8 x float> %res
1320}
1321declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
1322
1323define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1324; CHECK-LABEL: test_mm256_permute2f128_si256:
1325; CHECK:       # %bb.0:
1326; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
1327; CHECK-NEXT:    ret{{[l|q]}}
1328  %1 = bitcast <4 x i64> %a0 to <8 x i32>
1329  %2 = bitcast <4 x i64> %a1 to <8 x i32>
1330  %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1331  %bc = bitcast <8 x i32> %res to <4 x i64>
1332  ret <4 x i64> %bc
1333}
1334declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
1335
1336define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
1337; CHECK-LABEL: test_mm_permutevar_pd:
1338; CHECK:       # %bb.0:
1339; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1340; CHECK-NEXT:    ret{{[l|q]}}
1341  %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
1342  ret <2 x double> %res
1343}
1344declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
1345
1346define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
1347; CHECK-LABEL: test_mm256_permutevar_pd:
1348; CHECK:       # %bb.0:
1349; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
1350; CHECK-NEXT:    ret{{[l|q]}}
1351  %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
1352  ret <4 x double> %res
1353}
1354declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
1355
1356define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
1357; CHECK-LABEL: test_mm_permutevar_ps:
1358; CHECK:       # %bb.0:
1359; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1360; CHECK-NEXT:    ret{{[l|q]}}
1361  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1362  %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
1363  ret <4 x float> %res
1364}
1365declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
1366
1367define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
1368; CHECK-LABEL: test_mm256_permutevar_ps:
1369; CHECK:       # %bb.0:
1370; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
1371; CHECK-NEXT:    ret{{[l|q]}}
1372  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1373  %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
1374  ret <8 x float> %res
1375}
1376declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
1377
1378define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
1379; CHECK-LABEL: test_mm256_rcp_ps:
1380; CHECK:       # %bb.0:
1381; CHECK-NEXT:    vrcpps %ymm0, %ymm0
1382; CHECK-NEXT:    ret{{[l|q]}}
1383  %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
1384  ret <8 x float> %res
1385}
1386declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
1387
1388define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
1389; CHECK-LABEL: test_mm256_round_pd:
1390; CHECK:       # %bb.0:
1391; CHECK-NEXT:    vroundpd $4, %ymm0, %ymm0
1392; CHECK-NEXT:    ret{{[l|q]}}
1393  %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
1394  ret <4 x double> %res
1395}
1396
1397define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
1398; CHECK-LABEL: test_mm256_round_ps:
1399; CHECK:       # %bb.0:
1400; CHECK-NEXT:    vroundps $4, %ymm0, %ymm0
1401; CHECK-NEXT:    ret{{[l|q]}}
1402  %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
1403  ret <8 x float> %res
1404}
1405
1406define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
1407; CHECK-LABEL: test_mm256_rsqrt_ps:
1408; CHECK:       # %bb.0:
1409; CHECK-NEXT:    vrsqrtps %ymm0, %ymm0
1410; CHECK-NEXT:    ret{{[l|q]}}
1411  %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
1412  ret <8 x float> %res
1413}
1414declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
1415
1416define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
1417; X86-LABEL: test_mm256_set_epi8:
1418; X86:       # %bb.0:
1419; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1420; X86-NEXT:    vmovd %eax, %xmm0
1421; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1422; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
1423; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1424; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1425; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1426; X86-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
1427; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1428; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1429; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1430; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
1431; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1432; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1433; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1434; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
1435; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1436; X86-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1437; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1438; X86-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
1439; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1440; X86-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1441; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1442; X86-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
1443; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1444; X86-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1445; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1446; X86-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
1447; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1448; X86-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1449; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1450; X86-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
1451; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1452; X86-NEXT:    vmovd %eax, %xmm1
1453; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1454; X86-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
1455; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1456; X86-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
1457; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1458; X86-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
1459; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1460; X86-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
1461; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1462; X86-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
1463; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1464; X86-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
1465; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1466; X86-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
1467; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1468; X86-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
1469; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1470; X86-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
1471; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1472; X86-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
1473; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1474; X86-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
1475; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1476; X86-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
1477; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1478; X86-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
1479; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1480; X86-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
1481; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1482; X86-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
1483; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1484; X86-NEXT:    retl
1485;
1486; X64-LABEL: test_mm256_set_epi8:
1487; X64:       # %bb.0:
1488; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1489; X64-NEXT:    vmovd %eax, %xmm0
1490; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1491; X64-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
1492; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1493; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1494; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1495; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
1496; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1497; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1498; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1499; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
1500; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1501; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1502; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1503; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
1504; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1505; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1506; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1507; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
1508; X64-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
1509; X64-NEXT:    vpinsrb $11, %r8d, %xmm0, %xmm0
1510; X64-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
1511; X64-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
1512; X64-NEXT:    vpinsrb $14, %esi, %xmm0, %xmm0
1513; X64-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
1514; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1515; X64-NEXT:    vmovd %eax, %xmm1
1516; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1517; X64-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
1518; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1519; X64-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
1520; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1521; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
1522; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1523; X64-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
1524; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1525; X64-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
1526; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1527; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
1528; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1529; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
1530; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1531; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
1532; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1533; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
1534; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1535; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
1536; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1537; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
1538; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1539; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
1540; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1541; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
1542; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1543; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
1544; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1545; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
1546; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1547; X64-NEXT:    retq
1548  %res0  = insertelement <32 x i8> undef,  i8 %a31, i32 0
1549  %res1  = insertelement <32 x i8> %res0,  i8 %a30, i32 1
1550  %res2  = insertelement <32 x i8> %res1,  i8 %a29, i32 2
1551  %res3  = insertelement <32 x i8> %res2,  i8 %a28, i32 3
1552  %res4  = insertelement <32 x i8> %res3,  i8 %a27, i32 4
1553  %res5  = insertelement <32 x i8> %res4,  i8 %a26, i32 5
1554  %res6  = insertelement <32 x i8> %res5,  i8 %a25, i32 6
1555  %res7  = insertelement <32 x i8> %res6,  i8 %a24, i32 7
1556  %res8  = insertelement <32 x i8> %res7,  i8 %a23, i32 8
1557  %res9  = insertelement <32 x i8> %res8,  i8 %a22, i32 9
1558  %res10 = insertelement <32 x i8> %res9,  i8 %a21, i32 10
1559  %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11
1560  %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12
1561  %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13
1562  %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14
1563  %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15
1564  %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16
1565  %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17
1566  %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18
1567  %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19
1568  %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20
1569  %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21
1570  %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22
1571  %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23
1572  %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24
1573  %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25
1574  %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26
1575  %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27
1576  %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28
1577  %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29
1578  %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30
1579  %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31
1580  %res = bitcast <32 x i8> %res31 to <4 x i64>
1581  ret <4 x i64> %res
1582}
1583
1584define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
1585; X86-LABEL: test_mm256_set_epi16:
1586; X86:       # %bb.0:
1587; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1588; X86-NEXT:    vmovd %eax, %xmm0
1589; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1590; X86-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
1591; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1592; X86-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
1593; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1594; X86-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
1595; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1596; X86-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
1597; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1598; X86-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
1599; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1600; X86-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
1601; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1602; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1603; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1604; X86-NEXT:    vmovd %eax, %xmm1
1605; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1606; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
1607; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1608; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
1609; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1610; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
1611; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1612; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
1613; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1614; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
1615; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1616; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
1617; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1618; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
1619; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1620; X86-NEXT:    retl
1621;
1622; X64-LABEL: test_mm256_set_epi16:
1623; X64:       # %bb.0:
1624; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1625; X64-NEXT:    vmovd %eax, %xmm0
1626; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1627; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
1628; X64-NEXT:    vpinsrw $2, %r9d, %xmm0, %xmm0
1629; X64-NEXT:    vpinsrw $3, %r8d, %xmm0, %xmm0
1630; X64-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1631; X64-NEXT:    vpinsrw $5, %edx, %xmm0, %xmm0
1632; X64-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0
1633; X64-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
1634; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1635; X64-NEXT:    vmovd %eax, %xmm1
1636; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1637; X64-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
1638; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1639; X64-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
1640; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1641; X64-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
1642; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1643; X64-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
1644; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1645; X64-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
1646; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1647; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
1648; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1649; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
1650; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1651; X64-NEXT:    retq
1652  %res0  = insertelement <16 x i16> undef,  i16 %a15, i32 0
1653  %res1  = insertelement <16 x i16> %res0,  i16 %a14, i32 1
1654  %res2  = insertelement <16 x i16> %res1,  i16 %a13, i32 2
1655  %res3  = insertelement <16 x i16> %res2,  i16 %a12, i32 3
1656  %res4  = insertelement <16 x i16> %res3,  i16 %a11, i32 4
1657  %res5  = insertelement <16 x i16> %res4,  i16 %a10, i32 5
1658  %res6  = insertelement <16 x i16> %res5,  i16 %a9 , i32 6
1659  %res7  = insertelement <16 x i16> %res6,  i16 %a8 , i32 7
1660  %res8  = insertelement <16 x i16> %res7,  i16 %a7 , i32 8
1661  %res9  = insertelement <16 x i16> %res8,  i16 %a6 , i32 9
1662  %res10 = insertelement <16 x i16> %res9,  i16 %a5 , i32 10
1663  %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11
1664  %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12
1665  %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13
1666  %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14
1667  %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15
1668  %res = bitcast <16 x i16> %res15 to <4 x i64>
1669  ret <4 x i64> %res
1670}
1671
1672define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
1673; X86-LABEL: test_mm256_set_epi32:
1674; X86:       # %bb.0:
1675; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1676; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1677; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1678; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1679; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1680; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1681; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1682; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1683; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1684; X86-NEXT:    retl
1685;
1686; X64-LABEL: test_mm256_set_epi32:
1687; X64:       # %bb.0:
1688; X64-NEXT:    vmovd %ecx, %xmm0
1689; X64-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
1690; X64-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
1691; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
1692; X64-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1693; X64-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
1694; X64-NEXT:    vpinsrd $2, %r9d, %xmm1, %xmm1
1695; X64-NEXT:    vpinsrd $3, %r8d, %xmm1, %xmm1
1696; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1697; X64-NEXT:    retq
1698  %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
1699  %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
1700  %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
1701  %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3
1702  %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4
1703  %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5
1704  %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6
1705  %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1706  %res = bitcast <8 x i32> %res7 to <4 x i64>
1707  ret <4 x i64> %res
1708}
1709
1710define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
1711; X86-LABEL: test_mm256_set_epi64x:
1712; X86:       # %bb.0:
1713; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1714; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1715; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1716; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1717; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1718; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1719; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1720; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1721; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1722; X86-NEXT:    retl
1723;
1724; X64-LABEL: test_mm256_set_epi64x:
1725; X64:       # %bb.0:
1726; X64-NEXT:    vmovq %rdi, %xmm0
1727; X64-NEXT:    vmovq %rsi, %xmm1
1728; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1729; X64-NEXT:    vmovq %rdx, %xmm1
1730; X64-NEXT:    vmovq %rcx, %xmm2
1731; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1732; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1733; X64-NEXT:    retq
1734  %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
1735  %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
1736  %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
1737  %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1738  ret <4 x i64> %res3
1739}
1740
1741define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
1742; CHECK-LABEL: test_mm256_set_m128:
1743; CHECK:       # %bb.0:
1744; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1745; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1746; CHECK-NEXT:    ret{{[l|q]}}
1747  %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1748  ret <8 x float> %res
1749}
1750
1751define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
1752; CHECK-LABEL: test_mm256_set_m128d:
1753; CHECK:       # %bb.0:
1754; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1755; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1756; CHECK-NEXT:    ret{{[l|q]}}
1757  %arg0 = bitcast <2 x double> %a0 to <4 x float>
1758  %arg1 = bitcast <2 x double> %a1 to <4 x float>
1759  %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1760  %bc = bitcast <8 x float> %res to <4 x double>
1761  ret <4 x double> %bc
1762}
1763
1764define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1765; CHECK-LABEL: test_mm256_set_m128i:
1766; CHECK:       # %bb.0:
1767; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1768; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1769; CHECK-NEXT:    ret{{[l|q]}}
1770  %arg0 = bitcast <2 x i64> %a0 to <4 x float>
1771  %arg1 = bitcast <2 x i64> %a1 to <4 x float>
1772  %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1773  %bc = bitcast <8 x float> %res to <4 x i64>
1774  ret <4 x i64> %bc
1775}
1776
1777define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
1778; X86-LABEL: test_mm256_set_pd:
1779; X86:       # %bb.0:
1780; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1781; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1782; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1783; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1784; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
1785; X86-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1786; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1787; X86-NEXT:    retl
1788;
1789; X64-LABEL: test_mm256_set_pd:
1790; X64:       # %bb.0:
1791; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1792; X64-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
1793; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1794; X64-NEXT:    retq
1795  %res0 = insertelement <4 x double> undef, double %a3, i32 0
1796  %res1 = insertelement <4 x double> %res0, double %a2, i32 1
1797  %res2 = insertelement <4 x double> %res1, double %a1, i32 2
1798  %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1799  ret <4 x double> %res3
1800}
1801
1802define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
1803; X86-LABEL: test_mm256_set_ps:
1804; X86:       # %bb.0:
1805; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1806; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1807; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1808; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1809; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1810; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1811; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1812; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1813; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1814; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
1815; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1816; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1817; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1818; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
1819; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1820; X86-NEXT:    retl
1821;
1822; X64-LABEL: test_mm256_set_ps:
1823; X64:       # %bb.0:
1824; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1825; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1826; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1827; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
1828; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
1829; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
1830; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1831; X64-NEXT:    retq
1832  %res0 = insertelement <8 x float> undef, float %a7, i32 0
1833  %res1 = insertelement <8 x float> %res0, float %a6, i32 1
1834  %res2 = insertelement <8 x float> %res1, float %a5, i32 2
1835  %res3 = insertelement <8 x float> %res2, float %a4, i32 3
1836  %res4 = insertelement <8 x float> %res3, float %a3, i32 4
1837  %res5 = insertelement <8 x float> %res4, float %a2, i32 5
1838  %res6 = insertelement <8 x float> %res5, float %a1, i32 6
1839  %res7 = insertelement <8 x float> %res6, float %a0, i32 7
1840  ret <8 x float> %res7
1841}
1842
1843define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
1844; X86-LABEL: test_mm256_set1_epi8:
1845; X86:       # %bb.0:
1846; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1847; X86-NEXT:    vmovd %eax, %xmm0
1848; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1849; X86-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1850; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1851; X86-NEXT:    retl
1852;
1853; X64-LABEL: test_mm256_set1_epi8:
1854; X64:       # %bb.0:
1855; X64-NEXT:    vmovd %edi, %xmm0
1856; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1857; X64-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1858; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1859; X64-NEXT:    retq
1860  %res0  = insertelement <32 x i8> undef,  i8 %a0, i32 0
1861  %res1  = insertelement <32 x i8> %res0,  i8 %a0, i32 1
1862  %res2  = insertelement <32 x i8> %res1,  i8 %a0, i32 2
1863  %res3  = insertelement <32 x i8> %res2,  i8 %a0, i32 3
1864  %res4  = insertelement <32 x i8> %res3,  i8 %a0, i32 4
1865  %res5  = insertelement <32 x i8> %res4,  i8 %a0, i32 5
1866  %res6  = insertelement <32 x i8> %res5,  i8 %a0, i32 6
1867  %res7  = insertelement <32 x i8> %res6,  i8 %a0, i32 7
1868  %res8  = insertelement <32 x i8> %res7,  i8 %a0, i32 8
1869  %res9  = insertelement <32 x i8> %res8,  i8 %a0, i32 9
1870  %res10 = insertelement <32 x i8> %res9,  i8 %a0, i32 10
1871  %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11
1872  %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12
1873  %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13
1874  %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14
1875  %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15
1876  %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16
1877  %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17
1878  %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18
1879  %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19
1880  %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20
1881  %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21
1882  %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22
1883  %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23
1884  %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24
1885  %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25
1886  %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26
1887  %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27
1888  %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28
1889  %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29
1890  %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30
1891  %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31
1892  %res = bitcast <32 x i8> %res31 to <4 x i64>
1893  ret <4 x i64> %res
1894}
1895
1896define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
1897; X86-LABEL: test_mm256_set1_epi16:
1898; X86:       # %bb.0:
1899; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1900; X86-NEXT:    vmovd %eax, %xmm0
1901; X86-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1902; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1903; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1904; X86-NEXT:    retl
1905;
1906; X64-LABEL: test_mm256_set1_epi16:
1907; X64:       # %bb.0:
1908; X64-NEXT:    vmovd %edi, %xmm0
1909; X64-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1910; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1911; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1912; X64-NEXT:    retq
1913  %res0  = insertelement <16 x i16> undef,  i16 %a0, i32 0
1914  %res1  = insertelement <16 x i16> %res0,  i16 %a0, i32 1
1915  %res2  = insertelement <16 x i16> %res1,  i16 %a0, i32 2
1916  %res3  = insertelement <16 x i16> %res2,  i16 %a0, i32 3
1917  %res4  = insertelement <16 x i16> %res3,  i16 %a0, i32 4
1918  %res5  = insertelement <16 x i16> %res4,  i16 %a0, i32 5
1919  %res6  = insertelement <16 x i16> %res5,  i16 %a0, i32 6
1920  %res7  = insertelement <16 x i16> %res6,  i16 %a0, i32 7
1921  %res8  = insertelement <16 x i16> %res7,  i16 %a0, i32 8
1922  %res9  = insertelement <16 x i16> %res8,  i16 %a0, i32 9
1923  %res10 = insertelement <16 x i16> %res9,  i16 %a0, i32 10
1924  %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11
1925  %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12
1926  %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13
1927  %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14
1928  %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15
1929  %res = bitcast <16 x i16> %res15 to <4 x i64>
1930  ret <4 x i64> %res
1931}
1932
1933define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
1934; X86-LABEL: test_mm256_set1_epi32:
1935; X86:       # %bb.0:
1936; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1937; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1938; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1939; X86-NEXT:    retl
1940;
1941; X64-LABEL: test_mm256_set1_epi32:
1942; X64:       # %bb.0:
1943; X64-NEXT:    vmovd %edi, %xmm0
1944; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1945; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1946; X64-NEXT:    retq
1947  %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
1948  %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
1949  %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
1950  %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3
1951  %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4
1952  %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5
1953  %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6
1954  %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1955  %res = bitcast <8 x i32> %res7 to <4 x i64>
1956  ret <4 x i64> %res
1957}
1958
1959define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
1960; X86-LABEL: test_mm256_set1_epi64x:
1961; X86:       # %bb.0:
1962; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1963; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1964; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1965; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1966; X86-NEXT:    retl
1967;
1968; X64-LABEL: test_mm256_set1_epi64x:
1969; X64:       # %bb.0:
1970; X64-NEXT:    vmovq %rdi, %xmm0
1971; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1972; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1973; X64-NEXT:    retq
1974  %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
1975  %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
1976  %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
1977  %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1978  ret <4 x i64> %res3
1979}
1980
1981define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
1982; X86-LABEL: test_mm256_set1_pd:
1983; X86:       # %bb.0:
1984; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1985; X86-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1986; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1987; X86-NEXT:    retl
1988;
1989; X64-LABEL: test_mm256_set1_pd:
1990; X64:       # %bb.0:
1991; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1992; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1993; X64-NEXT:    retq
1994  %res0 = insertelement <4 x double> undef, double %a0, i32 0
1995  %res1 = insertelement <4 x double> %res0, double %a0, i32 1
1996  %res2 = insertelement <4 x double> %res1, double %a0, i32 2
1997  %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1998  ret <4 x double> %res3
1999}
2000
2001define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
2002; X86-LABEL: test_mm256_set1_ps:
2003; X86:       # %bb.0:
2004; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2005; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2006; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2007; X86-NEXT:    retl
2008;
2009; X64-LABEL: test_mm256_set1_ps:
2010; X64:       # %bb.0:
2011; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2012; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2013; X64-NEXT:    retq
2014  %res0 = insertelement <8 x float> undef, float %a0, i32 0
2015  %res1 = insertelement <8 x float> %res0, float %a0, i32 1
2016  %res2 = insertelement <8 x float> %res1, float %a0, i32 2
2017  %res3 = insertelement <8 x float> %res2, float %a0, i32 3
2018  %res4 = insertelement <8 x float> %res3, float %a0, i32 4
2019  %res5 = insertelement <8 x float> %res4, float %a0, i32 5
2020  %res6 = insertelement <8 x float> %res5, float %a0, i32 6
2021  %res7 = insertelement <8 x float> %res6, float %a0, i32 7
2022  ret <8 x float> %res7
2023}
2024
2025define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
2026; X86-LABEL: test_mm256_setr_epi8:
2027; X86:       # %bb.0:
2028; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2029; X86-NEXT:    vmovd %eax, %xmm0
2030; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2031; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
2032; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2033; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
2034; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2035; X86-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
2036; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2037; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
2038; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2039; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
2040; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2041; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
2042; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2043; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
2044; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2045; X86-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
2046; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2047; X86-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
2048; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2049; X86-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
2050; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2051; X86-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
2052; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2053; X86-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
2054; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2055; X86-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
2056; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2057; X86-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
2058; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2059; X86-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2060; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2061; X86-NEXT:    vmovd %eax, %xmm1
2062; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2063; X86-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
2064; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2065; X86-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
2066; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2067; X86-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
2068; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2069; X86-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
2070; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2071; X86-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
2072; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2073; X86-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
2074; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2075; X86-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
2076; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2077; X86-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
2078; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2079; X86-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
2080; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2081; X86-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
2082; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2083; X86-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
2084; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2085; X86-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
2086; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2087; X86-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
2088; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2089; X86-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
2090; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2091; X86-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
2092; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2093; X86-NEXT:    retl
2094;
2095; X64-LABEL: test_mm256_setr_epi8:
2096; X64:       # %bb.0:
2097; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2098; X64-NEXT:    vmovd %eax, %xmm0
2099; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2100; X64-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
2101; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2102; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
2103; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2104; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
2105; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2106; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
2107; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2108; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
2109; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2110; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
2111; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2112; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
2113; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2114; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
2115; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2116; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
2117; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2118; X64-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
2119; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2120; X64-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
2121; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2122; X64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
2123; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2124; X64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
2125; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2126; X64-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
2127; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2128; X64-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2129; X64-NEXT:    vmovd %edi, %xmm1
2130; X64-NEXT:    vpinsrb $1, %esi, %xmm1, %xmm1
2131; X64-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
2132; X64-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
2133; X64-NEXT:    vpinsrb $4, %r8d, %xmm1, %xmm1
2134; X64-NEXT:    vpinsrb $5, %r9d, %xmm1, %xmm1
2135; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2136; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
2137; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2138; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
2139; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2140; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
2141; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2142; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
2143; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2144; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
2145; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2146; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
2147; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2148; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
2149; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2150; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
2151; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2152; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
2153; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2154; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
2155; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2156; X64-NEXT:    retq
2157  %res0  = insertelement <32 x i8> undef,  i8 %a0 , i32 0
2158  %res1  = insertelement <32 x i8> %res0,  i8 %a1 , i32 1
2159  %res2  = insertelement <32 x i8> %res1,  i8 %a2 , i32 2
2160  %res3  = insertelement <32 x i8> %res2,  i8 %a3 , i32 3
2161  %res4  = insertelement <32 x i8> %res3,  i8 %a4 , i32 4
2162  %res5  = insertelement <32 x i8> %res4,  i8 %a5 , i32 5
2163  %res6  = insertelement <32 x i8> %res5,  i8 %a6 , i32 6
2164  %res7  = insertelement <32 x i8> %res6,  i8 %a7 , i32 7
2165  %res8  = insertelement <32 x i8> %res7,  i8 %a8 , i32 8
2166  %res9  = insertelement <32 x i8> %res8,  i8 %a9 , i32 9
2167  %res10 = insertelement <32 x i8> %res9,  i8 %a10, i32 10
2168  %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11
2169  %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12
2170  %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13
2171  %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14
2172  %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15
2173  %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16
2174  %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17
2175  %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18
2176  %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19
2177  %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20
2178  %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21
2179  %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22
2180  %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23
2181  %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24
2182  %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25
2183  %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26
2184  %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27
2185  %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28
2186  %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29
2187  %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30
2188  %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31
2189  %res = bitcast <32 x i8> %res31 to <4 x i64>
2190  ret <4 x i64> %res
2191}
2192
2193define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
2194; X86-LABEL: test_mm256_setr_epi16:
2195; X86:       # %bb.0:
2196; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2197; X86-NEXT:    vmovd %eax, %xmm0
2198; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2199; X86-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2200; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2201; X86-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2202; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2203; X86-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2204; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2205; X86-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
2206; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2207; X86-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2208; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2209; X86-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2210; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2211; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
2212; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2213; X86-NEXT:    vmovd %eax, %xmm1
2214; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2215; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
2216; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2217; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
2218; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2219; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
2220; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2221; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
2222; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2223; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
2224; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2225; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2226; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2227; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2228; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2229; X86-NEXT:    retl
2230;
2231; X64-LABEL: test_mm256_setr_epi16:
2232; X64:       # %bb.0:
2233; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2234; X64-NEXT:    vmovd %eax, %xmm0
2235; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2236; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2237; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2238; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2239; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2240; X64-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2241; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2242; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
2243; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2244; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2245; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2246; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2247; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2248; X64-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
2249; X64-NEXT:    vmovd %edi, %xmm1
2250; X64-NEXT:    vpinsrw $1, %esi, %xmm1, %xmm1
2251; X64-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
2252; X64-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
2253; X64-NEXT:    vpinsrw $4, %r8d, %xmm1, %xmm1
2254; X64-NEXT:    vpinsrw $5, %r9d, %xmm1, %xmm1
2255; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2256; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2257; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2258; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2259; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2260; X64-NEXT:    retq
2261  %res0  = insertelement <16 x i16> undef,  i16 %a0 , i32 0
2262  %res1  = insertelement <16 x i16> %res0,  i16 %a1 , i32 1
2263  %res2  = insertelement <16 x i16> %res1,  i16 %a2 , i32 2
2264  %res3  = insertelement <16 x i16> %res2,  i16 %a3 , i32 3
2265  %res4  = insertelement <16 x i16> %res3,  i16 %a4 , i32 4
2266  %res5  = insertelement <16 x i16> %res4,  i16 %a5 , i32 5
2267  %res6  = insertelement <16 x i16> %res5,  i16 %a6 , i32 6
2268  %res7  = insertelement <16 x i16> %res6,  i16 %a7 , i32 7
2269  %res8  = insertelement <16 x i16> %res7,  i16 %a8 , i32 8
2270  %res9  = insertelement <16 x i16> %res8,  i16 %a9 , i32 9
2271  %res10 = insertelement <16 x i16> %res9,  i16 %a10, i32 10
2272  %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11
2273  %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12
2274  %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13
2275  %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14
2276  %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15
2277  %res = bitcast <16 x i16> %res15 to <4 x i64>
2278  ret <4 x i64> %res
2279}
2280
2281define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
2282; X86-LABEL: test_mm256_setr_epi32:
2283; X86:       # %bb.0:
2284; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2285; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2286; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2287; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2288; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2289; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2290; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2291; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2292; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2293; X86-NEXT:    retl
2294;
2295; X64-LABEL: test_mm256_setr_epi32:
2296; X64:       # %bb.0:
2297; X64-NEXT:    vmovd %r8d, %xmm0
2298; X64-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
2299; X64-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2300; X64-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2301; X64-NEXT:    vmovd %edi, %xmm1
2302; X64-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
2303; X64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
2304; X64-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
2305; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2306; X64-NEXT:    retq
2307  %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
2308  %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
2309  %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
2310  %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3
2311  %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4
2312  %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5
2313  %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6
2314  %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7
2315  %res = bitcast <8 x i32> %res7 to <4 x i64>
2316  ret <4 x i64> %res
2317}
2318
2319define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
2320; X86-LABEL: test_mm256_setr_epi64x:
2321; X86:       # %bb.0:
2322; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2323; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2324; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2325; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2326; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2327; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2328; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2329; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2330; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2331; X86-NEXT:    retl
2332;
2333; X64-LABEL: test_mm256_setr_epi64x:
2334; X64:       # %bb.0:
2335; X64-NEXT:    vmovq %rcx, %xmm0
2336; X64-NEXT:    vmovq %rdx, %xmm1
2337; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2338; X64-NEXT:    vmovq %rsi, %xmm1
2339; X64-NEXT:    vmovq %rdi, %xmm2
2340; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2341; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2342; X64-NEXT:    retq
2343  %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
2344  %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
2345  %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
2346  %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3
2347  ret <4 x i64> %res3
2348}
2349
2350define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
2351; CHECK-LABEL: test_mm256_setr_m128:
2352; CHECK:       # %bb.0:
2353; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2354; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2355; CHECK-NEXT:    ret{{[l|q]}}
2356  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2357  ret <8 x float> %res
2358}
2359
2360define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
2361; CHECK-LABEL: test_mm256_setr_m128d:
2362; CHECK:       # %bb.0:
2363; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2364; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2365; CHECK-NEXT:    ret{{[l|q]}}
2366  %arg0 = bitcast <2 x double> %a0 to <4 x float>
2367  %arg1 = bitcast <2 x double> %a1 to <4 x float>
2368  %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2369  %bc = bitcast <8 x float> %res to <4 x double>
2370  ret <4 x double> %bc
2371}
2372
2373define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
2374; CHECK-LABEL: test_mm256_setr_m128i:
2375; CHECK:       # %bb.0:
2376; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2377; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2378; CHECK-NEXT:    ret{{[l|q]}}
2379  %arg0 = bitcast <2 x i64> %a0 to <4 x float>
2380  %arg1 = bitcast <2 x i64> %a1 to <4 x float>
2381  %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2382  %bc = bitcast <8 x float> %res to <4 x i64>
2383  ret <4 x i64> %bc
2384}
2385
2386define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
2387; X86-LABEL: test_mm256_setr_pd:
2388; X86:       # %bb.0:
2389; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2390; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2391; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2392; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2393; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
2394; X86-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2395; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2396; X86-NEXT:    retl
2397;
2398; X64-LABEL: test_mm256_setr_pd:
2399; X64:       # %bb.0:
2400; X64-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2401; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2402; X64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2403; X64-NEXT:    retq
2404  %res0 = insertelement <4 x double> undef, double %a0, i32 0
2405  %res1 = insertelement <4 x double> %res0, double %a1, i32 1
2406  %res2 = insertelement <4 x double> %res1, double %a2, i32 2
2407  %res3 = insertelement <4 x double> %res2, double %a3, i32 3
2408  ret <4 x double> %res3
2409}
2410
2411define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
2412; X86-LABEL: test_mm256_setr_ps:
2413; X86:       # %bb.0:
2414; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2415; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2416; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2417; X86-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2418; X86-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2419; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2420; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2421; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2422; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2423; X86-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2424; X86-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
2425; X86-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
2426; X86-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
2427; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2428; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2429; X86-NEXT:    retl
2430;
2431; X64-LABEL: test_mm256_setr_ps:
2432; X64:       # %bb.0:
2433; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
2434; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
2435; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
2436; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2437; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2438; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
2439; X64-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
2440; X64-NEXT:    retq
2441  %res0 = insertelement <8 x float> undef, float %a0, i32 0
2442  %res1 = insertelement <8 x float> %res0, float %a1, i32 1
2443  %res2 = insertelement <8 x float> %res1, float %a2, i32 2
2444  %res3 = insertelement <8 x float> %res2, float %a3, i32 3
2445  %res4 = insertelement <8 x float> %res3, float %a4, i32 4
2446  %res5 = insertelement <8 x float> %res4, float %a5, i32 5
2447  %res6 = insertelement <8 x float> %res5, float %a6, i32 6
2448  %res7 = insertelement <8 x float> %res6, float %a7, i32 7
2449  ret <8 x float> %res7
2450}
2451
2452define <4 x double> @test_mm256_setzero_pd() nounwind {
2453; CHECK-LABEL: test_mm256_setzero_pd:
2454; CHECK:       # %bb.0:
2455; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2456; CHECK-NEXT:    ret{{[l|q]}}
2457  ret <4 x double> zeroinitializer
2458}
2459
2460define <8 x float> @test_mm256_setzero_ps() nounwind {
2461; CHECK-LABEL: test_mm256_setzero_ps:
2462; CHECK:       # %bb.0:
2463; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2464; CHECK-NEXT:    ret{{[l|q]}}
2465  ret <8 x float> zeroinitializer
2466}
2467
2468define <4 x i64> @test_mm256_setzero_si256() nounwind {
2469; CHECK-LABEL: test_mm256_setzero_si256:
2470; CHECK:       # %bb.0:
2471; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2472; CHECK-NEXT:    ret{{[l|q]}}
2473  ret <4 x i64> zeroinitializer
2474}
2475
2476define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2477; CHECK-LABEL: test_mm256_shuffle_pd:
2478; CHECK:       # %bb.0:
2479; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2480; CHECK-NEXT:    ret{{[l|q]}}
2481  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2482  ret <4 x double> %res
2483}
2484
2485define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2486; CHECK-LABEL: test_mm256_shuffle_ps:
2487; CHECK:       # %bb.0:
2488; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
2489; CHECK-NEXT:    ret{{[l|q]}}
2490  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
2491  ret <8 x float> %res
2492}
2493
2494define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
2495; CHECK-LABEL: test_mm256_sqrt_pd:
2496; CHECK:       # %bb.0: # %entry
2497; CHECK-NEXT:    vsqrtpd %ymm0, %ymm0
2498; CHECK-NEXT:    ret{{[l|q]}}
2499entry:
2500  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2
2501  ret <4 x double> %0
2502}
2503
2504declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1
2505
2506define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
2507; CHECK-LABEL: test_mm256_sqrt_ps:
2508; CHECK:       # %bb.0: # %entry
2509; CHECK-NEXT:    vsqrtps %ymm0, %ymm0
2510; CHECK-NEXT:    ret{{[l|q]}}
2511entry:
2512  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2
2513  ret <8 x float> %0
2514}
2515
2516declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
2517
2518define void @test_mm256_store_pd(ptr %a0, <4 x double> %a1) nounwind {
2519; X86-LABEL: test_mm256_store_pd:
2520; X86:       # %bb.0:
2521; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2522; X86-NEXT:    vmovaps %ymm0, (%eax)
2523; X86-NEXT:    vzeroupper
2524; X86-NEXT:    retl
2525;
2526; X64-LABEL: test_mm256_store_pd:
2527; X64:       # %bb.0:
2528; X64-NEXT:    vmovaps %ymm0, (%rdi)
2529; X64-NEXT:    vzeroupper
2530; X64-NEXT:    retq
2531  store <4 x double> %a1, ptr %a0, align 32
2532  ret void
2533}
2534
2535define void @test_mm256_store_ps(ptr %a0, <8 x float> %a1) nounwind {
2536; X86-LABEL: test_mm256_store_ps:
2537; X86:       # %bb.0:
2538; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2539; X86-NEXT:    vmovaps %ymm0, (%eax)
2540; X86-NEXT:    vzeroupper
2541; X86-NEXT:    retl
2542;
2543; X64-LABEL: test_mm256_store_ps:
2544; X64:       # %bb.0:
2545; X64-NEXT:    vmovaps %ymm0, (%rdi)
2546; X64-NEXT:    vzeroupper
2547; X64-NEXT:    retq
2548  store <8 x float> %a1, ptr %a0, align 32
2549  ret void
2550}
2551
2552define void @test_mm256_store_si256(ptr %a0, <4 x i64> %a1) nounwind {
2553; X86-LABEL: test_mm256_store_si256:
2554; X86:       # %bb.0:
2555; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2556; X86-NEXT:    vmovaps %ymm0, (%eax)
2557; X86-NEXT:    vzeroupper
2558; X86-NEXT:    retl
2559;
2560; X64-LABEL: test_mm256_store_si256:
2561; X64:       # %bb.0:
2562; X64-NEXT:    vmovaps %ymm0, (%rdi)
2563; X64-NEXT:    vzeroupper
2564; X64-NEXT:    retq
2565  store <4 x i64> %a1, ptr %a0, align 32
2566  ret void
2567}
2568
2569define void @test_mm256_storeu_pd(ptr %a0, <4 x double> %a1) nounwind {
2570; X86-LABEL: test_mm256_storeu_pd:
2571; X86:       # %bb.0:
2572; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2573; X86-NEXT:    vmovups %ymm0, (%eax)
2574; X86-NEXT:    vzeroupper
2575; X86-NEXT:    retl
2576;
2577; X64-LABEL: test_mm256_storeu_pd:
2578; X64:       # %bb.0:
2579; X64-NEXT:    vmovups %ymm0, (%rdi)
2580; X64-NEXT:    vzeroupper
2581; X64-NEXT:    retq
2582  store <4 x double> %a1, ptr %a0, align 1
2583  ret void
2584}
2585
2586define void @test_mm256_storeu_ps(ptr %a0, <8 x float> %a1) nounwind {
2587; X86-LABEL: test_mm256_storeu_ps:
2588; X86:       # %bb.0:
2589; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2590; X86-NEXT:    vmovups %ymm0, (%eax)
2591; X86-NEXT:    vzeroupper
2592; X86-NEXT:    retl
2593;
2594; X64-LABEL: test_mm256_storeu_ps:
2595; X64:       # %bb.0:
2596; X64-NEXT:    vmovups %ymm0, (%rdi)
2597; X64-NEXT:    vzeroupper
2598; X64-NEXT:    retq
2599  store <8 x float> %a1, ptr %a0, align 1
2600  ret void
2601}
2602
2603define void @test_mm256_storeu_si256(ptr %a0, <4 x i64> %a1) nounwind {
2604; X86-LABEL: test_mm256_storeu_si256:
2605; X86:       # %bb.0:
2606; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2607; X86-NEXT:    vmovups %ymm0, (%eax)
2608; X86-NEXT:    vzeroupper
2609; X86-NEXT:    retl
2610;
2611; X64-LABEL: test_mm256_storeu_si256:
2612; X64:       # %bb.0:
2613; X64-NEXT:    vmovups %ymm0, (%rdi)
2614; X64-NEXT:    vzeroupper
2615; X64-NEXT:    retq
2616  store <4 x i64> %a1, ptr %a0, align 1
2617  ret void
2618}
2619
2620define void @test_mm256_storeu2_m128(ptr %a0, ptr %a1, <8 x float> %a2) nounwind {
2621; X86-LABEL: test_mm256_storeu2_m128:
2622; X86:       # %bb.0:
2623; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2624; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2625; X86-NEXT:    vmovups %xmm0, (%ecx)
2626; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
2627; X86-NEXT:    vmovups %xmm0, (%eax)
2628; X86-NEXT:    vzeroupper
2629; X86-NEXT:    retl
2630;
2631; X64-LABEL: test_mm256_storeu2_m128:
2632; X64:       # %bb.0:
2633; X64-NEXT:    vmovups %xmm0, (%rdi)
2634; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
2635; X64-NEXT:    vmovups %xmm0, (%rsi)
2636; X64-NEXT:    vzeroupper
2637; X64-NEXT:    retq
2638  %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2639  store <4 x float> %lo, ptr %a0, align 1
2640  %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2641  store <4 x float> %hi, ptr %a1, align 1
2642  ret void
2643}
2644
2645define void @test_mm256_storeu2_m128d(ptr %a0, ptr %a1, <4 x double> %a2) nounwind {
2646; X86-LABEL: test_mm256_storeu2_m128d:
2647; X86:       # %bb.0:
2648; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2649; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2650; X86-NEXT:    vmovups %xmm0, (%ecx)
2651; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
2652; X86-NEXT:    vmovups %xmm0, (%eax)
2653; X86-NEXT:    vzeroupper
2654; X86-NEXT:    retl
2655;
2656; X64-LABEL: test_mm256_storeu2_m128d:
2657; X64:       # %bb.0:
2658; X64-NEXT:    vmovups %xmm0, (%rdi)
2659; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
2660; X64-NEXT:    vmovups %xmm0, (%rsi)
2661; X64-NEXT:    vzeroupper
2662; X64-NEXT:    retq
2663  %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
2664  store <2 x double> %lo, ptr %a0, align 1
2665  %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3>
2666  store <2 x double> %hi, ptr %a1, align 1
2667  ret void
2668}
2669
2670define void @test_mm256_storeu2_m128i(ptr %a0, ptr %a1, <4 x i64> %a2) nounwind {
2671; X86-LABEL: test_mm256_storeu2_m128i:
2672; X86:       # %bb.0:
2673; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2674; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2675; X86-NEXT:    vmovups %xmm0, (%ecx)
2676; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
2677; X86-NEXT:    vmovups %xmm0, (%eax)
2678; X86-NEXT:    vzeroupper
2679; X86-NEXT:    retl
2680;
2681; X64-LABEL: test_mm256_storeu2_m128i:
2682; X64:       # %bb.0:
2683; X64-NEXT:    vmovups %xmm0, (%rdi)
2684; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
2685; X64-NEXT:    vmovups %xmm0, (%rsi)
2686; X64-NEXT:    vzeroupper
2687; X64-NEXT:    retq
2688  %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
2689  store <2 x i64> %lo, ptr %a0, align 1
2690  %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3>
2691  store <2 x i64> %hi, ptr %a1, align 1
2692  ret void
2693}
2694
2695define void @test_mm256_stream_pd(ptr%a0, <4 x double> %a1) nounwind {
2696; X86-LABEL: test_mm256_stream_pd:
2697; X86:       # %bb.0:
2698; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2699; X86-NEXT:    vmovntps %ymm0, (%eax)
2700; X86-NEXT:    vzeroupper
2701; X86-NEXT:    retl
2702;
2703; X64-LABEL: test_mm256_stream_pd:
2704; X64:       # %bb.0:
2705; X64-NEXT:    vmovntps %ymm0, (%rdi)
2706; X64-NEXT:    vzeroupper
2707; X64-NEXT:    retq
2708  store <4 x double> %a1, ptr %a0, align 32, !nontemporal !0
2709  ret void
2710}
2711
2712define void @test_mm256_stream_ps(ptr%a0, <8 x float> %a1) nounwind {
2713; X86-LABEL: test_mm256_stream_ps:
2714; X86:       # %bb.0:
2715; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2716; X86-NEXT:    vmovntps %ymm0, (%eax)
2717; X86-NEXT:    vzeroupper
2718; X86-NEXT:    retl
2719;
2720; X64-LABEL: test_mm256_stream_ps:
2721; X64:       # %bb.0:
2722; X64-NEXT:    vmovntps %ymm0, (%rdi)
2723; X64-NEXT:    vzeroupper
2724; X64-NEXT:    retq
2725  store <8 x float> %a1, ptr %a0, align 32, !nontemporal !0
2726  ret void
2727}
2728
2729define void @test_mm256_stream_si256(ptr%a0, <4 x i64> %a1) nounwind {
2730; X86-LABEL: test_mm256_stream_si256:
2731; X86:       # %bb.0:
2732; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2733; X86-NEXT:    vmovntps %ymm0, (%eax)
2734; X86-NEXT:    vzeroupper
2735; X86-NEXT:    retl
2736;
2737; X64-LABEL: test_mm256_stream_si256:
2738; X64:       # %bb.0:
2739; X64-NEXT:    vmovntps %ymm0, (%rdi)
2740; X64-NEXT:    vzeroupper
2741; X64-NEXT:    retq
2742  store <4 x i64> %a1, ptr %a0, align 32, !nontemporal !0
2743  ret void
2744}
2745
2746define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2747; CHECK-LABEL: test_mm256_sub_pd:
2748; CHECK:       # %bb.0:
2749; CHECK-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
2750; CHECK-NEXT:    ret{{[l|q]}}
2751  %res = fsub <4 x double> %a0, %a1
2752  ret <4 x double> %res
2753}
2754
2755define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2756; CHECK-LABEL: test_mm256_sub_ps:
2757; CHECK:       # %bb.0:
2758; CHECK-NEXT:    vsubps %ymm1, %ymm0, %ymm0
2759; CHECK-NEXT:    ret{{[l|q]}}
2760  %res = fsub <8 x float> %a0, %a1
2761  ret <8 x float> %res
2762}
2763
2764define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2765; CHECK-LABEL: test_mm_testc_pd:
2766; CHECK:       # %bb.0:
2767; CHECK-NEXT:    xorl %eax, %eax
2768; CHECK-NEXT:    vtestpd %xmm1, %xmm0
2769; CHECK-NEXT:    setb %al
2770; CHECK-NEXT:    ret{{[l|q]}}
2771  %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
2772  ret i32 %res
2773}
2774declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
2775
2776define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2777; CHECK-LABEL: test_mm256_testc_pd:
2778; CHECK:       # %bb.0:
2779; CHECK-NEXT:    xorl %eax, %eax
2780; CHECK-NEXT:    vtestpd %ymm1, %ymm0
2781; CHECK-NEXT:    setb %al
2782; CHECK-NEXT:    vzeroupper
2783; CHECK-NEXT:    ret{{[l|q]}}
2784  %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
2785  ret i32 %res
2786}
2787declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2788
2789define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2790; CHECK-LABEL: test_mm_testc_ps:
2791; CHECK:       # %bb.0:
2792; CHECK-NEXT:    xorl %eax, %eax
2793; CHECK-NEXT:    vtestps %xmm1, %xmm0
2794; CHECK-NEXT:    setb %al
2795; CHECK-NEXT:    ret{{[l|q]}}
2796  %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
2797  ret i32 %res
2798}
2799declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
2800
2801define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2802; CHECK-LABEL: test_mm256_testc_ps:
2803; CHECK:       # %bb.0:
2804; CHECK-NEXT:    xorl %eax, %eax
2805; CHECK-NEXT:    vtestps %ymm1, %ymm0
2806; CHECK-NEXT:    setb %al
2807; CHECK-NEXT:    vzeroupper
2808; CHECK-NEXT:    ret{{[l|q]}}
2809  %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
2810  ret i32 %res
2811}
2812declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2813
2814define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2815; CHECK-LABEL: test_mm256_testc_si256:
2816; CHECK:       # %bb.0:
2817; CHECK-NEXT:    xorl %eax, %eax
2818; CHECK-NEXT:    vptest %ymm1, %ymm0
2819; CHECK-NEXT:    setb %al
2820; CHECK-NEXT:    vzeroupper
2821; CHECK-NEXT:    ret{{[l|q]}}
2822  %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
2823  ret i32 %res
2824}
2825declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
2826
2827define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2828; CHECK-LABEL: test_mm_testnzc_pd:
2829; CHECK:       # %bb.0:
2830; CHECK-NEXT:    xorl %eax, %eax
2831; CHECK-NEXT:    vtestpd %xmm1, %xmm0
2832; CHECK-NEXT:    seta %al
2833; CHECK-NEXT:    ret{{[l|q]}}
2834  %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
2835  ret i32 %res
2836}
2837declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
2838
2839define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2840; CHECK-LABEL: test_mm256_testnzc_pd:
2841; CHECK:       # %bb.0:
2842; CHECK-NEXT:    xorl %eax, %eax
2843; CHECK-NEXT:    vtestpd %ymm1, %ymm0
2844; CHECK-NEXT:    seta %al
2845; CHECK-NEXT:    vzeroupper
2846; CHECK-NEXT:    ret{{[l|q]}}
2847  %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
2848  ret i32 %res
2849}
2850declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2851
2852define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2853; CHECK-LABEL: test_mm_testnzc_ps:
2854; CHECK:       # %bb.0:
2855; CHECK-NEXT:    xorl %eax, %eax
2856; CHECK-NEXT:    vtestps %xmm1, %xmm0
2857; CHECK-NEXT:    seta %al
2858; CHECK-NEXT:    ret{{[l|q]}}
2859  %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
2860  ret i32 %res
2861}
2862declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
2863
2864define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2865; CHECK-LABEL: test_mm256_testnzc_ps:
2866; CHECK:       # %bb.0:
2867; CHECK-NEXT:    xorl %eax, %eax
2868; CHECK-NEXT:    vtestps %ymm1, %ymm0
2869; CHECK-NEXT:    seta %al
2870; CHECK-NEXT:    vzeroupper
2871; CHECK-NEXT:    ret{{[l|q]}}
2872  %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
2873  ret i32 %res
2874}
2875declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2876
2877define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2878; CHECK-LABEL: test_mm256_testnzc_si256:
2879; CHECK:       # %bb.0:
2880; CHECK-NEXT:    xorl %eax, %eax
2881; CHECK-NEXT:    vptest %ymm1, %ymm0
2882; CHECK-NEXT:    seta %al
2883; CHECK-NEXT:    vzeroupper
2884; CHECK-NEXT:    ret{{[l|q]}}
2885  %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
2886  ret i32 %res
2887}
2888declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
2889
2890define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2891; CHECK-LABEL: test_mm_testz_pd:
2892; CHECK:       # %bb.0:
2893; CHECK-NEXT:    xorl %eax, %eax
2894; CHECK-NEXT:    vtestpd %xmm1, %xmm0
2895; CHECK-NEXT:    sete %al
2896; CHECK-NEXT:    ret{{[l|q]}}
2897  %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
2898  ret i32 %res
2899}
2900declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
2901
2902define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2903; CHECK-LABEL: test_mm256_testz_pd:
2904; CHECK:       # %bb.0:
2905; CHECK-NEXT:    xorl %eax, %eax
2906; CHECK-NEXT:    vtestpd %ymm1, %ymm0
2907; CHECK-NEXT:    sete %al
2908; CHECK-NEXT:    vzeroupper
2909; CHECK-NEXT:    ret{{[l|q]}}
2910  %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
2911  ret i32 %res
2912}
2913declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
2914
2915define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2916; CHECK-LABEL: test_mm_testz_ps:
2917; CHECK:       # %bb.0:
2918; CHECK-NEXT:    xorl %eax, %eax
2919; CHECK-NEXT:    vtestps %xmm1, %xmm0
2920; CHECK-NEXT:    sete %al
2921; CHECK-NEXT:    ret{{[l|q]}}
2922  %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
2923  ret i32 %res
2924}
2925declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
2926
2927define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2928; CHECK-LABEL: test_mm256_testz_ps:
2929; CHECK:       # %bb.0:
2930; CHECK-NEXT:    xorl %eax, %eax
2931; CHECK-NEXT:    vtestps %ymm1, %ymm0
2932; CHECK-NEXT:    sete %al
2933; CHECK-NEXT:    vzeroupper
2934; CHECK-NEXT:    ret{{[l|q]}}
2935  %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
2936  ret i32 %res
2937}
2938declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
2939
2940define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2941; CHECK-LABEL: test_mm256_testz_si256:
2942; CHECK:       # %bb.0:
2943; CHECK-NEXT:    xorl %eax, %eax
2944; CHECK-NEXT:    vptest %ymm1, %ymm0
2945; CHECK-NEXT:    sete %al
2946; CHECK-NEXT:    vzeroupper
2947; CHECK-NEXT:    ret{{[l|q]}}
2948  %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
2949  ret i32 %res
2950}
2951declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
2952
2953define <4 x double> @test_mm256_undefined_pd() nounwind {
2954; CHECK-LABEL: test_mm256_undefined_pd:
2955; CHECK:       # %bb.0:
2956; CHECK-NEXT:    ret{{[l|q]}}
2957  ret <4 x double> undef
2958}
2959
2960define <8 x float> @test_mm256_undefined_ps() nounwind {
2961; CHECK-LABEL: test_mm256_undefined_ps:
2962; CHECK:       # %bb.0:
2963; CHECK-NEXT:    ret{{[l|q]}}
2964  ret <8 x float> undef
2965}
2966
2967define <4 x i64> @test_mm256_undefined_si256() nounwind {
2968; CHECK-LABEL: test_mm256_undefined_si256:
2969; CHECK:       # %bb.0:
2970; CHECK-NEXT:    ret{{[l|q]}}
2971  ret <4 x i64> undef
2972}
2973
2974define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2975; CHECK-LABEL: test_mm256_unpackhi_pd:
2976; CHECK:       # %bb.0:
2977; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2978; CHECK-NEXT:    ret{{[l|q]}}
2979  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2980  ret <4 x double> %res
2981}
2982
2983define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2984; CHECK-LABEL: test_mm256_unpackhi_ps:
2985; CHECK:       # %bb.0:
2986; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2987; CHECK-NEXT:    ret{{[l|q]}}
2988  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2989  ret <8 x float> %res
2990}
2991
2992define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2993; CHECK-LABEL: test_mm256_unpacklo_pd:
2994; CHECK:       # %bb.0:
2995; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2996; CHECK-NEXT:    ret{{[l|q]}}
2997  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2998  ret <4 x double> %res
2999}
3000
3001define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3002; CHECK-LABEL: test_mm256_unpacklo_ps:
3003; CHECK:       # %bb.0:
3004; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3005; CHECK-NEXT:    ret{{[l|q]}}
3006  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3007  ret <8 x float> %res
3008}
3009
3010define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3011; CHECK-LABEL: test_mm256_xor_pd:
3012; CHECK:       # %bb.0:
3013; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3014; CHECK-NEXT:    ret{{[l|q]}}
3015  %1 = bitcast <4 x double> %a0 to <4 x i64>
3016  %2 = bitcast <4 x double> %a1 to <4 x i64>
3017  %res = xor <4 x i64> %1, %2
3018  %bc = bitcast <4 x i64> %res to <4 x double>
3019  ret <4 x double> %bc
3020}
3021
3022define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3023; CHECK-LABEL: test_mm256_xor_ps:
3024; CHECK:       # %bb.0:
3025; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3026; CHECK-NEXT:    ret{{[l|q]}}
3027  %1 = bitcast <8 x float> %a0 to <8 x i32>
3028  %2 = bitcast <8 x float> %a1 to <8 x i32>
3029  %res = xor <8 x i32> %1, %2
3030  %bc = bitcast <8 x i32> %res to <8 x float>
3031  ret <8 x float> %bc
3032}
3033
3034define void @test_mm256_zeroall() nounwind {
3035; CHECK-LABEL: test_mm256_zeroall:
3036; CHECK:       # %bb.0:
3037; CHECK-NEXT:    vzeroall
3038; CHECK-NEXT:    ret{{[l|q]}}
3039  call void @llvm.x86.avx.vzeroall()
3040  ret void
3041}
3042declare void @llvm.x86.avx.vzeroall() nounwind readnone
3043
3044define void @test_mm256_zeroupper() nounwind {
3045; CHECK-LABEL: test_mm256_zeroupper:
3046; CHECK:       # %bb.0:
3047; CHECK-NEXT:    vzeroupper
3048; CHECK-NEXT:    ret{{[l|q]}}
3049  call void @llvm.x86.avx.vzeroupper()
3050  ret void
3051}
3052declare void @llvm.x86.avx.vzeroupper() nounwind readnone
3053
3054define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind {
3055; CHECK-LABEL: test_mm256_zextpd128_pd256:
3056; CHECK:       # %bb.0:
3057; CHECK-NEXT:    vmovaps %xmm0, %xmm0
3058; CHECK-NEXT:    ret{{[l|q]}}
3059  %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3060  ret <4 x double> %res
3061}
3062
3063define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind {
3064; CHECK-LABEL: test_mm256_zextps128_ps256:
3065; CHECK:       # %bb.0:
3066; CHECK-NEXT:    vmovaps %xmm0, %xmm0
3067; CHECK-NEXT:    ret{{[l|q]}}
3068  %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3069  ret <8 x float> %res
3070}
3071
3072define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind {
3073; CHECK-LABEL: test_mm256_zextsi128_si256:
3074; CHECK:       # %bb.0:
3075; CHECK-NEXT:    vmovaps %xmm0, %xmm0
3076; CHECK-NEXT:    ret{{[l|q]}}
3077  %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3078  ret <4 x i64> %res
3079}
3080
3081!0 = !{i32 1}
3082