xref: /llvm-project/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll (revision 69a322fed19b977d15be9500d8653496b73673e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
6
7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8; CHECK-LABEL: test_mm256_abs_epi8:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vpabsb %ymm0, %ymm0
11; CHECK-NEXT:    ret{{[l|q]}}
12  %arg = bitcast <4 x i64> %a0 to <32 x i8>
13  %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false)
14  %res = bitcast <32 x i8> %abs to <4 x i64>
15  ret <4 x i64> %res
16}
17declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone
18
19define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
20; CHECK-LABEL: test_mm256_abs_epi16:
21; CHECK:       # %bb.0:
22; CHECK-NEXT:    vpabsw %ymm0, %ymm0
23; CHECK-NEXT:    ret{{[l|q]}}
24  %arg = bitcast <4 x i64> %a0 to <16 x i16>
25  %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
26  %res = bitcast <16 x i16> %abs to <4 x i64>
27  ret <4 x i64> %res
28}
29declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone
30
31define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
32; CHECK-LABEL: test_mm256_abs_epi32:
33; CHECK:       # %bb.0:
34; CHECK-NEXT:    vpabsd %ymm0, %ymm0
35; CHECK-NEXT:    ret{{[l|q]}}
36  %arg = bitcast <4 x i64> %a0 to <8 x i32>
37  %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false)
38  %res = bitcast <8 x i32> %abs to <4 x i64>
39  ret <4 x i64> %res
40}
41declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone
42
43define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
44; CHECK-LABEL: test_mm256_add_epi8:
45; CHECK:       # %bb.0:
46; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
47; CHECK-NEXT:    ret{{[l|q]}}
48  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
49  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
50  %res = add <32 x i8> %arg0, %arg1
51  %bc = bitcast <32 x i8> %res to <4 x i64>
52  ret <4 x i64> %bc
53}
54
55define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
56; CHECK-LABEL: test_mm256_add_epi16:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
59; CHECK-NEXT:    ret{{[l|q]}}
60  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
61  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
62  %res = add <16 x i16> %arg0, %arg1
63  %bc = bitcast <16 x i16> %res to <4 x i64>
64  ret <4 x i64> %bc
65}
66
67define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
68; CHECK-LABEL: test_mm256_add_epi32:
69; CHECK:       # %bb.0:
70; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
71; CHECK-NEXT:    ret{{[l|q]}}
72  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
73  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
74  %res = add <8 x i32> %arg0, %arg1
75  %bc = bitcast <8 x i32> %res to <4 x i64>
76  ret <4 x i64> %bc
77}
78
79define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
80; CHECK-LABEL: test_mm256_add_epi64:
81; CHECK:       # %bb.0:
82; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
83; CHECK-NEXT:    ret{{[l|q]}}
84  %res = add <4 x i64> %a0, %a1
85  ret <4 x i64> %res
86}
87
88define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
89; CHECK-LABEL: test_mm256_adds_epi8:
90; CHECK:       # %bb.0:
91; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
92; CHECK-NEXT:    ret{{[l|q]}}
93  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
94  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
95  %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
96  %bc = bitcast <32 x i8> %res to <4 x i64>
97  ret <4 x i64> %bc
98}
99declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
100
101define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
102; CHECK-LABEL: test_mm256_adds_epi16:
103; CHECK:       # %bb.0:
104; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
105; CHECK-NEXT:    ret{{[l|q]}}
106  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
107  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
108  %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
109  %bc = bitcast <16 x i16> %res to <4 x i64>
110  ret <4 x i64> %bc
111}
112declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
113
114define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
115; CHECK-LABEL: test_mm256_adds_epu8:
116; CHECK:       # %bb.0:
117; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
118; CHECK-NEXT:    ret{{[l|q]}}
119  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
120  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
121  %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
122  %bc = bitcast <32 x i8> %res to <4 x i64>
123  ret <4 x i64> %bc
124}
125declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
126
127define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
128; CHECK-LABEL: test_mm256_adds_epu16:
129; CHECK:       # %bb.0:
130; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
131; CHECK-NEXT:    ret{{[l|q]}}
132  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
133  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
134  %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
135  %bc = bitcast <16 x i16> %res to <4 x i64>
136  ret <4 x i64> %bc
137}
138declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
139
140define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
141; CHECK-LABEL: test_mm256_alignr_epi8:
142; CHECK:       # %bb.0:
143; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
144; CHECK-NEXT:    ret{{[l|q]}}
145  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
146  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
147  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
148  %res = bitcast <32 x i8> %shuf to <4 x i64>
149  ret <4 x i64> %res
150}
151
152define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
153; CHECK-LABEL: test2_mm256_alignr_epi8:
154; CHECK:       # %bb.0:
155; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
156; CHECK-NEXT:    ret{{[l|q]}}
157  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
158  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
159  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
160  %res = bitcast <32 x i8> %shuf to <4 x i64>
161  ret <4 x i64> %res
162}
163
164define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
165; CHECK-LABEL: test_mm256_and_si256:
166; CHECK:       # %bb.0:
167; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
168; CHECK-NEXT:    ret{{[l|q]}}
169  %res = and <4 x i64> %a0, %a1
170  ret <4 x i64> %res
171}
172
173define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
174; CHECK-LABEL: test_mm256_andnot_si256:
175; CHECK:       # %bb.0:
176; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
177; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
178; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
179; CHECK-NEXT:    ret{{[l|q]}}
180  %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
181  %res = and <4 x i64> %not, %a1
182  ret <4 x i64> %res
183}
184
185define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
186; CHECK-LABEL: test_mm256_avg_epu8:
187; CHECK:       # %bb.0:
188; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
189; CHECK-NEXT:    ret{{[l|q]}}
190  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
191  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
192  %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
193  %bc = bitcast <32 x i8> %res to <4 x i64>
194  ret <4 x i64> %bc
195}
196declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
197
198define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
199; CHECK-LABEL: test_mm256_avg_epu16:
200; CHECK:       # %bb.0:
201; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
202; CHECK-NEXT:    ret{{[l|q]}}
203  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
204  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
205  %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
206  %bc = bitcast <16 x i16> %res to <4 x i64>
207  ret <4 x i64> %bc
208}
209declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
210
211define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
212; CHECK-LABEL: test_mm256_blend_epi16:
213; CHECK:       # %bb.0:
214; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
215; CHECK-NEXT:    ret{{[l|q]}}
216  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
217  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
218  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
219  %res = bitcast <16 x i16> %shuf to <4 x i64>
220  ret <4 x i64> %res
221}
222
223define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
224; CHECK-LABEL: test_mm_blend_epi32:
225; CHECK:       # %bb.0:
226; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
227; CHECK-NEXT:    ret{{[l|q]}}
228  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
229  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
230  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
231  %res = bitcast <4 x i32> %shuf to <2 x i64>
232  ret <2 x i64> %res
233}
234
235define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
236; CHECK-LABEL: test_mm256_blend_epi32:
237; CHECK:       # %bb.0:
238; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
239; CHECK-NEXT:    ret{{[l|q]}}
240  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
241  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
242  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
243  %res = bitcast <8 x i32> %shuf to <4 x i64>
244  ret <4 x i64> %res
245}
246
247define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
248; CHECK-LABEL: test_mm256_blendv_epi8:
249; CHECK:       # %bb.0:
250; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
251; CHECK-NEXT:    ret{{[l|q]}}
252  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
253  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
254  %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
255  %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
256  %res = bitcast <32 x i8> %call to <4 x i64>
257  ret <4 x i64> %res
258}
259declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
260
261define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
262; CHECK-LABEL: test_mm_broadcastb_epi8:
263; CHECK:       # %bb.0:
264; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
265; CHECK-NEXT:    ret{{[l|q]}}
266  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
267  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
268  %res = bitcast <16 x i8> %shuf to <2 x i64>
269  ret <2 x i64> %res
270}
271
272define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
273; CHECK-LABEL: test_mm256_broadcastb_epi8:
274; CHECK:       # %bb.0:
275; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
276; CHECK-NEXT:    ret{{[l|q]}}
277  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
278  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
279  %res = bitcast <32 x i8> %shuf to <4 x i64>
280  ret <4 x i64> %res
281}
282
283define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
284; CHECK-LABEL: test_mm_broadcastd_epi32:
285; CHECK:       # %bb.0:
286; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
287; CHECK-NEXT:    ret{{[l|q]}}
288  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
289  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
290  %res = bitcast <4 x i32> %shuf to <2 x i64>
291  ret <2 x i64> %res
292}
293
294define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
295; CHECK-LABEL: test_mm256_broadcastd_epi32:
296; CHECK:       # %bb.0:
297; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
298; CHECK-NEXT:    ret{{[l|q]}}
299  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
300  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
301  %res = bitcast <8 x i32> %shuf to <4 x i64>
302  ret <4 x i64> %res
303}
304
305define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
306; CHECK-LABEL: test_mm_broadcastq_epi64:
307; CHECK:       # %bb.0:
308; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
309; CHECK-NEXT:    ret{{[l|q]}}
310  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
311  ret <2 x i64> %res
312}
313
314define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
315; CHECK-LABEL: test_mm256_broadcastq_epi64:
316; CHECK:       # %bb.0:
317; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
318; CHECK-NEXT:    ret{{[l|q]}}
319  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
320  ret <4 x i64> %res
321}
322
323define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
324; CHECK-LABEL: test_mm_broadcastsd_pd:
325; CHECK:       # %bb.0:
326; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
327; CHECK-NEXT:    ret{{[l|q]}}
328  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
329  ret <2 x double> %res
330}
331
332define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
333; CHECK-LABEL: test_mm256_broadcastsd_pd:
334; CHECK:       # %bb.0:
335; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
336; CHECK-NEXT:    ret{{[l|q]}}
337  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
338  ret <4 x double> %res
339}
340
341define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
342; CHECK-LABEL: test_mm256_broadcastsi128_si256:
343; CHECK:       # %bb.0:
344; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
345; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
346; CHECK-NEXT:    ret{{[l|q]}}
347  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
348  ret <4 x i64> %res
349}
350
351define <4 x i64> @test_mm256_broadcastsi128_si256_mem(ptr %p0) {
352; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
353; X86:       # %bb.0:
354; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
355; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
356; X86-NEXT:    retl
357;
358; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
359; X64:       # %bb.0:
360; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
361; X64-NEXT:    retq
362  %a0 = load <2 x i64>, ptr %p0
363  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
364  ret <4 x i64> %res
365}
366
367define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
368; CHECK-LABEL: test_mm_broadcastss_ps:
369; CHECK:       # %bb.0:
370; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
371; CHECK-NEXT:    ret{{[l|q]}}
372  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
373  ret <4 x float> %res
374}
375
376define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
377; CHECK-LABEL: test_mm256_broadcastss_ps:
378; CHECK:       # %bb.0:
379; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
380; CHECK-NEXT:    ret{{[l|q]}}
381  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
382  ret <8 x float> %res
383}
384
385define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
386; CHECK-LABEL: test_mm_broadcastw_epi16:
387; CHECK:       # %bb.0:
388; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
389; CHECK-NEXT:    ret{{[l|q]}}
390  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
391  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
392  %res = bitcast <8 x i16> %shuf to <2 x i64>
393  ret <2 x i64> %res
394}
395
396define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
397; CHECK-LABEL: test_mm256_broadcastw_epi16:
398; CHECK:       # %bb.0:
399; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
400; CHECK-NEXT:    ret{{[l|q]}}
401  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
402  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
403  %res = bitcast <16 x i16> %shuf to <4 x i64>
404  ret <4 x i64> %res
405}
406
407define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
408; CHECK-LABEL: test_mm256_bslli_epi128:
409; CHECK:       # %bb.0:
410; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
411; CHECK-NEXT:    ret{{[l|q]}}
412  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
413  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
414  %res = bitcast <32 x i8> %shuf to <4 x i64>
415  ret <4 x i64> %res
416}
417
418define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
419; CHECK-LABEL: test_mm256_bsrli_epi128:
420; CHECK:       # %bb.0:
421; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
422; CHECK-NEXT:    ret{{[l|q]}}
423  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
424  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
425  %res = bitcast <32 x i8> %shuf to <4 x i64>
426  ret <4 x i64> %res
427}
428
429define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
430; CHECK-LABEL: test_mm256_cmpeq_epi8:
431; CHECK:       # %bb.0:
432; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
433; CHECK-NEXT:    ret{{[l|q]}}
434  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
435  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
436  %cmp = icmp eq <32 x i8> %arg0, %arg1
437  %res = sext <32 x i1> %cmp to <32 x i8>
438  %bc = bitcast <32 x i8> %res to <4 x i64>
439  ret <4 x i64> %bc
440}
441
442define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
443; CHECK-LABEL: test_mm256_cmpeq_epi16:
444; CHECK:       # %bb.0:
445; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
446; CHECK-NEXT:    ret{{[l|q]}}
447  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
448  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
449  %cmp = icmp eq <16 x i16> %arg0, %arg1
450  %res = sext <16 x i1> %cmp to <16 x i16>
451  %bc = bitcast <16 x i16> %res to <4 x i64>
452  ret <4 x i64> %bc
453}
454
455define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
456; CHECK-LABEL: test_mm256_cmpeq_epi32:
457; CHECK:       # %bb.0:
458; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
459; CHECK-NEXT:    ret{{[l|q]}}
460  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
461  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
462  %cmp = icmp eq <8 x i32> %arg0, %arg1
463  %res = sext <8 x i1> %cmp to <8 x i32>
464  %bc = bitcast <8 x i32> %res to <4 x i64>
465  ret <4 x i64> %bc
466}
467
468define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
469; CHECK-LABEL: test_mm256_cmpeq_epi64:
470; CHECK:       # %bb.0:
471; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
472; CHECK-NEXT:    ret{{[l|q]}}
473  %cmp = icmp eq <4 x i64> %a0, %a1
474  %res = sext <4 x i1> %cmp to <4 x i64>
475  ret <4 x i64> %res
476}
477
478define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
479; CHECK-LABEL: test_mm256_cmpgt_epi8:
480; CHECK:       # %bb.0:
481; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
482; CHECK-NEXT:    ret{{[l|q]}}
483  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
484  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
485  %cmp = icmp sgt <32 x i8> %arg0, %arg1
486  %res = sext <32 x i1> %cmp to <32 x i8>
487  %bc = bitcast <32 x i8> %res to <4 x i64>
488  ret <4 x i64> %bc
489}
490
491define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
492; CHECK-LABEL: test_mm256_cmpgt_epi16:
493; CHECK:       # %bb.0:
494; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
495; CHECK-NEXT:    ret{{[l|q]}}
496  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
497  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
498  %cmp = icmp sgt <16 x i16> %arg0, %arg1
499  %res = sext <16 x i1> %cmp to <16 x i16>
500  %bc = bitcast <16 x i16> %res to <4 x i64>
501  ret <4 x i64> %bc
502}
503
504define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
505; CHECK-LABEL: test_mm256_cmpgt_epi32:
506; CHECK:       # %bb.0:
507; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
508; CHECK-NEXT:    ret{{[l|q]}}
509  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
510  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
511  %cmp = icmp sgt <8 x i32> %arg0, %arg1
512  %res = sext <8 x i1> %cmp to <8 x i32>
513  %bc = bitcast <8 x i32> %res to <4 x i64>
514  ret <4 x i64> %bc
515}
516
517define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
518; CHECK-LABEL: test_mm256_cmpgt_epi64:
519; CHECK:       # %bb.0:
520; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
521; CHECK-NEXT:    ret{{[l|q]}}
522  %cmp = icmp sgt <4 x i64> %a0, %a1
523  %res = sext <4 x i1> %cmp to <4 x i64>
524  ret <4 x i64> %res
525}
526
527define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
528; CHECK-LABEL: test_mm256_cvtepi8_epi16:
529; CHECK:       # %bb.0:
530; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
531; CHECK-NEXT:    ret{{[l|q]}}
532  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
533  %ext = sext <16 x i8> %arg0 to <16 x i16>
534  %res = bitcast <16 x i16> %ext to <4 x i64>
535  ret <4 x i64> %res
536}
537
538define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
539; CHECK-LABEL: test_mm256_cvtepi8_epi32:
540; CHECK:       # %bb.0:
541; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
542; CHECK-NEXT:    ret{{[l|q]}}
543  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
544  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
545  %ext = sext <8 x i8> %shuf to <8 x i32>
546  %res = bitcast <8 x i32> %ext to <4 x i64>
547  ret <4 x i64> %res
548}
549
550define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
551; CHECK-LABEL: test_mm256_cvtepi8_epi64:
552; CHECK:       # %bb.0:
553; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
554; CHECK-NEXT:    ret{{[l|q]}}
555  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
556  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
557  %ext = sext <4 x i8> %shuf to <4 x i64>
558  ret <4 x i64> %ext
559}
560
561define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
562; CHECK-LABEL: test_mm256_cvtepi16_epi32:
563; CHECK:       # %bb.0:
564; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
565; CHECK-NEXT:    ret{{[l|q]}}
566  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
567  %ext = sext <8 x i16> %arg0 to <8 x i32>
568  %res = bitcast <8 x i32> %ext to <4 x i64>
569  ret <4 x i64> %res
570}
571
572define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
573; CHECK-LABEL: test_mm256_cvtepi16_epi64:
574; CHECK:       # %bb.0:
575; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
576; CHECK-NEXT:    ret{{[l|q]}}
577  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
578  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
579  %ext = sext <4 x i16> %shuf to <4 x i64>
580  ret <4 x i64> %ext
581}
582
583define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
584; CHECK-LABEL: test_mm256_cvtepi32_epi64:
585; CHECK:       # %bb.0:
586; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
587; CHECK-NEXT:    ret{{[l|q]}}
588  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
589  %ext = sext <4 x i32> %arg0 to <4 x i64>
590  ret <4 x i64> %ext
591}
592
593define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
594; CHECK-LABEL: test_mm256_cvtepu8_epi16:
595; CHECK:       # %bb.0:
596; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
597; CHECK-NEXT:    ret{{[l|q]}}
598  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
599  %ext = zext <16 x i8> %arg0 to <16 x i16>
600  %res = bitcast <16 x i16> %ext to <4 x i64>
601  ret <4 x i64> %res
602}
603
604define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
605; CHECK-LABEL: test_mm256_cvtepu8_epi32:
606; CHECK:       # %bb.0:
607; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
608; CHECK-NEXT:    ret{{[l|q]}}
609  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
610  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
611  %ext = zext <8 x i8> %shuf to <8 x i32>
612  %res = bitcast <8 x i32> %ext to <4 x i64>
613  ret <4 x i64> %res
614}
615
616define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
617; CHECK-LABEL: test_mm256_cvtepu8_epi64:
618; CHECK:       # %bb.0:
619; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
620; CHECK-NEXT:    ret{{[l|q]}}
621  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
622  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
623  %ext = zext <4 x i8> %shuf to <4 x i64>
624  ret <4 x i64> %ext
625}
626
627define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
628; CHECK-LABEL: test_mm256_cvtepu16_epi32:
629; CHECK:       # %bb.0:
630; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
631; CHECK-NEXT:    ret{{[l|q]}}
632  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
633  %ext = zext <8 x i16> %arg0 to <8 x i32>
634  %res = bitcast <8 x i32> %ext to <4 x i64>
635  ret <4 x i64> %res
636}
637
638define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
639; CHECK-LABEL: test_mm256_cvtepu16_epi64:
640; CHECK:       # %bb.0:
641; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
642; CHECK-NEXT:    ret{{[l|q]}}
643  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
644  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
645  %ext = zext <4 x i16> %shuf to <4 x i64>
646  ret <4 x i64> %ext
647}
648
649define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
650; CHECK-LABEL: test_mm256_cvtepu32_epi64:
651; CHECK:       # %bb.0:
652; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
653; CHECK-NEXT:    ret{{[l|q]}}
654  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
655  %ext = zext <4 x i32> %arg0 to <4 x i64>
656  ret <4 x i64> %ext
657}
658
659define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
660; CHECK-LABEL: test_mm256_extracti128_si256:
661; CHECK:       # %bb.0:
662; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
663; CHECK-NEXT:    vzeroupper
664; CHECK-NEXT:    ret{{[l|q]}}
665  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
666  ret <2 x i64> %res
667}
668
669define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
670; CHECK-LABEL: test_mm256_hadd_epi16:
671; CHECK:       # %bb.0:
672; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
673; CHECK-NEXT:    ret{{[l|q]}}
674  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
675  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
676  %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
677  %bc = bitcast <16 x i16> %res to <4 x i64>
678  ret <4 x i64> %bc
679}
680declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
681
682define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
683; CHECK-LABEL: test_mm256_hadd_epi32:
684; CHECK:       # %bb.0:
685; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
686; CHECK-NEXT:    ret{{[l|q]}}
687  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
688  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
689  %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
690  %bc = bitcast <8 x i32> %res to <4 x i64>
691  ret <4 x i64> %bc
692}
693declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
694
695define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
696; CHECK-LABEL: test_mm256_hadds_epi16:
697; CHECK:       # %bb.0:
698; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
699; CHECK-NEXT:    ret{{[l|q]}}
700  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
701  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
702  %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
703  %bc = bitcast <16 x i16> %res to <4 x i64>
704  ret <4 x i64> %bc
705}
706declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
707
708define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
709; CHECK-LABEL: test_mm256_hsub_epi16:
710; CHECK:       # %bb.0:
711; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
712; CHECK-NEXT:    ret{{[l|q]}}
713  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
714  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
715  %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
716  %bc = bitcast <16 x i16> %res to <4 x i64>
717  ret <4 x i64> %bc
718}
719declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
720
721define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
722; CHECK-LABEL: test_mm256_hsub_epi32:
723; CHECK:       # %bb.0:
724; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
725; CHECK-NEXT:    ret{{[l|q]}}
726  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
727  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
728  %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
729  %bc = bitcast <8 x i32> %res to <4 x i64>
730  ret <4 x i64> %bc
731}
732declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
733
734define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
735; CHECK-LABEL: test_mm256_hsubs_epi16:
736; CHECK:       # %bb.0:
737; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
738; CHECK-NEXT:    ret{{[l|q]}}
739  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
740  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
741  %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
742  %bc = bitcast <16 x i16> %res to <4 x i64>
743  ret <4 x i64> %bc
744}
745declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
746
747define <2 x i64> @test_mm_i32gather_epi32(ptr%a0, <2 x i64> %a1) {
748; X86-LABEL: test_mm_i32gather_epi32:
749; X86:       # %bb.0:
750; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
751; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
752; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
753; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
754; X86-NEXT:    vmovdqa %xmm1, %xmm0
755; X86-NEXT:    retl
756;
757; X64-LABEL: test_mm_i32gather_epi32:
758; X64:       # %bb.0:
759; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
760; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
761; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
762; X64-NEXT:    vmovdqa %xmm1, %xmm0
763; X64-NEXT:    retq
764  %arg0 = bitcast ptr%a0 to ptr
765  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
766  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
767  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, ptr %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
768  %bc = bitcast <4 x i32> %call to <2 x i64>
769  ret <2 x i64> %bc
770}
771declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, ptr, <4 x i32>, <4 x i32>, i8) nounwind readonly
772
773define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
774; X86-LABEL: test_mm_mask_i32gather_epi32:
775; X86:       # %bb.0:
776; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
777; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
778; X86-NEXT:    retl
779;
780; X64-LABEL: test_mm_mask_i32gather_epi32:
781; X64:       # %bb.0:
782; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
783; X64-NEXT:    retq
784  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
785  %arg1 = bitcast ptr%a1 to ptr
786  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
787  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
788  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, ptr %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
789  %bc = bitcast <4 x i32> %call to <2 x i64>
790  ret <2 x i64> %bc
791}
792
793define <4 x i64> @test_mm256_i32gather_epi32(ptr%a0, <4 x i64> %a1) {
794; X86-LABEL: test_mm256_i32gather_epi32:
795; X86:       # %bb.0:
796; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
797; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
798; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
799; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
800; X86-NEXT:    vmovdqa %ymm1, %ymm0
801; X86-NEXT:    retl
802;
803; X64-LABEL: test_mm256_i32gather_epi32:
804; X64:       # %bb.0:
805; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
806; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
807; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
808; X64-NEXT:    vmovdqa %ymm1, %ymm0
809; X64-NEXT:    retq
810  %arg0 = bitcast ptr%a0 to ptr
811  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
812  %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
813  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, ptr %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
814  %bc = bitcast <8 x i32> %call to <4 x i64>
815  ret <4 x i64> %bc
816}
817declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, ptr, <8 x i32>, <8 x i32>, i8) nounwind readonly
818
819define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, ptr%a1, <4 x i64> %a2, <4 x i64> %a3) {
820; X86-LABEL: test_mm256_mask_i32gather_epi32:
821; X86:       # %bb.0:
822; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
823; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
824; X86-NEXT:    retl
825;
826; X64-LABEL: test_mm256_mask_i32gather_epi32:
827; X64:       # %bb.0:
828; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
829; X64-NEXT:    retq
830  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
831  %arg1 = bitcast ptr%a1 to ptr
832  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
833  %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
834  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, ptr %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
835  %bc = bitcast <8 x i32> %call to <4 x i64>
836  ret <4 x i64> %bc
837}
838
839define <2 x i64> @test_mm_i32gather_epi64(ptr%a0, <2 x i64> %a1) {
840; X86-LABEL: test_mm_i32gather_epi64:
841; X86:       # %bb.0:
842; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
843; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
844; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
845; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
846; X86-NEXT:    vmovdqa %xmm1, %xmm0
847; X86-NEXT:    retl
848;
849; X64-LABEL: test_mm_i32gather_epi64:
850; X64:       # %bb.0:
851; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
852; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
853; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
854; X64-NEXT:    vmovdqa %xmm1, %xmm0
855; X64-NEXT:    retq
856  %arg0 = bitcast ptr%a0 to ptr
857  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
858  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, ptr %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
859  ret <2 x i64> %res
860}
861declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, ptr, <4 x i32>, <2 x i64>, i8) nounwind readonly
862
863define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
864; X86-LABEL: test_mm_mask_i32gather_epi64:
865; X86:       # %bb.0:
866; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
867; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
868; X86-NEXT:    retl
869;
870; X64-LABEL: test_mm_mask_i32gather_epi64:
871; X64:       # %bb.0:
872; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
873; X64-NEXT:    retq
874  %arg1 = bitcast ptr%a1 to ptr
875  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
876  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, ptr %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
877  ret <2 x i64> %res
878}
879
880define <4 x i64> @test_mm256_i32gather_epi64(ptr%a0, <2 x i64> %a1) {
881; X86-LABEL: test_mm256_i32gather_epi64:
882; X86:       # %bb.0:
883; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
884; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
885; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
886; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
887; X86-NEXT:    vmovdqa %ymm1, %ymm0
888; X86-NEXT:    retl
889;
890; X64-LABEL: test_mm256_i32gather_epi64:
891; X64:       # %bb.0:
892; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
893; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
894; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
895; X64-NEXT:    vmovdqa %ymm1, %ymm0
896; X64-NEXT:    retq
897  %arg0 = bitcast ptr%a0 to ptr
898  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
899  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, ptr %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
900  ret <4 x i64> %res
901}
902declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, ptr, <4 x i32>, <4 x i64>, i8) nounwind readonly
903
904define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, ptr%a1, <2 x i64> %a2, <4 x i64> %a3) {
905; X86-LABEL: test_mm256_mask_i32gather_epi64:
906; X86:       # %bb.0:
907; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
908; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
909; X86-NEXT:    retl
910;
911; X64-LABEL: test_mm256_mask_i32gather_epi64:
912; X64:       # %bb.0:
913; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
914; X64-NEXT:    retq
915  %arg1 = bitcast ptr%a1 to ptr
916  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
917  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, ptr %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
918  ret <4 x i64> %res
919}
920
921define <2 x double> @test_mm_i32gather_pd(ptr%a0, <2 x i64> %a1) {
922; X86-LABEL: test_mm_i32gather_pd:
923; X86:       # %bb.0:
924; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
925; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
926; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
927; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
928; X86-NEXT:    vmovapd %xmm1, %xmm0
929; X86-NEXT:    retl
930;
931; X64-LABEL: test_mm_i32gather_pd:
932; X64:       # %bb.0:
933; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
934; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
935; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
936; X64-NEXT:    vmovapd %xmm1, %xmm0
937; X64-NEXT:    retq
938  %arg0 = bitcast ptr%a0 to ptr
939  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
940  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
941  %sext = sext <2 x i1> %cmp to <2 x i64>
942  %mask = bitcast <2 x i64> %sext to <2 x double>
943  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, ptr %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
944  ret <2 x double> %res
945}
946declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, ptr, <4 x i32>, <2 x double>, i8) nounwind readonly
947
948define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, ptr%a1, <2 x i64> %a2, <2 x double> %a3) {
949; X86-LABEL: test_mm_mask_i32gather_pd:
950; X86:       # %bb.0:
951; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
952; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
953; X86-NEXT:    retl
954;
955; X64-LABEL: test_mm_mask_i32gather_pd:
956; X64:       # %bb.0:
957; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
958; X64-NEXT:    retq
959  %arg1 = bitcast ptr%a1 to ptr
960  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
961  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, ptr %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
962  ret <2 x double> %res
963}
964
965define <4 x double> @test_mm256_i32gather_pd(ptr%a0, <2 x i64> %a1) {
966; X86-LABEL: test_mm256_i32gather_pd:
967; X86:       # %bb.0:
968; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
969; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
970; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
971; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
972; X86-NEXT:    vmovapd %ymm1, %ymm0
973; X86-NEXT:    retl
974;
975; X64-LABEL: test_mm256_i32gather_pd:
976; X64:       # %bb.0:
977; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
978; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
979; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
980; X64-NEXT:    vmovapd %ymm1, %ymm0
981; X64-NEXT:    retq
982  %arg0 = bitcast ptr%a0 to ptr
983  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
984  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
985  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, ptr %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
986  ret <4 x double> %res
987}
988declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, ptr, <4 x i32>, <4 x double>, i8) nounwind readonly
989
990define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, ptr%a1, <2 x i64> %a2, <4 x double> %a3) {
991; X86-LABEL: test_mm256_mask_i32gather_pd:
992; X86:       # %bb.0:
993; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
994; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
995; X86-NEXT:    retl
996;
997; X64-LABEL: test_mm256_mask_i32gather_pd:
998; X64:       # %bb.0:
999; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1000; X64-NEXT:    retq
1001  %arg1 = bitcast ptr%a1 to ptr
1002  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1003  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, ptr %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1004  ret <4 x double> %res
1005}
1006
1007define <4 x float> @test_mm_i32gather_ps(ptr%a0, <2 x i64> %a1) {
1008; X86-LABEL: test_mm_i32gather_ps:
1009; X86:       # %bb.0:
1010; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1011; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1012; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1013; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1014; X86-NEXT:    vmovaps %xmm1, %xmm0
1015; X86-NEXT:    retl
1016;
1017; X64-LABEL: test_mm_i32gather_ps:
1018; X64:       # %bb.0:
1019; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1020; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1021; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1022; X64-NEXT:    vmovaps %xmm1, %xmm0
1023; X64-NEXT:    retq
1024  %arg0 = bitcast ptr%a0 to ptr
1025  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1026  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1027  %sext = sext <4 x i1> %cmp to <4 x i32>
1028  %mask = bitcast <4 x i32> %sext to <4 x float>
1029  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, ptr %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1030  ret <4 x float> %call
1031}
1032declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, ptr, <4 x i32>, <4 x float>, i8) nounwind readonly
1033
1034define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, ptr%a1, <2 x i64> %a2, <4 x float> %a3) {
1035; X86-LABEL: test_mm_mask_i32gather_ps:
1036; X86:       # %bb.0:
1037; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1038; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1039; X86-NEXT:    retl
1040;
1041; X64-LABEL: test_mm_mask_i32gather_ps:
1042; X64:       # %bb.0:
1043; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1044; X64-NEXT:    retq
1045  %arg1 = bitcast ptr%a1 to ptr
1046  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1047  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, ptr %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1048  ret <4 x float> %call
1049}
1050
1051define <8 x float> @test_mm256_i32gather_ps(ptr%a0, <4 x i64> %a1) {
1052; X86-LABEL: test_mm256_i32gather_ps:
1053; X86:       # %bb.0:
1054; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1055; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1056; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1057; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1058; X86-NEXT:    vmovaps %ymm1, %ymm0
1059; X86-NEXT:    retl
1060;
1061; X64-LABEL: test_mm256_i32gather_ps:
1062; X64:       # %bb.0:
1063; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1064; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1065; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1066; X64-NEXT:    vmovaps %ymm1, %ymm0
1067; X64-NEXT:    retq
1068  %arg0 = bitcast ptr%a0 to ptr
1069  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1070  %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1071  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, ptr %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1072  ret <8 x float> %call
1073}
1074declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, ptr, <8 x i32>, <8 x float>, i8) nounwind readonly
1075
1076define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, ptr%a1, <4 x i64> %a2, <8 x float> %a3) {
1077; X86-LABEL: test_mm256_mask_i32gather_ps:
1078; X86:       # %bb.0:
1079; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1080; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1081; X86-NEXT:    retl
1082;
1083; X64-LABEL: test_mm256_mask_i32gather_ps:
1084; X64:       # %bb.0:
1085; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1086; X64-NEXT:    retq
1087  %arg1 = bitcast ptr%a1 to ptr
1088  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1089  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, ptr %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1090  ret <8 x float> %call
1091}
1092
1093define <2 x i64> @test_mm_i64gather_epi32(ptr%a0, <2 x i64> %a1) {
1094; X86-LABEL: test_mm_i64gather_epi32:
1095; X86:       # %bb.0:
1096; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1097; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1098; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1099; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1100; X86-NEXT:    vmovdqa %xmm1, %xmm0
1101; X86-NEXT:    retl
1102;
1103; X64-LABEL: test_mm_i64gather_epi32:
1104; X64:       # %bb.0:
1105; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1106; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1107; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1108; X64-NEXT:    vmovdqa %xmm1, %xmm0
1109; X64-NEXT:    retq
1110  %arg0 = bitcast ptr%a0 to ptr
1111  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1112  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, ptr %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1113  %bc = bitcast <4 x i32> %call to <2 x i64>
1114  ret <2 x i64> %bc
1115}
1116declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, ptr, <2 x i64>, <4 x i32>, i8) nounwind readonly
1117
1118define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
1119; X86-LABEL: test_mm_mask_i64gather_epi32:
1120; X86:       # %bb.0:
1121; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1122; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1123; X86-NEXT:    retl
1124;
1125; X64-LABEL: test_mm_mask_i64gather_epi32:
1126; X64:       # %bb.0:
1127; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1128; X64-NEXT:    retq
1129  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1130  %arg1 = bitcast ptr%a1 to ptr
1131  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1132  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, ptr %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1133  %bc = bitcast <4 x i32> %call to <2 x i64>
1134  ret <2 x i64> %bc
1135}
1136
1137define <2 x i64> @test_mm256_i64gather_epi32(ptr%a0, <4 x i64> %a1) {
1138; X86-LABEL: test_mm256_i64gather_epi32:
1139; X86:       # %bb.0:
1140; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1141; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1142; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1143; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1144; X86-NEXT:    vmovdqa %xmm1, %xmm0
1145; X86-NEXT:    vzeroupper
1146; X86-NEXT:    retl
1147;
1148; X64-LABEL: test_mm256_i64gather_epi32:
1149; X64:       # %bb.0:
1150; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1151; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1152; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1153; X64-NEXT:    vmovdqa %xmm1, %xmm0
1154; X64-NEXT:    vzeroupper
1155; X64-NEXT:    retq
1156  %arg0 = bitcast ptr%a0 to ptr
1157  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1158  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, ptr %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1159  %bc = bitcast <4 x i32> %call to <2 x i64>
1160  ret <2 x i64> %bc
1161}
1162declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, ptr, <4 x i64>, <4 x i32>, i8) nounwind readonly
1163
1164define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, ptr%a1, <4 x i64> %a2, <2 x i64> %a3) {
1165; X86-LABEL: test_mm256_mask_i64gather_epi32:
1166; X86:       # %bb.0:
1167; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1168; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1169; X86-NEXT:    vzeroupper
1170; X86-NEXT:    retl
1171;
1172; X64-LABEL: test_mm256_mask_i64gather_epi32:
1173; X64:       # %bb.0:
1174; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1175; X64-NEXT:    vzeroupper
1176; X64-NEXT:    retq
1177  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1178  %arg1 = bitcast ptr%a1 to ptr
1179  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1180  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, ptr %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1181  %bc = bitcast <4 x i32> %call to <2 x i64>
1182  ret <2 x i64> %bc
1183}
1184
1185define <2 x i64> @test_mm_i64gather_epi64(ptr%a0, <2 x i64> %a1) {
1186; X86-LABEL: test_mm_i64gather_epi64:
1187; X86:       # %bb.0:
1188; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1189; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1190; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1191; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1192; X86-NEXT:    vmovdqa %xmm1, %xmm0
1193; X86-NEXT:    retl
1194;
1195; X64-LABEL: test_mm_i64gather_epi64:
1196; X64:       # %bb.0:
1197; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1198; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1199; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1200; X64-NEXT:    vmovdqa %xmm1, %xmm0
1201; X64-NEXT:    retq
1202  %arg0 = bitcast ptr%a0 to ptr
1203  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, ptr %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1204  ret <2 x i64> %call
1205}
1206declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, ptr, <2 x i64>, <2 x i64>, i8) nounwind readonly
1207
1208define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
1209; X86-LABEL: test_mm_mask_i64gather_epi64:
1210; X86:       # %bb.0:
1211; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1212; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1213; X86-NEXT:    retl
1214;
1215; X64-LABEL: test_mm_mask_i64gather_epi64:
1216; X64:       # %bb.0:
1217; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1218; X64-NEXT:    retq
1219  %arg1 = bitcast ptr%a1 to ptr
1220  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, ptr %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1221  ret <2 x i64> %call
1222}
1223
1224define <4 x i64> @test_mm256_i64gather_epi64(ptr%a0, <4 x i64> %a1) {
1225; X86-LABEL: test_mm256_i64gather_epi64:
1226; X86:       # %bb.0:
1227; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1228; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1229; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1230; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1231; X86-NEXT:    vmovdqa %ymm1, %ymm0
1232; X86-NEXT:    retl
1233;
1234; X64-LABEL: test_mm256_i64gather_epi64:
1235; X64:       # %bb.0:
1236; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1237; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1238; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1239; X64-NEXT:    vmovdqa %ymm1, %ymm0
1240; X64-NEXT:    retq
1241  %arg0 = bitcast ptr%a0 to ptr
1242  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, ptr %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1243  ret <4 x i64> %call
1244}
1245declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, ptr, <4 x i64>, <4 x i64>, i8) nounwind readonly
1246
1247define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, ptr%a1, <4 x i64> %a2, <4 x i64> %a3) {
1248; X86-LABEL: test_mm256_mask_i64gather_epi64:
1249; X86:       # %bb.0:
1250; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1251; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1252; X86-NEXT:    retl
1253;
1254; X64-LABEL: test_mm256_mask_i64gather_epi64:
1255; X64:       # %bb.0:
1256; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1257; X64-NEXT:    retq
1258  %arg1 = bitcast ptr%a1 to ptr
1259  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, ptr %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1260  ret <4 x i64> %call
1261}
1262
1263define <2 x double> @test_mm_i64gather_pd(ptr%a0, <2 x i64> %a1) {
1264; X86-LABEL: test_mm_i64gather_pd:
1265; X86:       # %bb.0:
1266; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1267; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1268; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1269; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1270; X86-NEXT:    vmovapd %xmm1, %xmm0
1271; X86-NEXT:    retl
1272;
1273; X64-LABEL: test_mm_i64gather_pd:
1274; X64:       # %bb.0:
1275; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1276; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1277; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1278; X64-NEXT:    vmovapd %xmm1, %xmm0
1279; X64-NEXT:    retq
1280  %arg0 = bitcast ptr%a0 to ptr
1281  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1282  %sext = sext <2 x i1> %cmp to <2 x i64>
1283  %mask = bitcast <2 x i64> %sext to <2 x double>
1284  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, ptr %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1285  ret <2 x double> %call
1286}
1287declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, ptr, <2 x i64>, <2 x double>, i8) nounwind readonly
1288
1289define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, ptr%a1, <2 x i64> %a2, <2 x double> %a3) {
1290; X86-LABEL: test_mm_mask_i64gather_pd:
1291; X86:       # %bb.0:
1292; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1293; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1294; X86-NEXT:    retl
1295;
1296; X64-LABEL: test_mm_mask_i64gather_pd:
1297; X64:       # %bb.0:
1298; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1299; X64-NEXT:    retq
1300  %arg1 = bitcast ptr%a1 to ptr
1301  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, ptr %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1302  ret <2 x double> %call
1303}
1304
1305define <4 x double> @test_mm256_i64gather_pd(ptr%a0, <4 x i64> %a1) {
1306; X86-LABEL: test_mm256_i64gather_pd:
1307; X86:       # %bb.0:
1308; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1309; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1310; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1311; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1312; X86-NEXT:    vmovapd %ymm1, %ymm0
1313; X86-NEXT:    retl
1314;
1315; X64-LABEL: test_mm256_i64gather_pd:
1316; X64:       # %bb.0:
1317; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1318; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1319; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1320; X64-NEXT:    vmovapd %ymm1, %ymm0
1321; X64-NEXT:    retq
1322  %arg0 = bitcast ptr%a0 to ptr
1323  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1324  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, ptr %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1325  ret <4 x double> %call
1326}
1327declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, ptr, <4 x i64>, <4 x double>, i8) nounwind readonly
1328
1329define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, ptr%a1, <4 x i64> %a2, <4 x double> %a3) {
1330; X86-LABEL: test_mm256_mask_i64gather_pd:
1331; X86:       # %bb.0:
1332; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1333; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1334; X86-NEXT:    retl
1335;
1336; X64-LABEL: test_mm256_mask_i64gather_pd:
1337; X64:       # %bb.0:
1338; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1339; X64-NEXT:    retq
1340  %arg1 = bitcast ptr%a1 to ptr
1341  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, ptr %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1342  ret <4 x double> %call
1343}
1344
1345define <4 x float> @test_mm_i64gather_ps(ptr%a0, <2 x i64> %a1) {
1346; X86-LABEL: test_mm_i64gather_ps:
1347; X86:       # %bb.0:
1348; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1349; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1350; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1351; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1352; X86-NEXT:    vmovaps %xmm1, %xmm0
1353; X86-NEXT:    retl
1354;
1355; X64-LABEL: test_mm_i64gather_ps:
1356; X64:       # %bb.0:
1357; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1358; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1359; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1360; X64-NEXT:    vmovaps %xmm1, %xmm0
1361; X64-NEXT:    retq
1362  %arg0 = bitcast ptr%a0 to ptr
1363  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1364  %sext = sext <4 x i1> %cmp to <4 x i32>
1365  %mask = bitcast <4 x i32> %sext to <4 x float>
1366  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, ptr %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1367  ret <4 x float> %call
1368}
1369declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, ptr, <2 x i64>, <4 x float>, i8) nounwind readonly
1370
1371define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, ptr%a1, <2 x i64> %a2, <4 x float> %a3) {
1372; X86-LABEL: test_mm_mask_i64gather_ps:
1373; X86:       # %bb.0:
1374; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1375; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1376; X86-NEXT:    retl
1377;
1378; X64-LABEL: test_mm_mask_i64gather_ps:
1379; X64:       # %bb.0:
1380; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1381; X64-NEXT:    retq
1382  %arg1 = bitcast ptr%a1 to ptr
1383  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, ptr %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1384  ret <4 x float> %call
1385}
1386
1387define <4 x float> @test_mm256_i64gather_ps(ptr%a0, <4 x i64> %a1) {
1388; X86-LABEL: test_mm256_i64gather_ps:
1389; X86:       # %bb.0:
1390; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1391; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1392; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1393; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1394; X86-NEXT:    vmovaps %xmm1, %xmm0
1395; X86-NEXT:    vzeroupper
1396; X86-NEXT:    retl
1397;
1398; X64-LABEL: test_mm256_i64gather_ps:
1399; X64:       # %bb.0:
1400; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1401; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1402; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1403; X64-NEXT:    vmovaps %xmm1, %xmm0
1404; X64-NEXT:    vzeroupper
1405; X64-NEXT:    retq
1406  %arg0 = bitcast ptr%a0 to ptr
1407  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1408  %sext = sext <4 x i1> %cmp to <4 x i32>
1409  %mask = bitcast <4 x i32> %sext to <4 x float>
1410  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, ptr %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1411  ret <4 x float> %call
1412}
1413declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, ptr, <4 x i64>, <4 x float>, i8) nounwind readonly
1414
1415define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, ptr%a1, <4 x i64> %a2, <4 x float> %a3) {
1416; X86-LABEL: test_mm256_mask_i64gather_ps:
1417; X86:       # %bb.0:
1418; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1419; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1420; X86-NEXT:    vzeroupper
1421; X86-NEXT:    retl
1422;
1423; X64-LABEL: test_mm256_mask_i64gather_ps:
1424; X64:       # %bb.0:
1425; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1426; X64-NEXT:    vzeroupper
1427; X64-NEXT:    retq
1428  %arg1 = bitcast ptr%a1 to ptr
1429  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, ptr %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1430  ret <4 x float> %call
1431}
1432
1433define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1434; CHECK-LABEL: test0_mm256_inserti128_si256:
1435; CHECK:       # %bb.0:
1436; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1437; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1438; CHECK-NEXT:    ret{{[l|q]}}
1439  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1440  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1441  ret <4 x i64> %res
1442}
1443
1444define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1445; CHECK-LABEL: test1_mm256_inserti128_si256:
1446; CHECK:       # %bb.0:
1447; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1448; CHECK-NEXT:    ret{{[l|q]}}
1449  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1450  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1451  ret <4 x i64> %res
1452}
1453
1454define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1455; CHECK-LABEL: test_mm256_madd_epi16:
1456; CHECK:       # %bb.0:
1457; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1458; CHECK-NEXT:    ret{{[l|q]}}
1459  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1460  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1461  %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1462  %bc = bitcast <8 x i32> %res to <4 x i64>
1463  ret <4 x i64> %bc
1464}
1465declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1466
1467define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1468; CHECK-LABEL: test_mm256_maddubs_epi16:
1469; CHECK:       # %bb.0:
1470; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1471; CHECK-NEXT:    ret{{[l|q]}}
1472  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1473  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1474  %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1475  %bc = bitcast <16 x i16> %res to <4 x i64>
1476  ret <4 x i64> %bc
1477}
1478declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1479
1480define <2 x i64> @test_mm_maskload_epi32(ptr %a0, <2 x i64> %a1) nounwind {
1481; X86-LABEL: test_mm_maskload_epi32:
1482; X86:       # %bb.0:
1483; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1484; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1485; X86-NEXT:    retl
1486;
1487; X64-LABEL: test_mm_maskload_epi32:
1488; X64:       # %bb.0:
1489; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1490; X64-NEXT:    retq
1491  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1492  %call = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %arg1)
1493  %bc = bitcast <4 x i32> %call to <2 x i64>
1494  ret <2 x i64> %bc
1495}
1496declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly
1497
1498define <4 x i64> @test_mm256_maskload_epi32(ptr %a0, <4 x i64> %a1) nounwind {
1499; X86-LABEL: test_mm256_maskload_epi32:
1500; X86:       # %bb.0:
1501; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1502; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1503; X86-NEXT:    retl
1504;
1505; X64-LABEL: test_mm256_maskload_epi32:
1506; X64:       # %bb.0:
1507; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1508; X64-NEXT:    retq
1509  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1510  %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %arg1)
1511  %bc = bitcast <8 x i32> %call to <4 x i64>
1512  ret <4 x i64> %bc
1513}
1514declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonly
1515
1516define <2 x i64> @test_mm_maskload_epi64(ptr %a0, <2 x i64> %a1) nounwind {
1517; X86-LABEL: test_mm_maskload_epi64:
1518; X86:       # %bb.0:
1519; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1520; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1521; X86-NEXT:    retl
1522;
1523; X64-LABEL: test_mm_maskload_epi64:
1524; X64:       # %bb.0:
1525; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1526; X64-NEXT:    retq
1527  %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1)
1528  ret <2 x i64> %res
1529}
1530declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly
1531
1532define <4 x i64> @test_mm256_maskload_epi64(ptr %a0, <4 x i64> %a1) nounwind {
1533; X86-LABEL: test_mm256_maskload_epi64:
1534; X86:       # %bb.0:
1535; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1536; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1537; X86-NEXT:    retl
1538;
1539; X64-LABEL: test_mm256_maskload_epi64:
1540; X64:       # %bb.0:
1541; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1542; X64-NEXT:    retq
1543  %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1)
1544  ret <4 x i64> %res
1545}
1546declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonly
1547
1548define void @test_mm_maskstore_epi32(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1549; X86-LABEL: test_mm_maskstore_epi32:
1550; X86:       # %bb.0:
1551; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1552; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1553; X86-NEXT:    retl
1554;
1555; X64-LABEL: test_mm_maskstore_epi32:
1556; X64:       # %bb.0:
1557; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1558; X64-NEXT:    retq
1559  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1560  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1561  call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %arg1, <4 x i32> %arg2)
1562  ret void
1563}
1564declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind readnone
1565
1566define void @test_mm256_maskstore_epi32(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1567; X86-LABEL: test_mm256_maskstore_epi32:
1568; X86:       # %bb.0:
1569; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1570; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1571; X86-NEXT:    vzeroupper
1572; X86-NEXT:    retl
1573;
1574; X64-LABEL: test_mm256_maskstore_epi32:
1575; X64:       # %bb.0:
1576; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1577; X64-NEXT:    vzeroupper
1578; X64-NEXT:    retq
1579  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1580  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1581  call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %arg1, <8 x i32> %arg2)
1582  ret void
1583}
1584declare void @llvm.x86.avx2.maskstore.d.256(ptr, <8 x i32>, <8 x i32>) nounwind readnone
1585
1586define void @test_mm_maskstore_epi64(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1587; X86-LABEL: test_mm_maskstore_epi64:
1588; X86:       # %bb.0:
1589; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1590; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1591; X86-NEXT:    retl
1592;
1593; X64-LABEL: test_mm_maskstore_epi64:
1594; X64:       # %bb.0:
1595; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1596; X64-NEXT:    retq
1597  call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2)
1598  ret void
1599}
1600declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind readnone
1601
1602define void @test_mm256_maskstore_epi64(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1603; X86-LABEL: test_mm256_maskstore_epi64:
1604; X86:       # %bb.0:
1605; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1606; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1607; X86-NEXT:    vzeroupper
1608; X86-NEXT:    retl
1609;
1610; X64-LABEL: test_mm256_maskstore_epi64:
1611; X64:       # %bb.0:
1612; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1613; X64-NEXT:    vzeroupper
1614; X64-NEXT:    retq
1615  call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2)
1616  ret void
1617}
1618declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind readnone
1619
1620define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1621; CHECK-LABEL: test_mm256_max_epi8:
1622; CHECK:       # %bb.0:
1623; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1624; CHECK-NEXT:    ret{{[l|q]}}
1625  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1626  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1627  %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1628  %bc = bitcast <32 x i8> %sel to <4 x i64>
1629  ret <4 x i64> %bc
1630}
1631declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
1632
1633define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1634; CHECK-LABEL: test_mm256_max_epi16:
1635; CHECK:       # %bb.0:
1636; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1637; CHECK-NEXT:    ret{{[l|q]}}
1638  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1639  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1640  %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1641  %bc = bitcast <16 x i16> %sel to <4 x i64>
1642  ret <4 x i64> %bc
1643}
1644declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
1645
1646define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1647; CHECK-LABEL: test_mm256_max_epi32:
1648; CHECK:       # %bb.0:
1649; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1650; CHECK-NEXT:    ret{{[l|q]}}
1651  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1652  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1653  %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1654  %bc = bitcast <8 x i32> %sel to <4 x i64>
1655  ret <4 x i64> %bc
1656}
1657declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
1658
1659define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1660; CHECK-LABEL: test_mm256_max_epu8:
1661; CHECK:       # %bb.0:
1662; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1663; CHECK-NEXT:    ret{{[l|q]}}
1664  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1665  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1666  %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1667  %bc = bitcast <32 x i8> %sel to <4 x i64>
1668  ret <4 x i64> %bc
1669}
1670declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
1671
1672define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1673; CHECK-LABEL: test_mm256_max_epu16:
1674; CHECK:       # %bb.0:
1675; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
1676; CHECK-NEXT:    ret{{[l|q]}}
1677  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1678  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1679  %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1680  %bc = bitcast <16 x i16> %sel to <4 x i64>
1681  ret <4 x i64> %bc
1682}
1683declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
1684
1685define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1686; CHECK-LABEL: test_mm256_max_epu32:
1687; CHECK:       # %bb.0:
1688; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
1689; CHECK-NEXT:    ret{{[l|q]}}
1690  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1691  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1692  %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1693  %bc = bitcast <8 x i32> %sel to <4 x i64>
1694  ret <4 x i64> %bc
1695}
1696declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
1697
1698define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1699; CHECK-LABEL: test_mm256_min_epi8:
1700; CHECK:       # %bb.0:
1701; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
1702; CHECK-NEXT:    ret{{[l|q]}}
1703  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1704  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1705  %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1706  %bc = bitcast <32 x i8> %sel to <4 x i64>
1707  ret <4 x i64> %bc
1708}
1709declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
1710
1711define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1712; CHECK-LABEL: test_mm256_min_epi16:
1713; CHECK:       # %bb.0:
1714; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
1715; CHECK-NEXT:    ret{{[l|q]}}
1716  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1717  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1718  %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1719  %bc = bitcast <16 x i16> %sel to <4 x i64>
1720  ret <4 x i64> %bc
1721}
1722declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
1723
1724define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1725; CHECK-LABEL: test_mm256_min_epi32:
1726; CHECK:       # %bb.0:
1727; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
1728; CHECK-NEXT:    ret{{[l|q]}}
1729  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1730  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1731  %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1732  %bc = bitcast <8 x i32> %sel to <4 x i64>
1733  ret <4 x i64> %bc
1734}
1735declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
1736
1737define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1738; CHECK-LABEL: test_mm256_min_epu8:
1739; CHECK:       # %bb.0:
1740; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
1741; CHECK-NEXT:    ret{{[l|q]}}
1742  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1743  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1744  %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1745  %bc = bitcast <32 x i8> %sel to <4 x i64>
1746  ret <4 x i64> %bc
1747}
1748declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
1749
1750define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1751; CHECK-LABEL: test_mm256_min_epu16:
1752; CHECK:       # %bb.0:
1753; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
1754; CHECK-NEXT:    ret{{[l|q]}}
1755  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1756  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1757  %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1758  %bc = bitcast <16 x i16> %sel to <4 x i64>
1759  ret <4 x i64> %bc
1760}
1761declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
1762
1763define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1764; CHECK-LABEL: test_mm256_min_epu32:
1765; CHECK:       # %bb.0:
1766; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
1767; CHECK-NEXT:    ret{{[l|q]}}
1768  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1769  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1770  %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1771  %bc = bitcast <8 x i32> %sel to <4 x i64>
1772  ret <4 x i64> %bc
1773}
1774declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
1775
1776define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1777; CHECK-LABEL: test_mm256_movemask_epi8:
1778; CHECK:       # %bb.0:
1779; CHECK-NEXT:    vpmovmskb %ymm0, %eax
1780; CHECK-NEXT:    vzeroupper
1781; CHECK-NEXT:    ret{{[l|q]}}
1782  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1783  %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1784  ret i32 %res
1785}
1786declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1787
1788define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1789; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1790; CHECK:       # %bb.0:
1791; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
1792; CHECK-NEXT:    ret{{[l|q]}}
1793  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1794  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1795  %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1796  %bc = bitcast <16 x i16>  %call to <4 x i64>
1797  ret <4 x i64> %bc
1798}
1799declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1800
1801define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1802; CHECK-LABEL: test_mm256_mul_epi32:
1803; CHECK:       # %bb.0:
1804; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1805; CHECK-NEXT:    ret{{[l|q]}}
1806  %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1807  %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1808  %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1809  %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1810  %res = mul nsw <4 x i64> %A1, %B1
1811  ret <4 x i64> %res
1812}
1813declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1814
1815define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1816; CHECK-LABEL: test_mm256_mul_epu32:
1817; CHECK:       # %bb.0:
1818; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1819; CHECK-NEXT:    ret{{[l|q]}}
1820  %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1821  %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1822  %res = mul nuw <4 x i64> %A, %B
1823  ret <4 x i64> %res
1824}
1825declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1826
1827define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1828; CHECK-LABEL: test_mm256_mulhi_epi16:
1829; CHECK:       # %bb.0:
1830; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1831; CHECK-NEXT:    ret{{[l|q]}}
1832  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1833  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1834  %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1835  %bc = bitcast <16 x i16> %res to <4 x i64>
1836  ret <4 x i64> %bc
1837}
1838declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1839
1840define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1841; CHECK-LABEL: test_mm256_mulhi_epu16:
1842; CHECK:       # %bb.0:
1843; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1844; CHECK-NEXT:    ret{{[l|q]}}
1845  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1846  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1847  %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1848  %bc = bitcast <16 x i16> %res to <4 x i64>
1849  ret <4 x i64> %bc
1850}
1851declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1852
1853define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1854; CHECK-LABEL: test_mm256_mulhrs_epi16:
1855; CHECK:       # %bb.0:
1856; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
1857; CHECK-NEXT:    ret{{[l|q]}}
1858  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1859  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1860  %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1861  %bc = bitcast <16 x i16> %res to <4 x i64>
1862  ret <4 x i64> %bc
1863}
1864declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1865
1866define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1867; CHECK-LABEL: test_mm256_mullo_epi16:
1868; CHECK:       # %bb.0:
1869; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1870; CHECK-NEXT:    ret{{[l|q]}}
1871  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1872  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1873  %res = mul <16 x i16> %arg0, %arg1
1874  %bc = bitcast <16 x i16> %res to <4 x i64>
1875  ret <4 x i64> %bc
1876}
1877
1878define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1879; CHECK-LABEL: test_mm256_mullo_epi32:
1880; CHECK:       # %bb.0:
1881; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1882; CHECK-NEXT:    ret{{[l|q]}}
1883  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1884  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1885  %res = mul <8 x i32> %arg0, %arg1
1886  %bc = bitcast <8 x i32> %res to <4 x i64>
1887  ret <4 x i64> %bc
1888}
1889
1890define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1891; CHECK-LABEL: test_mm256_or_si256:
1892; CHECK:       # %bb.0:
1893; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1894; CHECK-NEXT:    ret{{[l|q]}}
1895  %res = or <4 x i64> %a0, %a1
1896  ret <4 x i64> %res
1897}
1898
1899define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1900; CHECK-LABEL: test_mm256_packs_epi16:
1901; CHECK:       # %bb.0:
1902; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
1903; CHECK-NEXT:    ret{{[l|q]}}
1904  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1905  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1906  %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1907  %res = bitcast <32 x i8> %call to <4 x i64>
1908  ret <4 x i64> %res
1909}
1910declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1911
1912define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1913; CHECK-LABEL: test_mm256_packs_epi32:
1914; CHECK:       # %bb.0:
1915; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
1916; CHECK-NEXT:    ret{{[l|q]}}
1917  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1918  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1919  %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1920  %res = bitcast <16 x i16> %call to <4 x i64>
1921  ret <4 x i64> %res
1922}
1923declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1924
1925define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1926; CHECK-LABEL: test_mm256_packus_epi16:
1927; CHECK:       # %bb.0:
1928; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1929; CHECK-NEXT:    ret{{[l|q]}}
1930  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1931  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1932  %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1933  %res = bitcast <32 x i8> %call to <4 x i64>
1934  ret <4 x i64> %res
1935}
1936declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1937
1938define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1939; CHECK-LABEL: test_mm256_packus_epi32:
1940; CHECK:       # %bb.0:
1941; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1942; CHECK-NEXT:    ret{{[l|q]}}
1943  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1944  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1945  %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1946  %res = bitcast <16 x i16> %call to <4 x i64>
1947  ret <4 x i64> %res
1948}
1949declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1950
1951define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1952; CHECK-LABEL: test_mm256_permute2x128_si256:
1953; CHECK:       # %bb.0:
1954; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1955; CHECK-NEXT:    ret{{[l|q]}}
1956  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1957  ret <4 x i64> %res
1958}
1959declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1960
1961define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1962; CHECK-LABEL: test_mm256_permute4x64_epi64:
1963; CHECK:       # %bb.0:
1964; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1965; CHECK-NEXT:    ret{{[l|q]}}
1966  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1967  ret <4 x i64> %res
1968}
1969
1970define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1971; CHECK-LABEL: test_mm256_permute4x64_pd:
1972; CHECK:       # %bb.0:
1973; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1974; CHECK-NEXT:    ret{{[l|q]}}
1975  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1976  ret <4 x double> %res
1977}
1978
1979define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1980; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1981; CHECK:       # %bb.0:
1982; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1983; CHECK-NEXT:    ret{{[l|q]}}
1984  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1985  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1986  %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
1987  %res = bitcast <8 x i32> %call to <4 x i64>
1988  ret <4 x i64> %res
1989}
1990declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
1991
1992define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
1993; CHECK-LABEL: test_mm256_permutevar8x32_ps:
1994; CHECK:       # %bb.0:
1995; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1996; CHECK-NEXT:    ret{{[l|q]}}
1997  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1998  %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
1999  ret <8 x float> %res
2000}
2001declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2002
2003define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2004; CHECK-LABEL: test_mm256_sad_epu8:
2005; CHECK:       # %bb.0:
2006; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2007; CHECK-NEXT:    ret{{[l|q]}}
2008  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2009  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2010  %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2011  ret <4 x i64> %res
2012}
2013declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2014
2015define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2016; CHECK-LABEL: test_mm256_shuffle_epi32:
2017; CHECK:       # %bb.0:
2018; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2019; CHECK-NEXT:    ret{{[l|q]}}
2020  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2021  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2022  %res = bitcast <8 x i32> %shuf to <4 x i64>
2023  ret <4 x i64> %res
2024}
2025
2026define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2027; CHECK-LABEL: test_mm256_shuffle_epi8:
2028; CHECK:       # %bb.0:
2029; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2030; CHECK-NEXT:    ret{{[l|q]}}
2031  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2032  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2033  %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2034  %res = bitcast <32 x i8> %shuf to <4 x i64>
2035  ret <4 x i64> %res
2036}
2037declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2038
2039define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2040; CHECK-LABEL: test_mm256_shufflehi_epi16:
2041; CHECK:       # %bb.0:
2042; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2043; CHECK-NEXT:    ret{{[l|q]}}
2044  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2045  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2046  %res = bitcast <16 x i16> %shuf to <4 x i64>
2047  ret <4 x i64> %res
2048}
2049
2050define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2051; CHECK-LABEL: test_mm256_shufflelo_epi16:
2052; CHECK:       # %bb.0:
2053; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2054; CHECK-NEXT:    ret{{[l|q]}}
2055  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2056  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2057  %res = bitcast <16 x i16> %shuf to <4 x i64>
2058  ret <4 x i64> %res
2059}
2060
2061define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2062; CHECK-LABEL: test_mm256_sign_epi8:
2063; CHECK:       # %bb.0:
2064; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2065; CHECK-NEXT:    ret{{[l|q]}}
2066  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2067  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2068  %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2069  %res = bitcast <32 x i8> %call to <4 x i64>
2070  ret <4 x i64> %res
2071}
2072declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2073
2074define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2075; CHECK-LABEL: test_mm256_sign_epi16:
2076; CHECK:       # %bb.0:
2077; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2078; CHECK-NEXT:    ret{{[l|q]}}
2079  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2080  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2081  %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2082  %res = bitcast <16 x i16> %call to <4 x i64>
2083  ret <4 x i64> %res
2084}
2085declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2086
2087define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2088; CHECK-LABEL: test_mm256_sign_epi32:
2089; CHECK:       # %bb.0:
2090; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2091; CHECK-NEXT:    ret{{[l|q]}}
2092  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2093  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2094  %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2095  %res = bitcast <8 x i32> %call to <4 x i64>
2096  ret <4 x i64> %res
2097}
2098declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2099
2100define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2101; CHECK-LABEL: test_mm256_sll_epi16:
2102; CHECK:       # %bb.0:
2103; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2104; CHECK-NEXT:    ret{{[l|q]}}
2105  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2106  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2107  %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2108  %bc = bitcast <16 x i16> %res to <4 x i64>
2109  ret <4 x i64> %bc
2110}
2111declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2112
2113define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2114; CHECK-LABEL: test_mm256_sll_epi32:
2115; CHECK:       # %bb.0:
2116; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2117; CHECK-NEXT:    ret{{[l|q]}}
2118  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2119  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2120  %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2121  %bc = bitcast <8 x i32> %res to <4 x i64>
2122  ret <4 x i64> %bc
2123}
2124declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2125
2126define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2127; CHECK-LABEL: test_mm256_sll_epi64:
2128; CHECK:       # %bb.0:
2129; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2130; CHECK-NEXT:    ret{{[l|q]}}
2131  %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2132  ret <4 x i64> %res
2133}
2134declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2135
2136define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2137; CHECK-LABEL: test_mm256_slli_epi16:
2138; CHECK:       # %bb.0:
2139; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
2140; CHECK-NEXT:    ret{{[l|q]}}
2141  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2142  %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2143  %bc = bitcast <16 x i16> %res to <4 x i64>
2144  ret <4 x i64> %bc
2145}
2146declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2147
2148define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2149; CHECK-LABEL: test_mm256_slli_epi32:
2150; CHECK:       # %bb.0:
2151; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
2152; CHECK-NEXT:    ret{{[l|q]}}
2153  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2154  %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2155  %bc = bitcast <8 x i32> %res to <4 x i64>
2156  ret <4 x i64> %bc
2157}
2158declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2159
2160define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2161; CHECK-LABEL: test_mm256_slli_epi64:
2162; CHECK:       # %bb.0:
2163; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
2164; CHECK-NEXT:    ret{{[l|q]}}
2165  %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2166  ret <4 x i64> %res
2167}
2168declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2169
2170define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2171; CHECK-LABEL: test_mm256_slli_si256:
2172; CHECK:       # %bb.0:
2173; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2174; CHECK-NEXT:    ret{{[l|q]}}
2175  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2176  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2177  %res = bitcast <32 x i8> %shuf to <4 x i64>
2178  ret <4 x i64> %res
2179}
2180
2181define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2182; CHECK-LABEL: test_mm_sllv_epi32:
2183; CHECK:       # %bb.0:
2184; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2185; CHECK-NEXT:    ret{{[l|q]}}
2186  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2187  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2188  %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2189  %bc = bitcast <4 x i32> %res to <2 x i64>
2190  ret <2 x i64> %bc
2191}
2192declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2193
2194define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2195; CHECK-LABEL: test_mm256_sllv_epi32:
2196; CHECK:       # %bb.0:
2197; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2198; CHECK-NEXT:    ret{{[l|q]}}
2199  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2200  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2201  %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2202  %bc = bitcast <8 x i32> %res to <4 x i64>
2203  ret <4 x i64> %bc
2204}
2205declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2206
2207define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2208; CHECK-LABEL: test_mm_sllv_epi64:
2209; CHECK:       # %bb.0:
2210; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2211; CHECK-NEXT:    ret{{[l|q]}}
2212  %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2213  ret <2 x i64> %res
2214}
2215declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2216
2217define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2218; CHECK-LABEL: test_mm256_sllv_epi64:
2219; CHECK:       # %bb.0:
2220; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2221; CHECK-NEXT:    ret{{[l|q]}}
2222  %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2223  ret <4 x i64> %res
2224}
2225declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2226
2227define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2228; CHECK-LABEL: test_mm256_sra_epi16:
2229; CHECK:       # %bb.0:
2230; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2231; CHECK-NEXT:    ret{{[l|q]}}
2232  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2233  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2234  %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2235  %bc = bitcast <16 x i16> %res to <4 x i64>
2236  ret <4 x i64> %bc
2237}
2238declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2239
2240define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2241; CHECK-LABEL: test_mm256_sra_epi32:
2242; CHECK:       # %bb.0:
2243; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2244; CHECK-NEXT:    ret{{[l|q]}}
2245  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2246  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2247  %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2248  %bc = bitcast <8 x i32> %res to <4 x i64>
2249  ret <4 x i64> %bc
2250}
2251declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2252
2253define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2254; CHECK-LABEL: test_mm256_srai_epi16:
2255; CHECK:       # %bb.0:
2256; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
2257; CHECK-NEXT:    ret{{[l|q]}}
2258  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2259  %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2260  %bc = bitcast <16 x i16> %res to <4 x i64>
2261  ret <4 x i64> %bc
2262}
2263declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2264
2265define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2266; CHECK-LABEL: test_mm256_srai_epi32:
2267; CHECK:       # %bb.0:
2268; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
2269; CHECK-NEXT:    ret{{[l|q]}}
2270  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2271  %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2272  %bc = bitcast <8 x i32> %res to <4 x i64>
2273  ret <4 x i64> %bc
2274}
2275declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2276
2277define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2278; CHECK-LABEL: test_mm_srav_epi32:
2279; CHECK:       # %bb.0:
2280; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2281; CHECK-NEXT:    ret{{[l|q]}}
2282  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2283  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2284  %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2285  %bc = bitcast <4 x i32> %res to <2 x i64>
2286  ret <2 x i64> %bc
2287}
2288declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2289
2290define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2291; CHECK-LABEL: test_mm256_srav_epi32:
2292; CHECK:       # %bb.0:
2293; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2294; CHECK-NEXT:    ret{{[l|q]}}
2295  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2296  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2297  %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2298  %bc = bitcast <8 x i32> %res to <4 x i64>
2299  ret <4 x i64> %bc
2300}
2301declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2302
2303define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2304; CHECK-LABEL: test_mm256_srl_epi16:
2305; CHECK:       # %bb.0:
2306; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2307; CHECK-NEXT:    ret{{[l|q]}}
2308  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2309  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2310  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2311  %bc = bitcast <16 x i16> %res to <4 x i64>
2312  ret <4 x i64> %bc
2313}
2314declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2315
2316define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2317; CHECK-LABEL: test_mm256_srl_epi32:
2318; CHECK:       # %bb.0:
2319; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2320; CHECK-NEXT:    ret{{[l|q]}}
2321  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2322  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2323  %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2324  %bc = bitcast <8 x i32> %res to <4 x i64>
2325  ret <4 x i64> %bc
2326}
2327declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2328
2329define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2330; CHECK-LABEL: test_mm256_srl_epi64:
2331; CHECK:       # %bb.0:
2332; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2333; CHECK-NEXT:    ret{{[l|q]}}
2334  %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2335  ret <4 x i64> %res
2336}
2337declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2338
2339define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2340; CHECK-LABEL: test_mm256_srli_epi16:
2341; CHECK:       # %bb.0:
2342; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
2343; CHECK-NEXT:    ret{{[l|q]}}
2344  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2345  %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2346  %bc = bitcast <16 x i16> %res to <4 x i64>
2347  ret <4 x i64> %bc
2348}
2349declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2350
2351define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2352; CHECK-LABEL: test_mm256_srli_epi32:
2353; CHECK:       # %bb.0:
2354; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
2355; CHECK-NEXT:    ret{{[l|q]}}
2356  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2357  %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2358  %bc = bitcast <8 x i32> %res to <4 x i64>
2359  ret <4 x i64> %bc
2360}
2361declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2362
2363define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2364; CHECK-LABEL: test_mm256_srli_epi64:
2365; CHECK:       # %bb.0:
2366; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
2367; CHECK-NEXT:    ret{{[l|q]}}
2368  %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2369  ret <4 x i64> %res
2370}
2371declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2372
2373define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2374; CHECK-LABEL: test_mm256_srli_si256:
2375; CHECK:       # %bb.0:
2376; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2377; CHECK-NEXT:    ret{{[l|q]}}
2378  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2379  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2380  %res = bitcast <32 x i8> %shuf to <4 x i64>
2381  ret <4 x i64> %res
2382}
2383
2384define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2385; CHECK-LABEL: test_mm_srlv_epi32:
2386; CHECK:       # %bb.0:
2387; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
2388; CHECK-NEXT:    ret{{[l|q]}}
2389  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2390  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2391  %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2392  %bc = bitcast <4 x i32> %res to <2 x i64>
2393  ret <2 x i64> %bc
2394}
2395declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2396
2397define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2398; CHECK-LABEL: test_mm256_srlv_epi32:
2399; CHECK:       # %bb.0:
2400; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
2401; CHECK-NEXT:    ret{{[l|q]}}
2402  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2403  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2404  %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2405  %bc = bitcast <8 x i32> %res to <4 x i64>
2406  ret <4 x i64> %bc
2407}
2408declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2409
2410define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2411; CHECK-LABEL: test_mm_srlv_epi64:
2412; CHECK:       # %bb.0:
2413; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
2414; CHECK-NEXT:    ret{{[l|q]}}
2415  %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2416  ret <2 x i64> %res
2417}
2418declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2419
2420define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2421; CHECK-LABEL: test_mm256_srlv_epi64:
2422; CHECK:       # %bb.0:
2423; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
2424; CHECK-NEXT:    ret{{[l|q]}}
2425  %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2426  ret <4 x i64> %res
2427}
2428declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2429
2430define <4 x i64> @test_mm256_stream_load_si256(ptr%a0) {
2431; X86-LABEL: test_mm256_stream_load_si256:
2432; X86:       # %bb.0:
2433; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2434; X86-NEXT:    vmovntdqa (%eax), %ymm0
2435; X86-NEXT:    retl
2436;
2437; X64-LABEL: test_mm256_stream_load_si256:
2438; X64:       # %bb.0:
2439; X64-NEXT:    vmovntdqa (%rdi), %ymm0
2440; X64-NEXT:    retq
2441  %arg0 = bitcast ptr%a0 to ptr
2442  %res = call <4 x i64> @llvm.x86.avx2.movntdqa(ptr %arg0)
2443  ret <4 x i64> %res
2444}
2445declare <4 x i64> @llvm.x86.avx2.movntdqa(ptr) nounwind readonly
2446
2447define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2448; CHECK-LABEL: test_mm256_sub_epi8:
2449; CHECK:       # %bb.0:
2450; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
2451; CHECK-NEXT:    ret{{[l|q]}}
2452  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2453  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2454  %res = sub <32 x i8> %arg0, %arg1
2455  %bc = bitcast <32 x i8> %res to <4 x i64>
2456  ret <4 x i64> %bc
2457}
2458
2459define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2460; CHECK-LABEL: test_mm256_sub_epi16:
2461; CHECK:       # %bb.0:
2462; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
2463; CHECK-NEXT:    ret{{[l|q]}}
2464  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2465  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2466  %res = sub <16 x i16> %arg0, %arg1
2467  %bc = bitcast <16 x i16> %res to <4 x i64>
2468  ret <4 x i64> %bc
2469}
2470
2471define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2472; CHECK-LABEL: test_mm256_sub_epi32:
2473; CHECK:       # %bb.0:
2474; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
2475; CHECK-NEXT:    ret{{[l|q]}}
2476  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2477  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2478  %res = sub <8 x i32> %arg0, %arg1
2479  %bc = bitcast <8 x i32> %res to <4 x i64>
2480  ret <4 x i64> %bc
2481}
2482
2483define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2484; CHECK-LABEL: test_mm256_sub_epi64:
2485; CHECK:       # %bb.0:
2486; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
2487; CHECK-NEXT:    ret{{[l|q]}}
2488  %res = sub <4 x i64> %a0, %a1
2489  ret <4 x i64> %res
2490}
2491
2492define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2493; CHECK-LABEL: test_mm256_subs_epi8:
2494; CHECK:       # %bb.0:
2495; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
2496; CHECK-NEXT:    ret{{[l|q]}}
2497  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2498  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2499  %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2500  %bc = bitcast <32 x i8> %res to <4 x i64>
2501  ret <4 x i64> %bc
2502}
2503declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2504
2505define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2506; CHECK-LABEL: test_mm256_subs_epi16:
2507; CHECK:       # %bb.0:
2508; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
2509; CHECK-NEXT:    ret{{[l|q]}}
2510  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2511  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2512  %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2513  %bc = bitcast <16 x i16> %res to <4 x i64>
2514  ret <4 x i64> %bc
2515}
2516declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2517
2518define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2519; CHECK-LABEL: test_mm256_subs_epu8:
2520; CHECK:       # %bb.0:
2521; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
2522; CHECK-NEXT:    ret{{[l|q]}}
2523  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2524  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2525  %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2526  %bc = bitcast <32 x i8> %res to <4 x i64>
2527  ret <4 x i64> %bc
2528}
2529declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2530
2531define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2532; CHECK-LABEL: test_mm256_subs_epu16:
2533; CHECK:       # %bb.0:
2534; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
2535; CHECK-NEXT:    ret{{[l|q]}}
2536  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2537  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2538  %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2539  %bc = bitcast <16 x i16> %res to <4 x i64>
2540  ret <4 x i64> %bc
2541}
2542declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2543
2544define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2545; CHECK-LABEL: test_mm256_unpackhi_epi8:
2546; CHECK:       # %bb.0:
2547; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2548; CHECK-NEXT:    ret{{[l|q]}}
2549  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2550  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2551  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2552  %bc = bitcast <32 x i8> %res to <4 x i64>
2553  ret <4 x i64> %bc
2554}
2555
2556define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2557; CHECK-LABEL: test_mm256_unpackhi_epi16:
2558; CHECK:       # %bb.0:
2559; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2560; CHECK-NEXT:    ret{{[l|q]}}
2561  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2562  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2563  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2564  %bc = bitcast <16 x i16> %res to <4 x i64>
2565  ret <4 x i64> %bc
2566}
2567
2568define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2569; CHECK-LABEL: test_mm256_unpackhi_epi32:
2570; CHECK:       # %bb.0:
2571; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2572; CHECK-NEXT:    ret{{[l|q]}}
2573  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2574  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2575  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2576  %bc = bitcast <8 x i32> %res to <4 x i64>
2577  ret <4 x i64> %bc
2578}
2579
2580define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2581; CHECK-LABEL: test_mm256_unpackhi_epi64:
2582; CHECK:       # %bb.0:
2583; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2584; CHECK-NEXT:    ret{{[l|q]}}
2585  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2586  ret <4 x i64> %res
2587}
2588
2589define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2590; CHECK-LABEL: test_mm256_unpacklo_epi8:
2591; CHECK:       # %bb.0:
2592; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2593; CHECK-NEXT:    ret{{[l|q]}}
2594  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2595  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2596  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2597  %bc = bitcast <32 x i8> %res to <4 x i64>
2598  ret <4 x i64> %bc
2599}
2600
2601define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2602; CHECK-LABEL: test_mm256_unpacklo_epi16:
2603; CHECK:       # %bb.0:
2604; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2605; CHECK-NEXT:    ret{{[l|q]}}
2606  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2607  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2608  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2609  %bc = bitcast <16 x i16> %res to <4 x i64>
2610  ret <4 x i64> %bc
2611}
2612
2613define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2614; CHECK-LABEL: test_mm256_unpacklo_epi32:
2615; CHECK:       # %bb.0:
2616; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2617; CHECK-NEXT:    ret{{[l|q]}}
2618  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2619  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2620  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2621  %bc = bitcast <8 x i32> %res to <4 x i64>
2622  ret <4 x i64> %bc
2623}
2624
2625define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2626; CHECK-LABEL: test_mm256_unpacklo_epi64:
2627; CHECK:       # %bb.0:
2628; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2629; CHECK-NEXT:    ret{{[l|q]}}
2630  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2631  ret <4 x i64> %res
2632}
2633
2634define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2635; CHECK-LABEL: test_mm256_xor_si256:
2636; CHECK:       # %bb.0:
2637; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
2638; CHECK-NEXT:    ret{{[l|q]}}
2639  %res = xor <4 x i64> %a0, %a1
2640  ret <4 x i64> %res
2641}
2642
2643declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2644
2645declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
2646