xref: /llvm-project/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll (revision aca34da46da41792614799a8b6a8b31a5a6e23d9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL
4;
5; Just one 32-bit run to make sure we do reasonable things.
6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X86-AVX512F
7
8define <8 x double> @merge_8f64_2f64_12u4(ptr %ptr) nounwind uwtable noinline ssp {
9; ALL-LABEL: merge_8f64_2f64_12u4:
10; ALL:       # %bb.0:
11; ALL-NEXT:    vmovups 16(%rdi), %zmm0
12; ALL-NEXT:    retq
13;
14; X86-AVX512F-LABEL: merge_8f64_2f64_12u4:
15; X86-AVX512F:       # %bb.0:
16; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
17; X86-AVX512F-NEXT:    vmovups 16(%eax), %zmm0
18; X86-AVX512F-NEXT:    retl
19  %ptr0 = getelementptr inbounds <2 x double>, ptr %ptr, i64 1
20  %ptr1 = getelementptr inbounds <2 x double>, ptr %ptr, i64 2
21  %ptr3 = getelementptr inbounds <2 x double>, ptr %ptr, i64 4
22  %val0 = load <2 x double>, ptr %ptr0
23  %val1 = load <2 x double>, ptr %ptr1
24  %val3 = load <2 x double>, ptr %ptr3
25  %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
26  %res23 = shufflevector <2 x double> undef, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
27  %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
28  ret <8 x double> %res
29}
30
31define <8 x double> @merge_8f64_2f64_23z5(ptr %ptr) nounwind uwtable noinline ssp {
32; ALL-LABEL: merge_8f64_2f64_23z5:
33; ALL:       # %bb.0:
34; ALL-NEXT:    vmovdqu64 32(%rdi), %zmm0
35; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
36; ALL-NEXT:    retq
37;
38; X86-AVX512F-LABEL: merge_8f64_2f64_23z5:
39; X86-AVX512F:       # %bb.0:
40; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
41; X86-AVX512F-NEXT:    vmovdqu64 32(%eax), %zmm0
42; X86-AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
43; X86-AVX512F-NEXT:    retl
44  %ptr0 = getelementptr inbounds <2 x double>, ptr %ptr, i64 2
45  %ptr1 = getelementptr inbounds <2 x double>, ptr %ptr, i64 3
46  %ptr3 = getelementptr inbounds <2 x double>, ptr %ptr, i64 5
47  %val0 = load <2 x double>, ptr %ptr0
48  %val1 = load <2 x double>, ptr %ptr1
49  %val3 = load <2 x double>, ptr %ptr3
50  %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
51  %res23 = shufflevector <2 x double> zeroinitializer, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
52  %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
53  ret <8 x double> %res
54}
55
56define <8 x double> @merge_8f64_4f64_z2(ptr %ptr) nounwind uwtable noinline ssp {
57; ALL-LABEL: merge_8f64_4f64_z2:
58; ALL:       # %bb.0:
59; ALL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
60; ALL-NEXT:    vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0
61; ALL-NEXT:    retq
62;
63; X86-AVX512F-LABEL: merge_8f64_4f64_z2:
64; X86-AVX512F:       # %bb.0:
65; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
66; X86-AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
67; X86-AVX512F-NEXT:    vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0
68; X86-AVX512F-NEXT:    retl
69  %ptr1 = getelementptr inbounds <4 x double>, ptr %ptr, i64 2
70  %val1 = load <4 x double>, ptr %ptr1
71  %res = shufflevector <4 x double> zeroinitializer, <4 x double> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
72  ret <8 x double> %res
73}
74
75define <8 x double> @merge_8f64_f64_23uuuuu9(ptr %ptr) nounwind uwtable noinline ssp {
76; ALL-LABEL: merge_8f64_f64_23uuuuu9:
77; ALL:       # %bb.0:
78; ALL-NEXT:    vmovups 16(%rdi), %zmm0
79; ALL-NEXT:    retq
80;
81; X86-AVX512F-LABEL: merge_8f64_f64_23uuuuu9:
82; X86-AVX512F:       # %bb.0:
83; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
84; X86-AVX512F-NEXT:    vmovups 16(%eax), %zmm0
85; X86-AVX512F-NEXT:    retl
86  %ptr0 = getelementptr inbounds double, ptr %ptr, i64 2
87  %ptr1 = getelementptr inbounds double, ptr %ptr, i64 3
88  %ptr7 = getelementptr inbounds double, ptr %ptr, i64 9
89  %val0 = load double, ptr %ptr0
90  %val1 = load double, ptr %ptr1
91  %val7 = load double, ptr %ptr7
92  %res0 = insertelement <8 x double> undef, double %val0, i32 0
93  %res1 = insertelement <8 x double> %res0, double %val1, i32 1
94  %res7 = insertelement <8 x double> %res1, double %val7, i32 7
95  ret <8 x double> %res7
96}
97
98define <8 x double> @merge_8f64_f64_12zzuuzz(ptr %ptr) nounwind uwtable noinline ssp {
99; ALL-LABEL: merge_8f64_f64_12zzuuzz:
100; ALL:       # %bb.0:
101; ALL-NEXT:    vmovups 8(%rdi), %xmm0
102; ALL-NEXT:    retq
103;
104; X86-AVX512F-LABEL: merge_8f64_f64_12zzuuzz:
105; X86-AVX512F:       # %bb.0:
106; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
107; X86-AVX512F-NEXT:    vmovups 8(%eax), %xmm0
108; X86-AVX512F-NEXT:    retl
109  %ptr0 = getelementptr inbounds double, ptr %ptr, i64 1
110  %ptr1 = getelementptr inbounds double, ptr %ptr, i64 2
111  %val0 = load double, ptr %ptr0
112  %val1 = load double, ptr %ptr1
113  %res0 = insertelement <8 x double> undef, double %val0, i32 0
114  %res1 = insertelement <8 x double> %res0, double %val1, i32 1
115  %res2 = insertelement <8 x double> %res1, double   0.0, i32 2
116  %res3 = insertelement <8 x double> %res2, double   0.0, i32 3
117  %res6 = insertelement <8 x double> %res3, double   0.0, i32 6
118  %res7 = insertelement <8 x double> %res6, double   0.0, i32 7
119  ret <8 x double> %res7
120}
121
122define <8 x double> @merge_8f64_f64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ssp {
123; ALL-LABEL: merge_8f64_f64_1u3u5zu8:
124; ALL:       # %bb.0:
125; ALL-NEXT:    vmovdqu64 8(%rdi), %zmm0
126; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
127; ALL-NEXT:    retq
128;
129; X86-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
130; X86-AVX512F:       # %bb.0:
131; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
132; X86-AVX512F-NEXT:    vmovdqu64 8(%eax), %zmm0
133; X86-AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
134; X86-AVX512F-NEXT:    retl
135  %ptr0 = getelementptr inbounds double, ptr %ptr, i64 1
136  %ptr2 = getelementptr inbounds double, ptr %ptr, i64 3
137  %ptr4 = getelementptr inbounds double, ptr %ptr, i64 5
138  %ptr7 = getelementptr inbounds double, ptr %ptr, i64 8
139  %val0 = load double, ptr %ptr0
140  %val2 = load double, ptr %ptr2
141  %val4 = load double, ptr %ptr4
142  %val7 = load double, ptr %ptr7
143  %res0 = insertelement <8 x double> undef, double %val0, i32 0
144  %res2 = insertelement <8 x double> %res0, double %val2, i32 2
145  %res4 = insertelement <8 x double> %res2, double %val4, i32 4
146  %res5 = insertelement <8 x double> %res4, double   0.0, i32 5
147  %res7 = insertelement <8 x double> %res5, double %val7, i32 7
148  ret <8 x double> %res7
149}
150
151define <8 x i64> @merge_8i64_4i64_z3(ptr %ptr) nounwind uwtable noinline ssp {
152; ALL-LABEL: merge_8i64_4i64_z3:
153; ALL:       # %bb.0:
154; ALL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
155; ALL-NEXT:    vinsertf64x4 $1, 96(%rdi), %zmm0, %zmm0
156; ALL-NEXT:    retq
157;
158; X86-AVX512F-LABEL: merge_8i64_4i64_z3:
159; X86-AVX512F:       # %bb.0:
160; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
161; X86-AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
162; X86-AVX512F-NEXT:    vinsertf64x4 $1, 96(%eax), %zmm0, %zmm0
163; X86-AVX512F-NEXT:    retl
164  %ptr1 = getelementptr inbounds <4 x i64>, ptr %ptr, i64 3
165  %val1 = load <4 x i64>, ptr %ptr1
166  %res = shufflevector <4 x i64> zeroinitializer, <4 x i64> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
167  ret <8 x i64> %res
168}
169
170define <8 x i64> @merge_8i64_i64_56zz9uzz(ptr %ptr) nounwind uwtable noinline ssp {
171; ALL-LABEL: merge_8i64_i64_56zz9uzz:
172; ALL:       # %bb.0:
173; ALL-NEXT:    vmovups 40(%rdi), %xmm0
174; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
175; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
176; ALL-NEXT:    retq
177;
178; X86-AVX512F-LABEL: merge_8i64_i64_56zz9uzz:
179; X86-AVX512F:       # %bb.0:
180; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
181; X86-AVX512F-NEXT:    vmovups 40(%eax), %xmm0
182; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
183; X86-AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
184; X86-AVX512F-NEXT:    retl
185  %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 5
186  %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 6
187  %ptr4 = getelementptr inbounds i64, ptr %ptr, i64 9
188  %val0 = load i64, ptr %ptr0
189  %val1 = load i64, ptr %ptr1
190  %val4 = load i64, ptr %ptr4
191  %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
192  %res1 = insertelement <8 x i64> %res0, i64 %val1, i32 1
193  %res2 = insertelement <8 x i64> %res1, i64     0, i32 2
194  %res3 = insertelement <8 x i64> %res2, i64     0, i32 3
195  %res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4
196  %res6 = insertelement <8 x i64> %res4, i64     0, i32 6
197  %res7 = insertelement <8 x i64> %res6, i64     0, i32 7
198  ret <8 x i64> %res7
199}
200
201define <8 x i64> @merge_8i64_i64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ssp {
202; ALL-LABEL: merge_8i64_i64_1u3u5zu8:
203; ALL:       # %bb.0:
204; ALL-NEXT:    vmovdqu64 8(%rdi), %zmm0
205; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
206; ALL-NEXT:    retq
207;
208; X86-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
209; X86-AVX512F:       # %bb.0:
210; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
211; X86-AVX512F-NEXT:    vmovdqu64 8(%eax), %zmm0
212; X86-AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
213; X86-AVX512F-NEXT:    retl
214  %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1
215  %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 3
216  %ptr4 = getelementptr inbounds i64, ptr %ptr, i64 5
217  %ptr7 = getelementptr inbounds i64, ptr %ptr, i64 8
218  %val0 = load i64, ptr %ptr0
219  %val2 = load i64, ptr %ptr2
220  %val4 = load i64, ptr %ptr4
221  %val7 = load i64, ptr %ptr7
222  %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
223  %res2 = insertelement <8 x i64> %res0, i64 %val2, i32 2
224  %res4 = insertelement <8 x i64> %res2, i64 %val4, i32 4
225  %res5 = insertelement <8 x i64> %res4, i64     0, i32 5
226  %res7 = insertelement <8 x i64> %res5, i64 %val7, i32 7
227  ret <8 x i64> %res7
228}
229
230define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp {
231; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
232; ALL:       # %bb.0:
233; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
234; ALL-NEXT:    retq
235;
236; X86-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
237; X86-AVX512F:       # %bb.0:
238; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
239; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
240; X86-AVX512F-NEXT:    retl
241  %ptr0 = getelementptr inbounds float, ptr %ptr, i64 8
242  %ptr1 = getelementptr inbounds float, ptr %ptr, i64 9
243  %val0 = load float, ptr %ptr0
244  %val1 = load float, ptr %ptr1
245  %res0 = insertelement <16 x float> undef, float %val0, i32 0
246  %res1 = insertelement <16 x float> %res0, float %val1, i32 1
247  %res2 = insertelement <16 x float> %res1, float   0.0, i32 2
248  %res3 = insertelement <16 x float> %res2, float   0.0, i32 3
249  %res4 = insertelement <16 x float> %res3, float   0.0, i32 4
250  %resF = insertelement <16 x float> %res4, float   0.0, i32 15
251  ret <16 x float> %resF
252}
253
254define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp {
255; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
256; ALL:       # %bb.0:
257; ALL-NEXT:    vmovups 16(%rdi), %xmm0
258; ALL-NEXT:    retq
259;
260; X86-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
261; X86-AVX512F:       # %bb.0:
262; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
263; X86-AVX512F-NEXT:    vmovups 16(%eax), %xmm0
264; X86-AVX512F-NEXT:    retl
265  %ptr0 = getelementptr inbounds float, ptr %ptr, i64 4
266  %ptr1 = getelementptr inbounds float, ptr %ptr, i64 5
267  %ptr3 = getelementptr inbounds float, ptr %ptr, i64 7
268  %val0 = load float, ptr %ptr0
269  %val1 = load float, ptr %ptr1
270  %val3 = load float, ptr %ptr3
271  %res0 = insertelement <16 x float> undef, float %val0, i32 0
272  %res1 = insertelement <16 x float> %res0, float %val1, i32 1
273  %res3 = insertelement <16 x float> %res1, float %val3, i32 3
274  ret <16 x float> %res3
275}
276
277define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(ptr %ptr) nounwind uwtable noinline ssp {
278; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
279; ALL:       # %bb.0:
280; ALL-NEXT:    vmovups (%rdi), %zmm0
281; ALL-NEXT:    retq
282;
283; X86-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
284; X86-AVX512F:       # %bb.0:
285; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
286; X86-AVX512F-NEXT:    vmovups (%eax), %zmm0
287; X86-AVX512F-NEXT:    retl
288  %ptr3 = getelementptr inbounds float, ptr %ptr, i64 3
289  %ptrC = getelementptr inbounds float, ptr %ptr, i64 12
290  %ptrE = getelementptr inbounds float, ptr %ptr, i64 14
291  %ptrF = getelementptr inbounds float, ptr %ptr, i64 15
292  %val0 = load float, ptr %ptr
293  %val3 = load float, ptr %ptr3
294  %valC = load float, ptr %ptrC
295  %valE = load float, ptr %ptrE
296  %valF = load float, ptr %ptrF
297  %res0 = insertelement <16 x float> undef, float %val0, i32 0
298  %res3 = insertelement <16 x float> %res0, float %val3, i32 3
299  %resC = insertelement <16 x float> %res3, float %valC, i32 12
300  %resE = insertelement <16 x float> %resC, float %valE, i32 14
301  %resF = insertelement <16 x float> %resE, float %valF, i32 15
302  ret <16 x float> %resF
303}
304
305define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp {
306; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
307; ALL:       # %bb.0:
308; ALL-NEXT:    vmovdqu64 (%rdi), %zmm0
309; ALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
310; ALL-NEXT:    retq
311;
312; X86-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
313; X86-AVX512F:       # %bb.0:
314; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
315; X86-AVX512F-NEXT:    vmovdqu64 (%eax), %zmm0
316; X86-AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
317; X86-AVX512F-NEXT:    retl
318  %ptr3 = getelementptr inbounds float, ptr %ptr, i64 3
319  %ptrC = getelementptr inbounds float, ptr %ptr, i64 12
320  %ptrE = getelementptr inbounds float, ptr %ptr, i64 14
321  %ptrF = getelementptr inbounds float, ptr %ptr, i64 15
322  %val0 = load float, ptr %ptr
323  %val3 = load float, ptr %ptr3
324  %valC = load float, ptr %ptrC
325  %valE = load float, ptr %ptrE
326  %valF = load float, ptr %ptrF
327  %res0 = insertelement <16 x float> undef, float %val0, i32 0
328  %res3 = insertelement <16 x float> %res0, float %val3, i32 3
329  %res4 = insertelement <16 x float> %res3, float   0.0, i32 4
330  %res5 = insertelement <16 x float> %res4, float   0.0, i32 5
331  %resC = insertelement <16 x float> %res5, float %valC, i32 12
332  %resD = insertelement <16 x float> %resC, float   0.0, i32 13
333  %resE = insertelement <16 x float> %resD, float %valE, i32 14
334  %resF = insertelement <16 x float> %resE, float %valF, i32 15
335  ret <16 x float> %resF
336}
337
338define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp {
339; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
340; ALL:       # %bb.0:
341; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
342; ALL-NEXT:    retq
343;
344; X86-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
345; X86-AVX512F:       # %bb.0:
346; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
347; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
348; X86-AVX512F-NEXT:    retl
349  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 1
350  %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 2
351  %val0 = load i32, ptr %ptr0
352  %val1 = load i32, ptr %ptr1
353  %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
354  %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
355  %res2 = insertelement <16 x i32> %res1, i32     0, i32 2
356  %res3 = insertelement <16 x i32> %res2, i32     0, i32 3
357  %res4 = insertelement <16 x i32> %res3, i32     0, i32 4
358  %resF = insertelement <16 x i32> %res4, i32     0, i32 15
359  ret <16 x i32> %resF
360}
361
362define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp {
363; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
364; ALL:       # %bb.0:
365; ALL-NEXT:    vmovups 8(%rdi), %xmm0
366; ALL-NEXT:    retq
367;
368; X86-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
369; X86-AVX512F:       # %bb.0:
370; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
371; X86-AVX512F-NEXT:    vmovups 8(%eax), %xmm0
372; X86-AVX512F-NEXT:    retl
373  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2
374  %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3
375  %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5
376  %val0 = load i32, ptr %ptr0
377  %val1 = load i32, ptr %ptr1
378  %val3 = load i32, ptr %ptr3
379  %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
380  %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
381  %res3 = insertelement <16 x i32> %res1, i32 %val3, i32 3
382  ret <16 x i32> %res3
383}
384
385define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(ptr %ptr) nounwind uwtable noinline ssp {
386; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
387; ALL:       # %bb.0:
388; ALL-NEXT:    vmovups (%rdi), %zmm0
389; ALL-NEXT:    retq
390;
391; X86-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
392; X86-AVX512F:       # %bb.0:
393; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
394; X86-AVX512F-NEXT:    vmovups (%eax), %zmm0
395; X86-AVX512F-NEXT:    retl
396  %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 3
397  %ptrC = getelementptr inbounds i32, ptr %ptr, i64 12
398  %ptrE = getelementptr inbounds i32, ptr %ptr, i64 14
399  %ptrF = getelementptr inbounds i32, ptr %ptr, i64 15
400  %val0 = load i32, ptr %ptr
401  %val3 = load i32, ptr %ptr3
402  %valC = load i32, ptr %ptrC
403  %valE = load i32, ptr %ptrE
404  %valF = load i32, ptr %ptrF
405  %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
406  %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
407  %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
408  %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
409  %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
410  ret <16 x i32> %resF
411}
412
413define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable noinline ssp {
414; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
415; ALL:       # %bb.0:
416; ALL-NEXT:    vmovdqu64 (%rdi), %zmm0
417; ALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
418; ALL-NEXT:    retq
419;
420; X86-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
421; X86-AVX512F:       # %bb.0:
422; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
423; X86-AVX512F-NEXT:    vmovdqu64 (%eax), %zmm0
424; X86-AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
425; X86-AVX512F-NEXT:    retl
426  %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 3
427  %ptrC = getelementptr inbounds i32, ptr %ptr, i64 12
428  %ptrE = getelementptr inbounds i32, ptr %ptr, i64 14
429  %ptrF = getelementptr inbounds i32, ptr %ptr, i64 15
430  %val0 = load i32, ptr %ptr
431  %val3 = load i32, ptr %ptr3
432  %valC = load i32, ptr %ptrC
433  %valE = load i32, ptr %ptrE
434  %valF = load i32, ptr %ptrF
435  %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
436  %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
437  %res4 = insertelement <16 x i32> %res3, i32     0, i32 4
438  %res5 = insertelement <16 x i32> %res4, i32     0, i32 5
439  %resC = insertelement <16 x i32> %res5, i32 %valC, i32 12
440  %resD = insertelement <16 x i32> %resC, i32     0, i32 13
441  %resE = insertelement <16 x i32> %resD, i32 %valE, i32 14
442  %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
443  ret <16 x i32> %resF
444}
445
446define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(ptr %ptr) nounwind uwtable noinline ssp {
447; ALL-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
448; ALL:       # %bb.0:
449; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
450; ALL-NEXT:    retq
451;
452; X86-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
453; X86-AVX512F:       # %bb.0:
454; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
455; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
456; X86-AVX512F-NEXT:    retl
457  %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 1
458  %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 2
459  %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 4
460  %val0 = load i16, ptr %ptr0
461  %val1 = load i16, ptr %ptr1
462  %val3 = load i16, ptr %ptr3
463  %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
464  %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
465  %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
466  %res30 = insertelement <32 x i16> %res3, i16 0, i16 30
467  %res31 = insertelement <32 x i16> %res30, i16 0, i16 31
468  ret <32 x i16> %res31
469}
470
471define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp {
472; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
473; ALL:       # %bb.0:
474; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
475; ALL-NEXT:    retq
476;
477; X86-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
478; X86-AVX512F:       # %bb.0:
479; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
480; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
481; X86-AVX512F-NEXT:    retl
482  %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 4
483  %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 5
484  %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 7
485  %val0 = load i16, ptr %ptr0
486  %val1 = load i16, ptr %ptr1
487  %val3 = load i16, ptr %ptr3
488  %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
489  %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
490  %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
491  ret <32 x i16> %res3
492}
493
494define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp {
495; ALL-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
496; ALL:       # %bb.0:
497; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
498; ALL-NEXT:    retq
499;
500; X86-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
501; X86-AVX512F:       # %bb.0:
502; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
503; X86-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
504; X86-AVX512F-NEXT:    retl
505  %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 2
506  %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 3
507  %val0 = load i16, ptr %ptr0
508  %val1 = load i16, ptr %ptr1
509  %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
510  %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
511  %res3 = insertelement <32 x i16> %res1, i16     0, i16 3
512  %resE = insertelement <32 x i16> %res3, i16     0, i16 14
513  %resF = insertelement <32 x i16> %resE, i16     0, i16 15
514  %resG = insertelement <32 x i16> %resF, i16     0, i16 16
515  %resH = insertelement <32 x i16> %resG, i16     0, i16 17
516  ret <32 x i16> %resH
517}
518
519define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp {
520; ALL-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
521; ALL:       # %bb.0:
522; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
523; ALL-NEXT:    retq
524;
525; X86-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
526; X86-AVX512F:       # %bb.0:
527; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
528; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
529; X86-AVX512F-NEXT:    retl
530  %ptr0 = getelementptr inbounds i8, ptr %ptr, i64 1
531  %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 2
532  %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 4
533  %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 8
534  %val0 = load i8, ptr %ptr0
535  %val1 = load i8, ptr %ptr1
536  %val3 = load i8, ptr %ptr3
537  %val7 = load i8, ptr %ptr7
538  %res0  = insertelement <64 x i8> undef,  i8 %val0, i8 0
539  %res1  = insertelement <64 x i8> %res0,  i8 %val1, i8 1
540  %res3  = insertelement <64 x i8> %res1,  i8 %val3, i8 3
541  %res7  = insertelement <64 x i8> %res3,  i8 %val7, i8 7
542  %res14 = insertelement <64 x i8> %res7,  i8     0, i8 14
543  %res15 = insertelement <64 x i8> %res14, i8     0, i8 15
544  %res16 = insertelement <64 x i8> %res15, i8     0, i8 16
545  %res17 = insertelement <64 x i8> %res16, i8     0, i8 17
546  %res63 = insertelement <64 x i8> %res17, i8     0, i8 63
547  ret <64 x i8> %res63
548}
549
550define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp {
551; ALL-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
552; ALL:       # %bb.0:
553; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
554; ALL-NEXT:    retq
555;
556; X86-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
557; X86-AVX512F:       # %bb.0:
558; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
559; X86-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
560; X86-AVX512F-NEXT:    retl
561  %ptr0 = getelementptr inbounds i8, ptr %ptr, i64 1
562  %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 2
563  %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 4
564  %val0 = load i8, ptr %ptr0
565  %val1 = load i8, ptr %ptr1
566  %val3 = load i8, ptr %ptr3
567  %res0  = insertelement <64 x i8> undef,  i8 %val0, i8 0
568  %res1  = insertelement <64 x i8> %res0,  i8 %val1, i8 1
569  %res3  = insertelement <64 x i8> %res1,  i8 %val3, i8 3
570  %res14 = insertelement <64 x i8> %res3,  i8     0, i8 14
571  %res15 = insertelement <64 x i8> %res14, i8     0, i8 15
572  %res16 = insertelement <64 x i8> %res15, i8     0, i8 16
573  %res17 = insertelement <64 x i8> %res16, i8     0, i8 17
574  %res63 = insertelement <64 x i8> %res17, i8     0, i8 63
575  ret <64 x i8> %res63
576}
577
578;
579; consecutive loads including any/all volatiles may not be combined
580;
581
582define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(ptr %ptr) nounwind uwtable noinline ssp {
583; ALL-LABEL: merge_8f64_f64_23uuuuu9_volatile:
584; ALL:       # %bb.0:
585; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
586; ALL-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
587; ALL-NEXT:    vbroadcastsd 72(%rdi), %ymm1
588; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
589; ALL-NEXT:    retq
590;
591; X86-AVX512F-LABEL: merge_8f64_f64_23uuuuu9_volatile:
592; X86-AVX512F:       # %bb.0:
593; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
594; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
595; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
596; X86-AVX512F-NEXT:    vbroadcastsd 72(%eax), %ymm1
597; X86-AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
598; X86-AVX512F-NEXT:    retl
599  %ptr0 = getelementptr inbounds double, ptr %ptr, i64 2
600  %ptr1 = getelementptr inbounds double, ptr %ptr, i64 3
601  %ptr7 = getelementptr inbounds double, ptr %ptr, i64 9
602  %val0 = load volatile double, ptr %ptr0
603  %val1 = load double, ptr %ptr1
604  %val7 = load double, ptr %ptr7
605  %res0 = insertelement <8 x double> undef, double %val0, i32 0
606  %res1 = insertelement <8 x double> %res0, double %val1, i32 1
607  %res7 = insertelement <8 x double> %res1, double %val7, i32 7
608  ret <8 x double> %res7
609}
610
611define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(ptr %ptr) nounwind uwtable noinline ssp {
612; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
613; ALL:       # %bb.0:
614; ALL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
615; ALL-NEXT:    vpinsrd $3, 12(%rdi), %xmm0, %xmm0
616; ALL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
617; ALL-NEXT:    vpinsrd $2, 56(%rdi), %xmm1, %xmm1
618; ALL-NEXT:    vpinsrd $3, 60(%rdi), %xmm1, %xmm1
619; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
620; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
621; ALL-NEXT:    retq
622;
623; X86-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
624; X86-AVX512F:       # %bb.0:
625; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
626; X86-AVX512F-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
627; X86-AVX512F-NEXT:    vpinsrd $3, 12(%eax), %xmm0, %xmm0
628; X86-AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
629; X86-AVX512F-NEXT:    vpinsrd $2, 56(%eax), %xmm1, %xmm1
630; X86-AVX512F-NEXT:    vpinsrd $3, 60(%eax), %xmm1, %xmm1
631; X86-AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
632; X86-AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
633; X86-AVX512F-NEXT:    retl
634  %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 3
635  %ptrC = getelementptr inbounds i32, ptr %ptr, i64 12
636  %ptrE = getelementptr inbounds i32, ptr %ptr, i64 14
637  %ptrF = getelementptr inbounds i32, ptr %ptr, i64 15
638  %val0 = load volatile i32, ptr %ptr
639  %val3 = load volatile i32, ptr %ptr3
640  %valC = load volatile i32, ptr %ptrC
641  %valE = load volatile i32, ptr %ptrE
642  %valF = load volatile i32, ptr %ptrF
643  %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
644  %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
645  %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
646  %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
647  %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
648  ret <16 x i32> %resF
649}
650