xref: /llvm-project/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll (revision a2a0089ac3a5781ba74d4d319c87c9e8b46d4eda)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
7;
8; 32-bit SSE tests to make sure we do reasonable things.
9; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86-SSE,X86-SSE1
10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE41
11
12define <2 x double> @merge_2f64_f64_23(ptr %ptr) nounwind uwtable noinline ssp {
13; SSE-LABEL: merge_2f64_f64_23:
14; SSE:       # %bb.0:
15; SSE-NEXT:    movups 16(%rdi), %xmm0
16; SSE-NEXT:    retq
17;
18; AVX-LABEL: merge_2f64_f64_23:
19; AVX:       # %bb.0:
20; AVX-NEXT:    vmovups 16(%rdi), %xmm0
21; AVX-NEXT:    retq
22;
23; X86-SSE1-LABEL: merge_2f64_f64_23:
24; X86-SSE1:       # %bb.0:
25; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
26; X86-SSE1-NEXT:    fldl 16(%eax)
27; X86-SSE1-NEXT:    fldl 24(%eax)
28; X86-SSE1-NEXT:    fxch %st(1)
29; X86-SSE1-NEXT:    retl
30;
31; X86-SSE41-LABEL: merge_2f64_f64_23:
32; X86-SSE41:       # %bb.0:
33; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
34; X86-SSE41-NEXT:    movups 16(%eax), %xmm0
35; X86-SSE41-NEXT:    retl
36  %ptr0 = getelementptr inbounds double, ptr %ptr, i64 2
37  %ptr1 = getelementptr inbounds double, ptr %ptr, i64 3
38  %val0 = load double, ptr %ptr0
39  %val1 = load double, ptr %ptr1
40  %res0 = insertelement <2 x double> undef, double %val0, i32 0
41  %res1 = insertelement <2 x double> %res0, double %val1, i32 1
42  ret <2 x double> %res1
43}
44
45define <2 x i64> @merge_2i64_i64_12(ptr %ptr) nounwind uwtable noinline ssp {
46; SSE-LABEL: merge_2i64_i64_12:
47; SSE:       # %bb.0:
48; SSE-NEXT:    movups 8(%rdi), %xmm0
49; SSE-NEXT:    retq
50;
51; AVX-LABEL: merge_2i64_i64_12:
52; AVX:       # %bb.0:
53; AVX-NEXT:    vmovups 8(%rdi), %xmm0
54; AVX-NEXT:    retq
55;
56; X86-SSE1-LABEL: merge_2i64_i64_12:
57; X86-SSE1:       # %bb.0:
58; X86-SSE1-NEXT:    pushl %edi
59; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
60; X86-SSE1-NEXT:    pushl %esi
61; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
62; X86-SSE1-NEXT:    .cfi_offset %esi, -12
63; X86-SSE1-NEXT:    .cfi_offset %edi, -8
64; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
65; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
66; X86-SSE1-NEXT:    movl 8(%ecx), %edx
67; X86-SSE1-NEXT:    movl 12(%ecx), %esi
68; X86-SSE1-NEXT:    movl 16(%ecx), %edi
69; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
70; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
71; X86-SSE1-NEXT:    movl %edi, 8(%eax)
72; X86-SSE1-NEXT:    movl %esi, 4(%eax)
73; X86-SSE1-NEXT:    movl %edx, (%eax)
74; X86-SSE1-NEXT:    popl %esi
75; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
76; X86-SSE1-NEXT:    popl %edi
77; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
78; X86-SSE1-NEXT:    retl $4
79;
80; X86-SSE41-LABEL: merge_2i64_i64_12:
81; X86-SSE41:       # %bb.0:
82; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
83; X86-SSE41-NEXT:    movups 8(%eax), %xmm0
84; X86-SSE41-NEXT:    retl
85  %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1
86  %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 2
87  %val0 = load i64, ptr %ptr0
88  %val1 = load i64, ptr %ptr1
89  %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
90  %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
91  ret <2 x i64> %res1
92}
93
94define <4 x float> @merge_4f32_f32_2345(ptr %ptr) nounwind uwtable noinline ssp {
95; SSE-LABEL: merge_4f32_f32_2345:
96; SSE:       # %bb.0:
97; SSE-NEXT:    movups 8(%rdi), %xmm0
98; SSE-NEXT:    retq
99;
100; AVX-LABEL: merge_4f32_f32_2345:
101; AVX:       # %bb.0:
102; AVX-NEXT:    vmovups 8(%rdi), %xmm0
103; AVX-NEXT:    retq
104;
105; X86-SSE-LABEL: merge_4f32_f32_2345:
106; X86-SSE:       # %bb.0:
107; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
108; X86-SSE-NEXT:    movups 8(%eax), %xmm0
109; X86-SSE-NEXT:    retl
110  %ptr0 = getelementptr inbounds float, ptr %ptr, i64 2
111  %ptr1 = getelementptr inbounds float, ptr %ptr, i64 3
112  %ptr2 = getelementptr inbounds float, ptr %ptr, i64 4
113  %ptr3 = getelementptr inbounds float, ptr %ptr, i64 5
114  %val0 = load float, ptr %ptr0
115  %val1 = load float, ptr %ptr1
116  %val2 = load float, ptr %ptr2
117  %val3 = load float, ptr %ptr3
118  %res0 = insertelement <4 x float> undef, float %val0, i32 0
119  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
120  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
121  %res3 = insertelement <4 x float> %res2, float %val3, i32 3
122  ret <4 x float> %res3
123}
124
125define <4 x float> @merge_4f32_f32_3zuu(ptr %ptr) nounwind uwtable noinline ssp {
126; SSE-LABEL: merge_4f32_f32_3zuu:
127; SSE:       # %bb.0:
128; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
129; SSE-NEXT:    retq
130;
131; AVX-LABEL: merge_4f32_f32_3zuu:
132; AVX:       # %bb.0:
133; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
134; AVX-NEXT:    retq
135;
136; X86-SSE-LABEL: merge_4f32_f32_3zuu:
137; X86-SSE:       # %bb.0:
138; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
139; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
140; X86-SSE-NEXT:    retl
141  %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
142  %val0 = load float, ptr %ptr0
143  %res0 = insertelement <4 x float> undef, float %val0, i32 0
144  %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
145  ret <4 x float> %res1
146}
147
148define <4 x float> @merge_4f32_f32_34uu(ptr %ptr) nounwind uwtable noinline ssp {
149; SSE-LABEL: merge_4f32_f32_34uu:
150; SSE:       # %bb.0:
151; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
152; SSE-NEXT:    retq
153;
154; AVX-LABEL: merge_4f32_f32_34uu:
155; AVX:       # %bb.0:
156; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
157; AVX-NEXT:    retq
158;
159; X86-SSE1-LABEL: merge_4f32_f32_34uu:
160; X86-SSE1:       # %bb.0:
161; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
162; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
163; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
164; X86-SSE1-NEXT:    retl
165;
166; X86-SSE41-LABEL: merge_4f32_f32_34uu:
167; X86-SSE41:       # %bb.0:
168; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
169; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
170; X86-SSE41-NEXT:    retl
171  %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
172  %ptr1 = getelementptr inbounds float, ptr %ptr, i64 4
173  %val0 = load float, ptr %ptr0
174  %val1 = load float, ptr %ptr1
175  %res0 = insertelement <4 x float> undef, float %val0, i32 0
176  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
177  ret <4 x float> %res1
178}
179
180define <4 x float> @merge_4f32_f32_34z6(ptr %ptr) nounwind uwtable noinline ssp {
181; SSE2-LABEL: merge_4f32_f32_34z6:
182; SSE2:       # %bb.0:
183; SSE2-NEXT:    movups 12(%rdi), %xmm0
184; SSE2-NEXT:    xorps %xmm1, %xmm1
185; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
186; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
187; SSE2-NEXT:    retq
188;
189; SSE41-LABEL: merge_4f32_f32_34z6:
190; SSE41:       # %bb.0:
191; SSE41-NEXT:    movups 12(%rdi), %xmm1
192; SSE41-NEXT:    xorps %xmm0, %xmm0
193; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
194; SSE41-NEXT:    retq
195;
196; AVX-LABEL: merge_4f32_f32_34z6:
197; AVX:       # %bb.0:
198; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
199; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
200; AVX-NEXT:    retq
201;
202; X86-SSE1-LABEL: merge_4f32_f32_34z6:
203; X86-SSE1:       # %bb.0:
204; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
205; X86-SSE1-NEXT:    movups 12(%eax), %xmm0
206; X86-SSE1-NEXT:    xorps %xmm1, %xmm1
207; X86-SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
208; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
209; X86-SSE1-NEXT:    retl
210;
211; X86-SSE41-LABEL: merge_4f32_f32_34z6:
212; X86-SSE41:       # %bb.0:
213; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
214; X86-SSE41-NEXT:    movups 12(%eax), %xmm1
215; X86-SSE41-NEXT:    xorps %xmm0, %xmm0
216; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
217; X86-SSE41-NEXT:    retl
218  %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
219  %ptr1 = getelementptr inbounds float, ptr %ptr, i64 4
220  %ptr3 = getelementptr inbounds float, ptr %ptr, i64 6
221  %val0 = load float, ptr %ptr0
222  %val1 = load float, ptr %ptr1
223  %val3 = load float, ptr %ptr3
224  %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
225  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
226  %res3 = insertelement <4 x float> %res1, float %val3, i32 3
227  ret <4 x float> %res3
228}
229
230define <4 x float> @merge_4f32_f32_45zz(ptr %ptr) nounwind uwtable noinline ssp {
231; SSE-LABEL: merge_4f32_f32_45zz:
232; SSE:       # %bb.0:
233; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
234; SSE-NEXT:    retq
235;
236; AVX-LABEL: merge_4f32_f32_45zz:
237; AVX:       # %bb.0:
238; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
239; AVX-NEXT:    retq
240;
241; X86-SSE1-LABEL: merge_4f32_f32_45zz:
242; X86-SSE1:       # %bb.0:
243; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
244; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
245; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
246; X86-SSE1-NEXT:    retl
247;
248; X86-SSE41-LABEL: merge_4f32_f32_45zz:
249; X86-SSE41:       # %bb.0:
250; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
251; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
252; X86-SSE41-NEXT:    retl
253  %ptr0 = getelementptr inbounds float, ptr %ptr, i64 4
254  %ptr1 = getelementptr inbounds float, ptr %ptr, i64 5
255  %val0 = load float, ptr %ptr0
256  %val1 = load float, ptr %ptr1
257  %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
258  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
259  ret <4 x float> %res1
260}
261
262define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp {
263; SSE2-LABEL: merge_4f32_f32_012u:
264; SSE2:       # %bb.0:
265; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
266; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
267; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
268; SSE2-NEXT:    retq
269;
270; SSE41-LABEL: merge_4f32_f32_012u:
271; SSE41:       # %bb.0:
272; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
273; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
274; SSE41-NEXT:    retq
275;
276; AVX-LABEL: merge_4f32_f32_012u:
277; AVX:       # %bb.0:
278; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
279; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
280; AVX-NEXT:    retq
281;
282; X86-SSE1-LABEL: merge_4f32_f32_012u:
283; X86-SSE1:       # %bb.0:
284; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
285; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
286; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
287; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
288; X86-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
289; X86-SSE1-NEXT:    retl
290;
291; X86-SSE41-LABEL: merge_4f32_f32_012u:
292; X86-SSE41:       # %bb.0:
293; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
294; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
295; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
296; X86-SSE41-NEXT:    retl
297  %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1
298  %ptr2 = getelementptr inbounds float, ptr %ptr, i64 2
299  %val0 = load float, ptr %ptr
300  %val1 = load float, ptr %ptr1
301  %val2 = load float, ptr %ptr2
302  %res0 = insertelement <4 x float> undef, float %val0, i32 0
303  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
304  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
305  %res3 = insertelement <4 x float> %res2, float undef, i32 3
306  ret <4 x float> %res3
307}
308
309define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp {
310; SSE2-LABEL: merge_4f32_f32_019u:
311; SSE2:       # %bb.0:
312; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
313; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
314; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
315; SSE2-NEXT:    retq
316;
317; SSE41-LABEL: merge_4f32_f32_019u:
318; SSE41:       # %bb.0:
319; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
320; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
321; SSE41-NEXT:    retq
322;
323; AVX-LABEL: merge_4f32_f32_019u:
324; AVX:       # %bb.0:
325; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
326; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
327; AVX-NEXT:    retq
328;
329; X86-SSE1-LABEL: merge_4f32_f32_019u:
330; X86-SSE1:       # %bb.0:
331; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
332; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
333; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
334; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
335; X86-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
336; X86-SSE1-NEXT:    retl
337;
338; X86-SSE41-LABEL: merge_4f32_f32_019u:
339; X86-SSE41:       # %bb.0:
340; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
341; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
342; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
343; X86-SSE41-NEXT:    retl
344  %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1
345  %ptr2 = getelementptr inbounds float, ptr %ptr, i64 9
346  %val0 = load float, ptr %ptr
347  %val1 = load float, ptr %ptr1
348  %val2 = load float, ptr %ptr2
349  %res0 = insertelement <4 x float> undef, float %val0, i32 0
350  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
351  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
352  %res3 = insertelement <4 x float> %res2, float undef, i32 3
353  ret <4 x float> %res3
354}
355
356define <4 x i32> @merge_4i32_i32_23u5(ptr %ptr) nounwind uwtable noinline ssp {
357; SSE-LABEL: merge_4i32_i32_23u5:
358; SSE:       # %bb.0:
359; SSE-NEXT:    movups 8(%rdi), %xmm0
360; SSE-NEXT:    retq
361;
362; AVX-LABEL: merge_4i32_i32_23u5:
363; AVX:       # %bb.0:
364; AVX-NEXT:    vmovups 8(%rdi), %xmm0
365; AVX-NEXT:    retq
366;
367; X86-SSE1-LABEL: merge_4i32_i32_23u5:
368; X86-SSE1:       # %bb.0:
369; X86-SSE1-NEXT:    pushl %esi
370; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
371; X86-SSE1-NEXT:    .cfi_offset %esi, -8
372; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
373; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
374; X86-SSE1-NEXT:    movl 8(%ecx), %edx
375; X86-SSE1-NEXT:    movl 12(%ecx), %esi
376; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
377; X86-SSE1-NEXT:    movl %esi, 4(%eax)
378; X86-SSE1-NEXT:    movl %edx, (%eax)
379; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
380; X86-SSE1-NEXT:    popl %esi
381; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
382; X86-SSE1-NEXT:    retl $4
383;
384; X86-SSE41-LABEL: merge_4i32_i32_23u5:
385; X86-SSE41:       # %bb.0:
386; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
387; X86-SSE41-NEXT:    movups 8(%eax), %xmm0
388; X86-SSE41-NEXT:    retl
389  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2
390  %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3
391  %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5
392  %val0 = load i32, ptr %ptr0
393  %val1 = load i32, ptr %ptr1
394  %val3 = load i32, ptr %ptr3
395  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
396  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
397  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
398  ret <4 x i32> %res3
399}
400
401define <4 x i32> @merge_4i32_i32_23u5_inc2(ptr %ptr) nounwind uwtable noinline ssp {
402; SSE-LABEL: merge_4i32_i32_23u5_inc2:
403; SSE:       # %bb.0:
404; SSE-NEXT:    movups 8(%rdi), %xmm0
405; SSE-NEXT:    incl 8(%rdi)
406; SSE-NEXT:    retq
407;
408; AVX-LABEL: merge_4i32_i32_23u5_inc2:
409; AVX:       # %bb.0:
410; AVX-NEXT:    vmovups 8(%rdi), %xmm0
411; AVX-NEXT:    incl 8(%rdi)
412; AVX-NEXT:    retq
413;
414; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc2:
415; X86-SSE1:       # %bb.0:
416; X86-SSE1-NEXT:    pushl %edi
417; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
418; X86-SSE1-NEXT:    pushl %esi
419; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
420; X86-SSE1-NEXT:    .cfi_offset %esi, -12
421; X86-SSE1-NEXT:    .cfi_offset %edi, -8
422; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
423; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
424; X86-SSE1-NEXT:    movl 8(%ecx), %edx
425; X86-SSE1-NEXT:    movl 12(%ecx), %esi
426; X86-SSE1-NEXT:    leal 1(%edx), %edi
427; X86-SSE1-NEXT:    movl %edi, 8(%ecx)
428; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
429; X86-SSE1-NEXT:    movl %esi, 4(%eax)
430; X86-SSE1-NEXT:    movl %edx, (%eax)
431; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
432; X86-SSE1-NEXT:    popl %esi
433; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
434; X86-SSE1-NEXT:    popl %edi
435; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
436; X86-SSE1-NEXT:    retl $4
437;
438; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
439; X86-SSE41:       # %bb.0:
440; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
441; X86-SSE41-NEXT:    movups 8(%eax), %xmm0
442; X86-SSE41-NEXT:    incl 8(%eax)
443; X86-SSE41-NEXT:    retl
444  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2
445  %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3
446  %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5
447  %val0 = load i32, ptr %ptr0
448  %inc = add i32 %val0, 1
449  store i32 %inc, ptr %ptr0
450  %val1 = load i32, ptr %ptr1
451  %val3 = load i32, ptr %ptr3
452  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
453  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
454  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
455  ret <4 x i32> %res3
456}
457
458define <4 x i32> @merge_4i32_i32_23u5_inc3(ptr %ptr) nounwind uwtable noinline ssp {
459; SSE-LABEL: merge_4i32_i32_23u5_inc3:
460; SSE:       # %bb.0:
461; SSE-NEXT:    movups 8(%rdi), %xmm0
462; SSE-NEXT:    incl 12(%rdi)
463; SSE-NEXT:    retq
464;
465; AVX-LABEL: merge_4i32_i32_23u5_inc3:
466; AVX:       # %bb.0:
467; AVX-NEXT:    vmovups 8(%rdi), %xmm0
468; AVX-NEXT:    incl 12(%rdi)
469; AVX-NEXT:    retq
470;
471; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc3:
472; X86-SSE1:       # %bb.0:
473; X86-SSE1-NEXT:    pushl %edi
474; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
475; X86-SSE1-NEXT:    pushl %esi
476; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
477; X86-SSE1-NEXT:    .cfi_offset %esi, -12
478; X86-SSE1-NEXT:    .cfi_offset %edi, -8
479; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
480; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
481; X86-SSE1-NEXT:    movl 8(%ecx), %edx
482; X86-SSE1-NEXT:    movl 12(%ecx), %esi
483; X86-SSE1-NEXT:    leal 1(%esi), %edi
484; X86-SSE1-NEXT:    movl %edi, 12(%ecx)
485; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
486; X86-SSE1-NEXT:    movl %esi, 4(%eax)
487; X86-SSE1-NEXT:    movl %edx, (%eax)
488; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
489; X86-SSE1-NEXT:    popl %esi
490; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
491; X86-SSE1-NEXT:    popl %edi
492; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
493; X86-SSE1-NEXT:    retl $4
494;
495; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
496; X86-SSE41:       # %bb.0:
497; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
498; X86-SSE41-NEXT:    movups 8(%eax), %xmm0
499; X86-SSE41-NEXT:    incl 12(%eax)
500; X86-SSE41-NEXT:    retl
501  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2
502  %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3
503  %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5
504  %val0 = load i32, ptr %ptr0
505  %val1 = load i32, ptr %ptr1
506  %inc = add i32 %val1, 1
507  store i32 %inc, ptr %ptr1
508  %val3 = load i32, ptr %ptr3
509  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
510  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
511  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
512  ret <4 x i32> %res3
513}
514
515define <4 x i32> @merge_4i32_i32_3zuu(ptr %ptr) nounwind uwtable noinline ssp {
516; SSE-LABEL: merge_4i32_i32_3zuu:
517; SSE:       # %bb.0:
518; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
519; SSE-NEXT:    retq
520;
521; AVX-LABEL: merge_4i32_i32_3zuu:
522; AVX:       # %bb.0:
523; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
524; AVX-NEXT:    retq
525;
526; X86-SSE1-LABEL: merge_4i32_i32_3zuu:
527; X86-SSE1:       # %bb.0:
528; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
529; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
530; X86-SSE1-NEXT:    movl 12(%ecx), %ecx
531; X86-SSE1-NEXT:    movl %ecx, (%eax)
532; X86-SSE1-NEXT:    movl $0, 4(%eax)
533; X86-SSE1-NEXT:    retl $4
534;
535; X86-SSE41-LABEL: merge_4i32_i32_3zuu:
536; X86-SSE41:       # %bb.0:
537; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
538; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
539; X86-SSE41-NEXT:    retl
540  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3
541  %val0 = load i32, ptr %ptr0
542  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
543  %res1 = insertelement <4 x i32> %res0, i32     0, i32 1
544  ret <4 x i32> %res1
545}
546
547define <4 x i32> @merge_4i32_i32_34uu(ptr %ptr) nounwind uwtable noinline ssp {
548; SSE-LABEL: merge_4i32_i32_34uu:
549; SSE:       # %bb.0:
550; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
551; SSE-NEXT:    retq
552;
553; AVX-LABEL: merge_4i32_i32_34uu:
554; AVX:       # %bb.0:
555; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
556; AVX-NEXT:    retq
557;
558; X86-SSE1-LABEL: merge_4i32_i32_34uu:
559; X86-SSE1:       # %bb.0:
560; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
561; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
562; X86-SSE1-NEXT:    movl 12(%ecx), %edx
563; X86-SSE1-NEXT:    movl 16(%ecx), %ecx
564; X86-SSE1-NEXT:    movl %ecx, 4(%eax)
565; X86-SSE1-NEXT:    movl %edx, (%eax)
566; X86-SSE1-NEXT:    retl $4
567;
568; X86-SSE41-LABEL: merge_4i32_i32_34uu:
569; X86-SSE41:       # %bb.0:
570; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
571; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
572; X86-SSE41-NEXT:    retl
573  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3
574  %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 4
575  %val0 = load i32, ptr %ptr0
576  %val1 = load i32, ptr %ptr1
577  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
578  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
579  ret <4 x i32> %res1
580}
581
582define <4 x i32> @merge_4i32_i32_45zz(ptr %ptr) nounwind uwtable noinline ssp {
583; SSE-LABEL: merge_4i32_i32_45zz:
584; SSE:       # %bb.0:
585; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
586; SSE-NEXT:    retq
587;
588; AVX-LABEL: merge_4i32_i32_45zz:
589; AVX:       # %bb.0:
590; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
591; AVX-NEXT:    retq
592;
593; X86-SSE1-LABEL: merge_4i32_i32_45zz:
594; X86-SSE1:       # %bb.0:
595; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
596; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
597; X86-SSE1-NEXT:    movl 16(%ecx), %edx
598; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
599; X86-SSE1-NEXT:    movl %ecx, 4(%eax)
600; X86-SSE1-NEXT:    movl %edx, (%eax)
601; X86-SSE1-NEXT:    movl $0, 12(%eax)
602; X86-SSE1-NEXT:    movl $0, 8(%eax)
603; X86-SSE1-NEXT:    retl $4
604;
605; X86-SSE41-LABEL: merge_4i32_i32_45zz:
606; X86-SSE41:       # %bb.0:
607; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
608; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
609; X86-SSE41-NEXT:    retl
610  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 4
611  %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 5
612  %val0 = load i32, ptr %ptr0
613  %val1 = load i32, ptr %ptr1
614  %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
615  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
616  ret <4 x i32> %res1
617}
618
619define <4 x i32> @merge_4i32_i32_45zz_inc4(ptr %ptr) nounwind uwtable noinline ssp {
620; SSE-LABEL: merge_4i32_i32_45zz_inc4:
621; SSE:       # %bb.0:
622; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
623; SSE-NEXT:    incl 16(%rdi)
624; SSE-NEXT:    retq
625;
626; AVX-LABEL: merge_4i32_i32_45zz_inc4:
627; AVX:       # %bb.0:
628; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
629; AVX-NEXT:    incl 16(%rdi)
630; AVX-NEXT:    retq
631;
632; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc4:
633; X86-SSE1:       # %bb.0:
634; X86-SSE1-NEXT:    pushl %edi
635; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
636; X86-SSE1-NEXT:    pushl %esi
637; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
638; X86-SSE1-NEXT:    .cfi_offset %esi, -12
639; X86-SSE1-NEXT:    .cfi_offset %edi, -8
640; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
641; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
642; X86-SSE1-NEXT:    movl 16(%ecx), %edx
643; X86-SSE1-NEXT:    movl 20(%ecx), %esi
644; X86-SSE1-NEXT:    leal 1(%edx), %edi
645; X86-SSE1-NEXT:    movl %edi, 16(%ecx)
646; X86-SSE1-NEXT:    movl %esi, 4(%eax)
647; X86-SSE1-NEXT:    movl %edx, (%eax)
648; X86-SSE1-NEXT:    movl $0, 12(%eax)
649; X86-SSE1-NEXT:    movl $0, 8(%eax)
650; X86-SSE1-NEXT:    popl %esi
651; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
652; X86-SSE1-NEXT:    popl %edi
653; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
654; X86-SSE1-NEXT:    retl $4
655;
656; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
657; X86-SSE41:       # %bb.0:
658; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
659; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
660; X86-SSE41-NEXT:    incl 16(%eax)
661; X86-SSE41-NEXT:    retl
662  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 4
663  %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 5
664  %val0 = load i32, ptr %ptr0
665  %inc = add i32 %val0, 1
666  store i32 %inc, ptr %ptr0
667  %val1 = load i32, ptr %ptr1
668  %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
669  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
670  ret <4 x i32> %res1
671}
672
673define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline ssp {
674; SSE-LABEL: merge_4i32_i32_45zz_inc5:
675; SSE:       # %bb.0:
676; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
677; SSE-NEXT:    incl 20(%rdi)
678; SSE-NEXT:    retq
679;
680; AVX-LABEL: merge_4i32_i32_45zz_inc5:
681; AVX:       # %bb.0:
682; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
683; AVX-NEXT:    incl 20(%rdi)
684; AVX-NEXT:    retq
685;
686; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc5:
687; X86-SSE1:       # %bb.0:
688; X86-SSE1-NEXT:    pushl %edi
689; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
690; X86-SSE1-NEXT:    pushl %esi
691; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
692; X86-SSE1-NEXT:    .cfi_offset %esi, -12
693; X86-SSE1-NEXT:    .cfi_offset %edi, -8
694; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
695; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
696; X86-SSE1-NEXT:    movl 16(%ecx), %edx
697; X86-SSE1-NEXT:    movl 20(%ecx), %esi
698; X86-SSE1-NEXT:    leal 1(%esi), %edi
699; X86-SSE1-NEXT:    movl %edi, 20(%ecx)
700; X86-SSE1-NEXT:    movl %esi, 4(%eax)
701; X86-SSE1-NEXT:    movl %edx, (%eax)
702; X86-SSE1-NEXT:    movl $0, 12(%eax)
703; X86-SSE1-NEXT:    movl $0, 8(%eax)
704; X86-SSE1-NEXT:    popl %esi
705; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
706; X86-SSE1-NEXT:    popl %edi
707; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
708; X86-SSE1-NEXT:    retl $4
709;
710; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
711; X86-SSE41:       # %bb.0:
712; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
713; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
714; X86-SSE41-NEXT:    incl 20(%eax)
715; X86-SSE41-NEXT:    retl
716  %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 4
717  %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 5
718  %val0 = load i32, ptr %ptr0
719  %val1 = load i32, ptr %ptr1
720  %inc = add i32 %val1, 1
721  store i32 %inc, ptr %ptr1
722  %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
723  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
724  ret <4 x i32> %res1
725}
726
727define <8 x i16> @merge_8i16_i16_23u567u9(ptr %ptr) nounwind uwtable noinline ssp {
728; SSE-LABEL: merge_8i16_i16_23u567u9:
729; SSE:       # %bb.0:
730; SSE-NEXT:    movups 4(%rdi), %xmm0
731; SSE-NEXT:    retq
732;
733; AVX-LABEL: merge_8i16_i16_23u567u9:
734; AVX:       # %bb.0:
735; AVX-NEXT:    vmovups 4(%rdi), %xmm0
736; AVX-NEXT:    retq
737;
738; X86-SSE1-LABEL: merge_8i16_i16_23u567u9:
739; X86-SSE1:       # %bb.0:
740; X86-SSE1-NEXT:    pushl %edi
741; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
742; X86-SSE1-NEXT:    pushl %esi
743; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
744; X86-SSE1-NEXT:    .cfi_offset %esi, -12
745; X86-SSE1-NEXT:    .cfi_offset %edi, -8
746; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
747; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
748; X86-SSE1-NEXT:    movl 4(%ecx), %edx
749; X86-SSE1-NEXT:    movl 10(%ecx), %esi
750; X86-SSE1-NEXT:    movzwl 14(%ecx), %edi
751; X86-SSE1-NEXT:    movzwl 18(%ecx), %ecx
752; X86-SSE1-NEXT:    movw %di, 10(%eax)
753; X86-SSE1-NEXT:    movw %cx, 14(%eax)
754; X86-SSE1-NEXT:    movl %esi, 6(%eax)
755; X86-SSE1-NEXT:    movl %edx, (%eax)
756; X86-SSE1-NEXT:    popl %esi
757; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
758; X86-SSE1-NEXT:    popl %edi
759; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
760; X86-SSE1-NEXT:    retl $4
761;
762; X86-SSE41-LABEL: merge_8i16_i16_23u567u9:
763; X86-SSE41:       # %bb.0:
764; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
765; X86-SSE41-NEXT:    movups 4(%eax), %xmm0
766; X86-SSE41-NEXT:    retl
767  %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 2
768  %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 3
769  %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 5
770  %ptr4 = getelementptr inbounds i16, ptr %ptr, i64 6
771  %ptr5 = getelementptr inbounds i16, ptr %ptr, i64 7
772  %ptr7 = getelementptr inbounds i16, ptr %ptr, i64 9
773  %val0 = load i16, ptr %ptr0
774  %val1 = load i16, ptr %ptr1
775  %val3 = load i16, ptr %ptr3
776  %val4 = load i16, ptr %ptr4
777  %val5 = load i16, ptr %ptr5
778  %val7 = load i16, ptr %ptr7
779  %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
780  %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
781  %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
782  %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4
783  %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5
784  %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7
785  ret <8 x i16> %res7
786}
787
788define <8 x i16> @merge_8i16_i16_34uuuuuu(ptr %ptr) nounwind uwtable noinline ssp {
789; SSE-LABEL: merge_8i16_i16_34uuuuuu:
790; SSE:       # %bb.0:
791; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
792; SSE-NEXT:    retq
793;
794; AVX-LABEL: merge_8i16_i16_34uuuuuu:
795; AVX:       # %bb.0:
796; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
797; AVX-NEXT:    retq
798;
799; X86-SSE1-LABEL: merge_8i16_i16_34uuuuuu:
800; X86-SSE1:       # %bb.0:
801; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
802; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
803; X86-SSE1-NEXT:    movl 6(%ecx), %ecx
804; X86-SSE1-NEXT:    movl %ecx, (%eax)
805; X86-SSE1-NEXT:    retl $4
806;
807; X86-SSE41-LABEL: merge_8i16_i16_34uuuuuu:
808; X86-SSE41:       # %bb.0:
809; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
810; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
811; X86-SSE41-NEXT:    retl
812  %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 3
813  %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 4
814  %val0 = load i16, ptr %ptr0
815  %val1 = load i16, ptr %ptr1
816  %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
817  %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
818  ret <8 x i16> %res1
819}
820
821define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ssp {
822; SSE-LABEL: merge_8i16_i16_45u7zzzz:
823; SSE:       # %bb.0:
824; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
825; SSE-NEXT:    retq
826;
827; AVX-LABEL: merge_8i16_i16_45u7zzzz:
828; AVX:       # %bb.0:
829; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
830; AVX-NEXT:    retq
831;
832; X86-SSE1-LABEL: merge_8i16_i16_45u7zzzz:
833; X86-SSE1:       # %bb.0:
834; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
835; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
836; X86-SSE1-NEXT:    movl 8(%ecx), %edx
837; X86-SSE1-NEXT:    movzwl 14(%ecx), %ecx
838; X86-SSE1-NEXT:    movw %cx, 6(%eax)
839; X86-SSE1-NEXT:    movl %edx, (%eax)
840; X86-SSE1-NEXT:    movl $0, 12(%eax)
841; X86-SSE1-NEXT:    movl $0, 8(%eax)
842; X86-SSE1-NEXT:    retl $4
843;
844; X86-SSE41-LABEL: merge_8i16_i16_45u7zzzz:
845; X86-SSE41:       # %bb.0:
846; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
847; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
848; X86-SSE41-NEXT:    retl
849  %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 4
850  %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 5
851  %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 7
852  %val0 = load i16, ptr %ptr0
853  %val1 = load i16, ptr %ptr1
854  %val3 = load i16, ptr %ptr3
855  %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
856  %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
857  %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
858  %res4 = insertelement <8 x i16> %res3, i16     0, i32 4
859  %res5 = insertelement <8 x i16> %res4, i16     0, i32 5
860  %res6 = insertelement <8 x i16> %res5, i16     0, i32 6
861  %res7 = insertelement <8 x i16> %res6, i16     0, i32 7
862  ret <8 x i16> %res7
863}
864
865define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(ptr %ptr) nounwind uwtable noinline ssp {
866; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
867; SSE:       # %bb.0:
868; SSE-NEXT:    movups (%rdi), %xmm0
869; SSE-NEXT:    retq
870;
871; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
872; AVX:       # %bb.0:
873; AVX-NEXT:    vmovups (%rdi), %xmm0
874; AVX-NEXT:    retq
875;
876; X86-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
877; X86-SSE1:       # %bb.0:
878; X86-SSE1-NEXT:    pushl %ebp
879; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
880; X86-SSE1-NEXT:    pushl %ebx
881; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
882; X86-SSE1-NEXT:    pushl %edi
883; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
884; X86-SSE1-NEXT:    pushl %esi
885; X86-SSE1-NEXT:    .cfi_def_cfa_offset 20
886; X86-SSE1-NEXT:    .cfi_offset %esi, -20
887; X86-SSE1-NEXT:    .cfi_offset %edi, -16
888; X86-SSE1-NEXT:    .cfi_offset %ebx, -12
889; X86-SSE1-NEXT:    .cfi_offset %ebp, -8
890; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
891; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
892; X86-SSE1-NEXT:    movzwl (%ecx), %ebp
893; X86-SSE1-NEXT:    movl 3(%ecx), %esi
894; X86-SSE1-NEXT:    movl 7(%ecx), %edi
895; X86-SSE1-NEXT:    movzwl 11(%ecx), %ebx
896; X86-SSE1-NEXT:    movzbl 13(%ecx), %edx
897; X86-SSE1-NEXT:    movzbl 15(%ecx), %ecx
898; X86-SSE1-NEXT:    movb %dl, 13(%eax)
899; X86-SSE1-NEXT:    movb %cl, 15(%eax)
900; X86-SSE1-NEXT:    movw %bx, 11(%eax)
901; X86-SSE1-NEXT:    movl %edi, 7(%eax)
902; X86-SSE1-NEXT:    movl %esi, 3(%eax)
903; X86-SSE1-NEXT:    movw %bp, (%eax)
904; X86-SSE1-NEXT:    popl %esi
905; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
906; X86-SSE1-NEXT:    popl %edi
907; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
908; X86-SSE1-NEXT:    popl %ebx
909; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
910; X86-SSE1-NEXT:    popl %ebp
911; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
912; X86-SSE1-NEXT:    retl $4
913;
914; X86-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
915; X86-SSE41:       # %bb.0:
916; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
917; X86-SSE41-NEXT:    movups (%eax), %xmm0
918; X86-SSE41-NEXT:    retl
919  %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 1
920  %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 3
921  %ptr4 = getelementptr inbounds i8, ptr %ptr, i64 4
922  %ptr5 = getelementptr inbounds i8, ptr %ptr, i64 5
923  %ptr6 = getelementptr inbounds i8, ptr %ptr, i64 6
924  %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 7
925  %ptr8 = getelementptr inbounds i8, ptr %ptr, i64 8
926  %ptr9 = getelementptr inbounds i8, ptr %ptr, i64 9
927  %ptrA = getelementptr inbounds i8, ptr %ptr, i64 10
928  %ptrB = getelementptr inbounds i8, ptr %ptr, i64 11
929  %ptrC = getelementptr inbounds i8, ptr %ptr, i64 12
930  %ptrD = getelementptr inbounds i8, ptr %ptr, i64 13
931  %ptrF = getelementptr inbounds i8, ptr %ptr, i64 15
932  %val0 = load i8, ptr %ptr
933  %val1 = load i8, ptr %ptr1
934  %val3 = load i8, ptr %ptr3
935  %val4 = load i8, ptr %ptr4
936  %val5 = load i8, ptr %ptr5
937  %val6 = load i8, ptr %ptr6
938  %val7 = load i8, ptr %ptr7
939  %val8 = load i8, ptr %ptr8
940  %val9 = load i8, ptr %ptr9
941  %valA = load i8, ptr %ptrA
942  %valB = load i8, ptr %ptrB
943  %valC = load i8, ptr %ptrC
944  %valD = load i8, ptr %ptrD
945  %valF = load i8, ptr %ptrF
946  %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
947  %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
948  %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
949  %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4
950  %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5
951  %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6
952  %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
953  %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8
954  %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9
955  %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10
956  %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11
957  %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12
958  %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13
959  %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15
960  ret <16 x i8> %resF
961}
962
963define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(ptr %ptr) nounwind uwtable noinline ssp {
964; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
965; SSE:       # %bb.0:
966; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
967; SSE-NEXT:    retq
968;
969; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
970; AVX:       # %bb.0:
971; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
972; AVX-NEXT:    retq
973;
974; X86-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
975; X86-SSE1:       # %bb.0:
976; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
977; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
978; X86-SSE1-NEXT:    movzwl (%ecx), %edx
979; X86-SSE1-NEXT:    movzbl 3(%ecx), %ecx
980; X86-SSE1-NEXT:    movb %cl, 3(%eax)
981; X86-SSE1-NEXT:    movw %dx, (%eax)
982; X86-SSE1-NEXT:    movb $0, 15(%eax)
983; X86-SSE1-NEXT:    movw $0, 13(%eax)
984; X86-SSE1-NEXT:    movw $0, 6(%eax)
985; X86-SSE1-NEXT:    retl $4
986;
987; X86-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
988; X86-SSE41:       # %bb.0:
989; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
990; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
991; X86-SSE41-NEXT:    retl
992  %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 1
993  %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 3
994  %val0 = load i8, ptr %ptr
995  %val1 = load i8, ptr %ptr1
996  %val3 = load i8, ptr %ptr3
997  %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
998  %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
999  %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
1000  %res6 = insertelement <16 x i8> %res3, i8     0, i32 6
1001  %res7 = insertelement <16 x i8> %res6, i8     0, i32 7
1002  %resD = insertelement <16 x i8> %res7, i8     0, i32 13
1003  %resE = insertelement <16 x i8> %resD, i8     0, i32 14
1004  %resF = insertelement <16 x i8> %resE, i8     0, i32 15
1005  ret <16 x i8> %resF
1006}
1007
1008define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noinline ssp {
1009; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1010; SSE:       # %bb.0:
1011; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1012; SSE-NEXT:    retq
1013;
1014; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1015; AVX:       # %bb.0:
1016; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1017; AVX-NEXT:    retq
1018;
1019; X86-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1020; X86-SSE1:       # %bb.0:
1021; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1022; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1023; X86-SSE1-NEXT:    movl (%ecx), %edx
1024; X86-SSE1-NEXT:    movzwl 6(%ecx), %ecx
1025; X86-SSE1-NEXT:    movw %cx, 6(%eax)
1026; X86-SSE1-NEXT:    movl %edx, (%eax)
1027; X86-SSE1-NEXT:    movb $0, 15(%eax)
1028; X86-SSE1-NEXT:    movw $0, 13(%eax)
1029; X86-SSE1-NEXT:    retl $4
1030;
1031; X86-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1032; X86-SSE41:       # %bb.0:
1033; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1034; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1035; X86-SSE41-NEXT:    retl
1036  %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 1
1037  %ptr2 = getelementptr inbounds i8, ptr %ptr, i64 2
1038  %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 3
1039  %ptr6 = getelementptr inbounds i8, ptr %ptr, i64 6
1040  %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 7
1041  %val0 = load i8, ptr %ptr
1042  %val1 = load i8, ptr %ptr1
1043  %val2 = load i8, ptr %ptr2
1044  %val3 = load i8, ptr %ptr3
1045  %val6 = load i8, ptr %ptr6
1046  %val7 = load i8, ptr %ptr7
1047  %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
1048  %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
1049  %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2
1050  %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3
1051  %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6
1052  %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
1053  %resD = insertelement <16 x i8> %res7, i8     0, i32 13
1054  %resE = insertelement <16 x i8> %resD, i8     0, i32 14
1055  %resF = insertelement <16 x i8> %resE, i8     0, i32 15
1056  ret <16 x i8> %resF
1057}
1058
1059define void @merge_4i32_i32_combine(ptr %dst, ptr %src) {
1060; SSE-LABEL: merge_4i32_i32_combine:
1061; SSE:       # %bb.0:
1062; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1063; SSE-NEXT:    movaps %xmm0, (%rdi)
1064; SSE-NEXT:    retq
1065;
1066; AVX-LABEL: merge_4i32_i32_combine:
1067; AVX:       # %bb.0:
1068; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1069; AVX-NEXT:    vmovaps %xmm0, (%rdi)
1070; AVX-NEXT:    retq
1071;
1072; X86-SSE1-LABEL: merge_4i32_i32_combine:
1073; X86-SSE1:       # %bb.0:
1074; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1075; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1076; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1077; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
1078; X86-SSE1-NEXT:    andps %xmm0, %xmm1
1079; X86-SSE1-NEXT:    movaps %xmm1, (%eax)
1080; X86-SSE1-NEXT:    retl
1081;
1082; X86-SSE41-LABEL: merge_4i32_i32_combine:
1083; X86-SSE41:       # %bb.0:
1084; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1085; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1086; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1087; X86-SSE41-NEXT:    movaps %xmm0, (%eax)
1088; X86-SSE41-NEXT:    retl
1089 %1 = load i32, ptr %src
1090 %2 = insertelement <4 x i32> undef, i32 %1, i32 0
1091 %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
1092 %4 = lshr <4 x i32> %3, <i32 0, i32 undef, i32 undef, i32 undef>
1093 %5 = and <4 x i32> %4, <i32 -1, i32 0, i32 0, i32 0>
1094 store <4 x i32> %5, ptr %dst
1095 ret void
1096}
1097
1098;
1099; consecutive loads including any/all volatiles may not be combined
1100;
1101
1102define <2 x i64> @merge_2i64_i64_12_volatile(ptr %ptr) nounwind uwtable noinline ssp {
1103; SSE-LABEL: merge_2i64_i64_12_volatile:
1104; SSE:       # %bb.0:
1105; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1106; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
1107; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1108; SSE-NEXT:    retq
1109;
1110; AVX-LABEL: merge_2i64_i64_12_volatile:
1111; AVX:       # %bb.0:
1112; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1113; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1114; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1115; AVX-NEXT:    retq
1116;
1117; X86-SSE1-LABEL: merge_2i64_i64_12_volatile:
1118; X86-SSE1:       # %bb.0:
1119; X86-SSE1-NEXT:    pushl %edi
1120; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
1121; X86-SSE1-NEXT:    pushl %esi
1122; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
1123; X86-SSE1-NEXT:    .cfi_offset %esi, -12
1124; X86-SSE1-NEXT:    .cfi_offset %edi, -8
1125; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1126; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1127; X86-SSE1-NEXT:    movl 8(%ecx), %edx
1128; X86-SSE1-NEXT:    movl 12(%ecx), %esi
1129; X86-SSE1-NEXT:    movl 16(%ecx), %edi
1130; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
1131; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
1132; X86-SSE1-NEXT:    movl %edi, 8(%eax)
1133; X86-SSE1-NEXT:    movl %esi, 4(%eax)
1134; X86-SSE1-NEXT:    movl %edx, (%eax)
1135; X86-SSE1-NEXT:    popl %esi
1136; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
1137; X86-SSE1-NEXT:    popl %edi
1138; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
1139; X86-SSE1-NEXT:    retl $4
1140;
1141; X86-SSE41-LABEL: merge_2i64_i64_12_volatile:
1142; X86-SSE41:       # %bb.0:
1143; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1144; X86-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1145; X86-SSE41-NEXT:    pinsrd $1, 12(%eax), %xmm0
1146; X86-SSE41-NEXT:    pinsrd $2, 16(%eax), %xmm0
1147; X86-SSE41-NEXT:    pinsrd $3, 20(%eax), %xmm0
1148; X86-SSE41-NEXT:    retl
1149  %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1
1150  %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 2
1151  %val0 = load volatile i64, ptr %ptr0
1152  %val1 = load volatile i64, ptr %ptr1
1153  %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
1154  %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
1155  ret <2 x i64> %res1
1156}
1157
1158define <4 x float> @merge_4f32_f32_2345_volatile(ptr %ptr) nounwind uwtable noinline ssp {
1159; SSE2-LABEL: merge_4f32_f32_2345_volatile:
1160; SSE2:       # %bb.0:
1161; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1162; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1163; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1164; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1165; SSE2-NEXT:    retq
1166;
1167; SSE41-LABEL: merge_4f32_f32_2345_volatile:
1168; SSE41:       # %bb.0:
1169; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1170; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1171; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1172; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1173; SSE41-NEXT:    retq
1174;
1175; AVX-LABEL: merge_4f32_f32_2345_volatile:
1176; AVX:       # %bb.0:
1177; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1178; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1179; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1180; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1181; AVX-NEXT:    retq
1182;
1183; X86-SSE1-LABEL: merge_4f32_f32_2345_volatile:
1184; X86-SSE1:       # %bb.0:
1185; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1186; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1187; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1188; X86-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1189; X86-SSE1-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1190; X86-SSE1-NEXT:    retl
1191;
1192; X86-SSE41-LABEL: merge_4f32_f32_2345_volatile:
1193; X86-SSE41:       # %bb.0:
1194; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1195; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1196; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1197; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1198; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1199; X86-SSE41-NEXT:    retl
1200  %ptr0 = getelementptr inbounds float, ptr %ptr, i64 2
1201  %ptr1 = getelementptr inbounds float, ptr %ptr, i64 3
1202  %ptr2 = getelementptr inbounds float, ptr %ptr, i64 4
1203  %ptr3 = getelementptr inbounds float, ptr %ptr, i64 5
1204  %val0 = load volatile float, ptr %ptr0
1205  %val1 = load float, ptr %ptr1
1206  %val2 = load float, ptr %ptr2
1207  %val3 = load float, ptr %ptr3
1208  %res0 = insertelement <4 x float> undef, float %val0, i32 0
1209  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
1210  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
1211  %res3 = insertelement <4 x float> %res2, float %val3, i32 3
1212  ret <4 x float> %res3
1213}
1214
1215;
1216; Non-consecutive test.
1217;
1218
1219define <4 x float> @merge_4f32_f32_X0YY(ptr %ptr0, ptr %ptr1) nounwind uwtable noinline ssp {
1220; SSE-LABEL: merge_4f32_f32_X0YY:
1221; SSE:       # %bb.0:
1222; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1223; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1224; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1225; SSE-NEXT:    retq
1226;
1227; AVX-LABEL: merge_4f32_f32_X0YY:
1228; AVX:       # %bb.0:
1229; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1230; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1231; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
1232; AVX-NEXT:    retq
1233;
1234; X86-SSE-LABEL: merge_4f32_f32_X0YY:
1235; X86-SSE:       # %bb.0:
1236; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
1237; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1238; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1239; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1240; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1241; X86-SSE-NEXT:    retl
1242  %val0 = load float, ptr %ptr0, align 4
1243  %val1 = load float, ptr %ptr1, align 4
1244  %res0 = insertelement <4 x float> undef, float %val0, i32 0
1245  %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1
1246  %res2 = insertelement <4 x float> %res1, float %val1, i32 2
1247  %res3 = insertelement <4 x float> %res2, float %val1, i32 3
1248  ret <4 x float> %res3
1249}
1250
1251;
1252; Extension tests.
1253;
1254
1255; PR31309
1256define <4 x i32> @load_i32_zext_i128_v4i32(ptr %ptr) {
1257; SSE-LABEL: load_i32_zext_i128_v4i32:
1258; SSE:       # %bb.0:
1259; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1260; SSE-NEXT:    retq
1261;
1262; AVX-LABEL: load_i32_zext_i128_v4i32:
1263; AVX:       # %bb.0:
1264; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1265; AVX-NEXT:    retq
1266;
1267; X86-SSE1-LABEL: load_i32_zext_i128_v4i32:
1268; X86-SSE1:       # %bb.0:
1269; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
1270; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1271; X86-SSE1-NEXT:    movl (%ecx), %ecx
1272; X86-SSE1-NEXT:    movl %ecx, (%eax)
1273; X86-SSE1-NEXT:    movl $0, 12(%eax)
1274; X86-SSE1-NEXT:    movl $0, 8(%eax)
1275; X86-SSE1-NEXT:    movl $0, 4(%eax)
1276; X86-SSE1-NEXT:    retl $4
1277;
1278; X86-SSE41-LABEL: load_i32_zext_i128_v4i32:
1279; X86-SSE41:       # %bb.0:
1280; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1281; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1282; X86-SSE41-NEXT:    retl
1283  %1 = load i32, ptr %ptr
1284  %2 = zext i32 %1 to i128
1285  %3 = bitcast i128 %2 to <4 x i32>
1286  ret <4 x i32> %3
1287}
1288