xref: /llvm-project/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2   | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx    | FileCheck %s --check-prefixes=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2   | FileCheck %s --check-prefixes=AVX
6
7; 0'th element insertion into an SSE register.
8
9define <4 x float> @insert_f32_firstelt(<4 x float> %x, ptr %s.addr) {
10; SSE2-LABEL: insert_f32_firstelt:
11; SSE2:       # %bb.0:
12; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
13; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
14; SSE2-NEXT:    retq
15;
16; SSE41-LABEL: insert_f32_firstelt:
17; SSE41:       # %bb.0:
18; SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
19; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
20; SSE41-NEXT:    retq
21;
22; AVX-LABEL: insert_f32_firstelt:
23; AVX:       # %bb.0:
24; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
25; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
26; AVX-NEXT:    retq
27  %s = load float, ptr %s.addr
28  %i0 = insertelement <4 x float> %x, float %s, i32 0
29  ret <4 x float> %i0
30}
31
32define <2 x double> @insert_f64_firstelt(<2 x double> %x, ptr %s.addr) {
33; SSE-LABEL: insert_f64_firstelt:
34; SSE:       # %bb.0:
35; SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
36; SSE-NEXT:    retq
37;
38; AVX-LABEL: insert_f64_firstelt:
39; AVX:       # %bb.0:
40; AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
41; AVX-NEXT:    retq
42  %s = load double, ptr %s.addr
43  %i0 = insertelement <2 x double> %x, double %s, i32 0
44  ret <2 x double> %i0
45}
46
47define <16 x i8> @insert_i8_firstelt(<16 x i8> %x, ptr %s.addr) {
48; SSE2-LABEL: insert_i8_firstelt:
49; SSE2:       # %bb.0:
50; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
51; SSE2-NEXT:    pand %xmm1, %xmm0
52; SSE2-NEXT:    movzbl (%rdi), %eax
53; SSE2-NEXT:    movd %eax, %xmm2
54; SSE2-NEXT:    pandn %xmm2, %xmm1
55; SSE2-NEXT:    por %xmm1, %xmm0
56; SSE2-NEXT:    retq
57;
58; SSE41-LABEL: insert_i8_firstelt:
59; SSE41:       # %bb.0:
60; SSE41-NEXT:    pinsrb $0, (%rdi), %xmm0
61; SSE41-NEXT:    retq
62;
63; AVX-LABEL: insert_i8_firstelt:
64; AVX:       # %bb.0:
65; AVX-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
66; AVX-NEXT:    retq
67  %s = load i8, ptr %s.addr
68  %i0 = insertelement <16 x i8> %x, i8 %s, i32 0
69  ret <16 x i8> %i0
70}
71
72define <8 x i16> @insert_i16_firstelt(<8 x i16> %x, ptr %s.addr) {
73; SSE-LABEL: insert_i16_firstelt:
74; SSE:       # %bb.0:
75; SSE-NEXT:    pinsrw $0, (%rdi), %xmm0
76; SSE-NEXT:    retq
77;
78; AVX-LABEL: insert_i16_firstelt:
79; AVX:       # %bb.0:
80; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
81; AVX-NEXT:    retq
82  %s = load i16, ptr %s.addr
83  %i0 = insertelement <8 x i16> %x, i16 %s, i32 0
84  ret <8 x i16> %i0
85}
86
87define <4 x i32> @insert_i32_firstelt(<4 x i32> %x, ptr %s.addr) {
88; SSE2-LABEL: insert_i32_firstelt:
89; SSE2:       # %bb.0:
90; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
91; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
92; SSE2-NEXT:    retq
93;
94; SSE41-LABEL: insert_i32_firstelt:
95; SSE41:       # %bb.0:
96; SSE41-NEXT:    pinsrd $0, (%rdi), %xmm0
97; SSE41-NEXT:    retq
98;
99; AVX-LABEL: insert_i32_firstelt:
100; AVX:       # %bb.0:
101; AVX-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
102; AVX-NEXT:    retq
103  %s = load i32, ptr %s.addr
104  %i0 = insertelement <4 x i32> %x, i32 %s, i32 0
105  ret <4 x i32> %i0
106}
107
108define <2 x i64> @insert_i64_firstelt(<2 x i64> %x, ptr %s.addr) {
109; SSE2-LABEL: insert_i64_firstelt:
110; SSE2:       # %bb.0:
111; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
112; SSE2-NEXT:    retq
113;
114; SSE41-LABEL: insert_i64_firstelt:
115; SSE41:       # %bb.0:
116; SSE41-NEXT:    pinsrq $0, (%rdi), %xmm0
117; SSE41-NEXT:    retq
118;
119; AVX-LABEL: insert_i64_firstelt:
120; AVX:       # %bb.0:
121; AVX-NEXT:    vpinsrq $0, (%rdi), %xmm0, %xmm0
122; AVX-NEXT:    retq
123  %s = load i64, ptr %s.addr
124  %i0 = insertelement <2 x i64> %x, i64 %s, i32 0
125  ret <2 x i64> %i0
126}
127
128; 1'th element insertion.
129
130define <4 x float> @insert_f32_secondelt(<4 x float> %x, ptr %s.addr) {
131; SSE2-LABEL: insert_f32_secondelt:
132; SSE2:       # %bb.0:
133; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
134; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
135; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
136; SSE2-NEXT:    movaps %xmm1, %xmm0
137; SSE2-NEXT:    retq
138;
139; SSE41-LABEL: insert_f32_secondelt:
140; SSE41:       # %bb.0:
141; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
142; SSE41-NEXT:    retq
143;
144; AVX-LABEL: insert_f32_secondelt:
145; AVX:       # %bb.0:
146; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
147; AVX-NEXT:    retq
148  %s = load float, ptr %s.addr
149  %i0 = insertelement <4 x float> %x, float %s, i32 1
150  ret <4 x float> %i0
151}
152
153define <2 x double> @insert_f64_secondelt(<2 x double> %x, ptr %s.addr) {
154; SSE-LABEL: insert_f64_secondelt:
155; SSE:       # %bb.0:
156; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
157; SSE-NEXT:    retq
158;
159; AVX-LABEL: insert_f64_secondelt:
160; AVX:       # %bb.0:
161; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
162; AVX-NEXT:    retq
163  %s = load double, ptr %s.addr
164  %i0 = insertelement <2 x double> %x, double %s, i32 1
165  ret <2 x double> %i0
166}
167
168define <16 x i8> @insert_i8_secondelt(<16 x i8> %x, ptr %s.addr) {
169; SSE2-LABEL: insert_i8_secondelt:
170; SSE2:       # %bb.0:
171; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
172; SSE2-NEXT:    pand %xmm1, %xmm0
173; SSE2-NEXT:    movzbl (%rdi), %eax
174; SSE2-NEXT:    movd %eax, %xmm2
175; SSE2-NEXT:    psllw $8, %xmm2
176; SSE2-NEXT:    pandn %xmm2, %xmm1
177; SSE2-NEXT:    por %xmm1, %xmm0
178; SSE2-NEXT:    retq
179;
180; SSE41-LABEL: insert_i8_secondelt:
181; SSE41:       # %bb.0:
182; SSE41-NEXT:    pinsrb $1, (%rdi), %xmm0
183; SSE41-NEXT:    retq
184;
185; AVX-LABEL: insert_i8_secondelt:
186; AVX:       # %bb.0:
187; AVX-NEXT:    vpinsrb $1, (%rdi), %xmm0, %xmm0
188; AVX-NEXT:    retq
189  %s = load i8, ptr %s.addr
190  %i0 = insertelement <16 x i8> %x, i8 %s, i32 1
191  ret <16 x i8> %i0
192}
193
194define <8 x i16> @insert_i16_secondelt(<8 x i16> %x, ptr %s.addr) {
195; SSE-LABEL: insert_i16_secondelt:
196; SSE:       # %bb.0:
197; SSE-NEXT:    pinsrw $1, (%rdi), %xmm0
198; SSE-NEXT:    retq
199;
200; AVX-LABEL: insert_i16_secondelt:
201; AVX:       # %bb.0:
202; AVX-NEXT:    vpinsrw $1, (%rdi), %xmm0, %xmm0
203; AVX-NEXT:    retq
204  %s = load i16, ptr %s.addr
205  %i0 = insertelement <8 x i16> %x, i16 %s, i32 1
206  ret <8 x i16> %i0
207}
208
209define <4 x i32> @insert_i32_secondelt(<4 x i32> %x, ptr %s.addr) {
210; SSE2-LABEL: insert_i32_secondelt:
211; SSE2:       # %bb.0:
212; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
213; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
214; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
215; SSE2-NEXT:    movaps %xmm1, %xmm0
216; SSE2-NEXT:    retq
217;
218; SSE41-LABEL: insert_i32_secondelt:
219; SSE41:       # %bb.0:
220; SSE41-NEXT:    pinsrd $1, (%rdi), %xmm0
221; SSE41-NEXT:    retq
222;
223; AVX-LABEL: insert_i32_secondelt:
224; AVX:       # %bb.0:
225; AVX-NEXT:    vpinsrd $1, (%rdi), %xmm0, %xmm0
226; AVX-NEXT:    retq
227  %s = load i32, ptr %s.addr
228  %i0 = insertelement <4 x i32> %x, i32 %s, i32 1
229  ret <4 x i32> %i0
230}
231
232define <2 x i64> @insert_i64_secondelt(<2 x i64> %x, ptr %s.addr) {
233; SSE2-LABEL: insert_i64_secondelt:
234; SSE2:       # %bb.0:
235; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
236; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
237; SSE2-NEXT:    retq
238;
239; SSE41-LABEL: insert_i64_secondelt:
240; SSE41:       # %bb.0:
241; SSE41-NEXT:    pinsrq $1, (%rdi), %xmm0
242; SSE41-NEXT:    retq
243;
244; AVX-LABEL: insert_i64_secondelt:
245; AVX:       # %bb.0:
246; AVX-NEXT:    vpinsrq $1, (%rdi), %xmm0, %xmm0
247; AVX-NEXT:    retq
248  %s = load i64, ptr %s.addr
249  %i0 = insertelement <2 x i64> %x, i64 %s, i32 1
250  ret <2 x i64> %i0
251}
252
253; element insertion into two elements
254
255define <4 x float> @insert_f32_two_elts(<4 x float> %x, ptr %s.addr) {
256; SSE-LABEL: insert_f32_two_elts:
257; SSE:       # %bb.0:
258; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
259; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
260; SSE-NEXT:    movaps %xmm1, %xmm0
261; SSE-NEXT:    retq
262;
263; AVX-LABEL: insert_f32_two_elts:
264; AVX:       # %bb.0:
265; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
266; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3]
267; AVX-NEXT:    retq
268  %s = load float, ptr %s.addr
269  %i0 = insertelement <4 x float> %x, float %s, i32 0
270  %i1 = insertelement <4 x float> %i0, float %s, i32 1
271  ret <4 x float> %i1
272}
273
274define <2 x double> @insert_f64_two_elts(<2 x double> %x, ptr %s.addr) {
275; SSE2-LABEL: insert_f64_two_elts:
276; SSE2:       # %bb.0:
277; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
278; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
279; SSE2-NEXT:    retq
280;
281; SSE41-LABEL: insert_f64_two_elts:
282; SSE41:       # %bb.0:
283; SSE41-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
284; SSE41-NEXT:    retq
285;
286; AVX-LABEL: insert_f64_two_elts:
287; AVX:       # %bb.0:
288; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
289; AVX-NEXT:    retq
290  %s = load double, ptr %s.addr
291  %i0 = insertelement <2 x double> %x, double %s, i32 0
292  %i1 = insertelement <2 x double> %i0, double %s, i32 1
293  ret <2 x double> %i1
294}
295
296define <16 x i8> @insert_i8_two_elts(<16 x i8> %x, ptr %s.addr) {
297; SSE2-LABEL: insert_i8_two_elts:
298; SSE2:       # %bb.0:
299; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
300; SSE2-NEXT:    pand %xmm1, %xmm0
301; SSE2-NEXT:    movzbl (%rdi), %eax
302; SSE2-NEXT:    movd %eax, %xmm2
303; SSE2-NEXT:    pandn %xmm2, %xmm1
304; SSE2-NEXT:    por %xmm1, %xmm0
305; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
306; SSE2-NEXT:    pand %xmm1, %xmm0
307; SSE2-NEXT:    psllw $8, %xmm2
308; SSE2-NEXT:    pandn %xmm2, %xmm1
309; SSE2-NEXT:    por %xmm1, %xmm0
310; SSE2-NEXT:    retq
311;
312; SSE41-LABEL: insert_i8_two_elts:
313; SSE41:       # %bb.0:
314; SSE41-NEXT:    movzbl (%rdi), %eax
315; SSE41-NEXT:    pinsrb $0, %eax, %xmm0
316; SSE41-NEXT:    pinsrb $1, %eax, %xmm0
317; SSE41-NEXT:    retq
318;
319; AVX-LABEL: insert_i8_two_elts:
320; AVX:       # %bb.0:
321; AVX-NEXT:    movzbl (%rdi), %eax
322; AVX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
323; AVX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
324; AVX-NEXT:    retq
325  %s = load i8, ptr %s.addr
326  %i0 = insertelement <16 x i8> %x, i8 %s, i32 0
327  %i1 = insertelement <16 x i8> %i0, i8 %s, i32 1
328  ret <16 x i8> %i1
329}
330
331define <8 x i16> @insert_i16_two_elts(<8 x i16> %x, ptr %s.addr) {
332; SSE-LABEL: insert_i16_two_elts:
333; SSE:       # %bb.0:
334; SSE-NEXT:    movzwl (%rdi), %eax
335; SSE-NEXT:    pinsrw $0, %eax, %xmm0
336; SSE-NEXT:    pinsrw $1, %eax, %xmm0
337; SSE-NEXT:    retq
338;
339; AVX-LABEL: insert_i16_two_elts:
340; AVX:       # %bb.0:
341; AVX-NEXT:    movzwl (%rdi), %eax
342; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
343; AVX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
344; AVX-NEXT:    retq
345  %s = load i16, ptr %s.addr
346  %i0 = insertelement <8 x i16> %x, i16 %s, i32 0
347  %i1 = insertelement <8 x i16> %i0, i16 %s, i32 1
348  ret <8 x i16> %i1
349}
350
351define <4 x i32> @insert_i32_two_elts(<4 x i32> %x, ptr %s.addr) {
352; SSE2-LABEL: insert_i32_two_elts:
353; SSE2:       # %bb.0:
354; SSE2-NEXT:    movl (%rdi), %eax
355; SSE2-NEXT:    movd %eax, %xmm2
356; SSE2-NEXT:    movd %eax, %xmm1
357; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
358; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
359; SSE2-NEXT:    movaps %xmm1, %xmm0
360; SSE2-NEXT:    retq
361;
362; SSE41-LABEL: insert_i32_two_elts:
363; SSE41:       # %bb.0:
364; SSE41-NEXT:    movl (%rdi), %eax
365; SSE41-NEXT:    pinsrd $0, %eax, %xmm0
366; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
367; SSE41-NEXT:    retq
368;
369; AVX-LABEL: insert_i32_two_elts:
370; AVX:       # %bb.0:
371; AVX-NEXT:    movl (%rdi), %eax
372; AVX-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
373; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
374; AVX-NEXT:    retq
375  %s = load i32, ptr %s.addr
376  %i0 = insertelement <4 x i32> %x, i32 %s, i32 0
377  %i1 = insertelement <4 x i32> %i0, i32 %s, i32 1
378  ret <4 x i32> %i1
379}
380
381define <2 x i64> @insert_i64_two_elts(<2 x i64> %x, ptr %s.addr) {
382; SSE-LABEL: insert_i64_two_elts:
383; SSE:       # %bb.0:
384; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
385; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
386; SSE-NEXT:    retq
387;
388; AVX-LABEL: insert_i64_two_elts:
389; AVX:       # %bb.0:
390; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
391; AVX-NEXT:    retq
392  %s = load i64, ptr %s.addr
393  %i0 = insertelement <2 x i64> %x, i64 %s, i32 0
394  %i1 = insertelement <2 x i64> %i0, i64 %s, i32 1
395  ret <2 x i64> %i1
396}
397
398; Special tests
399
400define void @insert_i32_two_elts_into_different_vectors(<4 x i32> %x, <4 x i32> %y, ptr %s.addr, ptr %x.out.addr, ptr %y.out.addr) {
401; SSE2-LABEL: insert_i32_two_elts_into_different_vectors:
402; SSE2:       # %bb.0:
403; SSE2-NEXT:    movl (%rdi), %eax
404; SSE2-NEXT:    movd %eax, %xmm2
405; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
406; SSE2-NEXT:    movd %eax, %xmm2
407; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
408; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
409; SSE2-NEXT:    movaps %xmm0, (%rsi)
410; SSE2-NEXT:    movaps %xmm2, (%rdx)
411; SSE2-NEXT:    retq
412;
413; SSE41-LABEL: insert_i32_two_elts_into_different_vectors:
414; SSE41:       # %bb.0:
415; SSE41-NEXT:    movl (%rdi), %eax
416; SSE41-NEXT:    pinsrd $0, %eax, %xmm0
417; SSE41-NEXT:    pinsrd $1, %eax, %xmm1
418; SSE41-NEXT:    movdqa %xmm0, (%rsi)
419; SSE41-NEXT:    movdqa %xmm1, (%rdx)
420; SSE41-NEXT:    retq
421;
422; AVX-LABEL: insert_i32_two_elts_into_different_vectors:
423; AVX:       # %bb.0:
424; AVX-NEXT:    movl (%rdi), %eax
425; AVX-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
426; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
427; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
428; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
429; AVX-NEXT:    retq
430  %s = load i32, ptr %s.addr
431  %i0 = insertelement <4 x i32> %x, i32 %s, i32 0
432  %i1 = insertelement <4 x i32> %y, i32 %s, i32 1
433  store <4 x i32> %i0, ptr %x.out.addr
434  store <4 x i32> %i1, ptr %y.out.addr
435  ret void
436}
437
438define <4 x float> @insert_f32_two_elts_extrause_of_scalar(<4 x float> %x, ptr %s.addr, ptr %s.out) {
439; SSE-LABEL: insert_f32_two_elts_extrause_of_scalar:
440; SSE:       # %bb.0:
441; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
442; SSE-NEXT:    movss %xmm1, (%rsi)
443; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
444; SSE-NEXT:    movaps %xmm1, %xmm0
445; SSE-NEXT:    retq
446;
447; AVX-LABEL: insert_f32_two_elts_extrause_of_scalar:
448; AVX:       # %bb.0:
449; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
450; AVX-NEXT:    vmovss %xmm1, (%rsi)
451; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3]
452; AVX-NEXT:    retq
453  %s = load float, ptr %s.addr
454  store float %s, ptr %s.out
455  %i0 = insertelement <4 x float> %x, float %s, i32 0
456  %i1 = insertelement <4 x float> %i0, float %s, i32 1
457  ret <4 x float> %i1
458}
459