xref: /llvm-project/llvm/test/CodeGen/X86/fast-isel-store.ll (revision 2aef33230d3402878a837f9aaa37e37d0763d1ac)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X86,X86-SSE
3; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,X64-SSE
4; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1
5; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
6; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512
7; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512
8; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512
9; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512
10
11define i32 @test_store_32(ptr nocapture %addr, i32 %value) nounwind {
12; X86-LABEL: test_store_32:
13; X86:       # %bb.0: # %entry
14; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
15; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
16; X86-NEXT:    movl %eax, (%ecx)
17; X86-NEXT:    retl
18;
19; X64-LABEL: test_store_32:
20; X64:       # %bb.0: # %entry
21; X64-NEXT:    movl %esi, %eax
22; X64-NEXT:    movl %esi, (%rdi)
23; X64-NEXT:    retq
24entry:
25  store i32 %value, ptr %addr, align 1
26  ret i32 %value
27}
28
29define i16 @test_store_16(ptr nocapture %addr, i16 %value) nounwind {
30; X86-LABEL: test_store_16:
31; X86:       # %bb.0: # %entry
32; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
33; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
34; X86-NEXT:    movw %ax, (%ecx)
35; X86-NEXT:    retl
36;
37; X64-LABEL: test_store_16:
38; X64:       # %bb.0: # %entry
39; X64-NEXT:    movl %esi, %eax
40; X64-NEXT:    movw %ax, (%rdi)
41; X64-NEXT:    # kill: def $ax killed $ax killed $eax
42; X64-NEXT:    retq
43entry:
44  store i16 %value, ptr %addr, align 1
45  ret i16 %value
46}
47
48define <4 x i32> @test_store_4xi32(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) nounwind {
49; X86-SSE-LABEL: test_store_4xi32:
50; X86-SSE:       # %bb.0:
51; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
52; X86-SSE-NEXT:    paddd %xmm1, %xmm0
53; X86-SSE-NEXT:    movdqu %xmm0, (%eax)
54; X86-SSE-NEXT:    retl
55;
56; X64-SSE-LABEL: test_store_4xi32:
57; X64-SSE:       # %bb.0:
58; X64-SSE-NEXT:    paddd %xmm1, %xmm0
59; X64-SSE-NEXT:    movdqu %xmm0, (%rdi)
60; X64-SSE-NEXT:    retq
61;
62; X86-AVX-LABEL: test_store_4xi32:
63; X86-AVX:       # %bb.0:
64; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
65; X86-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
66; X86-AVX-NEXT:    vmovdqu %xmm0, (%eax)
67; X86-AVX-NEXT:    retl
68;
69; X64-AVX-LABEL: test_store_4xi32:
70; X64-AVX:       # %bb.0:
71; X64-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
72; X64-AVX-NEXT:    vmovdqu %xmm0, (%rdi)
73; X64-AVX-NEXT:    retq
74  %foo = add <4 x i32> %value, %value2 ; to force integer type on store
75  store <4 x i32> %foo, ptr %addr, align 1
76  ret <4 x i32> %foo
77}
78
79define <4 x i32> @test_store_4xi32_aligned(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) nounwind {
80; X86-SSE-LABEL: test_store_4xi32_aligned:
81; X86-SSE:       # %bb.0:
82; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
83; X86-SSE-NEXT:    paddd %xmm1, %xmm0
84; X86-SSE-NEXT:    movdqa %xmm0, (%eax)
85; X86-SSE-NEXT:    retl
86;
87; X64-SSE-LABEL: test_store_4xi32_aligned:
88; X64-SSE:       # %bb.0:
89; X64-SSE-NEXT:    paddd %xmm1, %xmm0
90; X64-SSE-NEXT:    movdqa %xmm0, (%rdi)
91; X64-SSE-NEXT:    retq
92;
93; X86-AVX-LABEL: test_store_4xi32_aligned:
94; X86-AVX:       # %bb.0:
95; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
96; X86-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
97; X86-AVX-NEXT:    vmovdqa %xmm0, (%eax)
98; X86-AVX-NEXT:    retl
99;
100; X64-AVX-LABEL: test_store_4xi32_aligned:
101; X64-AVX:       # %bb.0:
102; X64-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
103; X64-AVX-NEXT:    vmovdqa %xmm0, (%rdi)
104; X64-AVX-NEXT:    retq
105  %foo = add <4 x i32> %value, %value2 ; to force integer type on store
106  store <4 x i32> %foo, ptr %addr, align 16
107  ret <4 x i32> %foo
108}
109
110define <4 x float> @test_store_4xf32(ptr nocapture %addr, <4 x float> %value) nounwind {
111; X86-SSE-LABEL: test_store_4xf32:
112; X86-SSE:       # %bb.0:
113; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
114; X86-SSE-NEXT:    movups %xmm0, (%eax)
115; X86-SSE-NEXT:    retl
116;
117; X64-SSE-LABEL: test_store_4xf32:
118; X64-SSE:       # %bb.0:
119; X64-SSE-NEXT:    movups %xmm0, (%rdi)
120; X64-SSE-NEXT:    retq
121;
122; X86-AVX-LABEL: test_store_4xf32:
123; X86-AVX:       # %bb.0:
124; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
125; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
126; X86-AVX-NEXT:    retl
127;
128; X64-AVX-LABEL: test_store_4xf32:
129; X64-AVX:       # %bb.0:
130; X64-AVX-NEXT:    vmovups %xmm0, (%rdi)
131; X64-AVX-NEXT:    retq
132  store <4 x float> %value, ptr %addr, align 1
133  ret <4 x float> %value
134}
135
136define <4 x float> @test_store_4xf32_aligned(ptr nocapture %addr, <4 x float> %value) nounwind {
137; X86-SSE-LABEL: test_store_4xf32_aligned:
138; X86-SSE:       # %bb.0:
139; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
140; X86-SSE-NEXT:    movaps %xmm0, (%eax)
141; X86-SSE-NEXT:    retl
142;
143; X64-SSE-LABEL: test_store_4xf32_aligned:
144; X64-SSE:       # %bb.0:
145; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
146; X64-SSE-NEXT:    retq
147;
148; X86-AVX-LABEL: test_store_4xf32_aligned:
149; X86-AVX:       # %bb.0:
150; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
151; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
152; X86-AVX-NEXT:    retl
153;
154; X64-AVX-LABEL: test_store_4xf32_aligned:
155; X64-AVX:       # %bb.0:
156; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
157; X64-AVX-NEXT:    retq
158  store <4 x float> %value, ptr %addr, align 16
159  ret <4 x float> %value
160}
161
162define <2 x double> @test_store_2xf64(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) nounwind {
163; X86-SSE-LABEL: test_store_2xf64:
164; X86-SSE:       # %bb.0:
165; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
166; X86-SSE-NEXT:    addpd %xmm1, %xmm0
167; X86-SSE-NEXT:    movupd %xmm0, (%eax)
168; X86-SSE-NEXT:    retl
169;
170; X64-SSE-LABEL: test_store_2xf64:
171; X64-SSE:       # %bb.0:
172; X64-SSE-NEXT:    addpd %xmm1, %xmm0
173; X64-SSE-NEXT:    movupd %xmm0, (%rdi)
174; X64-SSE-NEXT:    retq
175;
176; X86-AVX-LABEL: test_store_2xf64:
177; X86-AVX:       # %bb.0:
178; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
179; X86-AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
180; X86-AVX-NEXT:    vmovupd %xmm0, (%eax)
181; X86-AVX-NEXT:    retl
182;
183; X64-AVX-LABEL: test_store_2xf64:
184; X64-AVX:       # %bb.0:
185; X64-AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
186; X64-AVX-NEXT:    vmovupd %xmm0, (%rdi)
187; X64-AVX-NEXT:    retq
188  %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
189  store <2 x double> %foo, ptr %addr, align 1
190  ret <2 x double> %foo
191}
192
193define <2 x double> @test_store_2xf64_aligned(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) nounwind {
194; X86-SSE-LABEL: test_store_2xf64_aligned:
195; X86-SSE:       # %bb.0:
196; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
197; X86-SSE-NEXT:    addpd %xmm1, %xmm0
198; X86-SSE-NEXT:    movapd %xmm0, (%eax)
199; X86-SSE-NEXT:    retl
200;
201; X64-SSE-LABEL: test_store_2xf64_aligned:
202; X64-SSE:       # %bb.0:
203; X64-SSE-NEXT:    addpd %xmm1, %xmm0
204; X64-SSE-NEXT:    movapd %xmm0, (%rdi)
205; X64-SSE-NEXT:    retq
206;
207; X86-AVX-LABEL: test_store_2xf64_aligned:
208; X86-AVX:       # %bb.0:
209; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
210; X86-AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
211; X86-AVX-NEXT:    vmovapd %xmm0, (%eax)
212; X86-AVX-NEXT:    retl
213;
214; X64-AVX-LABEL: test_store_2xf64_aligned:
215; X64-AVX:       # %bb.0:
216; X64-AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
217; X64-AVX-NEXT:    vmovapd %xmm0, (%rdi)
218; X64-AVX-NEXT:    retq
219  %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
220  store <2 x double> %foo, ptr %addr, align 16
221  ret <2 x double> %foo
222}
223
224define <8 x i32> @test_store_8xi32(ptr nocapture %addr, <8 x i32> %value) nounwind {
225; X86-SSE-LABEL: test_store_8xi32:
226; X86-SSE:       # %bb.0:
227; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
228; X86-SSE-NEXT:    movups %xmm0, (%eax)
229; X86-SSE-NEXT:    movups %xmm1, 16(%eax)
230; X86-SSE-NEXT:    retl
231;
232; X64-SSE-LABEL: test_store_8xi32:
233; X64-SSE:       # %bb.0:
234; X64-SSE-NEXT:    movups %xmm0, (%rdi)
235; X64-SSE-NEXT:    movups %xmm1, 16(%rdi)
236; X64-SSE-NEXT:    retq
237;
238; X86-AVX-LABEL: test_store_8xi32:
239; X86-AVX:       # %bb.0:
240; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
241; X86-AVX-NEXT:    vmovups %ymm0, (%eax)
242; X86-AVX-NEXT:    retl
243;
244; X64-AVX-LABEL: test_store_8xi32:
245; X64-AVX:       # %bb.0:
246; X64-AVX-NEXT:    vmovups %ymm0, (%rdi)
247; X64-AVX-NEXT:    retq
248  store <8 x i32> %value, ptr %addr, align 1
249  ret <8 x i32> %value
250}
251
252define <8 x i32> @test_store_8xi32_aligned(ptr nocapture %addr, <8 x i32> %value) nounwind {
253; X86-SSE-LABEL: test_store_8xi32_aligned:
254; X86-SSE:       # %bb.0:
255; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
256; X86-SSE-NEXT:    movaps %xmm0, (%eax)
257; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
258; X86-SSE-NEXT:    retl
259;
260; X64-SSE-LABEL: test_store_8xi32_aligned:
261; X64-SSE:       # %bb.0:
262; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
263; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
264; X64-SSE-NEXT:    retq
265;
266; X86-AVX-LABEL: test_store_8xi32_aligned:
267; X86-AVX:       # %bb.0:
268; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
269; X86-AVX-NEXT:    vmovaps %ymm0, (%eax)
270; X86-AVX-NEXT:    retl
271;
272; X64-AVX-LABEL: test_store_8xi32_aligned:
273; X64-AVX:       # %bb.0:
274; X64-AVX-NEXT:    vmovaps %ymm0, (%rdi)
275; X64-AVX-NEXT:    retq
276  store <8 x i32> %value, ptr %addr, align 32
277  ret <8 x i32> %value
278}
279
280define <8 x float> @test_store_8xf32(ptr nocapture %addr, <8 x float> %value) nounwind {
281; X86-SSE-LABEL: test_store_8xf32:
282; X86-SSE:       # %bb.0:
283; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
284; X86-SSE-NEXT:    movups %xmm0, (%eax)
285; X86-SSE-NEXT:    movups %xmm1, 16(%eax)
286; X86-SSE-NEXT:    retl
287;
288; X64-SSE-LABEL: test_store_8xf32:
289; X64-SSE:       # %bb.0:
290; X64-SSE-NEXT:    movups %xmm0, (%rdi)
291; X64-SSE-NEXT:    movups %xmm1, 16(%rdi)
292; X64-SSE-NEXT:    retq
293;
294; X86-AVX-LABEL: test_store_8xf32:
295; X86-AVX:       # %bb.0:
296; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
297; X86-AVX-NEXT:    vmovups %ymm0, (%eax)
298; X86-AVX-NEXT:    retl
299;
300; X64-AVX-LABEL: test_store_8xf32:
301; X64-AVX:       # %bb.0:
302; X64-AVX-NEXT:    vmovups %ymm0, (%rdi)
303; X64-AVX-NEXT:    retq
304  store <8 x float> %value, ptr %addr, align 1
305  ret <8 x float> %value
306}
307
308define <8 x float> @test_store_8xf32_aligned(ptr nocapture %addr, <8 x float> %value) nounwind {
309; X86-SSE-LABEL: test_store_8xf32_aligned:
310; X86-SSE:       # %bb.0:
311; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
312; X86-SSE-NEXT:    movaps %xmm0, (%eax)
313; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
314; X86-SSE-NEXT:    retl
315;
316; X64-SSE-LABEL: test_store_8xf32_aligned:
317; X64-SSE:       # %bb.0:
318; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
319; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
320; X64-SSE-NEXT:    retq
321;
322; X86-AVX-LABEL: test_store_8xf32_aligned:
323; X86-AVX:       # %bb.0:
324; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
325; X86-AVX-NEXT:    vmovaps %ymm0, (%eax)
326; X86-AVX-NEXT:    retl
327;
328; X64-AVX-LABEL: test_store_8xf32_aligned:
329; X64-AVX:       # %bb.0:
330; X64-AVX-NEXT:    vmovaps %ymm0, (%rdi)
331; X64-AVX-NEXT:    retq
332  store <8 x float> %value, ptr %addr, align 32
333  ret <8 x float> %value
334}
335
336define <4 x double> @test_store_4xf64(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) nounwind {
337; X86-SSE-LABEL: test_store_4xf64:
338; X86-SSE:       # %bb.0:
339; X86-SSE-NEXT:    subl $12, %esp
340; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
341; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
342; X86-SSE-NEXT:    addpd %xmm2, %xmm0
343; X86-SSE-NEXT:    movupd %xmm0, (%eax)
344; X86-SSE-NEXT:    addpd %xmm3, %xmm1
345; X86-SSE-NEXT:    movupd %xmm1, 16(%eax)
346; X86-SSE-NEXT:    addl $12, %esp
347; X86-SSE-NEXT:    retl
348;
349; X64-SSE-LABEL: test_store_4xf64:
350; X64-SSE:       # %bb.0:
351; X64-SSE-NEXT:    addpd %xmm2, %xmm0
352; X64-SSE-NEXT:    movupd %xmm0, (%rdi)
353; X64-SSE-NEXT:    addpd %xmm3, %xmm1
354; X64-SSE-NEXT:    movupd %xmm1, 16(%rdi)
355; X64-SSE-NEXT:    retq
356;
357; X86-AVX-LABEL: test_store_4xf64:
358; X86-AVX:       # %bb.0:
359; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
360; X86-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
361; X86-AVX-NEXT:    vmovupd %ymm0, (%eax)
362; X86-AVX-NEXT:    retl
363;
364; X64-AVX-LABEL: test_store_4xf64:
365; X64-AVX:       # %bb.0:
366; X64-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
367; X64-AVX-NEXT:    vmovupd %ymm0, (%rdi)
368; X64-AVX-NEXT:    retq
369  %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
370  store <4 x double> %foo, ptr %addr, align 1
371  ret <4 x double> %foo
372}
373
374define <4 x double> @test_store_4xf64_aligned(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) nounwind {
375; X86-SSE-LABEL: test_store_4xf64_aligned:
376; X86-SSE:       # %bb.0:
377; X86-SSE-NEXT:    subl $12, %esp
378; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
379; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
380; X86-SSE-NEXT:    addpd %xmm2, %xmm0
381; X86-SSE-NEXT:    movapd %xmm0, (%eax)
382; X86-SSE-NEXT:    addpd %xmm3, %xmm1
383; X86-SSE-NEXT:    movapd %xmm1, 16(%eax)
384; X86-SSE-NEXT:    addl $12, %esp
385; X86-SSE-NEXT:    retl
386;
387; X64-SSE-LABEL: test_store_4xf64_aligned:
388; X64-SSE:       # %bb.0:
389; X64-SSE-NEXT:    addpd %xmm2, %xmm0
390; X64-SSE-NEXT:    movapd %xmm0, (%rdi)
391; X64-SSE-NEXT:    addpd %xmm3, %xmm1
392; X64-SSE-NEXT:    movapd %xmm1, 16(%rdi)
393; X64-SSE-NEXT:    retq
394;
395; X86-AVX-LABEL: test_store_4xf64_aligned:
396; X86-AVX:       # %bb.0:
397; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
398; X86-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
399; X86-AVX-NEXT:    vmovapd %ymm0, (%eax)
400; X86-AVX-NEXT:    retl
401;
402; X64-AVX-LABEL: test_store_4xf64_aligned:
403; X64-AVX:       # %bb.0:
404; X64-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
405; X64-AVX-NEXT:    vmovapd %ymm0, (%rdi)
406; X64-AVX-NEXT:    retq
407  %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
408  store <4 x double> %foo, ptr %addr, align 32
409  ret <4 x double> %foo
410}
411
412define <16 x i32> @test_store_16xi32(ptr nocapture %addr, <16 x i32> %value) nounwind {
413; X86-SSE-LABEL: test_store_16xi32:
414; X86-SSE:       # %bb.0:
415; X86-SSE-NEXT:    subl $12, %esp
416; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
417; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
418; X86-SSE-NEXT:    movups %xmm0, (%eax)
419; X86-SSE-NEXT:    movups %xmm1, 16(%eax)
420; X86-SSE-NEXT:    movups %xmm2, 32(%eax)
421; X86-SSE-NEXT:    movups %xmm3, 48(%eax)
422; X86-SSE-NEXT:    addl $12, %esp
423; X86-SSE-NEXT:    retl
424;
425; X64-SSE-LABEL: test_store_16xi32:
426; X64-SSE:       # %bb.0:
427; X64-SSE-NEXT:    movups %xmm0, (%rdi)
428; X64-SSE-NEXT:    movups %xmm1, 16(%rdi)
429; X64-SSE-NEXT:    movups %xmm2, 32(%rdi)
430; X64-SSE-NEXT:    movups %xmm3, 48(%rdi)
431; X64-SSE-NEXT:    retq
432;
433; X86-AVX1-LABEL: test_store_16xi32:
434; X86-AVX1:       # %bb.0:
435; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
436; X86-AVX1-NEXT:    vmovups %ymm0, (%eax)
437; X86-AVX1-NEXT:    vmovups %ymm1, 32(%eax)
438; X86-AVX1-NEXT:    retl
439;
440; X64-AVX1-LABEL: test_store_16xi32:
441; X64-AVX1:       # %bb.0:
442; X64-AVX1-NEXT:    vmovups %ymm0, (%rdi)
443; X64-AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
444; X64-AVX1-NEXT:    retq
445;
446; X86-AVX512-LABEL: test_store_16xi32:
447; X86-AVX512:       # %bb.0:
448; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
449; X86-AVX512-NEXT:    vmovups %zmm0, (%eax)
450; X86-AVX512-NEXT:    retl
451;
452; X64-AVX512-LABEL: test_store_16xi32:
453; X64-AVX512:       # %bb.0:
454; X64-AVX512-NEXT:    vmovups %zmm0, (%rdi)
455; X64-AVX512-NEXT:    retq
456  store <16 x i32> %value, ptr %addr, align 1
457  ret <16 x i32> %value
458}
459
460define <16 x i32> @test_store_16xi32_aligned(ptr nocapture %addr, <16 x i32> %value) nounwind {
461; X86-SSE-LABEL: test_store_16xi32_aligned:
462; X86-SSE:       # %bb.0:
463; X86-SSE-NEXT:    subl $12, %esp
464; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
465; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
466; X86-SSE-NEXT:    movaps %xmm0, (%eax)
467; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
468; X86-SSE-NEXT:    movaps %xmm2, 32(%eax)
469; X86-SSE-NEXT:    movaps %xmm3, 48(%eax)
470; X86-SSE-NEXT:    addl $12, %esp
471; X86-SSE-NEXT:    retl
472;
473; X64-SSE-LABEL: test_store_16xi32_aligned:
474; X64-SSE:       # %bb.0:
475; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
476; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
477; X64-SSE-NEXT:    movaps %xmm2, 32(%rdi)
478; X64-SSE-NEXT:    movaps %xmm3, 48(%rdi)
479; X64-SSE-NEXT:    retq
480;
481; X86-AVX1-LABEL: test_store_16xi32_aligned:
482; X86-AVX1:       # %bb.0:
483; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
484; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
485; X86-AVX1-NEXT:    vmovaps %ymm1, 32(%eax)
486; X86-AVX1-NEXT:    retl
487;
488; X64-AVX1-LABEL: test_store_16xi32_aligned:
489; X64-AVX1:       # %bb.0:
490; X64-AVX1-NEXT:    vmovaps %ymm0, (%rdi)
491; X64-AVX1-NEXT:    vmovaps %ymm1, 32(%rdi)
492; X64-AVX1-NEXT:    retq
493;
494; X86-AVX512-LABEL: test_store_16xi32_aligned:
495; X86-AVX512:       # %bb.0:
496; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
497; X86-AVX512-NEXT:    vmovaps %zmm0, (%eax)
498; X86-AVX512-NEXT:    retl
499;
500; X64-AVX512-LABEL: test_store_16xi32_aligned:
501; X64-AVX512:       # %bb.0:
502; X64-AVX512-NEXT:    vmovaps %zmm0, (%rdi)
503; X64-AVX512-NEXT:    retq
504  store <16 x i32> %value, ptr %addr, align 64
505  ret <16 x i32> %value
506}
507
508define <16 x float> @test_store_16xf32(ptr nocapture %addr, <16 x float> %value) nounwind {
509; X86-SSE-LABEL: test_store_16xf32:
510; X86-SSE:       # %bb.0:
511; X86-SSE-NEXT:    subl $12, %esp
512; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
513; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
514; X86-SSE-NEXT:    movups %xmm0, (%eax)
515; X86-SSE-NEXT:    movups %xmm1, 16(%eax)
516; X86-SSE-NEXT:    movups %xmm2, 32(%eax)
517; X86-SSE-NEXT:    movups %xmm3, 48(%eax)
518; X86-SSE-NEXT:    addl $12, %esp
519; X86-SSE-NEXT:    retl
520;
521; X64-SSE-LABEL: test_store_16xf32:
522; X64-SSE:       # %bb.0:
523; X64-SSE-NEXT:    movups %xmm0, (%rdi)
524; X64-SSE-NEXT:    movups %xmm1, 16(%rdi)
525; X64-SSE-NEXT:    movups %xmm2, 32(%rdi)
526; X64-SSE-NEXT:    movups %xmm3, 48(%rdi)
527; X64-SSE-NEXT:    retq
528;
529; X86-AVX1-LABEL: test_store_16xf32:
530; X86-AVX1:       # %bb.0:
531; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
532; X86-AVX1-NEXT:    vmovups %ymm0, (%eax)
533; X86-AVX1-NEXT:    vmovups %ymm1, 32(%eax)
534; X86-AVX1-NEXT:    retl
535;
536; X64-AVX1-LABEL: test_store_16xf32:
537; X64-AVX1:       # %bb.0:
538; X64-AVX1-NEXT:    vmovups %ymm0, (%rdi)
539; X64-AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
540; X64-AVX1-NEXT:    retq
541;
542; X86-AVX512-LABEL: test_store_16xf32:
543; X86-AVX512:       # %bb.0:
544; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
545; X86-AVX512-NEXT:    vmovups %zmm0, (%eax)
546; X86-AVX512-NEXT:    retl
547;
548; X64-AVX512-LABEL: test_store_16xf32:
549; X64-AVX512:       # %bb.0:
550; X64-AVX512-NEXT:    vmovups %zmm0, (%rdi)
551; X64-AVX512-NEXT:    retq
552  store <16 x float> %value, ptr %addr, align 1
553  ret <16 x float> %value
554}
555
556define <16 x float> @test_store_16xf32_aligned(ptr nocapture %addr, <16 x float> %value) nounwind {
557; X86-SSE-LABEL: test_store_16xf32_aligned:
558; X86-SSE:       # %bb.0:
559; X86-SSE-NEXT:    subl $12, %esp
560; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
561; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
562; X86-SSE-NEXT:    movaps %xmm0, (%eax)
563; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
564; X86-SSE-NEXT:    movaps %xmm2, 32(%eax)
565; X86-SSE-NEXT:    movaps %xmm3, 48(%eax)
566; X86-SSE-NEXT:    addl $12, %esp
567; X86-SSE-NEXT:    retl
568;
569; X64-SSE-LABEL: test_store_16xf32_aligned:
570; X64-SSE:       # %bb.0:
571; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
572; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
573; X64-SSE-NEXT:    movaps %xmm2, 32(%rdi)
574; X64-SSE-NEXT:    movaps %xmm3, 48(%rdi)
575; X64-SSE-NEXT:    retq
576;
577; X86-AVX1-LABEL: test_store_16xf32_aligned:
578; X86-AVX1:       # %bb.0:
579; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
580; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
581; X86-AVX1-NEXT:    vmovaps %ymm1, 32(%eax)
582; X86-AVX1-NEXT:    retl
583;
584; X64-AVX1-LABEL: test_store_16xf32_aligned:
585; X64-AVX1:       # %bb.0:
586; X64-AVX1-NEXT:    vmovaps %ymm0, (%rdi)
587; X64-AVX1-NEXT:    vmovaps %ymm1, 32(%rdi)
588; X64-AVX1-NEXT:    retq
589;
590; X86-AVX512-LABEL: test_store_16xf32_aligned:
591; X86-AVX512:       # %bb.0:
592; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
593; X86-AVX512-NEXT:    vmovaps %zmm0, (%eax)
594; X86-AVX512-NEXT:    retl
595;
596; X64-AVX512-LABEL: test_store_16xf32_aligned:
597; X64-AVX512:       # %bb.0:
598; X64-AVX512-NEXT:    vmovaps %zmm0, (%rdi)
599; X64-AVX512-NEXT:    retq
600  store <16 x float> %value, ptr %addr, align 64
601  ret <16 x float> %value
602}
603
604define <8 x double> @test_store_8xf64(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) nounwind {
605; X86-SSE-LABEL: test_store_8xf64:
606; X86-SSE:       # %bb.0:
607; X86-SSE-NEXT:    subl $12, %esp
608; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm4
609; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm5
610; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm6
611; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
612; X86-SSE-NEXT:    addpd %xmm4, %xmm3
613; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
614; X86-SSE-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
615; X86-SSE-NEXT:    movupd %xmm0, (%eax)
616; X86-SSE-NEXT:    addpd %xmm6, %xmm1
617; X86-SSE-NEXT:    movupd %xmm1, 16(%eax)
618; X86-SSE-NEXT:    addpd %xmm5, %xmm2
619; X86-SSE-NEXT:    movupd %xmm2, 32(%eax)
620; X86-SSE-NEXT:    movupd %xmm3, 48(%eax)
621; X86-SSE-NEXT:    addl $12, %esp
622; X86-SSE-NEXT:    retl
623;
624; X64-SSE-LABEL: test_store_8xf64:
625; X64-SSE:       # %bb.0:
626; X64-SSE-NEXT:    addpd %xmm4, %xmm0
627; X64-SSE-NEXT:    movupd %xmm0, (%rdi)
628; X64-SSE-NEXT:    addpd %xmm5, %xmm1
629; X64-SSE-NEXT:    movupd %xmm1, 16(%rdi)
630; X64-SSE-NEXT:    addpd %xmm6, %xmm2
631; X64-SSE-NEXT:    movupd %xmm2, 32(%rdi)
632; X64-SSE-NEXT:    addpd %xmm7, %xmm3
633; X64-SSE-NEXT:    movupd %xmm3, 48(%rdi)
634; X64-SSE-NEXT:    retq
635;
636; X86-AVX1-LABEL: test_store_8xf64:
637; X86-AVX1:       # %bb.0:
638; X86-AVX1-NEXT:    pushl %ebp
639; X86-AVX1-NEXT:    movl %esp, %ebp
640; X86-AVX1-NEXT:    andl $-32, %esp
641; X86-AVX1-NEXT:    subl $32, %esp
642; X86-AVX1-NEXT:    vmovapd 40(%ebp), %ymm3
643; X86-AVX1-NEXT:    movl 8(%ebp), %eax
644; X86-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
645; X86-AVX1-NEXT:    vmovupd %ymm0, (%eax)
646; X86-AVX1-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
647; X86-AVX1-NEXT:    vmovupd %ymm1, 32(%eax)
648; X86-AVX1-NEXT:    movl %ebp, %esp
649; X86-AVX1-NEXT:    popl %ebp
650; X86-AVX1-NEXT:    retl
651;
652; X64-AVX1-LABEL: test_store_8xf64:
653; X64-AVX1:       # %bb.0:
654; X64-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
655; X64-AVX1-NEXT:    vmovupd %ymm0, (%rdi)
656; X64-AVX1-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
657; X64-AVX1-NEXT:    vmovupd %ymm1, 32(%rdi)
658; X64-AVX1-NEXT:    retq
659;
660; X86-AVX512-LABEL: test_store_8xf64:
661; X86-AVX512:       # %bb.0:
662; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
663; X86-AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
664; X86-AVX512-NEXT:    vmovupd %zmm0, (%eax)
665; X86-AVX512-NEXT:    retl
666;
667; X64-AVX512-LABEL: test_store_8xf64:
668; X64-AVX512:       # %bb.0:
669; X64-AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
670; X64-AVX512-NEXT:    vmovupd %zmm0, (%rdi)
671; X64-AVX512-NEXT:    retq
672  %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
673  store <8 x double> %foo, ptr %addr, align 1
674  ret <8 x double> %foo
675}
676
677define <8 x double> @test_store_8xf64_aligned(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) nounwind {
678; X86-SSE-LABEL: test_store_8xf64_aligned:
679; X86-SSE:       # %bb.0:
680; X86-SSE-NEXT:    subl $12, %esp
681; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm4
682; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm5
683; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm6
684; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
685; X86-SSE-NEXT:    addpd %xmm4, %xmm3
686; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
687; X86-SSE-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
688; X86-SSE-NEXT:    movapd %xmm0, (%eax)
689; X86-SSE-NEXT:    addpd %xmm6, %xmm1
690; X86-SSE-NEXT:    movapd %xmm1, 16(%eax)
691; X86-SSE-NEXT:    addpd %xmm5, %xmm2
692; X86-SSE-NEXT:    movapd %xmm2, 32(%eax)
693; X86-SSE-NEXT:    movapd %xmm3, 48(%eax)
694; X86-SSE-NEXT:    addl $12, %esp
695; X86-SSE-NEXT:    retl
696;
697; X64-SSE-LABEL: test_store_8xf64_aligned:
698; X64-SSE:       # %bb.0:
699; X64-SSE-NEXT:    addpd %xmm4, %xmm0
700; X64-SSE-NEXT:    movapd %xmm0, (%rdi)
701; X64-SSE-NEXT:    addpd %xmm5, %xmm1
702; X64-SSE-NEXT:    movapd %xmm1, 16(%rdi)
703; X64-SSE-NEXT:    addpd %xmm6, %xmm2
704; X64-SSE-NEXT:    movapd %xmm2, 32(%rdi)
705; X64-SSE-NEXT:    addpd %xmm7, %xmm3
706; X64-SSE-NEXT:    movapd %xmm3, 48(%rdi)
707; X64-SSE-NEXT:    retq
708;
709; X86-AVX1-LABEL: test_store_8xf64_aligned:
710; X86-AVX1:       # %bb.0:
711; X86-AVX1-NEXT:    pushl %ebp
712; X86-AVX1-NEXT:    movl %esp, %ebp
713; X86-AVX1-NEXT:    andl $-32, %esp
714; X86-AVX1-NEXT:    subl $32, %esp
715; X86-AVX1-NEXT:    vmovapd 40(%ebp), %ymm3
716; X86-AVX1-NEXT:    movl 8(%ebp), %eax
717; X86-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
718; X86-AVX1-NEXT:    vmovapd %ymm0, (%eax)
719; X86-AVX1-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
720; X86-AVX1-NEXT:    vmovapd %ymm1, 32(%eax)
721; X86-AVX1-NEXT:    movl %ebp, %esp
722; X86-AVX1-NEXT:    popl %ebp
723; X86-AVX1-NEXT:    retl
724;
725; X64-AVX1-LABEL: test_store_8xf64_aligned:
726; X64-AVX1:       # %bb.0:
727; X64-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
728; X64-AVX1-NEXT:    vmovapd %ymm0, (%rdi)
729; X64-AVX1-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
730; X64-AVX1-NEXT:    vmovapd %ymm1, 32(%rdi)
731; X64-AVX1-NEXT:    retq
732;
733; X86-AVX512-LABEL: test_store_8xf64_aligned:
734; X86-AVX512:       # %bb.0:
735; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
736; X86-AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
737; X86-AVX512-NEXT:    vmovapd %zmm0, (%eax)
738; X86-AVX512-NEXT:    retl
739;
740; X64-AVX512-LABEL: test_store_8xf64_aligned:
741; X64-AVX512:       # %bb.0:
742; X64-AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
743; X64-AVX512-NEXT:    vmovapd %zmm0, (%rdi)
744; X64-AVX512-NEXT:    retq
745  %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
746  store <8 x double> %foo, ptr %addr, align 64
747  ret <8 x double> %foo
748}
749