xref: /llvm-project/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll (revision a70d5e25f32ebd5f1d1c394312036a37591e998b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
10
11define void @load_single_128bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind {
12; SSE-LABEL: load_single_128bit_elt_vector:
13; SSE:       # %bb.0:
14; SSE-NEXT:    movaps (%rdi), %xmm0
15; SSE-NEXT:    xorps %xmm1, %xmm1
16; SSE-NEXT:    movaps %xmm1, 16(%rdx)
17; SSE-NEXT:    movaps %xmm0, (%rdx)
18; SSE-NEXT:    retq
19;
20; AVX-LABEL: load_single_128bit_elt_vector:
21; AVX:       # %bb.0:
22; AVX-NEXT:    vmovaps (%rdi), %xmm0
23; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
24; AVX-NEXT:    vmovaps %xmm1, 16(%rdx)
25; AVX-NEXT:    vmovaps %xmm0, (%rdx)
26; AVX-NEXT:    retq
27;
28; AVX2-LABEL: load_single_128bit_elt_vector:
29; AVX2:       # %bb.0:
30; AVX2-NEXT:    vmovaps (%rdi), %xmm0
31; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
32; AVX2-NEXT:    vmovaps %xmm1, 16(%rdx)
33; AVX2-NEXT:    vmovaps %xmm0, (%rdx)
34; AVX2-NEXT:    retq
35;
36; AVX512F-LABEL: load_single_128bit_elt_vector:
37; AVX512F:       # %bb.0:
38; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
39; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
40; AVX512F-NEXT:    vmovaps %xmm1, 16(%rdx)
41; AVX512F-NEXT:    vmovaps %xmm0, (%rdx)
42; AVX512F-NEXT:    retq
43  %i0 = load <16 x i8>, ptr %in, align 64
44  %i1 = bitcast <16 x i8> %i0 to <1 x i128>
45  %i2 = shufflevector <1 x i128> %i1, <1 x i128> zeroinitializer, <2 x i32> <i32 0, i32 1>
46  %i3 = bitcast <2 x i128> %i2 to <32 x i8>
47  store <32 x i8> %i3, ptr %out, align 64
48  ret void
49}
50define void @store_single_128bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind {
51; SSE-LABEL: store_single_128bit_elt_vector:
52; SSE:       # %bb.0:
53; SSE-NEXT:    movaps (%rdi), %xmm0
54; SSE-NEXT:    movaps %xmm0, (%rdx)
55; SSE-NEXT:    retq
56;
57; AVX-LABEL: store_single_128bit_elt_vector:
58; AVX:       # %bb.0:
59; AVX-NEXT:    vmovaps (%rdi), %xmm0
60; AVX-NEXT:    vmovaps %xmm0, (%rdx)
61; AVX-NEXT:    retq
62;
63; AVX2-LABEL: store_single_128bit_elt_vector:
64; AVX2:       # %bb.0:
65; AVX2-NEXT:    vmovaps (%rdi), %xmm0
66; AVX2-NEXT:    vmovaps %xmm0, (%rdx)
67; AVX2-NEXT:    retq
68;
69; AVX512F-LABEL: store_single_128bit_elt_vector:
70; AVX512F:       # %bb.0:
71; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
72; AVX512F-NEXT:    vmovaps %xmm0, (%rdx)
73; AVX512F-NEXT:    retq
74  %i0 = load <32 x i8>, ptr %in, align 64
75  %i1 = bitcast <32 x i8> %i0 to <2 x i128>
76  %i2 = shufflevector <2 x i128> %i1, <2 x i128> poison, <1 x i32> <i32 0>
77  %i3 = bitcast <1 x i128> %i2 to <16 x i8>
78  store <16 x i8> %i3, ptr %out, align 64
79  ret void
80}
81
82define void @load_single_256bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind {
83; SSE-LABEL: load_single_256bit_elt_vector:
84; SSE:       # %bb.0:
85; SSE-NEXT:    movaps (%rdi), %xmm0
86; SSE-NEXT:    movaps 16(%rdi), %xmm1
87; SSE-NEXT:    xorps %xmm2, %xmm2
88; SSE-NEXT:    movaps %xmm2, 48(%rdx)
89; SSE-NEXT:    movaps %xmm2, 32(%rdx)
90; SSE-NEXT:    movaps %xmm0, (%rdx)
91; SSE-NEXT:    movaps %xmm1, 16(%rdx)
92; SSE-NEXT:    retq
93;
94; AVX-LABEL: load_single_256bit_elt_vector:
95; AVX:       # %bb.0:
96; AVX-NEXT:    vmovaps (%rdi), %ymm0
97; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
98; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
99; AVX-NEXT:    vmovaps %ymm0, (%rdx)
100; AVX-NEXT:    vzeroupper
101; AVX-NEXT:    retq
102;
103; AVX2-LABEL: load_single_256bit_elt_vector:
104; AVX2:       # %bb.0:
105; AVX2-NEXT:    vmovaps (%rdi), %ymm0
106; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
107; AVX2-NEXT:    vmovaps %ymm1, 32(%rdx)
108; AVX2-NEXT:    vmovaps %ymm0, (%rdx)
109; AVX2-NEXT:    vzeroupper
110; AVX2-NEXT:    retq
111;
112; AVX512F-LABEL: load_single_256bit_elt_vector:
113; AVX512F:       # %bb.0:
114; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
115; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
116; AVX512F-NEXT:    vmovaps %ymm1, 32(%rdx)
117; AVX512F-NEXT:    vmovaps %ymm0, (%rdx)
118; AVX512F-NEXT:    vzeroupper
119; AVX512F-NEXT:    retq
120  %i0 = load <32 x i8>, ptr %in, align 64
121  %i1 = bitcast <32 x i8> %i0 to <1 x i256>
122  %i2 = shufflevector <1 x i256> %i1, <1 x i256> zeroinitializer, <2 x i32> <i32 0, i32 1>
123  %i3 = bitcast <2 x i256> %i2 to <64 x i8>
124  store <64 x i8> %i3, ptr %out, align 64
125  ret void
126}
127define void @store_single_256bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind {
128; SSE-LABEL: store_single_256bit_elt_vector:
129; SSE:       # %bb.0:
130; SSE-NEXT:    movaps (%rdi), %xmm0
131; SSE-NEXT:    movaps 16(%rdi), %xmm1
132; SSE-NEXT:    movaps %xmm0, (%rdx)
133; SSE-NEXT:    movaps %xmm1, 16(%rdx)
134; SSE-NEXT:    retq
135;
136; AVX-LABEL: store_single_256bit_elt_vector:
137; AVX:       # %bb.0:
138; AVX-NEXT:    vmovaps (%rdi), %ymm0
139; AVX-NEXT:    vmovaps %ymm0, (%rdx)
140; AVX-NEXT:    vzeroupper
141; AVX-NEXT:    retq
142;
143; AVX2-LABEL: store_single_256bit_elt_vector:
144; AVX2:       # %bb.0:
145; AVX2-NEXT:    vmovaps (%rdi), %ymm0
146; AVX2-NEXT:    vmovaps %ymm0, (%rdx)
147; AVX2-NEXT:    vzeroupper
148; AVX2-NEXT:    retq
149;
150; AVX512F-LABEL: store_single_256bit_elt_vector:
151; AVX512F:       # %bb.0:
152; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
153; AVX512F-NEXT:    vmovaps %ymm0, (%rdx)
154; AVX512F-NEXT:    vzeroupper
155; AVX512F-NEXT:    retq
156  %i0 = load <64 x i8>, ptr %in, align 64
157  %i1 = bitcast <64 x i8> %i0 to <2 x i256>
158  %i2 = shufflevector <2 x i256> %i1, <2 x i256> poison, <1 x i32> <i32 0>
159  %i3 = bitcast <1 x i256> %i2 to <32 x i8>
160  store <32 x i8> %i3, ptr %out, align 64
161  ret void
162}
163
164define void @load_single_512bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind {
165; SSE-LABEL: load_single_512bit_elt_vector:
166; SSE:       # %bb.0:
167; SSE-NEXT:    movaps (%rdi), %xmm0
168; SSE-NEXT:    movaps 16(%rdi), %xmm1
169; SSE-NEXT:    movaps 32(%rdi), %xmm2
170; SSE-NEXT:    movaps 48(%rdi), %xmm3
171; SSE-NEXT:    xorps %xmm4, %xmm4
172; SSE-NEXT:    movaps %xmm4, 112(%rdx)
173; SSE-NEXT:    movaps %xmm4, 96(%rdx)
174; SSE-NEXT:    movaps %xmm4, 80(%rdx)
175; SSE-NEXT:    movaps %xmm4, 64(%rdx)
176; SSE-NEXT:    movaps %xmm3, 48(%rdx)
177; SSE-NEXT:    movaps %xmm2, 32(%rdx)
178; SSE-NEXT:    movaps %xmm1, 16(%rdx)
179; SSE-NEXT:    movaps %xmm0, (%rdx)
180; SSE-NEXT:    retq
181;
182; AVX-LABEL: load_single_512bit_elt_vector:
183; AVX:       # %bb.0:
184; AVX-NEXT:    vmovaps (%rdi), %ymm0
185; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
186; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
187; AVX-NEXT:    vmovaps %ymm2, 96(%rdx)
188; AVX-NEXT:    vmovaps %ymm2, 64(%rdx)
189; AVX-NEXT:    vmovaps %ymm0, (%rdx)
190; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
191; AVX-NEXT:    vzeroupper
192; AVX-NEXT:    retq
193;
194; AVX2-LABEL: load_single_512bit_elt_vector:
195; AVX2:       # %bb.0:
196; AVX2-NEXT:    vmovaps (%rdi), %ymm0
197; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
198; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
199; AVX2-NEXT:    vmovaps %ymm2, 96(%rdx)
200; AVX2-NEXT:    vmovaps %ymm2, 64(%rdx)
201; AVX2-NEXT:    vmovaps %ymm0, (%rdx)
202; AVX2-NEXT:    vmovaps %ymm1, 32(%rdx)
203; AVX2-NEXT:    vzeroupper
204; AVX2-NEXT:    retq
205;
206; AVX512F-LABEL: load_single_512bit_elt_vector:
207; AVX512F:       # %bb.0:
208; AVX512F-NEXT:    vmovaps (%rdi), %zmm0
209; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
210; AVX512F-NEXT:    vmovaps %zmm1, 64(%rdx)
211; AVX512F-NEXT:    vmovaps %zmm0, (%rdx)
212; AVX512F-NEXT:    vzeroupper
213; AVX512F-NEXT:    retq
214  %i0 = load <64 x i8>, ptr %in, align 128
215  %i1 = bitcast <64 x i8> %i0 to <1 x i512>
216  %i2 = shufflevector <1 x i512> %i1, <1 x i512> zeroinitializer, <2 x i32> <i32 0, i32 1>
217  %i3 = bitcast <2 x i512> %i2 to <128 x i8>
218  store <128 x i8> %i3, ptr %out, align 128
219  ret void
220}
221define void @store_single_512bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwind {
222; SSE-LABEL: store_single_512bit_elt_vector:
223; SSE:       # %bb.0:
224; SSE-NEXT:    movaps (%rdi), %xmm0
225; SSE-NEXT:    movaps 16(%rdi), %xmm1
226; SSE-NEXT:    movaps 32(%rdi), %xmm2
227; SSE-NEXT:    movaps 48(%rdi), %xmm3
228; SSE-NEXT:    movaps %xmm3, 48(%rdx)
229; SSE-NEXT:    movaps %xmm0, (%rdx)
230; SSE-NEXT:    movaps %xmm1, 16(%rdx)
231; SSE-NEXT:    movaps %xmm2, 32(%rdx)
232; SSE-NEXT:    retq
233;
234; AVX-LABEL: store_single_512bit_elt_vector:
235; AVX:       # %bb.0:
236; AVX-NEXT:    vmovaps (%rdi), %ymm0
237; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
238; AVX-NEXT:    vmovaps %ymm0, (%rdx)
239; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
240; AVX-NEXT:    vzeroupper
241; AVX-NEXT:    retq
242;
243; AVX2-LABEL: store_single_512bit_elt_vector:
244; AVX2:       # %bb.0:
245; AVX2-NEXT:    vmovaps (%rdi), %ymm0
246; AVX2-NEXT:    vmovaps 32(%rdi), %ymm1
247; AVX2-NEXT:    vmovaps %ymm0, (%rdx)
248; AVX2-NEXT:    vmovaps %ymm1, 32(%rdx)
249; AVX2-NEXT:    vzeroupper
250; AVX2-NEXT:    retq
251;
252; AVX512F-LABEL: store_single_512bit_elt_vector:
253; AVX512F:       # %bb.0:
254; AVX512F-NEXT:    vmovaps (%rdi), %zmm0
255; AVX512F-NEXT:    vmovaps %zmm0, (%rdx)
256; AVX512F-NEXT:    vzeroupper
257; AVX512F-NEXT:    retq
258  %i0 = load <128 x i8>, ptr %in, align 128
259  %i1 = bitcast <128 x i8> %i0 to <2 x i512>
260  %i2 = shufflevector <2 x i512> %i1, <2 x i512> poison, <1 x i32> <i32 0>
261  %i3 = bitcast <1 x i512> %i2 to <64 x i8>
262  store <64 x i8> %i3, ptr %out, align 128
263  ret void
264}
265;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
266; AVX1-ONLY: {{.*}}
267; AVX2-FAST: {{.*}}
268; AVX2-FAST-PERLANE: {{.*}}
269; AVX2-SLOW: {{.*}}
270; AVX512F-FAST: {{.*}}
271; AVX512F-SLOW: {{.*}}
272; FALLBACK0: {{.*}}
273; FALLBACK1: {{.*}}
274; FALLBACK2: {{.*}}
275; FALLBACK3: {{.*}}
276; FALLBACK4: {{.*}}
277; FALLBACK5: {{.*}}
278; FALLBACK6: {{.*}}
279; FALLBACK7: {{.*}}
280; SSE2: {{.*}}
281; SSE42: {{.*}}
282