xref: /llvm-project/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll (revision c384b20b55c730f2382013d18b41269f04757873)
1; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse -earlycse-debug-hash | FileCheck %s
2; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes='early-cse<memssa>' | FileCheck %s
3
4define <4 x i32> @test_cse(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
5entry:
6; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
7; CHECK-LABEL: @test_cse
8; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
9  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
10  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
11  br label %for.cond
12
13for.cond:                                         ; preds = %for.body, %entry
14  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
15  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
16  %cmp = icmp slt i32 %i.0, %n
17  br i1 %cmp, label %for.body, label %for.end
18
19for.body:                                         ; preds = %for.cond
20  %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
21  %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
22  %2 = bitcast <16 x i8> %0 to <4 x i32>
23  %3 = bitcast <16 x i8> %1 to <4 x i32>
24  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a)
25  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
26  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
27  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
28  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
29  %inc = add nsw i32 %i.0, 1
30  br label %for.cond
31
32for.end:                                          ; preds = %for.cond
33  ret <4 x i32> %res.0
34}
35
36define <4 x i32> @test_cse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
37entry:
38; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
39; CHECK-LABEL: @test_cse2
40; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %3, ptr %0)
41; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, ptr %a)
42  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
43  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
44  br label %for.cond
45
46for.cond:                                         ; preds = %for.body, %entry
47  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
48  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
49  %cmp = icmp slt i32 %i.0, %n
50  br i1 %cmp, label %for.body, label %for.end
51
52for.body:                                         ; preds = %for.cond
53  %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
54  %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
55  %2 = bitcast <16 x i8> %0 to <4 x i32>
56  %3 = bitcast <16 x i8> %1 to <4 x i32>
57  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %2, ptr %a)
58  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a)
59  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
60  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
61  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
62  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
63  %inc = add nsw i32 %i.0, 1
64  br label %for.cond
65
66for.end:                                          ; preds = %for.cond
67  ret <4 x i32> %res.0
68}
69
70define <4 x i32> @test_cse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
71entry:
72; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
73; CHECK-LABEL: @test_cse3
74; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
75; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
76  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
77  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
78  br label %for.cond
79
80for.cond:                                         ; preds = %for.body, %entry
81  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
82  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
83  %cmp = icmp slt i32 %i.0, %n
84  br i1 %cmp, label %for.body, label %for.end
85
86for.body:                                         ; preds = %for.cond
87  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
88  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
89  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
90  %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
91  %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
92  %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
93  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
94  %inc = add nsw i32 %i.0, 1
95  br label %for.cond
96
97for.end:                                          ; preds = %for.cond
98  ret <4 x i32> %res.0
99}
100
101
102define <4 x i32> @test_nocse(ptr %a, ptr %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
103entry:
104; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
105; away by Early CSE.
106; CHECK-LABEL: @test_nocse
107; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
108  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
109  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
110  br label %for.cond
111
112for.cond:                                         ; preds = %for.body, %entry
113  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
114  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
115  %cmp = icmp slt i32 %i.0, %n
116  br i1 %cmp, label %for.body, label %for.end
117
118for.body:                                         ; preds = %for.cond
119  %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
120  %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
121  %2 = bitcast <16 x i8> %0 to <4 x i32>
122  %3 = bitcast <16 x i8> %1 to <4 x i32>
123  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a)
124  store i32 0, ptr %b, align 4
125  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
126  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
127  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
128  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
129  %inc = add nsw i32 %i.0, 1
130  br label %for.cond
131
132for.end:                                          ; preds = %for.cond
133  ret <4 x i32> %res.0
134}
135
136define <4 x i32> @test_nocse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
137entry:
138; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
139; to mismatch between st2 and ld3.
140; CHECK-LABEL: @test_nocse2
141; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0
142  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
143  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
144  br label %for.cond
145
146for.cond:                                         ; preds = %for.body, %entry
147  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
148  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
149  %cmp = icmp slt i32 %i.0, %n
150  br i1 %cmp, label %for.body, label %for.end
151
152for.body:                                         ; preds = %for.cond
153  %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
154  %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
155  %2 = bitcast <16 x i8> %0 to <4 x i32>
156  %3 = bitcast <16 x i8> %1 to <4 x i32>
157  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a)
158  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a)
159  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
160  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
161  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
162  %inc = add nsw i32 %i.0, 1
163  br label %for.cond
164
165for.end:                                          ; preds = %for.cond
166  ret <4 x i32> %res.0
167}
168
169define <4 x i32> @test_nocse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
170entry:
171; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
172; mismatch between st2 and st3.
173; CHECK-LABEL: @test_nocse3
174; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0
175; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0
176  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
177  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
178  br label %for.cond
179
180for.cond:                                         ; preds = %for.body, %entry
181  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
182  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
183  %cmp = icmp slt i32 %i.0, %n
184  br i1 %cmp, label %for.body, label %for.end
185
186for.body:                                         ; preds = %for.cond
187  %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
188  %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
189  %2 = bitcast <16 x i8> %0 to <4 x i32>
190  %3 = bitcast <16 x i8> %1 to <4 x i32>
191  call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %3, <4 x i32> %2, <4 x i32> %2, ptr %a)
192  call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %2, ptr %a)
193  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a)
194  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
195  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
196  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
197  %inc = add nsw i32 %i.0, 1
198  br label %for.cond
199
200for.end:                                          ; preds = %for.cond
201  ret <4 x i32> %res.0
202}
203
204; Function Attrs: nounwind
205declare void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32>, <4 x i32>, ptr nocapture)
206
207; Function Attrs: nounwind
208declare void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, ptr nocapture)
209
210; Function Attrs: nounwind readonly
211declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr)
212
213; Function Attrs: nounwind readonly
214declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr)
215
216define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
217entry:
218  %add = add <4 x i32> %__p0, %__p1
219  ret <4 x i32> %add
220}
221