xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll (revision e16f2f5d2491fde19afb63d5cec83625d391be30)
1; REQUIRES: asserts
2; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,CHECK-A57
3; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m3 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
4
5; Test ldr clustering.
6; CHECK: ********** MI Scheduling **********
7; CHECK-LABEL: ldr_int:%bb.0
8; CHECK: Cluster ld/st SU(1) - SU(2)
9; CHECK: SU(1):   %{{[0-9]+}}:gpr32 = LDRWui
10; CHECK: SU(2):   %{{[0-9]+}}:gpr32 = LDRWui
11define i32 @ldr_int(ptr %a) nounwind {
12  %p1 = getelementptr inbounds i32, ptr %a, i32 1
13  %tmp1 = load i32, ptr %p1, align 2
14  %p2 = getelementptr inbounds i32, ptr %a, i32 2
15  %tmp2 = load i32, ptr %p2, align 2
16  %tmp3 = add i32 %tmp1, %tmp2
17  ret i32 %tmp3
18}
19
20; Test ldpsw clustering
21; CHECK: ********** MI Scheduling **********
22; CHECK-LABEL: ldp_sext_int:%bb.0
23; CHECK: Cluster ld/st SU(1) - SU(2)
24; CHECK: SU(1):   %{{[0-9]+}}:gpr64 = LDRSWui
25; CHECK: SU(2):   %{{[0-9]+}}:gpr64 = LDRSWui
26define i64 @ldp_sext_int(ptr %p) nounwind {
27  %tmp = load i32, ptr %p, align 4
28  %add.ptr = getelementptr inbounds i32, ptr %p, i64 1
29  %tmp1 = load i32, ptr %add.ptr, align 4
30  %sexttmp = sext i32 %tmp to i64
31  %sexttmp1 = sext i32 %tmp1 to i64
32  %add = add nsw i64 %sexttmp1, %sexttmp
33  ret i64 %add
34}
35
36; Test ldur clustering.
37; CHECK: ********** MI Scheduling **********
38; CHECK-LABEL: ldur_int:%bb.0
39; CHECK: Cluster ld/st SU(1) - SU(2)
40; CHECK: SU(1):   %{{[0-9]+}}:gpr32 = LDURWi
41; CHECK: SU(2):   %{{[0-9]+}}:gpr32 = LDURWi
42define i32 @ldur_int(ptr %a) nounwind {
43  %p1 = getelementptr inbounds i32, ptr %a, i32 -1
44  %tmp1 = load i32, ptr %p1, align 2
45  %p2 = getelementptr inbounds i32, ptr %a, i32 -2
46  %tmp2 = load i32, ptr %p2, align 2
47  %tmp3 = add i32 %tmp1, %tmp2
48  ret i32 %tmp3
49}
50
51; Test sext + zext clustering.
52; CHECK: ********** MI Scheduling **********
53; CHECK-LABEL: ldp_half_sext_zext_int:%bb.0
54; CHECK: Cluster ld/st SU(3) - SU(4)
55; CHECK: SU(3):   %{{[0-9]+}}:gpr64 = LDRSWui
56; CHECK: SU(4):   undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui
57define i64 @ldp_half_sext_zext_int(ptr %q, ptr %p) nounwind {
58  %tmp0 = load i64, ptr %q, align 4
59  %tmp = load i32, ptr %p, align 4
60  %add.ptr = getelementptr inbounds i32, ptr %p, i64 1
61  %tmp1 = load i32, ptr %add.ptr, align 4
62  %sexttmp = sext i32 %tmp to i64
63  %sexttmp1 = zext i32 %tmp1 to i64
64  %add = add nsw i64 %sexttmp1, %sexttmp
65  %add1 = add nsw i64 %add, %tmp0
66  ret i64 %add1
67}
68
69; Test zext + sext clustering.
70; CHECK: ********** MI Scheduling **********
71; CHECK-LABEL: ldp_half_zext_sext_int:%bb.0
72; CHECK: Cluster ld/st SU(3) - SU(4)
73; CHECK: SU(3):   undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui
74; CHECK: SU(4):   %{{[0-9]+}}:gpr64 = LDRSWui
75define i64 @ldp_half_zext_sext_int(ptr %q, ptr %p) nounwind {
76  %tmp0 = load i64, ptr %q, align 4
77  %tmp = load i32, ptr %p, align 4
78  %add.ptr = getelementptr inbounds i32, ptr %p, i64 1
79  %tmp1 = load i32, ptr %add.ptr, align 4
80  %sexttmp = zext i32 %tmp to i64
81  %sexttmp1 = sext i32 %tmp1 to i64
82  %add = add nsw i64 %sexttmp1, %sexttmp
83  %add1 = add nsw i64 %add, %tmp0
84  ret i64 %add1
85}
86
87; Verify we don't cluster volatile loads.
88; CHECK: ********** MI Scheduling **********
89; CHECK-LABEL: ldr_int_volatile:%bb.0
90; CHECK-NOT: Cluster ld/st
91; CHECK: SU(1):   %{{[0-9]+}}:gpr32 = LDRWui
92; CHECK: SU(2):   %{{[0-9]+}}:gpr32 = LDRWui
93define i32 @ldr_int_volatile(ptr %a) nounwind {
94  %p1 = getelementptr inbounds i32, ptr %a, i32 1
95  %tmp1 = load volatile i32, ptr %p1, align 2
96  %p2 = getelementptr inbounds i32, ptr %a, i32 2
97  %tmp2 = load volatile i32, ptr %p2, align 2
98  %tmp3 = add i32 %tmp1, %tmp2
99  ret i32 %tmp3
100}
101
102; Test ldq clustering (no clustering for Exynos).
103; CHECK: ********** MI Scheduling **********
104; CHECK-LABEL: ldq_cluster:%bb.0
105; CHECK: Cluster ld/st SU(1) - SU(3)
106; CHECK: SU(1):   %{{[0-9]+}}:fpr128 = LDRQui
107; CHECK: SU(3):   %{{[0-9]+}}:fpr128 = LDRQui
108define <2 x i64> @ldq_cluster(ptr %p) {
109  %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8
110  %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2
111  %tmp2 = add nsw <2 x i64> %tmp1, %tmp1
112  %tmp3 = load <2 x i64>, ptr %add.ptr2, align 8
113  %res  = mul nsw <2 x i64> %tmp2, %tmp3
114  ret <2 x i64> %res
115}
116
117; CHECK: ********** MI Scheduling **********
118; CHECK: LDURSi_LDRSui:%bb.0 entry
119; CHECK: Cluster ld/st SU(3) - SU(4)
120; CHECK: SU(3):   %3:fpr32 = LDURSi %0:gpr64
121; CHECK: SU(4):   %4:fpr32 = LDRSui %0:gpr64
122;
123define void @LDURSi_LDRSui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
124entry:
125  %r51 = getelementptr i8, ptr %arg, i64 -4
126  %r52 = load float, ptr %r51, align 4
127  %r53 = load float, ptr %arg, align 4
128  store float %r52, ptr %wa
129  store float %r53, ptr %wb
130  ret void
131}
132
133; Test LDURQi / LDRQui clustering
134;
135; CHECK: ********** MI Scheduling **********
136; CHECK: LDURQi_LDRQui:%bb.1 vector_body
137;
138; CHECK: Cluster ld/st SU(0) - SU(4)
139; CHECK: Cluster ld/st SU(1) - SU(5)
140;
141; CHECK: SU(0): %{{[0-9]+}}:fpr128 = LDURQi
142; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDURQi
143; CHECK: SU(4): %{{[0-9]+}}:fpr128 = LDRQui
144; CHECK: SU(5): %{{[0-9]+}}:fpr128 = LDRQui
145;
146define void @LDURQi_LDRQui(ptr nocapture readonly %arg) {
147entry:
148  br label %vector_body
149vector_body:
150  %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
151  %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
152  %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
153  %r51 = getelementptr i8, ptr %phi1, i64 -16
154  %r52 = load <2 x double>, ptr %r51, align 8
155  %r53 = getelementptr i8, ptr %phi2, i64 -16
156  %r54 = load <2 x double>, ptr %r53, align 8
157  %r55 = fmul fast <2 x double> %r54, <double 3.0, double 4.0>
158  %r56 = fsub fast <2 x double> %r52, %r55
159  store <2 x double> %r56, ptr %r51, align 1
160  %r57 = load <2 x double>, ptr %phi1, align 8
161  %r58 = load <2 x double>, ptr %phi2, align 8
162  %r59 = fmul fast <2 x double> %r58,<double 3.0, double 4.0>
163  %r60 = fsub fast <2 x double> %r57, %r59
164  store <2 x double> %r60, ptr %phi1, align 1
165  %r61 = add i32 %phi3, 4
166  %r62 = getelementptr i8, ptr %phi2, i64 32
167  %r63 = getelementptr i8, ptr %phi1, i64 32
168  %r.not = icmp eq i32 %r61, 0
169  br i1 %r.not, label %exit, label %vector_body
170exit:
171  ret void
172}
173
174; Test LDURDi / LDRDui clustering
175;
176; CHECK: ********** MI Scheduling **********
177; CHECK: LDURDi_LDRDui:%bb.1 vector_body
178;
179; CHECK: Cluster ld/st SU(0) - SU(4)
180; CHECK: Cluster ld/st SU(1) - SU(5)
181;
182; CHECK: SU(0): %{{[0-9]+}}:fpr64 = LDURDi
183; CHECK: SU(1): %{{[0-9]+}}:fpr64 = LDURDi
184; CHECK: SU(4): %{{[0-9]+}}:fpr64 = LDRDui
185; CHECK: SU(5): %{{[0-9]+}}:fpr64 = LDRDui
186;
187define void @LDURDi_LDRDui(ptr nocapture readonly %arg) {
188entry:
189  br label %vector_body
190vector_body:
191  %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
192  %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
193  %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
194  %r51 = getelementptr i8, ptr %phi1, i64 -8
195  %r52 = load <2 x float>, ptr %r51, align 8
196  %r53 = getelementptr i8, ptr %phi2, i64 -8
197  %r54 = load <2 x float>, ptr %r53, align 8
198  %r55 = fmul fast <2 x float> %r54, <float 3.0, float 4.0>
199  %r56 = fsub fast <2 x float> %r52, %r55
200  store <2 x float> %r56, ptr %r51, align 1
201  %r57 = load <2 x float>, ptr %phi1, align 8
202  %r58 = load <2 x float>, ptr %phi2, align 8
203  %r59 = fmul fast <2 x float> %r58,  <float 3.0, float 4.0>
204  %r60 = fsub fast <2 x float> %r57, %r59
205  store <2 x float> %r60, ptr %phi1, align 1
206  %r61 = add i32 %phi3, 4
207  %r62 = getelementptr i8, ptr %phi2, i64 32
208  %r63 = getelementptr i8, ptr %phi1, i64 32
209  %r.not = icmp eq i32 %r61, 0
210  br i1 %r.not, label %exit, label %vector_body
211exit:
212  ret void
213}
214
215; CHECK: ********** MI Scheduling **********
216; CHECK: LDURXi_LDRXui:%bb.0 entry
217; CHECK: Cluster ld/st SU(3) - SU(4)
218; CHECK: SU(3):  %{{[0-9]+}}:gpr64 = LDURXi
219; CHECK: SU(4):  %{{[0-9]+}}:gpr64 = LDRXui
220;
221define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
222entry:
223  %r51 = getelementptr i8, ptr %arg, i64 -8
224  %r52 = load i64, ptr %r51, align 8
225  %r53 = load i64, ptr %arg, align 8
226  store i64 %r52, ptr %wa
227  store i64 %r53, ptr %wb
228  ret void
229}
230
231; CHECK: ********** MI Scheduling **********
232; CHECK: STURWi_STRWui:%bb.0 entry
233; CHECK: Cluster ld/st SU(3) - SU(4)
234; CHECK: SU(3):   STURWi %{{[0-9]+}}:gpr32
235; CHECK: SU(4):   STRWui %{{[0-9]+}}:gpr32
236;
237define void @STURWi_STRWui(ptr nocapture readonly %arg, i32 %b, i32 %c) {
238entry:
239  %r51 = getelementptr i8, ptr %arg, i64 -4
240  store i32 %b, ptr %r51
241  store i32 %c, ptr %arg
242  ret void
243}
244
245; CHECK: ********** MI Scheduling **********
246; CHECK: STURXi_STRXui:%bb.0 entry
247; CHECK: Cluster ld/st SU(3) - SU(4)
248; CHECK: SU(3):   STURXi %{{[0-9]+}}:gpr64
249; CHECK: SU(4):   STRXui %{{[0-9]+}}:gpr64
250;
251define void @STURXi_STRXui(ptr nocapture readonly %arg, i64 %b, i64 %c) {
252entry:
253  %r51 = getelementptr i8, ptr %arg, i64 -8
254  store i64 %b, ptr %r51
255  store i64 %c, ptr %arg
256  ret void
257}
258
259; CHECK-A57: ********** MI Scheduling **********
260; CHECK-A57: STURSi_STRSui:%bb.0 entry
261; CHECK-A57: Cluster ld/st SU(3) - SU(4)
262; CHECK-A57: SU(3):   STURSi %{{[0-9]+}}:fpr32
263; CHECK-A57: SU(4):   STRSui %{{[0-9]+}}:fpr32
264;
265define void @STURSi_STRSui(ptr nocapture readonly %arg, float %b, float %c) {
266entry:
267  %r51 = getelementptr i8, ptr %arg, i64 -4
268  store float %b, ptr %r51
269  store float %c, ptr %arg
270  ret void
271}
272
273; CHECK-A57: ********** MI Scheduling **********
274; CHECK-A57: STURDi_STRDui:%bb.0 entry
275; CHECK-A57: Cluster ld/st SU(3) - SU(4)
276; CHECK-A57: SU(3):   STURDi %{{[0-9]+}}:fpr64
277; CHECK-A57: SU(4):   STRDui %{{[0-9]+}}:fpr64
278;
279define void @STURDi_STRDui(ptr nocapture readonly %arg, <2 x float> %b, <2 x float> %c) {
280entry:
281  %r51 = getelementptr i8, ptr %arg, i64 -8
282  store <2 x float> %b, ptr %r51
283  store <2 x float> %c, ptr %arg
284  ret void
285}
286
287; CHECK-A57: ********** MI Scheduling **********
288; CHECK-A57: STURQi_STRQui:%bb.0 entry
289; CHECK-A57: Cluster ld/st SU(3) - SU(4)
290; CHECK-A57: SU(3):   STURQi %{{[0-9]+}}:fpr128
291; CHECK-A57: SU(4):   STRQui %{{[0-9]+}}:fpr128
292;
293define void @STURQi_STRQui(ptr nocapture readonly %arg, <2 x double> %b, <2 x double> %c) {
294entry:
295  %r51 = getelementptr i8, ptr %arg, i64 -16
296  store <2 x double> %b, ptr %r51
297  store <2 x double> %c, ptr %arg
298  ret void
299}
300