xref: /llvm-project/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll (revision bed1c7f061aa12417aa081e334afdba45767b938)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -o - < %s | FileCheck %s
3
4target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
5target triple = "armv8-unknown-linux-gnueabihf"
6
7define <4 x float> @test(ptr %A) {
8; CHECK-LABEL: test:
9; CHECK:       @ %bb.0:
10; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
11; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
12; CHECK-NEXT:    vadd.f32 q8, q8, q9
13; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
14; CHECK-NEXT:    vadd.f32 q0, q8, q9
15; CHECK-NEXT:    bx lr
16  %X = load <4 x float>, ptr %A, align 4
17  %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 4
18  %Y = load <4 x float>, ptr %Y.ptr.elt, align 4
19  %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 8
20  %Z = load <4 x float>, ptr %Z.ptr.elt, align 4
21  %tmp.sum = fadd <4 x float> %X, %Y
22  %sum = fadd <4 x float> %tmp.sum, %Z
23  ret <4 x float> %sum
24}
25
26define <4 x float> @test_stride(ptr %A) {
27; CHECK-LABEL: test_stride:
28; CHECK:       @ %bb.0:
29; CHECK-NEXT:    mov r1, #24
30; CHECK-NEXT:    vld1.32 {d16, d17}, [r0], r1
31; CHECK-NEXT:    vld1.32 {d18, d19}, [r0], r1
32; CHECK-NEXT:    vadd.f32 q8, q8, q9
33; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
34; CHECK-NEXT:    vadd.f32 q0, q8, q9
35; CHECK-NEXT:    bx lr
36  %X = load <4 x float>, ptr %A, align 4
37  %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 6
38  %Y = load <4 x float>, ptr %Y.ptr.elt, align 4
39  %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 12
40  %Z = load <4 x float>, ptr %Z.ptr.elt, align 4
41  %tmp.sum = fadd <4 x float> %X, %Y
42  %sum = fadd <4 x float> %tmp.sum, %Z
43  ret <4 x float> %sum
44}
45
46define <4 x float> @test_stride_mixed(ptr %A) {
47; CHECK-LABEL: test_stride_mixed:
48; CHECK:       @ %bb.0:
49; CHECK-NEXT:    mov r1, #24
50; CHECK-NEXT:    vld1.32 {d16, d17}, [r0], r1
51; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
52; CHECK-NEXT:    vadd.f32 q8, q8, q9
53; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
54; CHECK-NEXT:    vadd.f32 q0, q8, q9
55; CHECK-NEXT:    bx lr
56  %X = load <4 x float>, ptr %A, align 4
57  %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 6
58  %Y = load <4 x float>, ptr %Y.ptr.elt, align 4
59  %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 10
60  %Z = load <4 x float>, ptr %Z.ptr.elt, align 4
61  %tmp.sum = fadd <4 x float> %X, %Y
62  %sum = fadd <4 x float> %tmp.sum, %Z
63  ret <4 x float> %sum
64}
65
66; Refrain from using multiple stride registers
67define <4 x float> @test_stride_noop(ptr %A) {
68; CHECK-LABEL: test_stride_noop:
69; CHECK:       @ %bb.0:
70; CHECK-NEXT:    mov r1, #24
71; CHECK-NEXT:    vld1.32 {d16, d17}, [r0], r1
72; CHECK-NEXT:    mov r1, #32
73; CHECK-NEXT:    vld1.32 {d18, d19}, [r0], r1
74; CHECK-NEXT:    vadd.f32 q8, q8, q9
75; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
76; CHECK-NEXT:    vadd.f32 q0, q8, q9
77; CHECK-NEXT:    bx lr
78  %X = load <4 x float>, ptr %A, align 4
79  %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 6
80  %Y = load <4 x float>, ptr %Y.ptr.elt, align 4
81  %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 14
82  %Z = load <4 x float>, ptr %Z.ptr.elt, align 4
83  %tmp.sum = fadd <4 x float> %X, %Y
84  %sum = fadd <4 x float> %tmp.sum, %Z
85  ret <4 x float> %sum
86}
87
88define <4 x float> @test_positive_initial_offset(ptr %A) {
89; CHECK-LABEL: test_positive_initial_offset:
90; CHECK:       @ %bb.0:
91; CHECK-NEXT:    add r0, r0, #32
92; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
93; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
94; CHECK-NEXT:    vadd.f32 q8, q8, q9
95; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
96; CHECK-NEXT:    vadd.f32 q0, q8, q9
97; CHECK-NEXT:    bx lr
98  %X.ptr.elt = getelementptr inbounds float, ptr %A, i32 8
99  %X = load <4 x float>, ptr %X.ptr.elt, align 4
100  %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 12
101  %Y = load <4 x float>, ptr %Y.ptr.elt, align 4
102  %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 16
103  %Z = load <4 x float>, ptr %Z.ptr.elt, align 4
104  %tmp.sum = fadd <4 x float> %X, %Y
105  %sum = fadd <4 x float> %tmp.sum, %Z
106  ret <4 x float> %sum
107}
108
109define <4 x float> @test_negative_initial_offset(ptr %A) {
110; CHECK-LABEL: test_negative_initial_offset:
111; CHECK:       @ %bb.0:
112; CHECK-NEXT:    sub r0, r0, #64
113; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
114; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
115; CHECK-NEXT:    vadd.f32 q8, q8, q9
116; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
117; CHECK-NEXT:    vadd.f32 q0, q8, q9
118; CHECK-NEXT:    bx lr
119  %X.ptr.elt = getelementptr inbounds float, ptr %A, i32 -16
120  %X = load <4 x float>, ptr %X.ptr.elt, align 4
121  %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 -12
122  %Y = load <4 x float>, ptr %Y.ptr.elt, align 4
123  %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 -8
124  %Z = load <4 x float>, ptr %Z.ptr.elt, align 4
125  %tmp.sum = fadd <4 x float> %X, %Y
126  %sum = fadd <4 x float> %tmp.sum, %Z
127  ret <4 x float> %sum
128}
129
130@global_float_array = external global [128 x float], align 4
131define <4 x float> @test_global() {
132; CHECK-LABEL: test_global:
133; CHECK:       @ %bb.0:
134; CHECK-NEXT:    movw r0, :lower16:global_float_array
135; CHECK-NEXT:    movt r0, :upper16:global_float_array
136; CHECK-NEXT:    add r0, r0, #32
137; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
138; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
139; CHECK-NEXT:    vadd.f32 q8, q8, q9
140; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]
141; CHECK-NEXT:    vadd.f32 q0, q8, q9
142; CHECK-NEXT:    bx lr
143  %X = load <4 x float>, ptr getelementptr inbounds ([128 x float], ptr @global_float_array, i32 0, i32 8), align 4
144  %Y = load <4 x float>, ptr getelementptr inbounds ([128 x float], ptr @global_float_array, i32 0, i32 12), align 4
145  %Z = load <4 x float>, ptr getelementptr inbounds ([128 x float], ptr @global_float_array, i32 0, i32 16), align 4
146  %tmp.sum = fadd <4 x float> %X, %Y
147  %sum = fadd <4 x float> %tmp.sum, %Z
148  ret <4 x float> %sum
149}
150
151define <4 x float> @test_stack() {
152; Use huge alignment to test that ADD would not be converted to OR
153; CHECK-LABEL: test_stack:
154; CHECK:       @ %bb.0:
155; CHECK-NEXT:    .save {r4, r10, r11, lr}
156; CHECK-NEXT:    push {r4, r10, r11, lr}
157; CHECK-NEXT:    .setfp r11, sp, #8
158; CHECK-NEXT:    add r11, sp, #8
159; CHECK-NEXT:    .pad #240
160; CHECK-NEXT:    sub sp, sp, #240
161; CHECK-NEXT:    bfc sp, #0, #7
162; CHECK-NEXT:    mov r4, sp
163; CHECK-NEXT:    mov r0, r4
164; CHECK-NEXT:    bl external_function
165; CHECK-NEXT:    vld1.32 {d16, d17}, [r4:128]!
166; CHECK-NEXT:    vld1.32 {d18, d19}, [r4:128]!
167; CHECK-NEXT:    vadd.f32 q8, q8, q9
168; CHECK-NEXT:    vld1.64 {d18, d19}, [r4:128]
169; CHECK-NEXT:    vadd.f32 q0, q8, q9
170; CHECK-NEXT:    sub sp, r11, #8
171; CHECK-NEXT:    pop {r4, r10, r11, pc}
172  %array = alloca [32 x float], align 128
173  call void @external_function(ptr %array)
174  %X = load <4 x float>, ptr %array, align 4
175  %Y.ptr.elt = getelementptr inbounds [32 x float], ptr %array, i32 0, i32 4
176  %Y = load <4 x float>, ptr %Y.ptr.elt, align 4
177  %Z.ptr.elt = getelementptr inbounds [32 x float], ptr %array, i32 0, i32 8
178  %Z = load <4 x float>, ptr %Z.ptr.elt, align 4
179  %tmp.sum = fadd <4 x float> %X, %Y
180  %sum = fadd <4 x float> %tmp.sum, %Z
181  ret <4 x float> %sum
182}
183
184define <2 x double> @test_double(ptr %A) {
185; CHECK-LABEL: test_double:
186; CHECK:       @ %bb.0:
187; CHECK-NEXT:    add r0, r0, #64
188; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]!
189; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]!
190; CHECK-NEXT:    vadd.f64 d20, d17, d19
191; CHECK-NEXT:    vadd.f64 d16, d16, d18
192; CHECK-NEXT:    vld1.64 {d22, d23}, [r0]
193; CHECK-NEXT:    vadd.f64 d1, d20, d23
194; CHECK-NEXT:    vadd.f64 d0, d16, d22
195; CHECK-NEXT:    bx lr
196  %X.ptr.elt = getelementptr inbounds double, ptr %A, i32 8
197  %X = load <2 x double>, ptr %X.ptr.elt, align 8
198  %Y.ptr.elt = getelementptr inbounds double, ptr %A, i32 10
199  %Y = load <2 x double>, ptr %Y.ptr.elt, align 8
200  %Z.ptr.elt = getelementptr inbounds double, ptr %A, i32 12
201  %Z = load <2 x double>, ptr %Z.ptr.elt, align 8
202  %tmp.sum = fadd <2 x double> %X, %Y
203  %sum = fadd <2 x double> %tmp.sum, %Z
204  ret <2 x double> %sum
205}
206
207define void @test_various_instructions(ptr %A) {
208; CHECK-LABEL: test_various_instructions:
209; CHECK:       @ %bb.0:
210; CHECK-NEXT:    vld1.32 {d16, d17}, [r0]!
211; CHECK-NEXT:    vld1.32 {d18, d19}, [r0]!
212; CHECK-NEXT:    vadd.f32 q8, q8, q9
213; CHECK-NEXT:    vst1.32 {d16, d17}, [r0]
214; CHECK-NEXT:    bx lr
215  %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr %A, i32 1)
216  %Y.ptr.elt = getelementptr inbounds float, ptr %A, i32 4
217  %Y = load <4 x float>, ptr %Y.ptr.elt, align 4
218  %Z.ptr.elt = getelementptr inbounds float, ptr %A, i32 8
219  %Z = fadd <4 x float> %X, %Y
220  tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr nonnull %Z.ptr.elt, <4 x float> %Z, i32 4)
221  ret void
222}
223
224define void @test_lsr_geps(ptr %a, ptr %b, i32 %n) {
225; CHECK-LABEL: test_lsr_geps:
226; CHECK:       @ %bb.0: @ %entry
227; CHECK-NEXT:    cmp r2, #1
228; CHECK-NEXT:    bxlt lr
229; CHECK-NEXT:  .LBB10_1: @ %for.body.preheader
230; CHECK-NEXT:    mov r12, #0
231; CHECK-NEXT:  .LBB10_2: @ %for.body
232; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
233; CHECK-NEXT:    add r3, r0, r12
234; CHECK-NEXT:    subs r2, r2, #1
235; CHECK-NEXT:    vld1.32 {d16, d17}, [r3]!
236; CHECK-NEXT:    vld1.32 {d18, d19}, [r3]!
237; CHECK-NEXT:    vld1.32 {d20, d21}, [r3]!
238; CHECK-NEXT:    vld1.32 {d22, d23}, [r3]
239; CHECK-NEXT:    add r3, r1, r12
240; CHECK-NEXT:    add r12, r12, #64
241; CHECK-NEXT:    vst1.32 {d16, d17}, [r3]!
242; CHECK-NEXT:    vst1.32 {d18, d19}, [r3]!
243; CHECK-NEXT:    vst1.32 {d20, d21}, [r3]!
244; CHECK-NEXT:    vst1.32 {d22, d23}, [r3]
245; CHECK-NEXT:    bne .LBB10_2
246; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
247; CHECK-NEXT:    bx lr
248entry:
249  %cmp61 = icmp sgt i32 %n, 0
250  br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup
251
252for.body.preheader:
253  br label %for.body
254
255for.cond.cleanup:
256  ret void
257
258for.body:
259  %lsr.iv1 = phi i32 [ 0, %for.body.preheader ], [ %lsr.iv.next2, %for.body ]
260  %lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
261  %uglygep19 = getelementptr i8, ptr %a, i32 %lsr.iv1
262  %0 = load <4 x float>, ptr %uglygep19, align 4
263  %uglygep16 = getelementptr i8, ptr %a, i32 %lsr.iv1
264  %scevgep18 = getelementptr <4 x float>, ptr %uglygep16, i32 1
265  %1 = load <4 x float>, ptr %scevgep18, align 4
266  %uglygep13 = getelementptr i8, ptr %a, i32 %lsr.iv1
267  %scevgep15 = getelementptr <4 x float>, ptr %uglygep13, i32 2
268  %2 = load <4 x float>, ptr %scevgep15, align 4
269  %uglygep10 = getelementptr i8, ptr %a, i32 %lsr.iv1
270  %scevgep12 = getelementptr <4 x float>, ptr %uglygep10, i32 3
271  %3 = load <4 x float>, ptr %scevgep12, align 4
272  %uglygep8 = getelementptr i8, ptr %b, i32 %lsr.iv1
273  tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr %uglygep8, <4 x float> %0, i32 4)
274  %uglygep6 = getelementptr i8, ptr %b, i32 %lsr.iv1
275  %scevgep7 = getelementptr i8, ptr %uglygep6, i32 16
276  tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr nonnull %scevgep7, <4 x float> %1, i32 4)
277  %uglygep4 = getelementptr i8, ptr %b, i32 %lsr.iv1
278  %scevgep5 = getelementptr i8, ptr %uglygep4, i32 32
279  tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr nonnull %scevgep5, <4 x float> %2, i32 4)
280  %uglygep = getelementptr i8, ptr %b, i32 %lsr.iv1
281  %scevgep = getelementptr i8, ptr %uglygep, i32 48
282  tail call void @llvm.arm.neon.vst1.p0.v4f32(ptr nonnull %scevgep, <4 x float> %3, i32 4)
283  %lsr.iv.next = add i32 %lsr.iv, -1
284  %lsr.iv.next2 = add nuw i32 %lsr.iv1, 64
285  %exitcond.not = icmp eq i32 %lsr.iv.next, 0
286  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
287}
288
289declare void @external_function(ptr)
290declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr, i32) nounwind readonly
291declare void @llvm.arm.neon.vst1.p0.v4f32(ptr, <4 x float>, i32) nounwind argmemonly
292