xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vmaxnma-tailpred.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4define float @minf32(ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, ptr noalias nocapture %d, i32 %n) {
5; CHECK-LABEL: minf32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r7, lr}
8; CHECK-NEXT:    push {r7, lr}
9; CHECK-NEXT:    cmp r3, #1
10; CHECK-NEXT:    it lt
11; CHECK-NEXT:    poplt {r7, pc}
12; CHECK-NEXT:  .LBB0_1: @ %vector.ph
13; CHECK-NEXT:    dlstp.32 lr, r3
14; CHECK-NEXT:  .LBB0_2: @ %vector.body
15; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
16; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
17; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
18; CHECK-NEXT:    vabs.f32 q0, q0
19; CHECK-NEXT:    vminnm.f32 q0, q0, q1
20; CHECK-NEXT:    vstrw.32 q0, [r2], #16
21; CHECK-NEXT:    letp lr, .LBB0_2
22; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
23; CHECK-NEXT:    pop {r7, pc}
24entry:
25  %cmp8 = icmp sgt i32 %n, 0
26  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
27
28vector.ph:                                        ; preds = %entry
29  %n.rnd.up = add i32 %n, 3
30  %n.vec = and i32 %n.rnd.up, -4
31  br label %vector.body
32
33vector.body:                                      ; preds = %vector.body, %vector.ph
34  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
35  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
36  %0 = getelementptr inbounds float, ptr %s1, i32 %index
37  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> poison)
38  %1 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %wide.masked.load)
39  %2 = getelementptr inbounds float, ptr %s2, i32 %index
40  %wide.masked.load10 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x float> poison)
41  %3 = call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %1, <4 x float> %wide.masked.load10)
42  %4 = getelementptr inbounds float, ptr %d, i32 %index
43  call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %active.lane.mask)
44  %index.next = add i32 %index, 4
45  %5 = icmp eq i32 %index.next, %n.vec
46  br i1 %5, label %for.cond.cleanup, label %vector.body
47
48for.cond.cleanup:                                 ; preds = %vector.body, %entry
49  ret float undef
50}
51
52define float @maxaf32(ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, ptr noalias nocapture %d, i32 %n) {
53; CHECK-LABEL: maxaf32:
54; CHECK:       @ %bb.0: @ %entry
55; CHECK-NEXT:    .save {r7, lr}
56; CHECK-NEXT:    push {r7, lr}
57; CHECK-NEXT:    cmp r3, #1
58; CHECK-NEXT:    it lt
59; CHECK-NEXT:    poplt {r7, pc}
60; CHECK-NEXT:  .LBB1_1: @ %vector.ph
61; CHECK-NEXT:    dlstp.32 lr, r3
62; CHECK-NEXT:  .LBB1_2: @ %vector.body
63; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
64; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
65; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
66; CHECK-NEXT:    vmaxnma.f32 q1, q0
67; CHECK-NEXT:    vstrw.32 q1, [r2], #16
68; CHECK-NEXT:    letp lr, .LBB1_2
69; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
70; CHECK-NEXT:    pop {r7, pc}
71entry:
72  %cmp8 = icmp sgt i32 %n, 0
73  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
74
75vector.ph:                                        ; preds = %entry
76  %n.rnd.up = add i32 %n, 3
77  %n.vec = and i32 %n.rnd.up, -4
78  br label %vector.body
79
80vector.body:                                      ; preds = %vector.body, %vector.ph
81  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
82  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
83  %0 = getelementptr inbounds float, ptr %s1, i32 %index
84  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> poison)
85  %1 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %wide.masked.load)
86  %2 = getelementptr inbounds float, ptr %s2, i32 %index
87  %wide.masked.load10 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x float> poison)
88  %3 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %wide.masked.load10)
89  %4 = call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %1, <4 x float> %3)
90  %5 = getelementptr inbounds float, ptr %d, i32 %index
91  call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %active.lane.mask)
92  %index.next = add i32 %index, 4
93  %6 = icmp eq i32 %index.next, %n.vec
94  br i1 %6, label %for.cond.cleanup, label %vector.body
95
96for.cond.cleanup:                                 ; preds = %vector.body, %entry
97  ret float undef
98}
99
100
101define half @maxf16(ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, ptr noalias nocapture %d, i32 %n) {
102; CHECK-LABEL: maxf16:
103; CHECK:       @ %bb.0: @ %entry
104; CHECK-NEXT:    .save {r7, lr}
105; CHECK-NEXT:    push {r7, lr}
106; CHECK-NEXT:    cmp r3, #1
107; CHECK-NEXT:    it lt
108; CHECK-NEXT:    poplt {r7, pc}
109; CHECK-NEXT:  .LBB2_1: @ %vector.ph
110; CHECK-NEXT:    dlstp.16 lr, r3
111; CHECK-NEXT:  .LBB2_2: @ %vector.body
112; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
113; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
114; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
115; CHECK-NEXT:    vabs.f16 q0, q0
116; CHECK-NEXT:    vmaxnm.f16 q0, q0, q1
117; CHECK-NEXT:    vstrh.16 q0, [r2], #16
118; CHECK-NEXT:    letp lr, .LBB2_2
119; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
120; CHECK-NEXT:    pop {r7, pc}
121entry:
122  %cmp10 = icmp sgt i32 %n, 0
123  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
124
125vector.ph:                                        ; preds = %entry
126  %n.rnd.up = add i32 %n, 7
127  %n.vec = and i32 %n.rnd.up, -8
128  br label %vector.body
129
130vector.body:                                      ; preds = %vector.body, %vector.ph
131  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
132  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
133  %0 = getelementptr inbounds half, ptr %s1, i32 %index
134  %wide.masked.load = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %0, i32 2, <8 x i1> %active.lane.mask, <8 x half> poison)
135  %1 = call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %wide.masked.load)
136  %2 = getelementptr inbounds half, ptr %s2, i32 %index
137  %wide.masked.load12 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %2, i32 2, <8 x i1> %active.lane.mask, <8 x half> poison)
138  %3 = call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %1, <8 x half> %wide.masked.load12)
139  %4 = getelementptr inbounds half, ptr %d, i32 %index
140  call void @llvm.masked.store.v8f16.p0(<8 x half> %3, ptr %4, i32 2, <8 x i1> %active.lane.mask)
141  %index.next = add i32 %index, 8
142  %5 = icmp eq i32 %index.next, %n.vec
143  br i1 %5, label %for.cond.cleanup, label %vector.body
144
145for.cond.cleanup:                                 ; preds = %vector.body, %entry
146  ret half undef
147}
148
149define half @minaf16(ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, ptr noalias nocapture %d, i32 %n) {
150; CHECK-LABEL: minaf16:
151; CHECK:       @ %bb.0: @ %entry
152; CHECK-NEXT:    .save {r7, lr}
153; CHECK-NEXT:    push {r7, lr}
154; CHECK-NEXT:    cmp r3, #1
155; CHECK-NEXT:    it lt
156; CHECK-NEXT:    poplt {r7, pc}
157; CHECK-NEXT:  .LBB3_1: @ %vector.ph
158; CHECK-NEXT:    dlstp.16 lr, r3
159; CHECK-NEXT:  .LBB3_2: @ %vector.body
160; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
161; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
162; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
163; CHECK-NEXT:    vminnma.f16 q1, q0
164; CHECK-NEXT:    vstrh.16 q1, [r2], #16
165; CHECK-NEXT:    letp lr, .LBB3_2
166; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
167; CHECK-NEXT:    pop {r7, pc}
168entry:
169  %cmp10 = icmp sgt i32 %n, 0
170  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
171
172vector.ph:                                        ; preds = %entry
173  %n.rnd.up = add i32 %n, 7
174  %n.vec = and i32 %n.rnd.up, -8
175  br label %vector.body
176
177vector.body:                                      ; preds = %vector.body, %vector.ph
178  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
179  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
180  %0 = getelementptr inbounds half, ptr %s1, i32 %index
181  %wide.masked.load = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %0, i32 2, <8 x i1> %active.lane.mask, <8 x half> poison)
182  %1 = call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %wide.masked.load)
183  %2 = getelementptr inbounds half, ptr %s2, i32 %index
184  %wide.masked.load12 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %2, i32 2, <8 x i1> %active.lane.mask, <8 x half> poison)
185  %3 = call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %wide.masked.load12)
186  %4 = call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %1, <8 x half> %3)
187  %5 = getelementptr inbounds half, ptr %d, i32 %index
188  call void @llvm.masked.store.v8f16.p0(<8 x half> %4, ptr %5, i32 2, <8 x i1> %active.lane.mask)
189  %index.next = add i32 %index, 8
190  %6 = icmp eq i32 %index.next, %n.vec
191  br i1 %6, label %for.cond.cleanup, label %vector.body
192
193for.cond.cleanup:                                 ; preds = %vector.body, %entry
194  ret half undef
195}
196
197declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
198declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
199declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
200declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
201declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
202declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
203declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
204declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32 immarg, <8 x i1>, <8 x half>)
205declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
206declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
207declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
208declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>)
209