xref: /llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -enable-arm-maskedgatscat=false %s -o - | FileCheck %s
3
4define void @remat_vctp(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i16 zeroext %arg5) {
5; CHECK-LABEL: remat_vctp:
6; CHECK:       @ %bb.0: @ %bb
7; CHECK-NEXT:    push {r4, r5, r7, lr}
8; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
9; CHECK-NEXT:    ldrd r5, r12, [sp, #64]
10; CHECK-NEXT:    vmvn.i32 q0, #0x80000000
11; CHECK-NEXT:    vmov.i32 q1, #0x3f
12; CHECK-NEXT:    movs r4, #1
13; CHECK-NEXT:    dlstp.32 lr, r12
14; CHECK-NEXT:  .LBB0_1: @ %bb6
15; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
16; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
17; CHECK-NEXT:    vabs.s32 q4, q3
18; CHECK-NEXT:    vcls.s32 q2, q4
19; CHECK-NEXT:    vshl.u32 q4, q4, q2
20; CHECK-NEXT:    vadd.i32 q2, q2, r4
21; CHECK-NEXT:    vshr.u32 q5, q4, #24
22; CHECK-NEXT:    vand q5, q5, q1
23; CHECK-NEXT:    vldrw.u32 q6, [r5, q5, uxtw #2]
24; CHECK-NEXT:    vqrdmulh.s32 q5, q6, q4
25; CHECK-NEXT:    vqsub.s32 q5, q0, q5
26; CHECK-NEXT:    vqrdmulh.s32 q5, q6, q5
27; CHECK-NEXT:    vqshl.s32 q5, q5, #1
28; CHECK-NEXT:    vqrdmulh.s32 q4, q5, q4
29; CHECK-NEXT:    vqsub.s32 q4, q0, q4
30; CHECK-NEXT:    vqrdmulh.s32 q4, q5, q4
31; CHECK-NEXT:    vqshl.s32 q4, q4, #1
32; CHECK-NEXT:    vpt.s32 lt, q3, zr
33; CHECK-NEXT:    vnegt.s32 q4, q4
34; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
35; CHECK-NEXT:    vqrdmulh.s32 q3, q3, q4
36; CHECK-NEXT:    vstrw.32 q3, [r2], #16
37; CHECK-NEXT:    vstrw.32 q2, [r3], #16
38; CHECK-NEXT:    letp lr, .LBB0_1
39; CHECK-NEXT:  @ %bb.2: @ %bb44
40; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
41; CHECK-NEXT:    pop {r4, r5, r7, pc}
42bb:
43  %i = zext i16 %arg5 to i32
44  br label %bb6
45
46bb6:                                              ; preds = %bb6, %bb
47  %i7 = phi ptr [ %arg3, %bb ], [ %i38, %bb6 ]
48  %i8 = phi i32 [ %i, %bb ], [ %i42, %bb6 ]
49  %i9 = phi ptr [ %arg2, %bb ], [ %i41, %bb6 ]
50  %i10 = phi ptr [ %arg1, %bb ], [ %i40, %bb6 ]
51  %i11 = phi ptr [ %arg, %bb ], [ %i39, %bb6 ]
52  %i12 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i8)
53  %i14 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %i11, i32 4, <4 x i1> %i12, <4 x i32> zeroinitializer)
54  %i16 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %i10, i32 4, <4 x i1> %i12, <4 x i32> zeroinitializer)
55  %i17 = icmp slt <4 x i32> %i16, zeroinitializer
56  %i18 = sub <4 x i32> zeroinitializer, %i16
57  %i19 = select <4 x i1> %i17, <4 x i32> %i18, <4 x i32> %i16
58  %i20 = tail call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %i19)
59  %i21 = shl <4 x i32> %i19, %i20
60  %i22 = add <4 x i32> %i20, <i32 1, i32 1, i32 1, i32 1>
61  %i23 = lshr <4 x i32> %i21, <i32 24, i32 24, i32 24, i32 24>
62  %i24 = and <4 x i32> %i23, <i32 63, i32 63, i32 63, i32 63>
63  %i25 = tail call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %arg4, <4 x i32> %i24, i32 32, i32 2, i32 0)
64  %i26 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i21)
65  %i27 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i26)
66  %i28 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i27)
67  %i29 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i28, i32 1, i32 0)
68  %i30 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i21)
69  %i31 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i30)
70  %i32 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i31)
71  %i33 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i32, i32 1, i32 0)
72  %i34 = tail call <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32> %i33, <4 x i1> %i17, <4 x i32> %i33)
73  %i35 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i14, <4 x i32> %i34)
74  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %i35, ptr %i9, i32 4, <4 x i1> %i12)
75  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %i22, ptr %i7, i32 4, <4 x i1> %i12)
76  %i38 = getelementptr inbounds i32, ptr %i7, i32 4
77  %i39 = getelementptr inbounds i32, ptr %i11, i32 4
78  %i40 = getelementptr inbounds i32, ptr %i10, i32 4
79  %i41 = getelementptr inbounds i32, ptr %i9, i32 4
80  %i42 = add nsw i32 %i8, -4
81  %i43 = icmp sgt i32 %i8, 4
82  br i1 %i43, label %bb6, label %bb44
83
84bb44:                                             ; preds = %bb6
85  ret void
86}
87
88define void @dont_remat_predicated_vctp(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i16 zeroext %arg5, i32 %conv.mask) {
89; CHECK-LABEL: dont_remat_predicated_vctp:
90; CHECK:       @ %bb.0: @ %bb
91; CHECK-NEXT:    push {r4, r5, r6, lr}
92; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
93; CHECK-NEXT:    sub sp, #8
94; CHECK-NEXT:    ldrd r6, r12, [sp, #72]
95; CHECK-NEXT:    movs r4, #4
96; CHECK-NEXT:    cmp.w r12, #4
97; CHECK-NEXT:    vmvn.i32 q0, #0x80000000
98; CHECK-NEXT:    csel r5, r12, r4, lt
99; CHECK-NEXT:    vmov.i32 q1, #0x3f
100; CHECK-NEXT:    sub.w r5, r12, r5
101; CHECK-NEXT:    add.w lr, r5, #3
102; CHECK-NEXT:    movs r5, #1
103; CHECK-NEXT:    add.w lr, r5, lr, lsr #2
104; CHECK-NEXT:  .LBB1_1: @ %bb6
105; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
106; CHECK-NEXT:    vctp.32 r12
107; CHECK-NEXT:    sub.w r12, r12, #4
108; CHECK-NEXT:    vpst
109; CHECK-NEXT:    vctpt.32 r4
110; CHECK-NEXT:    vstr p0, [sp, #4] @ 4-byte Spill
111; CHECK-NEXT:    vpst
112; CHECK-NEXT:    vldrwt.u32 q3, [r1], #16
113; CHECK-NEXT:    vabs.s32 q4, q3
114; CHECK-NEXT:    vcls.s32 q2, q4
115; CHECK-NEXT:    vshl.u32 q4, q4, q2
116; CHECK-NEXT:    vadd.i32 q2, q2, r5
117; CHECK-NEXT:    vshr.u32 q5, q4, #24
118; CHECK-NEXT:    vand q5, q5, q1
119; CHECK-NEXT:    vldrw.u32 q6, [r6, q5, uxtw #2]
120; CHECK-NEXT:    vqrdmulh.s32 q5, q6, q4
121; CHECK-NEXT:    vqsub.s32 q5, q0, q5
122; CHECK-NEXT:    vqrdmulh.s32 q5, q6, q5
123; CHECK-NEXT:    vqshl.s32 q5, q5, #1
124; CHECK-NEXT:    vqrdmulh.s32 q4, q5, q4
125; CHECK-NEXT:    vqsub.s32 q4, q0, q4
126; CHECK-NEXT:    vqrdmulh.s32 q4, q5, q4
127; CHECK-NEXT:    vqshl.s32 q4, q4, #1
128; CHECK-NEXT:    vpt.s32 lt, q3, zr
129; CHECK-NEXT:    vnegt.s32 q4, q4
130; CHECK-NEXT:    vldr p0, [sp, #4] @ 4-byte Reload
131; CHECK-NEXT:    vpst
132; CHECK-NEXT:    vldrwt.u32 q3, [r0], #16
133; CHECK-NEXT:    vqrdmulh.s32 q3, q3, q4
134; CHECK-NEXT:    vpstt
135; CHECK-NEXT:    vstrwt.32 q3, [r2], #16
136; CHECK-NEXT:    vstrwt.32 q2, [r3], #16
137; CHECK-NEXT:    le lr, .LBB1_1
138; CHECK-NEXT:  @ %bb.2: @ %bb44
139; CHECK-NEXT:    add sp, #8
140; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
141; CHECK-NEXT:    pop {r4, r5, r6, pc}
142bb:
143  %i = zext i16 %arg5 to i32
144  br label %bb6
145
146bb6:                                              ; preds = %bb6, %bb
147  %i7 = phi ptr [ %arg3, %bb ], [ %i38, %bb6 ]
148  %i8 = phi i32 [ %i, %bb ], [ %i42, %bb6 ]
149  %i9 = phi ptr [ %arg2, %bb ], [ %i41, %bb6 ]
150  %i10 = phi ptr [ %arg1, %bb ], [ %i40, %bb6 ]
151  %i11 = phi ptr [ %arg, %bb ], [ %i39, %bb6 ]
152  %i12 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 4)
153  %mask = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i8)
154  %pred = and <4 x i1> %i12, %mask
155  %i14 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %i11, i32 4, <4 x i1> %pred, <4 x i32> zeroinitializer)
156  %i16 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %i10, i32 4, <4 x i1> %pred, <4 x i32> zeroinitializer)
157  %i17 = icmp slt <4 x i32> %i16, zeroinitializer
158  %i18 = sub <4 x i32> zeroinitializer, %i16
159  %i19 = select <4 x i1> %i17, <4 x i32> %i18, <4 x i32> %i16
160  %i20 = tail call <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32> %i19)
161  %i21 = shl <4 x i32> %i19, %i20
162  %i22 = add <4 x i32> %i20, <i32 1, i32 1, i32 1, i32 1>
163  %i23 = lshr <4 x i32> %i21, <i32 24, i32 24, i32 24, i32 24>
164  %i24 = and <4 x i32> %i23, <i32 63, i32 63, i32 63, i32 63>
165  %i25 = tail call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %arg4, <4 x i32> %i24, i32 32, i32 2, i32 0)
166  %i26 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i21)
167  %i27 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i26)
168  %i28 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i25, <4 x i32> %i27)
169  %i29 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i28, i32 1, i32 0)
170  %i30 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i21)
171  %i31 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> %i30)
172  %i32 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i29, <4 x i32> %i31)
173  %i33 = tail call <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32> %i32, i32 1, i32 0)
174  %i34 = tail call <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32> %i33, <4 x i1> %i17, <4 x i32> %i33)
175  %i35 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %i14, <4 x i32> %i34)
176  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %i35, ptr %i9, i32 4, <4 x i1> %pred)
177  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %i22, ptr %i7, i32 4, <4 x i1> %pred)
178  %i38 = getelementptr inbounds i32, ptr %i7, i32 4
179  %i39 = getelementptr inbounds i32, ptr %i11, i32 4
180  %i40 = getelementptr inbounds i32, ptr %i10, i32 4
181  %i41 = getelementptr inbounds i32, ptr %i9, i32 4
182  %i42 = add nsw i32 %i8, -4
183  %i43 = icmp sgt i32 %i8, 4
184  br i1 %i43, label %bb6, label %bb44
185
186bb44:                                             ; preds = %bb6
187  ret void
188}
189
190declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
191declare <4 x i1> @llvm.arm.mve.vctp32(i32)
192declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
193declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
194declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
195declare <4 x i32> @llvm.arm.mve.vcls.v4i32(<4 x i32>)
196declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr, <4 x i32>, i32, i32, i32)
197declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
198declare <4 x i32> @llvm.arm.mve.vqshl.imm.v4i32(<4 x i32>, i32, i32)
199declare <4 x i32> @llvm.arm.mve.neg.predicated.v4i32.v4i1(<4 x i32>, <4 x i1>, <4 x i32>)
200