xref: /llvm-project/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll (revision 055fb7795aa219a3d274d280ec9129784f169f56)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -mtriple=thumbv8.1m.main -mattr=+mve %s -S -loop-reduce -o - | FileCheck %s
3target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
4target triple = "thumbv8.1m-arm-none-eabi"
5
6define float @vctp8(ptr %0, i32 %1) {
7; CHECK-LABEL: @vctp8(
8; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
9; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
10; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
11; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP0:%.*]] to i32
12; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
13; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
14; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
15; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
16; CHECK-NEXT:    br label [[TMP11:%.*]]
17; CHECK:       11:
18; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
19; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
20; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
21; CHECK-NEXT:    [[TMP15:%.*]] = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP12]])
22; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> [[TMP15]])
23; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
24; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
25; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
26; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
27; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
28; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
29; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
30; CHECK:       22:
31; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 @vecAddAcrossF32Mve(<4 x float> [[TMP19]])
32; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
33; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
34; CHECK-NEXT:    ret float [[TMP25]]
35;
36  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
37  %4 = extractvalue { <4 x i32>, i32 } %3, 0
38  %5 = add nsw i32 %1, -1
39  %6 = ptrtoint ptr %0 to i32
40  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
41  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
42  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
43  %10 = add <4 x i32> %4, %9
44  br label %11
45
4611:                                               ; preds = %11, %2
47  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
48  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
49  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
50  %15 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %12)
51  %mask = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> %15)
52  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
53  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
54  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
55  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
56  %20 = add nsw i32 %12, -4
57  %21 = icmp sgt i32 %12, 4
58  br i1 %21, label %11, label %22
59
6022:                                               ; preds = %11
61  %23 = tail call i32 @vecAddAcrossF32Mve(<4 x float> %19)
62  %24 = sitofp i32 %23 to float
63  %25 = tail call float @llvm.fabs.f32(float %24)
64  ret float %25
65}
66
67define float @vctp16(ptr %0, i32 %1) {
68; CHECK-LABEL: @vctp16(
69; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
70; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
71; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
72; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP0:%.*]] to i32
73; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
74; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
75; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
76; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
77; CHECK-NEXT:    br label [[TMP11:%.*]]
78; CHECK:       11:
79; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
80; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
81; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
82; CHECK-NEXT:    [[TMP15:%.*]] = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP12]])
83; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> [[TMP15]])
84; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
85; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
86; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
87; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
88; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
89; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
90; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
91; CHECK:       22:
92; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 @vecAddAcrossF32Mve(<4 x float> [[TMP19]])
93; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
94; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
95; CHECK-NEXT:    ret float [[TMP25]]
96;
97  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
98  %4 = extractvalue { <4 x i32>, i32 } %3, 0
99  %5 = add nsw i32 %1, -1
100  %6 = ptrtoint ptr %0 to i32
101  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
102  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
103  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
104  %10 = add <4 x i32> %4, %9
105  br label %11
106
10711:                                               ; preds = %11, %2
108  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
109  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
110  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
111  %15 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %12)
112  %mask = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> %15)
113  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
114  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
115  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
116  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
117  %20 = add nsw i32 %12, -4
118  %21 = icmp sgt i32 %12, 4
119  br i1 %21, label %11, label %22
120
12122:                                               ; preds = %11
122  %23 = tail call i32 @vecAddAcrossF32Mve(<4 x float> %19)
123  %24 = sitofp i32 %23 to float
124  %25 = tail call float @llvm.fabs.f32(float %24)
125  ret float %25
126}
127
128define float @vctpi32(ptr %0, i32 %1) {
129; CHECK-LABEL: @vctpi32(
130; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
131; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
132; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
133; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP0:%.*]] to i32
134; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
135; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
136; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
137; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
138; CHECK-NEXT:    br label [[TMP11:%.*]]
139; CHECK:       11:
140; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
141; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
142; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
143; CHECK-NEXT:    [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP12]])
144; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]])
145; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
146; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
147; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]])
148; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
149; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
150; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
151; CHECK:       22:
152; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 @vecAddAcrossF32Mve(<4 x float> [[TMP19]])
153; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
154; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
155; CHECK-NEXT:    ret float [[TMP25]]
156;
157  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
158  %4 = extractvalue { <4 x i32>, i32 } %3, 0
159  %5 = add nsw i32 %1, -1
160  %6 = ptrtoint ptr %0 to i32
161  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
162  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
163  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
164  %10 = add <4 x i32> %4, %9
165  br label %11
166
16711:                                               ; preds = %11, %2
168  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
169  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
170  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
171  %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
172  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
173  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
174  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
175  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
176  %20 = add nsw i32 %12, -4
177  %21 = icmp sgt i32 %12, 4
178  br i1 %21, label %11, label %22
179
18022:                                               ; preds = %11
181  %23 = tail call i32 @vecAddAcrossF32Mve(<4 x float> %19)
182  %24 = sitofp i32 %23 to float
183  %25 = tail call float @llvm.fabs.f32(float %24)
184  ret float %25
185}
186
187
188define float @vctpi64(ptr %0, i32 %1) {
189; CHECK-LABEL: @vctpi64(
190; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
191; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
192; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
193; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP0:%.*]] to i32
194; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
195; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
196; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
197; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
198; CHECK-NEXT:    br label [[TMP11:%.*]]
199; CHECK:       11:
200; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP23:%.*]], [[TMP11]] ]
201; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP21:%.*]], [[TMP11]] ]
202; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
203; CHECK-NEXT:    [[TMP15:%.*]] = call <2 x i1> @llvm.arm.mve.vctp64(i32 [[TMP12]])
204; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v2i1(<2 x i1> [[TMP15]])
205; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP16]])
206; CHECK-NEXT:    [[TMP18:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP17]])
207; CHECK-NEXT:    [[TMP19]] = extractvalue { <4 x float>, <4 x i32> } [[TMP18]], 1
208; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP18]], 0
209; CHECK-NEXT:    [[TMP21]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP20]], <4 x i1> [[TMP17]], <4 x float> [[TMP13]])
210; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP12]], 4
211; CHECK-NEXT:    [[TMP23]] = add i32 [[TMP12]], -4
212; CHECK-NEXT:    br i1 [[TMP22]], label [[TMP11]], label [[TMP24:%.*]]
213; CHECK:       24:
214; CHECK-NEXT:    [[TMP25:%.*]] = tail call i32 @vecAddAcrossF32Mve(<4 x float> [[TMP21]])
215; CHECK-NEXT:    [[TMP26:%.*]] = sitofp i32 [[TMP25]] to float
216; CHECK-NEXT:    [[TMP27:%.*]] = tail call float @llvm.fabs.f32(float [[TMP26]])
217; CHECK-NEXT:    ret float [[TMP27]]
218;
219  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
220  %4 = extractvalue { <4 x i32>, i32 } %3, 0
221  %5 = add nsw i32 %1, -1
222  %6 = ptrtoint ptr %0 to i32
223  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
224  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
225  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
226  %10 = add <4 x i32> %4, %9
227  br label %11
228
22911:                                               ; preds = %11, %2
230  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
231  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
232  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
233  %15 = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 %12)
234  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
235  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
236  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
237  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
238  %20 = add nsw i32 %12, -4
239  %21 = icmp sgt i32 %12, 4
240  br i1 %21, label %11, label %22
241
24222:                                               ; preds = %11
243  %23 = tail call i32 @vecAddAcrossF32Mve(<4 x float> %19)
244  %24 = sitofp i32 %23 to float
245  %25 = tail call float @llvm.fabs.f32(float %24)
246  ret float %25
247}
248
249declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
250declare <16 x i1> @llvm.arm.mve.vctp8(i32)
251declare <8 x i1> @llvm.arm.mve.vctp16(i32)
252declare <4 x i1> @llvm.arm.mve.vctp32(i32)
253declare <4 x i1> @llvm.arm.mve.vctp64(i32)
254declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
255declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
256declare i32 @vecAddAcrossF32Mve(...)
257declare <4 x i1> @v8i1_to_v4i1(<8 x i1>)
258declare <4 x i1> @v16i1_to_v4i1(<16 x i1>)
259declare float @llvm.fabs.f32(float)
260