xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll (revision 343a810725f27bfe92fbd04a42d42aa9caaee7a6)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,NO-ZVFHMIN-ZVFBFMIN
3; RUN: opt -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN-ZVFBFMIN
4
5
6define void @test(ptr %p, ptr noalias %s) {
7; CHECK-LABEL: @test(
8; CHECK-NEXT:  entry:
9; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
10; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
11; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
12; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> splat (i1 true), i32 8)
13; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> splat (i1 true), i32 8)
14; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]]
15; CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4
16; CHECK-NEXT:    ret void
17;
18entry:
19  %arrayidx = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 0
20  %i = load float, ptr %arrayidx, align 4
21  %arrayidx1 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 30
22  %i1 = load float, ptr %arrayidx1, align 4
23  %add = fsub fast float %i1, %i
24  %arrayidx2 = getelementptr inbounds float, ptr %s, i64 0
25  store float %add, ptr %arrayidx2, align 4
26  %arrayidx4 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 4
27  %i2 = load float, ptr %arrayidx4, align 4
28  %arrayidx6 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 26
29  %i3 = load float, ptr %arrayidx6, align 4
30  %add7 = fsub fast float %i3, %i2
31  %arrayidx9 = getelementptr inbounds float, ptr %s, i64 1
32  store float %add7, ptr %arrayidx9, align 4
33  %arrayidx11 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 8
34  %i4 = load float, ptr %arrayidx11, align 4
35  %arrayidx13 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 22
36  %i5 = load float, ptr %arrayidx13, align 4
37  %add14 = fsub fast float %i5, %i4
38  %arrayidx16 = getelementptr inbounds float, ptr %s, i64 2
39  store float %add14, ptr %arrayidx16, align 4
40  %arrayidx18 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 12
41  %i6 = load float, ptr %arrayidx18, align 4
42  %arrayidx20 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 18
43  %i7 = load float, ptr %arrayidx20, align 4
44  %add21 = fsub fast float %i7, %i6
45  %arrayidx23 = getelementptr inbounds float, ptr %s, i64 3
46  store float %add21, ptr %arrayidx23, align 4
47  %arrayidx25 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 16
48  %i8 = load float, ptr %arrayidx25, align 4
49  %arrayidx27 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 14
50  %i9 = load float, ptr %arrayidx27, align 4
51  %add28 = fsub fast float %i9, %i8
52  %arrayidx30 = getelementptr inbounds float, ptr %s, i64 4
53  store float %add28, ptr %arrayidx30, align 4
54  %arrayidx32 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 20
55  %i10 = load float, ptr %arrayidx32, align 4
56  %arrayidx34 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 10
57  %i11 = load float, ptr %arrayidx34, align 4
58  %add35 = fsub fast float %i11, %i10
59  %arrayidx37 = getelementptr inbounds float, ptr %s, i64 5
60  store float %add35, ptr %arrayidx37, align 4
61  %arrayidx39 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 24
62  %i12 = load float, ptr %arrayidx39, align 4
63  %arrayidx41 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 6
64  %i13 = load float, ptr %arrayidx41, align 4
65  %add42 = fsub fast float %i13, %i12
66  %arrayidx44 = getelementptr inbounds float, ptr %s, i64 6
67  store float %add42, ptr %arrayidx44, align 4
68  %arrayidx46 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 28
69  %i14 = load float, ptr %arrayidx46, align 4
70  %arrayidx48 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 2
71  %i15 = load float, ptr %arrayidx48, align 4
72  %add49 = fsub fast float %i15, %i14
73  %arrayidx51 = getelementptr inbounds float, ptr %s, i64 7
74  store float %add49, ptr %arrayidx51, align 4
75  ret void
76}
77
78define void @test1(ptr %p, ptr noalias %s, i32 %stride) {
79; CHECK-LABEL: @test1(
80; CHECK-NEXT:  entry:
81; CHECK-NEXT:    [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
82; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
83; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
84; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
85; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[STR]], 4
86; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 [[TMP0]], <8 x i1> splat (i1 true), i32 8)
87; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> splat (i1 true), i32 8)
88; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP1]]
89; CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
90; CHECK-NEXT:    ret void
91;
92entry:
93  %str = zext i32 %stride to i64
94  %arrayidx = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 0
95  %i = load float, ptr %arrayidx, align 4
96  %arrayidx1 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 30
97  %i1 = load float, ptr %arrayidx1, align 4
98  %add = fsub fast float %i1, %i
99  %arrayidx2 = getelementptr inbounds float, ptr %s, i64 0
100  store float %add, ptr %arrayidx2, align 4
101  %arrayidx4 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %str
102  %i2 = load float, ptr %arrayidx4, align 4
103  %arrayidx6 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 26
104  %i3 = load float, ptr %arrayidx6, align 4
105  %add7 = fsub fast float %i3, %i2
106  %arrayidx9 = getelementptr inbounds float, ptr %s, i64 1
107  store float %add7, ptr %arrayidx9, align 4
108  %st1 = mul i64 %str, 2
109  %arrayidx11 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st1
110  %i4 = load float, ptr %arrayidx11, align 4
111  %arrayidx13 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 22
112  %i5 = load float, ptr %arrayidx13, align 4
113  %add14 = fsub fast float %i5, %i4
114  %arrayidx16 = getelementptr inbounds float, ptr %s, i64 2
115  store float %add14, ptr %arrayidx16, align 4
116  %st2 = mul i64 %str, 3
117  %arrayidx18 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st2
118  %i6 = load float, ptr %arrayidx18, align 4
119  %arrayidx20 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 18
120  %i7 = load float, ptr %arrayidx20, align 4
121  %add21 = fsub fast float %i7, %i6
122  %arrayidx23 = getelementptr inbounds float, ptr %s, i64 3
123  store float %add21, ptr %arrayidx23, align 4
124  %st3 = mul i64 %str, 4
125  %arrayidx25 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st3
126  %i8 = load float, ptr %arrayidx25, align 4
127  %arrayidx27 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 14
128  %i9 = load float, ptr %arrayidx27, align 4
129  %add28 = fsub fast float %i9, %i8
130  %arrayidx30 = getelementptr inbounds float, ptr %s, i64 4
131  store float %add28, ptr %arrayidx30, align 4
132  %st4 = mul i64 %str, 5
133  %arrayidx32 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st4
134  %i10 = load float, ptr %arrayidx32, align 4
135  %arrayidx34 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 10
136  %i11 = load float, ptr %arrayidx34, align 4
137  %add35 = fsub fast float %i11, %i10
138  %arrayidx37 = getelementptr inbounds float, ptr %s, i64 5
139  store float %add35, ptr %arrayidx37, align 4
140  %st5 = mul i64 %str, 6
141  %arrayidx39 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st5
142  %i12 = load float, ptr %arrayidx39, align 4
143  %arrayidx41 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 6
144  %i13 = load float, ptr %arrayidx41, align 4
145  %add42 = fsub fast float %i13, %i12
146  %arrayidx44 = getelementptr inbounds float, ptr %s, i64 6
147  store float %add42, ptr %arrayidx44, align 4
148  %st6 = mul i64 %str, 7
149  %arrayidx46 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st6
150  %i14 = load float, ptr %arrayidx46, align 4
151  %arrayidx48 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 2
152  %i15 = load float, ptr %arrayidx48, align 4
153  %add49 = fsub fast float %i15, %i14
154  %arrayidx51 = getelementptr inbounds float, ptr %s, i64 7
155  store float %add49, ptr %arrayidx51, align 4
156  ret void
157}
158
159define void @test2(ptr %p, ptr noalias %s, i32 %stride) {
160; CHECK-LABEL: @test2(
161; CHECK-NEXT:  entry:
162; CHECK-NEXT:    [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64
163; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2
164; CHECK-NEXT:    [[ST6:%.*]] = mul i64 [[STR]], 7
165; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]]
166; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
167; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> splat (i1 true), i32 8)
168; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STR]], -4
169; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 [[TMP1]], <8 x i1> splat (i1 true), i32 8)
170; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
171; CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
172; CHECK-NEXT:    ret void
173;
174entry:
175  %str = zext i32 %stride to i64
176  %arrayidx = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 2
177  %i = load float, ptr %arrayidx, align 4
178  %st6 = mul i64 %str, 7
179  %arrayidx1 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st6
180  %i1 = load float, ptr %arrayidx1, align 4
181  %add = fsub fast float %i1, %i
182  %arrayidx2 = getelementptr inbounds float, ptr %s, i64 0
183  store float %add, ptr %arrayidx2, align 4
184  %arrayidx4 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 6
185  %i2 = load float, ptr %arrayidx4, align 4
186  %st5 = mul i64 %str, 6
187  %arrayidx6 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st5
188  %i3 = load float, ptr %arrayidx6, align 4
189  %add7 = fsub fast float %i3, %i2
190  %arrayidx9 = getelementptr inbounds float, ptr %s, i64 1
191  store float %add7, ptr %arrayidx9, align 4
192  %arrayidx11 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 10
193  %i4 = load float, ptr %arrayidx11, align 4
194  %st4 = mul i64 %str, 5
195  %arrayidx13 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st4
196  %i5 = load float, ptr %arrayidx13, align 4
197  %add14 = fsub fast float %i5, %i4
198  %arrayidx16 = getelementptr inbounds float, ptr %s, i64 2
199  store float %add14, ptr %arrayidx16, align 4
200  %arrayidx18 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 14
201  %i6 = load float, ptr %arrayidx18, align 4
202  %st3 = mul i64 %str, 4
203  %arrayidx20 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st3
204  %i7 = load float, ptr %arrayidx20, align 4
205  %add21 = fsub fast float %i7, %i6
206  %arrayidx23 = getelementptr inbounds float, ptr %s, i64 3
207  store float %add21, ptr %arrayidx23, align 4
208  %arrayidx25 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 18
209  %st2 = mul i64 %str, 3
210  %i8 = load float, ptr %arrayidx25, align 4
211  %arrayidx27 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st2
212  %i9 = load float, ptr %arrayidx27, align 4
213  %add28 = fsub fast float %i9, %i8
214  %arrayidx30 = getelementptr inbounds float, ptr %s, i64 4
215  store float %add28, ptr %arrayidx30, align 4
216  %arrayidx32 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 22
217  %i10 = load float, ptr %arrayidx32, align 4
218  %st1 = mul i64 %str, 2
219  %arrayidx34 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %st1
220  %i11 = load float, ptr %arrayidx34, align 4
221  %add35 = fsub fast float %i11, %i10
222  %arrayidx37 = getelementptr inbounds float, ptr %s, i64 5
223  store float %add35, ptr %arrayidx37, align 4
224  %arrayidx39 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 26
225  %i12 = load float, ptr %arrayidx39, align 4
226  %arrayidx41 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 %str
227  %i13 = load float, ptr %arrayidx41, align 4
228  %add42 = fsub fast float %i13, %i12
229  %arrayidx44 = getelementptr inbounds float, ptr %s, i64 6
230  store float %add42, ptr %arrayidx44, align 4
231  %arrayidx46 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 30
232  %i14 = load float, ptr %arrayidx46, align 4
233  %arrayidx48 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 0
234  %i15 = load float, ptr %arrayidx48, align 4
235  %add49 = fsub fast float %i15, %i14
236  %arrayidx51 = getelementptr inbounds float, ptr %s, i64 7
237  store float %add49, ptr %arrayidx51, align 4
238  ret void
239}
240
241define void @test3(ptr %p, ptr noalias %s) {
242; CHECK-LABEL: @test3(
243; CHECK-NEXT:  entry:
244; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
245; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
246; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
247; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> splat (i1 true), i32 8)
248; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -4, <8 x i1> splat (i1 true), i32 8)
249; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
250; CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
251; CHECK-NEXT:    ret void
252;
253entry:
254  %arrayidx = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 0
255  %i = load float, ptr %arrayidx, align 4
256  %arrayidx1 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 30
257  %i1 = load float, ptr %arrayidx1, align 4
258  %add = fsub fast float %i1, %i
259  %arrayidx2 = getelementptr inbounds float, ptr %s, i64 0
260  store float %add, ptr %arrayidx2, align 4
261  %arrayidx4 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 4
262  %i2 = load float, ptr %arrayidx4, align 4
263  %arrayidx6 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 29
264  %i3 = load float, ptr %arrayidx6, align 4
265  %add7 = fsub fast float %i3, %i2
266  %arrayidx9 = getelementptr inbounds float, ptr %s, i64 1
267  store float %add7, ptr %arrayidx9, align 4
268  %arrayidx11 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 8
269  %i4 = load float, ptr %arrayidx11, align 4
270  %arrayidx13 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 28
271  %i5 = load float, ptr %arrayidx13, align 4
272  %add14 = fsub fast float %i5, %i4
273  %arrayidx16 = getelementptr inbounds float, ptr %s, i64 2
274  store float %add14, ptr %arrayidx16, align 4
275  %arrayidx18 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 12
276  %i6 = load float, ptr %arrayidx18, align 4
277  %arrayidx20 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 27
278  %i7 = load float, ptr %arrayidx20, align 4
279  %add21 = fsub fast float %i7, %i6
280  %arrayidx23 = getelementptr inbounds float, ptr %s, i64 3
281  store float %add21, ptr %arrayidx23, align 4
282  %arrayidx25 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 16
283  %i8 = load float, ptr %arrayidx25, align 4
284  %arrayidx27 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 26
285  %i9 = load float, ptr %arrayidx27, align 4
286  %add28 = fsub fast float %i9, %i8
287  %arrayidx30 = getelementptr inbounds float, ptr %s, i64 4
288  store float %add28, ptr %arrayidx30, align 4
289  %arrayidx32 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 20
290  %i10 = load float, ptr %arrayidx32, align 4
291  %arrayidx34 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 25
292  %i11 = load float, ptr %arrayidx34, align 4
293  %add35 = fsub fast float %i11, %i10
294  %arrayidx37 = getelementptr inbounds float, ptr %s, i64 5
295  store float %add35, ptr %arrayidx37, align 4
296  %arrayidx39 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 24
297  %i12 = load float, ptr %arrayidx39, align 4
298  %arrayidx41 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 24
299  %i13 = load float, ptr %arrayidx41, align 4
300  %add42 = fsub fast float %i13, %i12
301  %arrayidx44 = getelementptr inbounds float, ptr %s, i64 6
302  store float %add42, ptr %arrayidx44, align 4
303  %arrayidx46 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 28
304  %i14 = load float, ptr %arrayidx46, align 4
305  %arrayidx48 = getelementptr inbounds [48 x float], ptr %p, i64 0, i64 23
306  %i15 = load float, ptr %arrayidx48, align 4
307  %add49 = fsub fast float %i15, %i14
308  %arrayidx51 = getelementptr inbounds float, ptr %s, i64 7
309  store float %add49, ptr %arrayidx51, align 4
310  ret void
311}
312
313
314define void @test_bf16(ptr %p, ptr noalias %s) {
315; NO-ZVFHMIN-ZVFBFMIN-LABEL: @test_bf16(
316; NO-ZVFHMIN-ZVFBFMIN-NEXT:  entry:
317; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P:%.*]], i64 0, i64 0
318; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4
319; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 30
320; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I1:%.*]] = load bfloat, ptr [[ARRAYIDX1]], align 4
321; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD:%.*]] = fsub fast bfloat [[I1]], [[I]]
322; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds bfloat, ptr [[S:%.*]], i64 0
323; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store bfloat [[ADD]], ptr [[ARRAYIDX2]], align 4
324; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 4
325; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I2:%.*]] = load bfloat, ptr [[ARRAYIDX4]], align 4
326; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 26
327; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I3:%.*]] = load bfloat, ptr [[ARRAYIDX6]], align 4
328; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD7:%.*]] = fsub fast bfloat [[I3]], [[I2]]
329; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 1
330; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store bfloat [[ADD7]], ptr [[ARRAYIDX9]], align 4
331; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 8
332; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I4:%.*]] = load bfloat, ptr [[ARRAYIDX11]], align 4
333; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 22
334; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I5:%.*]] = load bfloat, ptr [[ARRAYIDX13]], align 4
335; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD14:%.*]] = fsub fast bfloat [[I5]], [[I4]]
336; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 2
337; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store bfloat [[ADD14]], ptr [[ARRAYIDX16]], align 4
338; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 12
339; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I6:%.*]] = load bfloat, ptr [[ARRAYIDX18]], align 4
340; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 18
341; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I7:%.*]] = load bfloat, ptr [[ARRAYIDX20]], align 4
342; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD21:%.*]] = fsub fast bfloat [[I7]], [[I6]]
343; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 3
344; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store bfloat [[ADD21]], ptr [[ARRAYIDX23]], align 4
345; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 16
346; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I8:%.*]] = load bfloat, ptr [[ARRAYIDX25]], align 4
347; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 14
348; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I9:%.*]] = load bfloat, ptr [[ARRAYIDX27]], align 4
349; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD28:%.*]] = fsub fast bfloat [[I9]], [[I8]]
350; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 4
351; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store bfloat [[ADD28]], ptr [[ARRAYIDX30]], align 4
352; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 20
353; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I10:%.*]] = load bfloat, ptr [[ARRAYIDX32]], align 4
354; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 10
355; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I11:%.*]] = load bfloat, ptr [[ARRAYIDX34]], align 4
356; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD35:%.*]] = fsub fast bfloat [[I11]], [[I10]]
357; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 5
358; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store bfloat [[ADD35]], ptr [[ARRAYIDX37]], align 4
359; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 24
360; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I12:%.*]] = load bfloat, ptr [[ARRAYIDX39]], align 4
361; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 6
362; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I13:%.*]] = load bfloat, ptr [[ARRAYIDX41]], align 4
363; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD42:%.*]] = fsub fast bfloat [[I13]], [[I12]]
364; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 6
365; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store bfloat [[ADD42]], ptr [[ARRAYIDX44]], align 4
366; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 28
367; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I14:%.*]] = load bfloat, ptr [[ARRAYIDX46]], align 4
368; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 2
369; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I15:%.*]] = load bfloat, ptr [[ARRAYIDX48]], align 4
370; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD49:%.*]] = fsub fast bfloat [[I15]], [[I14]]
371; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 7
372; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store bfloat [[ADD49]], ptr [[ARRAYIDX51]], align 4
373; NO-ZVFHMIN-ZVFBFMIN-NEXT:    ret void
374;
375; ZVFHMIN-ZVFBFMIN-LABEL: @test_bf16(
376; ZVFHMIN-ZVFBFMIN-NEXT:  entry:
377; ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P:%.*]], i64 0, i64 0
378; ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 30
379; ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds bfloat, ptr [[S:%.*]], i64 0
380; ZVFHMIN-ZVFBFMIN-NEXT:    [[TMP15:%.*]] = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i64(ptr align 4 [[ARRAYIDX]], i64 8, <8 x i1> splat (i1 true), i32 8)
381; ZVFHMIN-ZVFBFMIN-NEXT:    [[TMP7:%.*]] = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -8, <8 x i1> splat (i1 true), i32 8)
382; ZVFHMIN-ZVFBFMIN-NEXT:    [[TMP16:%.*]] = fsub fast <8 x bfloat> [[TMP7]], [[TMP15]]
383; ZVFHMIN-ZVFBFMIN-NEXT:    store <8 x bfloat> [[TMP16]], ptr [[ARRAYIDX2]], align 4
384; ZVFHMIN-ZVFBFMIN-NEXT:    ret void
385;
386entry:
387  %arrayidx = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 0
388  %i = load bfloat, ptr %arrayidx, align 4
389  %arrayidx1 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 30
390  %i1 = load bfloat, ptr %arrayidx1, align 4
391  %add = fsub fast bfloat %i1, %i
392  %arrayidx2 = getelementptr inbounds bfloat, ptr %s, i64 0
393  store bfloat %add, ptr %arrayidx2, align 4
394  %arrayidx4 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 4
395  %i2 = load bfloat, ptr %arrayidx4, align 4
396  %arrayidx6 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 26
397  %i3 = load bfloat, ptr %arrayidx6, align 4
398  %add7 = fsub fast bfloat %i3, %i2
399  %arrayidx9 = getelementptr inbounds bfloat, ptr %s, i64 1
400  store bfloat %add7, ptr %arrayidx9, align 4
401  %arrayidx11 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 8
402  %i4 = load bfloat, ptr %arrayidx11, align 4
403  %arrayidx13 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 22
404  %i5 = load bfloat, ptr %arrayidx13, align 4
405  %add14 = fsub fast bfloat %i5, %i4
406  %arrayidx16 = getelementptr inbounds bfloat, ptr %s, i64 2
407  store bfloat %add14, ptr %arrayidx16, align 4
408  %arrayidx18 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 12
409  %i6 = load bfloat, ptr %arrayidx18, align 4
410  %arrayidx20 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 18
411  %i7 = load bfloat, ptr %arrayidx20, align 4
412  %add21 = fsub fast bfloat %i7, %i6
413  %arrayidx23 = getelementptr inbounds bfloat, ptr %s, i64 3
414  store bfloat %add21, ptr %arrayidx23, align 4
415  %arrayidx25 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 16
416  %i8 = load bfloat, ptr %arrayidx25, align 4
417  %arrayidx27 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 14
418  %i9 = load bfloat, ptr %arrayidx27, align 4
419  %add28 = fsub fast bfloat %i9, %i8
420  %arrayidx30 = getelementptr inbounds bfloat, ptr %s, i64 4
421  store bfloat %add28, ptr %arrayidx30, align 4
422  %arrayidx32 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 20
423  %i10 = load bfloat, ptr %arrayidx32, align 4
424  %arrayidx34 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 10
425  %i11 = load bfloat, ptr %arrayidx34, align 4
426  %add35 = fsub fast bfloat %i11, %i10
427  %arrayidx37 = getelementptr inbounds bfloat, ptr %s, i64 5
428  store bfloat %add35, ptr %arrayidx37, align 4
429  %arrayidx39 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 24
430  %i12 = load bfloat, ptr %arrayidx39, align 4
431  %arrayidx41 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 6
432  %i13 = load bfloat, ptr %arrayidx41, align 4
433  %add42 = fsub fast bfloat %i13, %i12
434  %arrayidx44 = getelementptr inbounds bfloat, ptr %s, i64 6
435  store bfloat %add42, ptr %arrayidx44, align 4
436  %arrayidx46 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 28
437  %i14 = load bfloat, ptr %arrayidx46, align 4
438  %arrayidx48 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 2
439  %i15 = load bfloat, ptr %arrayidx48, align 4
440  %add49 = fsub fast bfloat %i15, %i14
441  %arrayidx51 = getelementptr inbounds bfloat, ptr %s, i64 7
442  store bfloat %add49, ptr %arrayidx51, align 4
443  ret void
444}
445
446define void @test_f16(ptr %p, ptr noalias %s) {
447; NO-ZVFHMIN-ZVFBFMIN-LABEL: @test_f16(
448; NO-ZVFHMIN-ZVFBFMIN-NEXT:  entry:
449; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x half], ptr [[P:%.*]], i64 0, i64 0
450; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I:%.*]] = load half, ptr [[ARRAYIDX]], align 4
451; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 30
452; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I1:%.*]] = load half, ptr [[ARRAYIDX1]], align 4
453; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD:%.*]] = fsub fast half [[I1]], [[I]]
454; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[S:%.*]], i64 0
455; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store half [[ADD]], ptr [[ARRAYIDX2]], align 4
456; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 4
457; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I2:%.*]] = load half, ptr [[ARRAYIDX4]], align 4
458; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 26
459; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I3:%.*]] = load half, ptr [[ARRAYIDX6]], align 4
460; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD7:%.*]] = fsub fast half [[I3]], [[I2]]
461; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds half, ptr [[S]], i64 1
462; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store half [[ADD7]], ptr [[ARRAYIDX9]], align 4
463; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 8
464; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I4:%.*]] = load half, ptr [[ARRAYIDX11]], align 4
465; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 22
466; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I5:%.*]] = load half, ptr [[ARRAYIDX13]], align 4
467; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD14:%.*]] = fsub fast half [[I5]], [[I4]]
468; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds half, ptr [[S]], i64 2
469; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store half [[ADD14]], ptr [[ARRAYIDX16]], align 4
470; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 12
471; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I6:%.*]] = load half, ptr [[ARRAYIDX18]], align 4
472; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 18
473; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I7:%.*]] = load half, ptr [[ARRAYIDX20]], align 4
474; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD21:%.*]] = fsub fast half [[I7]], [[I6]]
475; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds half, ptr [[S]], i64 3
476; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store half [[ADD21]], ptr [[ARRAYIDX23]], align 4
477; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 16
478; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I8:%.*]] = load half, ptr [[ARRAYIDX25]], align 4
479; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 14
480; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I9:%.*]] = load half, ptr [[ARRAYIDX27]], align 4
481; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD28:%.*]] = fsub fast half [[I9]], [[I8]]
482; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds half, ptr [[S]], i64 4
483; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store half [[ADD28]], ptr [[ARRAYIDX30]], align 4
484; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 20
485; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I10:%.*]] = load half, ptr [[ARRAYIDX32]], align 4
486; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 10
487; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I11:%.*]] = load half, ptr [[ARRAYIDX34]], align 4
488; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD35:%.*]] = fsub fast half [[I11]], [[I10]]
489; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds half, ptr [[S]], i64 5
490; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store half [[ADD35]], ptr [[ARRAYIDX37]], align 4
491; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 24
492; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I12:%.*]] = load half, ptr [[ARRAYIDX39]], align 4
493; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 6
494; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I13:%.*]] = load half, ptr [[ARRAYIDX41]], align 4
495; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD42:%.*]] = fsub fast half [[I13]], [[I12]]
496; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds half, ptr [[S]], i64 6
497; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store half [[ADD42]], ptr [[ARRAYIDX44]], align 4
498; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 28
499; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I14:%.*]] = load half, ptr [[ARRAYIDX46]], align 4
500; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 2
501; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[I15:%.*]] = load half, ptr [[ARRAYIDX48]], align 4
502; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ADD49:%.*]] = fsub fast half [[I15]], [[I14]]
503; NO-ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds half, ptr [[S]], i64 7
504; NO-ZVFHMIN-ZVFBFMIN-NEXT:    store half [[ADD49]], ptr [[ARRAYIDX51]], align 4
505; NO-ZVFHMIN-ZVFBFMIN-NEXT:    ret void
506;
507; ZVFHMIN-ZVFBFMIN-LABEL: @test_f16(
508; ZVFHMIN-ZVFBFMIN-NEXT:  entry:
509; ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x half], ptr [[P:%.*]], i64 0, i64 0
510; ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 30
511; ZVFHMIN-ZVFBFMIN-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[S:%.*]], i64 0
512; ZVFHMIN-ZVFBFMIN-NEXT:    [[TMP15:%.*]] = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i64(ptr align 4 [[ARRAYIDX]], i64 8, <8 x i1> splat (i1 true), i32 8)
513; ZVFHMIN-ZVFBFMIN-NEXT:    [[TMP7:%.*]] = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -8, <8 x i1> splat (i1 true), i32 8)
514; ZVFHMIN-ZVFBFMIN-NEXT:    [[TMP16:%.*]] = fsub fast <8 x half> [[TMP7]], [[TMP15]]
515; ZVFHMIN-ZVFBFMIN-NEXT:    store <8 x half> [[TMP16]], ptr [[ARRAYIDX2]], align 4
516; ZVFHMIN-ZVFBFMIN-NEXT:    ret void
517;
518entry:
519  %arrayidx = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 0
520  %i = load half, ptr %arrayidx, align 4
521  %arrayidx1 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 30
522  %i1 = load half, ptr %arrayidx1, align 4
523  %add = fsub fast half %i1, %i
524  %arrayidx2 = getelementptr inbounds half, ptr %s, i64 0
525  store half %add, ptr %arrayidx2, align 4
526  %arrayidx4 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 4
527  %i2 = load half, ptr %arrayidx4, align 4
528  %arrayidx6 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 26
529  %i3 = load half, ptr %arrayidx6, align 4
530  %add7 = fsub fast half %i3, %i2
531  %arrayidx9 = getelementptr inbounds half, ptr %s, i64 1
532  store half %add7, ptr %arrayidx9, align 4
533  %arrayidx11 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 8
534  %i4 = load half, ptr %arrayidx11, align 4
535  %arrayidx13 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 22
536  %i5 = load half, ptr %arrayidx13, align 4
537  %add14 = fsub fast half %i5, %i4
538  %arrayidx16 = getelementptr inbounds half, ptr %s, i64 2
539  store half %add14, ptr %arrayidx16, align 4
540  %arrayidx18 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 12
541  %i6 = load half, ptr %arrayidx18, align 4
542  %arrayidx20 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 18
543  %i7 = load half, ptr %arrayidx20, align 4
544  %add21 = fsub fast half %i7, %i6
545  %arrayidx23 = getelementptr inbounds half, ptr %s, i64 3
546  store half %add21, ptr %arrayidx23, align 4
547  %arrayidx25 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 16
548  %i8 = load half, ptr %arrayidx25, align 4
549  %arrayidx27 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 14
550  %i9 = load half, ptr %arrayidx27, align 4
551  %add28 = fsub fast half %i9, %i8
552  %arrayidx30 = getelementptr inbounds half, ptr %s, i64 4
553  store half %add28, ptr %arrayidx30, align 4
554  %arrayidx32 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 20
555  %i10 = load half, ptr %arrayidx32, align 4
556  %arrayidx34 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 10
557  %i11 = load half, ptr %arrayidx34, align 4
558  %add35 = fsub fast half %i11, %i10
559  %arrayidx37 = getelementptr inbounds half, ptr %s, i64 5
560  store half %add35, ptr %arrayidx37, align 4
561  %arrayidx39 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 24
562  %i12 = load half, ptr %arrayidx39, align 4
563  %arrayidx41 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 6
564  %i13 = load half, ptr %arrayidx41, align 4
565  %add42 = fsub fast half %i13, %i12
566  %arrayidx44 = getelementptr inbounds half, ptr %s, i64 6
567  store half %add42, ptr %arrayidx44, align 4
568  %arrayidx46 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 28
569  %i14 = load half, ptr %arrayidx46, align 4
570  %arrayidx48 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 2
571  %i15 = load half, ptr %arrayidx48, align 4
572  %add49 = fsub fast half %i15, %i14
573  %arrayidx51 = getelementptr inbounds half, ptr %s, i64 7
574  store half %add49, ptr %arrayidx51, align 4
575  ret void
576}
577