xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll (revision ab7110bcd6b137803935508de8c9f6af377f9454)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; FADDA
10;
11
12; No single instruction NEON support. Use SVE.
13define half @fadda_v4f16(half %start, <4 x half> %a) vscale_range(1,0) #0 {
14; CHECK-LABEL: fadda_v4f16:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    ptrue p0.h, vl4
17; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
18; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
19; CHECK-NEXT:    fadda h0, p0, h0, z1.h
20; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
21; CHECK-NEXT:    ret
22  %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
23  ret half %res
24}
25
26; No single instruction NEON support. Use SVE.
27define half @fadda_v8f16(half %start, <8 x half> %a) vscale_range(1,0) #0 {
28; CHECK-LABEL: fadda_v8f16:
29; CHECK:       // %bb.0:
30; CHECK-NEXT:    ptrue p0.h, vl8
31; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
32; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
33; CHECK-NEXT:    fadda h0, p0, h0, z1.h
34; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
35; CHECK-NEXT:    ret
36  %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
37  ret half %res
38}
39
40define half @fadda_v16f16(half %start, ptr %a) vscale_range(2,0) #0 {
41; CHECK-LABEL: fadda_v16f16:
42; CHECK:       // %bb.0:
43; CHECK-NEXT:    ptrue p0.h, vl16
44; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
45; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
46; CHECK-NEXT:    fadda h0, p0, h0, z1.h
47; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
48; CHECK-NEXT:    ret
49  %op = load <16 x half>, ptr %a
50  %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
51  ret half %res
52}
53
54define half @fadda_v32f16(half %start, ptr %a) #0 {
55; VBITS_GE_256-LABEL: fadda_v32f16:
56; VBITS_GE_256:       // %bb.0:
57; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
58; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 def $z0
59; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
60; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
61; VBITS_GE_256-NEXT:    fadda h0, p0, h0, z1.h
62; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
63; VBITS_GE_256-NEXT:    fadda h0, p0, h0, z1.h
64; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
65; VBITS_GE_256-NEXT:    ret
66;
67; VBITS_GE_512-LABEL: fadda_v32f16:
68; VBITS_GE_512:       // %bb.0:
69; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
70; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 def $z0
71; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
72; VBITS_GE_512-NEXT:    fadda h0, p0, h0, z1.h
73; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
74; VBITS_GE_512-NEXT:    ret
75  %op = load <32 x half>, ptr %a
76  %res = call half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
77  ret half %res
78}
79
80define half @fadda_v64f16(half %start, ptr %a) vscale_range(8,0) #0 {
81; CHECK-LABEL: fadda_v64f16:
82; CHECK:       // %bb.0:
83; CHECK-NEXT:    ptrue p0.h, vl64
84; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
85; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
86; CHECK-NEXT:    fadda h0, p0, h0, z1.h
87; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
88; CHECK-NEXT:    ret
89  %op = load <64 x half>, ptr %a
90  %res = call half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
91  ret half %res
92}
93
94define half @fadda_v128f16(half %start, ptr %a) vscale_range(16,0) #0 {
95; CHECK-LABEL: fadda_v128f16:
96; CHECK:       // %bb.0:
97; CHECK-NEXT:    ptrue p0.h, vl128
98; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
99; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
100; CHECK-NEXT:    fadda h0, p0, h0, z1.h
101; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
102; CHECK-NEXT:    ret
103  %op = load <128 x half>, ptr %a
104  %res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
105  ret half %res
106}
107
108; No single instruction NEON support. Use SVE.
109define float @fadda_v2f32(float %start, <2 x float> %a) vscale_range(1,0) #0 {
110; CHECK-LABEL: fadda_v2f32:
111; CHECK:       // %bb.0:
112; CHECK-NEXT:    ptrue p0.s, vl2
113; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
114; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
115; CHECK-NEXT:    fadda s0, p0, s0, z1.s
116; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
117; CHECK-NEXT:    ret
118  %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
119  ret float %res
120}
121
122; No single instruction NEON support. Use SVE.
123define float @fadda_v4f32(float %start, <4 x float> %a) vscale_range(1,0) #0 {
124; CHECK-LABEL: fadda_v4f32:
125; CHECK:       // %bb.0:
126; CHECK-NEXT:    ptrue p0.s, vl4
127; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
128; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
129; CHECK-NEXT:    fadda s0, p0, s0, z1.s
130; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
131; CHECK-NEXT:    ret
132  %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
133  ret float %res
134}
135
136define float @fadda_v8f32(float %start, ptr %a) vscale_range(2,0) #0 {
137; CHECK-LABEL: fadda_v8f32:
138; CHECK:       // %bb.0:
139; CHECK-NEXT:    ptrue p0.s, vl8
140; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
141; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
142; CHECK-NEXT:    fadda s0, p0, s0, z1.s
143; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
144; CHECK-NEXT:    ret
145  %op = load <8 x float>, ptr %a
146  %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
147  ret float %res
148}
149
150define float @fadda_v16f32(float %start, ptr %a) #0 {
151; VBITS_GE_256-LABEL: fadda_v16f32:
152; VBITS_GE_256:       // %bb.0:
153; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
154; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 def $z0
155; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
156; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
157; VBITS_GE_256-NEXT:    fadda s0, p0, s0, z1.s
158; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
159; VBITS_GE_256-NEXT:    fadda s0, p0, s0, z1.s
160; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
161; VBITS_GE_256-NEXT:    ret
162;
163; VBITS_GE_512-LABEL: fadda_v16f32:
164; VBITS_GE_512:       // %bb.0:
165; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
166; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 def $z0
167; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
168; VBITS_GE_512-NEXT:    fadda s0, p0, s0, z1.s
169; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
170; VBITS_GE_512-NEXT:    ret
171  %op = load <16 x float>, ptr %a
172  %res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
173  ret float %res
174}
175
176define float @fadda_v32f32(float %start, ptr %a) vscale_range(8,0) #0 {
177; CHECK-LABEL: fadda_v32f32:
178; CHECK:       // %bb.0:
179; CHECK-NEXT:    ptrue p0.s, vl32
180; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
181; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
182; CHECK-NEXT:    fadda s0, p0, s0, z1.s
183; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
184; CHECK-NEXT:    ret
185  %op = load <32 x float>, ptr %a
186  %res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
187  ret float %res
188}
189
190define float @fadda_v64f32(float %start, ptr %a) vscale_range(16,0) #0 {
191; CHECK-LABEL: fadda_v64f32:
192; CHECK:       // %bb.0:
193; CHECK-NEXT:    ptrue p0.s, vl64
194; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
195; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
196; CHECK-NEXT:    fadda s0, p0, s0, z1.s
197; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
198; CHECK-NEXT:    ret
199  %op = load <64 x float>, ptr %a
200  %res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
201  ret float %res
202}
203
204; No single instruction NEON support. Use SVE.
205define double @fadda_v1f64(double %start, <1 x double> %a) vscale_range(1,0) #0 {
206; CHECK-LABEL: fadda_v1f64:
207; CHECK:       // %bb.0:
208; CHECK-NEXT:    fadd d0, d0, d1
209; CHECK-NEXT:    ret
210  %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
211  ret double %res
212}
213
214; No single instruction NEON support. Use SVE.
215define double @fadda_v2f64(double %start, <2 x double> %a) vscale_range(1,0) #0 {
216; CHECK-LABEL: fadda_v2f64:
217; CHECK:       // %bb.0:
218; CHECK-NEXT:    ptrue p0.d, vl2
219; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
220; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
221; CHECK-NEXT:    fadda d0, p0, d0, z1.d
222; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
223; CHECK-NEXT:    ret
224  %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
225  ret double %res
226}
227
228define double @fadda_v4f64(double %start, ptr %a) vscale_range(2,0) #0 {
229; CHECK-LABEL: fadda_v4f64:
230; CHECK:       // %bb.0:
231; CHECK-NEXT:    ptrue p0.d, vl4
232; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
233; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
234; CHECK-NEXT:    fadda d0, p0, d0, z1.d
235; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
236; CHECK-NEXT:    ret
237  %op = load <4 x double>, ptr %a
238  %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
239  ret double %res
240}
241
242define double @fadda_v8f64(double %start, ptr %a) #0 {
243; VBITS_GE_256-LABEL: fadda_v8f64:
244; VBITS_GE_256:       // %bb.0:
245; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
246; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
247; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
248; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
249; VBITS_GE_256-NEXT:    fadda d0, p0, d0, z1.d
250; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
251; VBITS_GE_256-NEXT:    fadda d0, p0, d0, z1.d
252; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
253; VBITS_GE_256-NEXT:    ret
254;
255; VBITS_GE_512-LABEL: fadda_v8f64:
256; VBITS_GE_512:       // %bb.0:
257; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
258; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
259; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
260; VBITS_GE_512-NEXT:    fadda d0, p0, d0, z1.d
261; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
262; VBITS_GE_512-NEXT:    ret
263  %op = load <8 x double>, ptr %a
264  %res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
265  ret double %res
266}
267
268define double @fadda_v16f64(double %start, ptr %a) vscale_range(8,0) #0 {
269; CHECK-LABEL: fadda_v16f64:
270; CHECK:       // %bb.0:
271; CHECK-NEXT:    ptrue p0.d, vl16
272; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
273; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
274; CHECK-NEXT:    fadda d0, p0, d0, z1.d
275; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
276; CHECK-NEXT:    ret
277  %op = load <16 x double>, ptr %a
278  %res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
279  ret double %res
280}
281
282define double @fadda_v32f64(double %start, ptr %a) vscale_range(16,0) #0 {
283; CHECK-LABEL: fadda_v32f64:
284; CHECK:       // %bb.0:
285; CHECK-NEXT:    ptrue p0.d, vl32
286; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
287; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
288; CHECK-NEXT:    fadda d0, p0, d0, z1.d
289; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
290; CHECK-NEXT:    ret
291  %op = load <32 x double>, ptr %a
292  %res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
293  ret double %res
294}
295
296;
297; FADDV
298;
299
300; No single instruction NEON support for 4 element vectors.
301define half @faddv_v4f16(half %start, <4 x half> %a) vscale_range(2,0) #0 {
302; CHECK-LABEL: faddv_v4f16:
303; CHECK:       // %bb.0:
304; CHECK-NEXT:    ptrue p0.h, vl4
305; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
306; CHECK-NEXT:    faddv h1, p0, z1.h
307; CHECK-NEXT:    fadd h0, h0, h1
308; CHECK-NEXT:    ret
309  %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
310  ret half %res
311}
312
313; No single instruction NEON support for 8 element vectors.
314define half @faddv_v8f16(half %start, <8 x half> %a) vscale_range(2,0) #0 {
315; CHECK-LABEL: faddv_v8f16:
316; CHECK:       // %bb.0:
317; CHECK-NEXT:    ptrue p0.h, vl8
318; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
319; CHECK-NEXT:    faddv h1, p0, z1.h
320; CHECK-NEXT:    fadd h0, h0, h1
321; CHECK-NEXT:    ret
322  %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
323  ret half %res
324}
325
326define half @faddv_v16f16(half %start, ptr %a) vscale_range(2,0) #0 {
327; CHECK-LABEL: faddv_v16f16:
328; CHECK:       // %bb.0:
329; CHECK-NEXT:    ptrue p0.h, vl16
330; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
331; CHECK-NEXT:    faddv h1, p0, z1.h
332; CHECK-NEXT:    fadd h0, h0, h1
333; CHECK-NEXT:    ret
334  %op = load <16 x half>, ptr %a
335  %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
336  ret half %res
337}
338
339define half @faddv_v32f16(half %start, ptr %a) #0 {
340; VBITS_GE_256-LABEL: faddv_v32f16:
341; VBITS_GE_256:       // %bb.0:
342; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
343; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
344; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
345; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
346; VBITS_GE_256-NEXT:    fadd z1.h, p0/m, z1.h, z2.h
347; VBITS_GE_256-NEXT:    faddv h1, p0, z1.h
348; VBITS_GE_256-NEXT:    fadd h0, h0, h1
349; VBITS_GE_256-NEXT:    ret
350;
351; VBITS_GE_512-LABEL: faddv_v32f16:
352; VBITS_GE_512:       // %bb.0:
353; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
354; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
355; VBITS_GE_512-NEXT:    faddv h1, p0, z1.h
356; VBITS_GE_512-NEXT:    fadd h0, h0, h1
357; VBITS_GE_512-NEXT:    ret
358  %op = load <32 x half>, ptr %a
359  %res = call fast half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
360  ret half %res
361}
362
363define half @faddv_v64f16(half %start, ptr %a) vscale_range(8,0) #0 {
364; CHECK-LABEL: faddv_v64f16:
365; CHECK:       // %bb.0:
366; CHECK-NEXT:    ptrue p0.h, vl64
367; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
368; CHECK-NEXT:    faddv h1, p0, z1.h
369; CHECK-NEXT:    fadd h0, h0, h1
370; CHECK-NEXT:    ret
371  %op = load <64 x half>, ptr %a
372  %res = call fast half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
373  ret half %res
374}
375
376define half @faddv_v128f16(half %start, ptr %a) vscale_range(16,0) #0 {
377; CHECK-LABEL: faddv_v128f16:
378; CHECK:       // %bb.0:
379; CHECK-NEXT:    ptrue p0.h, vl128
380; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
381; CHECK-NEXT:    faddv h1, p0, z1.h
382; CHECK-NEXT:    fadd h0, h0, h1
383; CHECK-NEXT:    ret
384  %op = load <128 x half>, ptr %a
385  %res = call fast half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
386  ret half %res
387}
388
389; Don't use SVE for 2 element vectors.
390define float @faddv_v2f32(float %start, <2 x float> %a) vscale_range(2,0) #0 {
391; CHECK-LABEL: faddv_v2f32:
392; CHECK:       // %bb.0:
393; CHECK-NEXT:    faddp s1, v1.2s
394; CHECK-NEXT:    fadd s0, s0, s1
395; CHECK-NEXT:    ret
396  %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
397  ret float %res
398}
399
400; No single instruction NEON support for 4 element vectors.
401define float @faddv_v4f32(float %start, <4 x float> %a) vscale_range(2,0) #0 {
402; CHECK-LABEL: faddv_v4f32:
403; CHECK:       // %bb.0:
404; CHECK-NEXT:    ptrue p0.s, vl4
405; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
406; CHECK-NEXT:    faddv s1, p0, z1.s
407; CHECK-NEXT:    fadd s0, s0, s1
408; CHECK-NEXT:    ret
409  %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
410  ret float %res
411}
412
413define float @faddv_v8f32(float %start, ptr %a) vscale_range(2,0) #0 {
414; CHECK-LABEL: faddv_v8f32:
415; CHECK:       // %bb.0:
416; CHECK-NEXT:    ptrue p0.s, vl8
417; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
418; CHECK-NEXT:    faddv s1, p0, z1.s
419; CHECK-NEXT:    fadd s0, s0, s1
420; CHECK-NEXT:    ret
421  %op = load <8 x float>, ptr %a
422  %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
423  ret float %res
424}
425
426define float @faddv_v16f32(float %start, ptr %a) #0 {
427; VBITS_GE_256-LABEL: faddv_v16f32:
428; VBITS_GE_256:       // %bb.0:
429; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
430; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
431; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
432; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
433; VBITS_GE_256-NEXT:    fadd z1.s, p0/m, z1.s, z2.s
434; VBITS_GE_256-NEXT:    faddv s1, p0, z1.s
435; VBITS_GE_256-NEXT:    fadd s0, s0, s1
436; VBITS_GE_256-NEXT:    ret
437;
438; VBITS_GE_512-LABEL: faddv_v16f32:
439; VBITS_GE_512:       // %bb.0:
440; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
441; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
442; VBITS_GE_512-NEXT:    faddv s1, p0, z1.s
443; VBITS_GE_512-NEXT:    fadd s0, s0, s1
444; VBITS_GE_512-NEXT:    ret
445  %op = load <16 x float>, ptr %a
446  %res = call fast float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
447  ret float %res
448}
449
450define float @faddv_v32f32(float %start, ptr %a) vscale_range(8,0) #0 {
451; CHECK-LABEL: faddv_v32f32:
452; CHECK:       // %bb.0:
453; CHECK-NEXT:    ptrue p0.s, vl32
454; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
455; CHECK-NEXT:    faddv s1, p0, z1.s
456; CHECK-NEXT:    fadd s0, s0, s1
457; CHECK-NEXT:    ret
458  %op = load <32 x float>, ptr %a
459  %res = call fast float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
460  ret float %res
461}
462
463define float @faddv_v64f32(float %start, ptr %a) vscale_range(16,0) #0 {
464; CHECK-LABEL: faddv_v64f32:
465; CHECK:       // %bb.0:
466; CHECK-NEXT:    ptrue p0.s, vl64
467; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
468; CHECK-NEXT:    faddv s1, p0, z1.s
469; CHECK-NEXT:    fadd s0, s0, s1
470; CHECK-NEXT:    ret
471  %op = load <64 x float>, ptr %a
472  %res = call fast float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
473  ret float %res
474}
475
476; Don't use SVE for 1 element vectors.
477define double @faddv_v1f64(double %start, <1 x double> %a) vscale_range(2,0) #0 {
478; CHECK-LABEL: faddv_v1f64:
479; CHECK:       // %bb.0:
480; CHECK-NEXT:    fadd d0, d0, d1
481; CHECK-NEXT:    ret
482  %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
483  ret double %res
484}
485
486; Don't use SVE for 2 element vectors.
487define double @faddv_v2f64(double %start, <2 x double> %a) vscale_range(2,0) #0 {
488; CHECK-LABEL: faddv_v2f64:
489; CHECK:       // %bb.0:
490; CHECK-NEXT:    faddp d1, v1.2d
491; CHECK-NEXT:    fadd d0, d0, d1
492; CHECK-NEXT:    ret
493  %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
494  ret double %res
495}
496
497define double @faddv_v4f64(double %start, ptr %a) vscale_range(2,0) #0 {
498; CHECK-LABEL: faddv_v4f64:
499; CHECK:       // %bb.0:
500; CHECK-NEXT:    ptrue p0.d, vl4
501; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
502; CHECK-NEXT:    faddv d1, p0, z1.d
503; CHECK-NEXT:    fadd d0, d0, d1
504; CHECK-NEXT:    ret
505  %op = load <4 x double>, ptr %a
506  %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
507  ret double %res
508}
509
510define double @faddv_v8f64(double %start, ptr %a) #0 {
511; VBITS_GE_256-LABEL: faddv_v8f64:
512; VBITS_GE_256:       // %bb.0:
513; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
514; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
515; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
516; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
517; VBITS_GE_256-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
518; VBITS_GE_256-NEXT:    faddv d1, p0, z1.d
519; VBITS_GE_256-NEXT:    fadd d0, d0, d1
520; VBITS_GE_256-NEXT:    ret
521;
522; VBITS_GE_512-LABEL: faddv_v8f64:
523; VBITS_GE_512:       // %bb.0:
524; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
525; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
526; VBITS_GE_512-NEXT:    faddv d1, p0, z1.d
527; VBITS_GE_512-NEXT:    fadd d0, d0, d1
528; VBITS_GE_512-NEXT:    ret
529  %op = load <8 x double>, ptr %a
530  %res = call fast double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
531  ret double %res
532}
533
534define double @faddv_v16f64(double %start, ptr %a) vscale_range(8,0) #0 {
535; CHECK-LABEL: faddv_v16f64:
536; CHECK:       // %bb.0:
537; CHECK-NEXT:    ptrue p0.d, vl16
538; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
539; CHECK-NEXT:    faddv d1, p0, z1.d
540; CHECK-NEXT:    fadd d0, d0, d1
541; CHECK-NEXT:    ret
542  %op = load <16 x double>, ptr %a
543  %res = call fast double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
544  ret double %res
545}
546
547define double @faddv_v32f64(double %start, ptr %a) vscale_range(16,0) #0 {
548; CHECK-LABEL: faddv_v32f64:
549; CHECK:       // %bb.0:
550; CHECK-NEXT:    ptrue p0.d, vl32
551; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
552; CHECK-NEXT:    faddv d1, p0, z1.d
553; CHECK-NEXT:    fadd d0, d0, d1
554; CHECK-NEXT:    ret
555  %op = load <32 x double>, ptr %a
556  %res = call fast double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
557  ret double %res
558}
559
560;
561; FMAXNMV
562;
563
564; No NEON 16-bit vector FMAXNMV support. Use SVE.
565define half @fmaxv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
566; CHECK-LABEL: fmaxv_v4f16:
567; CHECK:       // %bb.0:
568; CHECK-NEXT:    fmaxnmv h0, v0.4h
569; CHECK-NEXT:    ret
570  %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
571  ret half %res
572}
573
574; No NEON 16-bit vector FMAXNMV support. Use SVE.
575define half @fmaxv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
576; CHECK-LABEL: fmaxv_v8f16:
577; CHECK:       // %bb.0:
578; CHECK-NEXT:    fmaxnmv h0, v0.8h
579; CHECK-NEXT:    ret
580  %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
581  ret half %res
582}
583
584define half @fmaxv_v16f16(ptr %a) vscale_range(2,0) #0 {
585; CHECK-LABEL: fmaxv_v16f16:
586; CHECK:       // %bb.0:
587; CHECK-NEXT:    ptrue p0.h, vl16
588; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
589; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
590; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
591; CHECK-NEXT:    ret
592  %op = load <16 x half>, ptr %a
593  %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
594  ret half %res
595}
596
597define half @fmaxv_v32f16(ptr %a) #0 {
598; VBITS_GE_256-LABEL: fmaxv_v32f16:
599; VBITS_GE_256:       // %bb.0:
600; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
601; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
602; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
603; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
604; VBITS_GE_256-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
605; VBITS_GE_256-NEXT:    fmaxnmv h0, p0, z0.h
606; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
607; VBITS_GE_256-NEXT:    ret
608;
609; VBITS_GE_512-LABEL: fmaxv_v32f16:
610; VBITS_GE_512:       // %bb.0:
611; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
612; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
613; VBITS_GE_512-NEXT:    fmaxnmv h0, p0, z0.h
614; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
615; VBITS_GE_512-NEXT:    ret
616  %op = load <32 x half>, ptr %a
617  %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op)
618  ret half %res
619}
620
621define half @fmaxv_v64f16(ptr %a) vscale_range(8,0) #0 {
622; CHECK-LABEL: fmaxv_v64f16:
623; CHECK:       // %bb.0:
624; CHECK-NEXT:    ptrue p0.h, vl64
625; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
626; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
627; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
628; CHECK-NEXT:    ret
629  %op = load <64 x half>, ptr %a
630  %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op)
631  ret half %res
632}
633
634define half @fmaxv_v128f16(ptr %a) vscale_range(16,0) #0 {
635; CHECK-LABEL: fmaxv_v128f16:
636; CHECK:       // %bb.0:
637; CHECK-NEXT:    ptrue p0.h, vl128
638; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
639; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
640; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
641; CHECK-NEXT:    ret
642  %op = load <128 x half>, ptr %a
643  %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op)
644  ret half %res
645}
646
647; Don't use SVE for 64-bit f32 vectors.
648define float @fmaxv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
649; CHECK-LABEL: fmaxv_v2f32:
650; CHECK:       // %bb.0:
651; CHECK-NEXT:    fmaxnmp s0, v0.2s
652; CHECK-NEXT:    ret
653  %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
654  ret float %res
655}
656
657; Don't use SVE for 128-bit f32 vectors.
658define float @fmaxv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
659; CHECK-LABEL: fmaxv_v4f32:
660; CHECK:       // %bb.0:
661; CHECK-NEXT:    fmaxnmv s0, v0.4s
662; CHECK-NEXT:    ret
663  %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
664  ret float %res
665}
666
667define float @fmaxv_v8f32(ptr %a) vscale_range(2,0) #0 {
668; CHECK-LABEL: fmaxv_v8f32:
669; CHECK:       // %bb.0:
670; CHECK-NEXT:    ptrue p0.s, vl8
671; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
672; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
673; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
674; CHECK-NEXT:    ret
675  %op = load <8 x float>, ptr %a
676  %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
677  ret float %res
678}
679
680define float @fmaxv_v16f32(ptr %a) #0 {
681; VBITS_GE_256-LABEL: fmaxv_v16f32:
682; VBITS_GE_256:       // %bb.0:
683; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
684; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
685; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
686; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
687; VBITS_GE_256-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
688; VBITS_GE_256-NEXT:    fmaxnmv s0, p0, z0.s
689; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
690; VBITS_GE_256-NEXT:    ret
691;
692; VBITS_GE_512-LABEL: fmaxv_v16f32:
693; VBITS_GE_512:       // %bb.0:
694; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
695; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
696; VBITS_GE_512-NEXT:    fmaxnmv s0, p0, z0.s
697; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
698; VBITS_GE_512-NEXT:    ret
699  %op = load <16 x float>, ptr %a
700  %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op)
701  ret float %res
702}
703
704define float @fmaxv_v32f32(ptr %a) vscale_range(8,0) #0 {
705; CHECK-LABEL: fmaxv_v32f32:
706; CHECK:       // %bb.0:
707; CHECK-NEXT:    ptrue p0.s, vl32
708; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
709; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
710; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
711; CHECK-NEXT:    ret
712  %op = load <32 x float>, ptr %a
713  %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op)
714  ret float %res
715}
716
717define float @fmaxv_v64f32(ptr %a) vscale_range(16,0) #0 {
718; CHECK-LABEL: fmaxv_v64f32:
719; CHECK:       // %bb.0:
720; CHECK-NEXT:    ptrue p0.s, vl64
721; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
722; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
723; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
724; CHECK-NEXT:    ret
725  %op = load <64 x float>, ptr %a
726  %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op)
727  ret float %res
728}
729
730; Nothing to do for single element vectors.
731define double @fmaxv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
732; CHECK-LABEL: fmaxv_v1f64:
733; CHECK:       // %bb.0:
734; CHECK-NEXT:    ret
735  %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
736  ret double %res
737}
738
739; Don't use SVE for 128-bit f64 vectors.
740define double @fmaxv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
741; CHECK-LABEL: fmaxv_v2f64:
742; CHECK:       // %bb.0:
743; CHECK-NEXT:    fmaxnmp d0, v0.2d
744; CHECK-NEXT:    ret
745  %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
746  ret double %res
747}
748
749define double @fmaxv_v4f64(ptr %a) vscale_range(2,0) #0 {
750; CHECK-LABEL: fmaxv_v4f64:
751; CHECK:       // %bb.0:
752; CHECK-NEXT:    ptrue p0.d, vl4
753; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
754; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
755; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
756; CHECK-NEXT:    ret
757  %op = load <4 x double>, ptr %a
758  %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
759  ret double %res
760}
761
762define double @fmaxv_v8f64(ptr %a) #0 {
763; VBITS_GE_256-LABEL: fmaxv_v8f64:
764; VBITS_GE_256:       // %bb.0:
765; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
766; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
767; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
768; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
769; VBITS_GE_256-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
770; VBITS_GE_256-NEXT:    fmaxnmv d0, p0, z0.d
771; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
772; VBITS_GE_256-NEXT:    ret
773;
774; VBITS_GE_512-LABEL: fmaxv_v8f64:
775; VBITS_GE_512:       // %bb.0:
776; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
777; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
778; VBITS_GE_512-NEXT:    fmaxnmv d0, p0, z0.d
779; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
780; VBITS_GE_512-NEXT:    ret
781  %op = load <8 x double>, ptr %a
782  %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op)
783  ret double %res
784}
785
786define double @fmaxv_v16f64(ptr %a) vscale_range(8,0) #0 {
787; CHECK-LABEL: fmaxv_v16f64:
788; CHECK:       // %bb.0:
789; CHECK-NEXT:    ptrue p0.d, vl16
790; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
791; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
792; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
793; CHECK-NEXT:    ret
794  %op = load <16 x double>, ptr %a
795  %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op)
796  ret double %res
797}
798
799define double @fmaxv_v32f64(ptr %a) vscale_range(16,0) #0 {
800; CHECK-LABEL: fmaxv_v32f64:
801; CHECK:       // %bb.0:
802; CHECK-NEXT:    ptrue p0.d, vl32
803; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
804; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
805; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
806; CHECK-NEXT:    ret
807  %op = load <32 x double>, ptr %a
808  %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op)
809  ret double %res
810}
811
812;
813; FMINNMV
814;
815
816; No NEON 16-bit vector FMINNMV support. Use SVE.
817define half @fminv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
818; CHECK-LABEL: fminv_v4f16:
819; CHECK:       // %bb.0:
820; CHECK-NEXT:    fminnmv h0, v0.4h
821; CHECK-NEXT:    ret
822  %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
823  ret half %res
824}
825
826; No NEON 16-bit vector FMINNMV support. Use SVE.
827define half @fminv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
828; CHECK-LABEL: fminv_v8f16:
829; CHECK:       // %bb.0:
830; CHECK-NEXT:    fminnmv h0, v0.8h
831; CHECK-NEXT:    ret
832  %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
833  ret half %res
834}
835
836define half @fminv_v16f16(ptr %a) vscale_range(2,0) #0 {
837; CHECK-LABEL: fminv_v16f16:
838; CHECK:       // %bb.0:
839; CHECK-NEXT:    ptrue p0.h, vl16
840; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
841; CHECK-NEXT:    fminnmv h0, p0, z0.h
842; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
843; CHECK-NEXT:    ret
844  %op = load <16 x half>, ptr %a
845  %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
846  ret half %res
847}
848
849define half @fminv_v32f16(ptr %a) #0 {
850; VBITS_GE_256-LABEL: fminv_v32f16:
851; VBITS_GE_256:       // %bb.0:
852; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
853; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
854; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
855; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
856; VBITS_GE_256-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
857; VBITS_GE_256-NEXT:    fminnmv h0, p0, z0.h
858; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
859; VBITS_GE_256-NEXT:    ret
860;
861; VBITS_GE_512-LABEL: fminv_v32f16:
862; VBITS_GE_512:       // %bb.0:
863; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
864; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
865; VBITS_GE_512-NEXT:    fminnmv h0, p0, z0.h
866; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
867; VBITS_GE_512-NEXT:    ret
868  %op = load <32 x half>, ptr %a
869  %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op)
870  ret half %res
871}
872
873define half @fminv_v64f16(ptr %a) vscale_range(8,0) #0 {
874; CHECK-LABEL: fminv_v64f16:
875; CHECK:       // %bb.0:
876; CHECK-NEXT:    ptrue p0.h, vl64
877; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
878; CHECK-NEXT:    fminnmv h0, p0, z0.h
879; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
880; CHECK-NEXT:    ret
881  %op = load <64 x half>, ptr %a
882  %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op)
883  ret half %res
884}
885
886define half @fminv_v128f16(ptr %a) vscale_range(16,0) #0 {
887; CHECK-LABEL: fminv_v128f16:
888; CHECK:       // %bb.0:
889; CHECK-NEXT:    ptrue p0.h, vl128
890; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
891; CHECK-NEXT:    fminnmv h0, p0, z0.h
892; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
893; CHECK-NEXT:    ret
894  %op = load <128 x half>, ptr %a
895  %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op)
896  ret half %res
897}
898
899; Don't use SVE for 64-bit f32 vectors.
900define float @fminv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
901; CHECK-LABEL: fminv_v2f32:
902; CHECK:       // %bb.0:
903; CHECK-NEXT:    fminnmp s0, v0.2s
904; CHECK-NEXT:    ret
905  %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
906  ret float %res
907}
908
909; Don't use SVE for 128-bit f32 vectors.
910define float @fminv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
911; CHECK-LABEL: fminv_v4f32:
912; CHECK:       // %bb.0:
913; CHECK-NEXT:    fminnmv s0, v0.4s
914; CHECK-NEXT:    ret
915  %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
916  ret float %res
917}
918
919define float @fminv_v8f32(ptr %a) vscale_range(2,0) #0 {
920; CHECK-LABEL: fminv_v8f32:
921; CHECK:       // %bb.0:
922; CHECK-NEXT:    ptrue p0.s, vl8
923; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
924; CHECK-NEXT:    fminnmv s0, p0, z0.s
925; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
926; CHECK-NEXT:    ret
927  %op = load <8 x float>, ptr %a
928  %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
929  ret float %res
930}
931
932define float @fminv_v16f32(ptr %a) #0 {
933; VBITS_GE_256-LABEL: fminv_v16f32:
934; VBITS_GE_256:       // %bb.0:
935; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
936; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
937; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
938; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
939; VBITS_GE_256-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
940; VBITS_GE_256-NEXT:    fminnmv s0, p0, z0.s
941; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
942; VBITS_GE_256-NEXT:    ret
943;
944; VBITS_GE_512-LABEL: fminv_v16f32:
945; VBITS_GE_512:       // %bb.0:
946; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
947; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
948; VBITS_GE_512-NEXT:    fminnmv s0, p0, z0.s
949; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
950; VBITS_GE_512-NEXT:    ret
951  %op = load <16 x float>, ptr %a
952  %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op)
953  ret float %res
954}
955
956define float @fminv_v32f32(ptr %a) vscale_range(8,0) #0 {
957; CHECK-LABEL: fminv_v32f32:
958; CHECK:       // %bb.0:
959; CHECK-NEXT:    ptrue p0.s, vl32
960; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
961; CHECK-NEXT:    fminnmv s0, p0, z0.s
962; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
963; CHECK-NEXT:    ret
964  %op = load <32 x float>, ptr %a
965  %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op)
966  ret float %res
967}
968
969define float @fminv_v64f32(ptr %a) vscale_range(16,0) #0 {
970; CHECK-LABEL: fminv_v64f32:
971; CHECK:       // %bb.0:
972; CHECK-NEXT:    ptrue p0.s, vl64
973; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
974; CHECK-NEXT:    fminnmv s0, p0, z0.s
975; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
976; CHECK-NEXT:    ret
977  %op = load <64 x float>, ptr %a
978  %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op)
979  ret float %res
980}
981
982; Nothing to do for single element vectors.
983define double @fminv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
984; CHECK-LABEL: fminv_v1f64:
985; CHECK:       // %bb.0:
986; CHECK-NEXT:    ret
987  %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
988  ret double %res
989}
990
991; Don't use SVE for 128-bit f64 vectors.
992define double @fminv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
993; CHECK-LABEL: fminv_v2f64:
994; CHECK:       // %bb.0:
995; CHECK-NEXT:    fminnmp d0, v0.2d
996; CHECK-NEXT:    ret
997  %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
998  ret double %res
999}
1000
1001define double @fminv_v4f64(ptr %a) vscale_range(2,0) #0 {
1002; CHECK-LABEL: fminv_v4f64:
1003; CHECK:       // %bb.0:
1004; CHECK-NEXT:    ptrue p0.d, vl4
1005; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1006; CHECK-NEXT:    fminnmv d0, p0, z0.d
1007; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1008; CHECK-NEXT:    ret
1009  %op = load <4 x double>, ptr %a
1010  %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
1011  ret double %res
1012}
1013
1014define double @fminv_v8f64(ptr %a) #0 {
1015; VBITS_GE_256-LABEL: fminv_v8f64:
1016; VBITS_GE_256:       // %bb.0:
1017; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1018; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1019; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1020; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1021; VBITS_GE_256-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
1022; VBITS_GE_256-NEXT:    fminnmv d0, p0, z0.d
1023; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
1024; VBITS_GE_256-NEXT:    ret
1025;
1026; VBITS_GE_512-LABEL: fminv_v8f64:
1027; VBITS_GE_512:       // %bb.0:
1028; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1029; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1030; VBITS_GE_512-NEXT:    fminnmv d0, p0, z0.d
1031; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
1032; VBITS_GE_512-NEXT:    ret
1033  %op = load <8 x double>, ptr %a
1034  %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op)
1035  ret double %res
1036}
1037
1038define double @fminv_v16f64(ptr %a) vscale_range(8,0) #0 {
1039; CHECK-LABEL: fminv_v16f64:
1040; CHECK:       // %bb.0:
1041; CHECK-NEXT:    ptrue p0.d, vl16
1042; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1043; CHECK-NEXT:    fminnmv d0, p0, z0.d
1044; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1045; CHECK-NEXT:    ret
1046  %op = load <16 x double>, ptr %a
1047  %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op)
1048  ret double %res
1049}
1050
1051define double @fminv_v32f64(ptr %a) vscale_range(16,0) #0 {
1052; CHECK-LABEL: fminv_v32f64:
1053; CHECK:       // %bb.0:
1054; CHECK-NEXT:    ptrue p0.d, vl32
1055; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1056; CHECK-NEXT:    fminnmv d0, p0, z0.d
1057; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1058; CHECK-NEXT:    ret
1059  %op = load <32 x double>, ptr %a
1060  %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op)
1061  ret double %res
1062}
1063
1064;
1065; FMAXV
1066;
1067
1068define half @fmaximumv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
1069; CHECK-LABEL: fmaximumv_v4f16:
1070; CHECK:       // %bb.0:
1071; CHECK-NEXT:    fmaxv h0, v0.4h
1072; CHECK-NEXT:    ret
1073  %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
1074  ret half %res
1075}
1076
1077define half @fmaximumv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
1078; CHECK-LABEL: fmaximumv_v8f16:
1079; CHECK:       // %bb.0:
1080; CHECK-NEXT:    fmaxv h0, v0.8h
1081; CHECK-NEXT:    ret
1082  %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a)
1083  ret half %res
1084}
1085
1086define half @fmaximumv_v16f16(ptr %a) vscale_range(2,0) #0 {
1087; CHECK-LABEL: fmaximumv_v16f16:
1088; CHECK:       // %bb.0:
1089; CHECK-NEXT:    ptrue p0.h, vl16
1090; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1091; CHECK-NEXT:    fmaxv h0, p0, z0.h
1092; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
1093; CHECK-NEXT:    ret
1094  %op = load <16 x half>, ptr %a
1095  %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op)
1096  ret half %res
1097}
1098
1099define half @fmaximumv_v32f16(ptr %a) #0 {
1100; VBITS_GE_256-LABEL: fmaximumv_v32f16:
1101; VBITS_GE_256:       // %bb.0:
1102; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1103; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
1104; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1105; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
1106; VBITS_GE_256-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
1107; VBITS_GE_256-NEXT:    fmaxv h0, p0, z0.h
1108; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
1109; VBITS_GE_256-NEXT:    ret
1110;
1111; VBITS_GE_512-LABEL: fmaximumv_v32f16:
1112; VBITS_GE_512:       // %bb.0:
1113; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1114; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1115; VBITS_GE_512-NEXT:    fmaxv h0, p0, z0.h
1116; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
1117; VBITS_GE_512-NEXT:    ret
1118  %op = load <32 x half>, ptr %a
1119  %res = call half @llvm.vector.reduce.fmaximum.v32f16(<32 x half> %op)
1120  ret half %res
1121}
1122
1123define half @fmaximumv_v64f16(ptr %a) vscale_range(8,0) #0 {
1124; CHECK-LABEL: fmaximumv_v64f16:
1125; CHECK:       // %bb.0:
1126; CHECK-NEXT:    ptrue p0.h, vl64
1127; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1128; CHECK-NEXT:    fmaxv h0, p0, z0.h
1129; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
1130; CHECK-NEXT:    ret
1131  %op = load <64 x half>, ptr %a
1132  %res = call half @llvm.vector.reduce.fmaximum.v64f16(<64 x half> %op)
1133  ret half %res
1134}
1135
1136define half @fmaximumv_v128f16(ptr %a) vscale_range(16,0) #0 {
1137; CHECK-LABEL: fmaximumv_v128f16:
1138; CHECK:       // %bb.0:
1139; CHECK-NEXT:    ptrue p0.h, vl128
1140; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1141; CHECK-NEXT:    fmaxv h0, p0, z0.h
1142; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
1143; CHECK-NEXT:    ret
1144  %op = load <128 x half>, ptr %a
1145  %res = call half @llvm.vector.reduce.fmaximum.v128f16(<128 x half> %op)
1146  ret half %res
1147}
1148
1149; Don't use SVE for 64-bit f32 vectors.
1150define float @fmaximumv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
1151; CHECK-LABEL: fmaximumv_v2f32:
1152; CHECK:       // %bb.0:
1153; CHECK-NEXT:    fmaxp s0, v0.2s
1154; CHECK-NEXT:    ret
1155  %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a)
1156  ret float %res
1157}
1158
1159; Don't use SVE for 128-bit f32 vectors.
1160define float @fmaximumv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
1161; CHECK-LABEL: fmaximumv_v4f32:
1162; CHECK:       // %bb.0:
1163; CHECK-NEXT:    fmaxv s0, v0.4s
1164; CHECK-NEXT:    ret
1165  %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
1166  ret float %res
1167}
1168
1169define float @fmaximumv_v8f32(ptr %a) vscale_range(2,0) #0 {
1170; CHECK-LABEL: fmaximumv_v8f32:
1171; CHECK:       // %bb.0:
1172; CHECK-NEXT:    ptrue p0.s, vl8
1173; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1174; CHECK-NEXT:    fmaxv s0, p0, z0.s
1175; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
1176; CHECK-NEXT:    ret
1177  %op = load <8 x float>, ptr %a
1178  %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op)
1179  ret float %res
1180}
1181
1182define float @fmaximumv_v16f32(ptr %a) #0 {
1183; VBITS_GE_256-LABEL: fmaximumv_v16f32:
1184; VBITS_GE_256:       // %bb.0:
1185; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1186; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1187; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1188; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
1189; VBITS_GE_256-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
1190; VBITS_GE_256-NEXT:    fmaxv s0, p0, z0.s
1191; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
1192; VBITS_GE_256-NEXT:    ret
1193;
1194; VBITS_GE_512-LABEL: fmaximumv_v16f32:
1195; VBITS_GE_512:       // %bb.0:
1196; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1197; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1198; VBITS_GE_512-NEXT:    fmaxv s0, p0, z0.s
1199; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
1200; VBITS_GE_512-NEXT:    ret
1201  %op = load <16 x float>, ptr %a
1202  %res = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> %op)
1203  ret float %res
1204}
1205
1206define float @fmaximumv_v32f32(ptr %a) vscale_range(8,0) #0 {
1207; CHECK-LABEL: fmaximumv_v32f32:
1208; CHECK:       // %bb.0:
1209; CHECK-NEXT:    ptrue p0.s, vl32
1210; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1211; CHECK-NEXT:    fmaxv s0, p0, z0.s
1212; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
1213; CHECK-NEXT:    ret
1214  %op = load <32 x float>, ptr %a
1215  %res = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> %op)
1216  ret float %res
1217}
1218
1219define float @fmaximumv_v64f32(ptr %a) vscale_range(16,0) #0 {
1220; CHECK-LABEL: fmaximumv_v64f32:
1221; CHECK:       // %bb.0:
1222; CHECK-NEXT:    ptrue p0.s, vl64
1223; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1224; CHECK-NEXT:    fmaxv s0, p0, z0.s
1225; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
1226; CHECK-NEXT:    ret
1227  %op = load <64 x float>, ptr %a
1228  %res = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> %op)
1229  ret float %res
1230}
1231
1232; Nothing to do for single element vectors.
1233define double @fmaximumv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
1234; CHECK-LABEL: fmaximumv_v1f64:
1235; CHECK:       // %bb.0:
1236; CHECK-NEXT:    ret
1237  %res = call double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a)
1238  ret double %res
1239}
1240
1241; Don't use SVE for 128-bit f64 vectors.
1242define double @fmaximumv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
1243; CHECK-LABEL: fmaximumv_v2f64:
1244; CHECK:       // %bb.0:
1245; CHECK-NEXT:    fmaxp d0, v0.2d
1246; CHECK-NEXT:    ret
1247  %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a)
1248  ret double %res
1249}
1250
1251define double @fmaximumv_v4f64(ptr %a) vscale_range(2,0) #0 {
1252; CHECK-LABEL: fmaximumv_v4f64:
1253; CHECK:       // %bb.0:
1254; CHECK-NEXT:    ptrue p0.d, vl4
1255; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1256; CHECK-NEXT:    fmaxv d0, p0, z0.d
1257; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1258; CHECK-NEXT:    ret
1259  %op = load <4 x double>, ptr %a
1260  %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op)
1261  ret double %res
1262}
1263
1264define double @fmaximumv_v8f64(ptr %a) #0 {
1265; VBITS_GE_256-LABEL: fmaximumv_v8f64:
1266; VBITS_GE_256:       // %bb.0:
1267; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1268; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1269; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1270; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1271; VBITS_GE_256-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
1272; VBITS_GE_256-NEXT:    fmaxv d0, p0, z0.d
1273; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
1274; VBITS_GE_256-NEXT:    ret
1275;
1276; VBITS_GE_512-LABEL: fmaximumv_v8f64:
1277; VBITS_GE_512:       // %bb.0:
1278; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1279; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1280; VBITS_GE_512-NEXT:    fmaxv d0, p0, z0.d
1281; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
1282; VBITS_GE_512-NEXT:    ret
1283  %op = load <8 x double>, ptr %a
1284  %res = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> %op)
1285  ret double %res
1286}
1287
1288define double @fmaximumv_v16f64(ptr %a) vscale_range(8,0) #0 {
1289; CHECK-LABEL: fmaximumv_v16f64:
1290; CHECK:       // %bb.0:
1291; CHECK-NEXT:    ptrue p0.d, vl16
1292; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1293; CHECK-NEXT:    fmaxv d0, p0, z0.d
1294; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1295; CHECK-NEXT:    ret
1296  %op = load <16 x double>, ptr %a
1297  %res = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> %op)
1298  ret double %res
1299}
1300
1301define double @fmaximumv_v32f64(ptr %a) vscale_range(16,0) #0 {
1302; CHECK-LABEL: fmaximumv_v32f64:
1303; CHECK:       // %bb.0:
1304; CHECK-NEXT:    ptrue p0.d, vl32
1305; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1306; CHECK-NEXT:    fmaxv d0, p0, z0.d
1307; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1308; CHECK-NEXT:    ret
1309  %op = load <32 x double>, ptr %a
1310  %res = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %op)
1311  ret double %res
1312}
1313
1314;
1315; FMINV
1316;
1317
1318define half @fminimumv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
1319; CHECK-LABEL: fminimumv_v4f16:
1320; CHECK:       // %bb.0:
1321; CHECK-NEXT:    fminv h0, v0.4h
1322; CHECK-NEXT:    ret
1323  %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a)
1324  ret half %res
1325}
1326
1327define half @fminimumv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
1328; CHECK-LABEL: fminimumv_v8f16:
1329; CHECK:       // %bb.0:
1330; CHECK-NEXT:    fminv h0, v0.8h
1331; CHECK-NEXT:    ret
1332  %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a)
1333  ret half %res
1334}
1335
1336define half @fminimumv_v16f16(ptr %a) vscale_range(2,0) #0 {
1337; CHECK-LABEL: fminimumv_v16f16:
1338; CHECK:       // %bb.0:
1339; CHECK-NEXT:    ptrue p0.h, vl16
1340; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1341; CHECK-NEXT:    fminv h0, p0, z0.h
1342; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
1343; CHECK-NEXT:    ret
1344  %op = load <16 x half>, ptr %a
1345  %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op)
1346  ret half %res
1347}
1348
1349define half @fminimumv_v32f16(ptr %a) #0 {
1350; VBITS_GE_256-LABEL: fminimumv_v32f16:
1351; VBITS_GE_256:       // %bb.0:
1352; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1353; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
1354; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1355; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
1356; VBITS_GE_256-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
1357; VBITS_GE_256-NEXT:    fminv h0, p0, z0.h
1358; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
1359; VBITS_GE_256-NEXT:    ret
1360;
1361; VBITS_GE_512-LABEL: fminimumv_v32f16:
1362; VBITS_GE_512:       // %bb.0:
1363; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1364; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1365; VBITS_GE_512-NEXT:    fminv h0, p0, z0.h
1366; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
1367; VBITS_GE_512-NEXT:    ret
1368  %op = load <32 x half>, ptr %a
1369  %res = call half @llvm.vector.reduce.fminimum.v32f16(<32 x half> %op)
1370  ret half %res
1371}
1372
1373define half @fminimumv_v64f16(ptr %a) vscale_range(8,0) #0 {
1374; CHECK-LABEL: fminimumv_v64f16:
1375; CHECK:       // %bb.0:
1376; CHECK-NEXT:    ptrue p0.h, vl64
1377; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1378; CHECK-NEXT:    fminv h0, p0, z0.h
1379; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
1380; CHECK-NEXT:    ret
1381  %op = load <64 x half>, ptr %a
1382  %res = call half @llvm.vector.reduce.fminimum.v64f16(<64 x half> %op)
1383  ret half %res
1384}
1385
1386define half @fminimumv_v128f16(ptr %a) vscale_range(16,0) #0 {
1387; CHECK-LABEL: fminimumv_v128f16:
1388; CHECK:       // %bb.0:
1389; CHECK-NEXT:    ptrue p0.h, vl128
1390; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1391; CHECK-NEXT:    fminv h0, p0, z0.h
1392; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
1393; CHECK-NEXT:    ret
1394  %op = load <128 x half>, ptr %a
1395  %res = call half @llvm.vector.reduce.fminimum.v128f16(<128 x half> %op)
1396  ret half %res
1397}
1398
1399; Don't use SVE for 64-bit f32 vectors.
1400define float @fminimumv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
1401; CHECK-LABEL: fminimumv_v2f32:
1402; CHECK:       // %bb.0:
1403; CHECK-NEXT:    fminp s0, v0.2s
1404; CHECK-NEXT:    ret
1405  %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a)
1406  ret float %res
1407}
1408
1409; Don't use SVE for 128-bit f32 vectors.
1410define float @fminimumv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
1411; CHECK-LABEL: fminimumv_v4f32:
1412; CHECK:       // %bb.0:
1413; CHECK-NEXT:    fminv s0, v0.4s
1414; CHECK-NEXT:    ret
1415  %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
1416  ret float %res
1417}
1418
1419define float @fminimumv_v8f32(ptr %a) vscale_range(2,0) #0 {
1420; CHECK-LABEL: fminimumv_v8f32:
1421; CHECK:       // %bb.0:
1422; CHECK-NEXT:    ptrue p0.s, vl8
1423; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1424; CHECK-NEXT:    fminv s0, p0, z0.s
1425; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
1426; CHECK-NEXT:    ret
1427  %op = load <8 x float>, ptr %a
1428  %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op)
1429  ret float %res
1430}
1431
1432define float @fminimumv_v16f32(ptr %a) #0 {
1433; VBITS_GE_256-LABEL: fminimumv_v16f32:
1434; VBITS_GE_256:       // %bb.0:
1435; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1436; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1437; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1438; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
1439; VBITS_GE_256-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
1440; VBITS_GE_256-NEXT:    fminv s0, p0, z0.s
1441; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
1442; VBITS_GE_256-NEXT:    ret
1443;
1444; VBITS_GE_512-LABEL: fminimumv_v16f32:
1445; VBITS_GE_512:       // %bb.0:
1446; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1447; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1448; VBITS_GE_512-NEXT:    fminv s0, p0, z0.s
1449; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
1450; VBITS_GE_512-NEXT:    ret
1451  %op = load <16 x float>, ptr %a
1452  %res = call float @llvm.vector.reduce.fminimum.v16f32(<16 x float> %op)
1453  ret float %res
1454}
1455
1456define float @fminimumv_v32f32(ptr %a) vscale_range(8,0) #0 {
1457; CHECK-LABEL: fminimumv_v32f32:
1458; CHECK:       // %bb.0:
1459; CHECK-NEXT:    ptrue p0.s, vl32
1460; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1461; CHECK-NEXT:    fminv s0, p0, z0.s
1462; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
1463; CHECK-NEXT:    ret
1464  %op = load <32 x float>, ptr %a
1465  %res = call float @llvm.vector.reduce.fminimum.v32f32(<32 x float> %op)
1466  ret float %res
1467}
1468
1469define float @fminimumv_v64f32(ptr %a) vscale_range(16,0) #0 {
1470; CHECK-LABEL: fminimumv_v64f32:
1471; CHECK:       // %bb.0:
1472; CHECK-NEXT:    ptrue p0.s, vl64
1473; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1474; CHECK-NEXT:    fminv s0, p0, z0.s
1475; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
1476; CHECK-NEXT:    ret
1477  %op = load <64 x float>, ptr %a
1478  %res = call float @llvm.vector.reduce.fminimum.v64f32(<64 x float> %op)
1479  ret float %res
1480}
1481
1482; Nothing to do for single element vectors.
1483define double @fminimumv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
1484; CHECK-LABEL: fminimumv_v1f64:
1485; CHECK:       // %bb.0:
1486; CHECK-NEXT:    ret
1487  %res = call double @llvm.vector.reduce.fminimum.v1f64(<1 x double> %a)
1488  ret double %res
1489}
1490
1491; Don't use SVE for 128-bit f64 vectors.
1492define double @fminimumv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
1493; CHECK-LABEL: fminimumv_v2f64:
1494; CHECK:       // %bb.0:
1495; CHECK-NEXT:    fminp d0, v0.2d
1496; CHECK-NEXT:    ret
1497  %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a)
1498  ret double %res
1499}
1500
1501define double @fminimumv_v4f64(ptr %a) vscale_range(2,0) #0 {
1502; CHECK-LABEL: fminimumv_v4f64:
1503; CHECK:       // %bb.0:
1504; CHECK-NEXT:    ptrue p0.d, vl4
1505; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1506; CHECK-NEXT:    fminv d0, p0, z0.d
1507; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1508; CHECK-NEXT:    ret
1509  %op = load <4 x double>, ptr %a
1510  %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op)
1511  ret double %res
1512}
1513
1514define double @fminimumv_v8f64(ptr %a) #0 {
1515; VBITS_GE_256-LABEL: fminimumv_v8f64:
1516; VBITS_GE_256:       // %bb.0:
1517; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1518; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1519; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1520; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1521; VBITS_GE_256-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
1522; VBITS_GE_256-NEXT:    fminv d0, p0, z0.d
1523; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
1524; VBITS_GE_256-NEXT:    ret
1525;
1526; VBITS_GE_512-LABEL: fminimumv_v8f64:
1527; VBITS_GE_512:       // %bb.0:
1528; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1529; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1530; VBITS_GE_512-NEXT:    fminv d0, p0, z0.d
1531; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
1532; VBITS_GE_512-NEXT:    ret
1533  %op = load <8 x double>, ptr %a
1534  %res = call double @llvm.vector.reduce.fminimum.v8f64(<8 x double> %op)
1535  ret double %res
1536}
1537
1538define double @fminimumv_v16f64(ptr %a) vscale_range(8,0) #0 {
1539; CHECK-LABEL: fminimumv_v16f64:
1540; CHECK:       // %bb.0:
1541; CHECK-NEXT:    ptrue p0.d, vl16
1542; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1543; CHECK-NEXT:    fminv d0, p0, z0.d
1544; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1545; CHECK-NEXT:    ret
1546  %op = load <16 x double>, ptr %a
1547  %res = call double @llvm.vector.reduce.fminimum.v16f64(<16 x double> %op)
1548  ret double %res
1549}
1550
1551define double @fminimumv_v32f64(ptr %a) vscale_range(16,0) #0 {
1552; CHECK-LABEL: fminimumv_v32f64:
1553; CHECK:       // %bb.0:
1554; CHECK-NEXT:    ptrue p0.d, vl32
1555; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1556; CHECK-NEXT:    fminv d0, p0, z0.d
1557; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1558; CHECK-NEXT:    ret
1559  %op = load <32 x double>, ptr %a
1560  %res = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %op)
1561  ret double %res
1562}
1563
1564attributes #0 = { "target-features"="+sve" }
1565
1566declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
1567declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
1568declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
1569declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>)
1570declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>)
1571declare half @llvm.vector.reduce.fadd.v128f16(half, <128 x half>)
1572
1573declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
1574declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
1575declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
1576declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>)
1577declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>)
1578declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>)
1579
1580declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>)
1581declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
1582declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
1583declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>)
1584declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>)
1585declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>)
1586
1587declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
1588declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
1589declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
1590declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>)
1591declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>)
1592declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>)
1593
1594declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
1595declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
1596declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
1597declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
1598declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)
1599declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>)
1600
1601declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
1602declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
1603declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
1604declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
1605declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
1606declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>)
1607
1608declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
1609declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)
1610declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
1611declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>)
1612declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>)
1613declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>)
1614
1615declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
1616declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
1617declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
1618declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
1619declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>)
1620declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>)
1621
1622declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
1623declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
1624declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
1625declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
1626declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
1627declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>)
1628
1629declare half @llvm.vector.reduce.fmaximum.v4f16(<4 x half>)
1630declare half @llvm.vector.reduce.fmaximum.v8f16(<8 x half>)
1631declare half @llvm.vector.reduce.fmaximum.v16f16(<16 x half>)
1632declare half @llvm.vector.reduce.fmaximum.v32f16(<32 x half>)
1633declare half @llvm.vector.reduce.fmaximum.v64f16(<64 x half>)
1634declare half @llvm.vector.reduce.fmaximum.v128f16(<128 x half>)
1635
1636declare float @llvm.vector.reduce.fmaximum.v2f32(<2 x float>)
1637declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>)
1638declare float @llvm.vector.reduce.fmaximum.v8f32(<8 x float>)
1639declare float @llvm.vector.reduce.fmaximum.v16f32(<16 x float>)
1640declare float @llvm.vector.reduce.fmaximum.v32f32(<32 x float>)
1641declare float @llvm.vector.reduce.fmaximum.v64f32(<64 x float>)
1642
1643declare double @llvm.vector.reduce.fmaximum.v1f64(<1 x double>)
1644declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>)
1645declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>)
1646declare double @llvm.vector.reduce.fmaximum.v8f64(<8 x double>)
1647declare double @llvm.vector.reduce.fmaximum.v16f64(<16 x double>)
1648declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>)
1649
1650declare half @llvm.vector.reduce.fminimum.v4f16(<4 x half>)
1651declare half @llvm.vector.reduce.fminimum.v8f16(<8 x half>)
1652declare half @llvm.vector.reduce.fminimum.v16f16(<16 x half>)
1653declare half @llvm.vector.reduce.fminimum.v32f16(<32 x half>)
1654declare half @llvm.vector.reduce.fminimum.v64f16(<64 x half>)
1655declare half @llvm.vector.reduce.fminimum.v128f16(<128 x half>)
1656
1657declare float @llvm.vector.reduce.fminimum.v2f32(<2 x float>)
1658declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float>)
1659declare float @llvm.vector.reduce.fminimum.v8f32(<8 x float>)
1660declare float @llvm.vector.reduce.fminimum.v16f32(<16 x float>)
1661declare float @llvm.vector.reduce.fminimum.v32f32(<32 x float>)
1662declare float @llvm.vector.reduce.fminimum.v64f32(<64 x float>)
1663
1664declare double @llvm.vector.reduce.fminimum.v1f64(<1 x double>)
1665declare double @llvm.vector.reduce.fminimum.v2f64(<2 x double>)
1666declare double @llvm.vector.reduce.fminimum.v4f64(<4 x double>)
1667declare double @llvm.vector.reduce.fminimum.v8f64(<8 x double>)
1668declare double @llvm.vector.reduce.fminimum.v16f64(<16 x double>)
1669declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>)
1670