xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll (revision c2bd5c25b3634e55089d34afe922aa38eee743e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; UADDV
10;
11
12; Don't use SVE for 64-bit vectors.
13define i8 @uaddv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
14; CHECK-LABEL: uaddv_v8i8:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    addv b0, v0.8b
17; CHECK-NEXT:    fmov w0, s0
18; CHECK-NEXT:    ret
19  %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
20  ret i8 %res
21}
22
23; Don't use SVE for 128-bit vectors.
24define i8 @uaddv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
25; CHECK-LABEL: uaddv_v16i8:
26; CHECK:       // %bb.0:
27; CHECK-NEXT:    addv b0, v0.16b
28; CHECK-NEXT:    fmov w0, s0
29; CHECK-NEXT:    ret
30  %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
31  ret i8 %res
32}
33
34define i8 @uaddv_v32i8(ptr %a) vscale_range(2,0) #0 {
35; CHECK-LABEL: uaddv_v32i8:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    ptrue p0.b, vl32
38; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
39; CHECK-NEXT:    uaddv d0, p0, z0.b
40; CHECK-NEXT:    fmov w0, s0
41; CHECK-NEXT:    ret
42  %op = load <32 x i8>, ptr %a
43  %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
44  ret i8 %res
45}
46
47define i8 @uaddv_v64i8(ptr %a) #0 {
48; VBITS_GE_256-LABEL: uaddv_v64i8:
49; VBITS_GE_256:       // %bb.0:
50; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
51; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
52; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
53; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
54; VBITS_GE_256-NEXT:    add z0.b, z1.b, z0.b
55; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.b
56; VBITS_GE_256-NEXT:    fmov w0, s0
57; VBITS_GE_256-NEXT:    ret
58;
59; VBITS_GE_512-LABEL: uaddv_v64i8:
60; VBITS_GE_512:       // %bb.0:
61; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
62; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
63; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.b
64; VBITS_GE_512-NEXT:    fmov w0, s0
65; VBITS_GE_512-NEXT:    ret
66  %op = load <64 x i8>, ptr %a
67  %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
68  ret i8 %res
69}
70
71define i8 @uaddv_v128i8(ptr %a) vscale_range(8,0) #0 {
72; CHECK-LABEL: uaddv_v128i8:
73; CHECK:       // %bb.0:
74; CHECK-NEXT:    ptrue p0.b, vl128
75; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
76; CHECK-NEXT:    uaddv d0, p0, z0.b
77; CHECK-NEXT:    fmov w0, s0
78; CHECK-NEXT:    ret
79  %op = load <128 x i8>, ptr %a
80  %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
81  ret i8 %res
82}
83
84define i8 @uaddv_v256i8(ptr %a) vscale_range(16,0) #0 {
85; CHECK-LABEL: uaddv_v256i8:
86; CHECK:       // %bb.0:
87; CHECK-NEXT:    ptrue p0.b, vl256
88; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
89; CHECK-NEXT:    uaddv d0, p0, z0.b
90; CHECK-NEXT:    fmov w0, s0
91; CHECK-NEXT:    ret
92  %op = load <256 x i8>, ptr %a
93  %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
94  ret i8 %res
95}
96
97; Don't use SVE for 64-bit vectors.
98define i16 @uaddv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
99; CHECK-LABEL: uaddv_v4i16:
100; CHECK:       // %bb.0:
101; CHECK-NEXT:    addv h0, v0.4h
102; CHECK-NEXT:    fmov w0, s0
103; CHECK-NEXT:    ret
104  %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
105  ret i16 %res
106}
107
108; Don't use SVE for 128-bit vectors.
109define i16 @uaddv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
110; CHECK-LABEL: uaddv_v8i16:
111; CHECK:       // %bb.0:
112; CHECK-NEXT:    addv h0, v0.8h
113; CHECK-NEXT:    fmov w0, s0
114; CHECK-NEXT:    ret
115  %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
116  ret i16 %res
117}
118
119define i16 @uaddv_v16i16(ptr %a) vscale_range(2,0) #0 {
120; CHECK-LABEL: uaddv_v16i16:
121; CHECK:       // %bb.0:
122; CHECK-NEXT:    ptrue p0.h, vl16
123; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
124; CHECK-NEXT:    uaddv d0, p0, z0.h
125; CHECK-NEXT:    fmov w0, s0
126; CHECK-NEXT:    ret
127  %op = load <16 x i16>, ptr %a
128  %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
129  ret i16 %res
130}
131
132define i16 @uaddv_v32i16(ptr %a) #0 {
133; VBITS_GE_256-LABEL: uaddv_v32i16:
134; VBITS_GE_256:       // %bb.0:
135; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
136; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
137; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
138; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
139; VBITS_GE_256-NEXT:    add z0.h, z1.h, z0.h
140; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.h
141; VBITS_GE_256-NEXT:    fmov w0, s0
142; VBITS_GE_256-NEXT:    ret
143;
144; VBITS_GE_512-LABEL: uaddv_v32i16:
145; VBITS_GE_512:       // %bb.0:
146; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
147; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
148; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.h
149; VBITS_GE_512-NEXT:    fmov w0, s0
150; VBITS_GE_512-NEXT:    ret
151  %op = load <32 x i16>, ptr %a
152  %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
153  ret i16 %res
154}
155
156define i16 @uaddv_v64i16(ptr %a) vscale_range(8,0) #0 {
157; CHECK-LABEL: uaddv_v64i16:
158; CHECK:       // %bb.0:
159; CHECK-NEXT:    ptrue p0.h, vl64
160; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
161; CHECK-NEXT:    uaddv d0, p0, z0.h
162; CHECK-NEXT:    fmov w0, s0
163; CHECK-NEXT:    ret
164  %op = load <64 x i16>, ptr %a
165  %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
166  ret i16 %res
167}
168
169define i16 @uaddv_v128i16(ptr %a) vscale_range(16,0) #0 {
170; CHECK-LABEL: uaddv_v128i16:
171; CHECK:       // %bb.0:
172; CHECK-NEXT:    ptrue p0.h, vl128
173; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
174; CHECK-NEXT:    uaddv d0, p0, z0.h
175; CHECK-NEXT:    fmov w0, s0
176; CHECK-NEXT:    ret
177  %op = load <128 x i16>, ptr %a
178  %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
179  ret i16 %res
180}
181
182; Don't use SVE for 64-bit vectors.
183define i32 @uaddv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
184; CHECK-LABEL: uaddv_v2i32:
185; CHECK:       // %bb.0:
186; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
187; CHECK-NEXT:    fmov w0, s0
188; CHECK-NEXT:    ret
189  %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
190  ret i32 %res
191}
192
193; Don't use SVE for 128-bit vectors.
194define i32 @uaddv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
195; CHECK-LABEL: uaddv_v4i32:
196; CHECK:       // %bb.0:
197; CHECK-NEXT:    addv s0, v0.4s
198; CHECK-NEXT:    fmov w0, s0
199; CHECK-NEXT:    ret
200  %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
201  ret i32 %res
202}
203
204define i32 @uaddv_v8i32(ptr %a) vscale_range(2,0) #0 {
205; CHECK-LABEL: uaddv_v8i32:
206; CHECK:       // %bb.0:
207; CHECK-NEXT:    ptrue p0.s, vl8
208; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
209; CHECK-NEXT:    uaddv d0, p0, z0.s
210; CHECK-NEXT:    fmov w0, s0
211; CHECK-NEXT:    ret
212  %op = load <8 x i32>, ptr %a
213  %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
214  ret i32 %res
215}
216
217define i32 @uaddv_v16i32(ptr %a) #0 {
218; VBITS_GE_256-LABEL: uaddv_v16i32:
219; VBITS_GE_256:       // %bb.0:
220; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
221; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
222; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
223; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
224; VBITS_GE_256-NEXT:    add z0.s, z1.s, z0.s
225; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.s
226; VBITS_GE_256-NEXT:    fmov w0, s0
227; VBITS_GE_256-NEXT:    ret
228;
229; VBITS_GE_512-LABEL: uaddv_v16i32:
230; VBITS_GE_512:       // %bb.0:
231; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
232; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
233; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.s
234; VBITS_GE_512-NEXT:    fmov w0, s0
235; VBITS_GE_512-NEXT:    ret
236  %op = load <16 x i32>, ptr %a
237  %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
238  ret i32 %res
239}
240
241define i32 @uaddv_v32i32(ptr %a) vscale_range(8,0) #0 {
242; CHECK-LABEL: uaddv_v32i32:
243; CHECK:       // %bb.0:
244; CHECK-NEXT:    ptrue p0.s, vl32
245; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
246; CHECK-NEXT:    uaddv d0, p0, z0.s
247; CHECK-NEXT:    fmov w0, s0
248; CHECK-NEXT:    ret
249  %op = load <32 x i32>, ptr %a
250  %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
251  ret i32 %res
252}
253
254define i32 @uaddv_v64i32(ptr %a) vscale_range(16,0) #0 {
255; CHECK-LABEL: uaddv_v64i32:
256; CHECK:       // %bb.0:
257; CHECK-NEXT:    ptrue p0.s, vl64
258; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
259; CHECK-NEXT:    uaddv d0, p0, z0.s
260; CHECK-NEXT:    fmov w0, s0
261; CHECK-NEXT:    ret
262  %op = load <64 x i32>, ptr %a
263  %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
264  ret i32 %res
265}
266
267; Nothing to do for single element vectors.
268define i64 @uaddv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
269; CHECK-LABEL: uaddv_v1i64:
270; CHECK:       // %bb.0:
271; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
272; CHECK-NEXT:    fmov x0, d0
273; CHECK-NEXT:    ret
274  %res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
275  ret i64 %res
276}
277
278; Don't use SVE for 128-bit vectors.
279define i64 @uaddv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
280; CHECK-LABEL: uaddv_v2i64:
281; CHECK:       // %bb.0:
282; CHECK-NEXT:    addp d0, v0.2d
283; CHECK-NEXT:    fmov x0, d0
284; CHECK-NEXT:    ret
285  %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
286  ret i64 %res
287}
288
289define i64 @uaddv_v4i64(ptr %a) vscale_range(2,0) #0 {
290; CHECK-LABEL: uaddv_v4i64:
291; CHECK:       // %bb.0:
292; CHECK-NEXT:    ptrue p0.d, vl4
293; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
294; CHECK-NEXT:    uaddv d0, p0, z0.d
295; CHECK-NEXT:    fmov x0, d0
296; CHECK-NEXT:    ret
297  %op = load <4 x i64>, ptr %a
298  %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
299  ret i64 %res
300}
301
302define i64 @uaddv_v8i64(ptr %a) #0 {
303; VBITS_GE_256-LABEL: uaddv_v8i64:
304; VBITS_GE_256:       // %bb.0:
305; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
306; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
307; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
308; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
309; VBITS_GE_256-NEXT:    add z0.d, z1.d, z0.d
310; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.d
311; VBITS_GE_256-NEXT:    fmov x0, d0
312; VBITS_GE_256-NEXT:    ret
313;
314; VBITS_GE_512-LABEL: uaddv_v8i64:
315; VBITS_GE_512:       // %bb.0:
316; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
317; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
318; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.d
319; VBITS_GE_512-NEXT:    fmov x0, d0
320; VBITS_GE_512-NEXT:    ret
321  %op = load <8 x i64>, ptr %a
322  %res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op)
323  ret i64 %res
324}
325
326define i64 @uaddv_v16i64(ptr %a) vscale_range(8,0) #0 {
327; CHECK-LABEL: uaddv_v16i64:
328; CHECK:       // %bb.0:
329; CHECK-NEXT:    ptrue p0.d, vl16
330; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
331; CHECK-NEXT:    uaddv d0, p0, z0.d
332; CHECK-NEXT:    fmov x0, d0
333; CHECK-NEXT:    ret
334  %op = load <16 x i64>, ptr %a
335  %res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op)
336  ret i64 %res
337}
338
339define i64 @uaddv_v32i64(ptr %a) vscale_range(16,0) #0 {
340; CHECK-LABEL: uaddv_v32i64:
341; CHECK:       // %bb.0:
342; CHECK-NEXT:    ptrue p0.d, vl32
343; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
344; CHECK-NEXT:    uaddv d0, p0, z0.d
345; CHECK-NEXT:    fmov x0, d0
346; CHECK-NEXT:    ret
347  %op = load <32 x i64>, ptr %a
348  %res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op)
349  ret i64 %res
350}
351
352;
353; SMAXV
354;
355
356; Don't use SVE for 64-bit vectors.
357define i8 @smaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
358; CHECK-LABEL: smaxv_v8i8:
359; CHECK:       // %bb.0:
360; CHECK-NEXT:    smaxv b0, v0.8b
361; CHECK-NEXT:    fmov w0, s0
362; CHECK-NEXT:    ret
363  %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
364  ret i8 %res
365}
366
367; Don't use SVE for 128-bit vectors.
368define i8 @smaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
369; CHECK-LABEL: smaxv_v16i8:
370; CHECK:       // %bb.0:
371; CHECK-NEXT:    smaxv b0, v0.16b
372; CHECK-NEXT:    fmov w0, s0
373; CHECK-NEXT:    ret
374  %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
375  ret i8 %res
376}
377
378define i8 @smaxv_v32i8(ptr %a) vscale_range(2,0) #0 {
379; CHECK-LABEL: smaxv_v32i8:
380; CHECK:       // %bb.0:
381; CHECK-NEXT:    ptrue p0.b, vl32
382; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
383; CHECK-NEXT:    smaxv b0, p0, z0.b
384; CHECK-NEXT:    fmov w0, s0
385; CHECK-NEXT:    ret
386  %op = load <32 x i8>, ptr %a
387  %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
388  ret i8 %res
389}
390
391define i8 @smaxv_v64i8(ptr %a) #0 {
392; VBITS_GE_256-LABEL: smaxv_v64i8:
393; VBITS_GE_256:       // %bb.0:
394; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
395; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
396; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
397; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
398; VBITS_GE_256-NEXT:    smax z0.b, p0/m, z0.b, z1.b
399; VBITS_GE_256-NEXT:    smaxv b0, p0, z0.b
400; VBITS_GE_256-NEXT:    fmov w0, s0
401; VBITS_GE_256-NEXT:    ret
402;
403; VBITS_GE_512-LABEL: smaxv_v64i8:
404; VBITS_GE_512:       // %bb.0:
405; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
406; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
407; VBITS_GE_512-NEXT:    smaxv b0, p0, z0.b
408; VBITS_GE_512-NEXT:    fmov w0, s0
409; VBITS_GE_512-NEXT:    ret
410  %op = load <64 x i8>, ptr %a
411  %res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op)
412  ret i8 %res
413}
414
415define i8 @smaxv_v128i8(ptr %a) vscale_range(8,0) #0 {
416; CHECK-LABEL: smaxv_v128i8:
417; CHECK:       // %bb.0:
418; CHECK-NEXT:    ptrue p0.b, vl128
419; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
420; CHECK-NEXT:    smaxv b0, p0, z0.b
421; CHECK-NEXT:    fmov w0, s0
422; CHECK-NEXT:    ret
423  %op = load <128 x i8>, ptr %a
424  %res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op)
425  ret i8 %res
426}
427
428define i8 @smaxv_v256i8(ptr %a) vscale_range(16,0) #0 {
429; CHECK-LABEL: smaxv_v256i8:
430; CHECK:       // %bb.0:
431; CHECK-NEXT:    ptrue p0.b, vl256
432; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
433; CHECK-NEXT:    smaxv b0, p0, z0.b
434; CHECK-NEXT:    fmov w0, s0
435; CHECK-NEXT:    ret
436  %op = load <256 x i8>, ptr %a
437  %res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op)
438  ret i8 %res
439}
440
441; Don't use SVE for 64-bit vectors.
442define i16 @smaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
443; CHECK-LABEL: smaxv_v4i16:
444; CHECK:       // %bb.0:
445; CHECK-NEXT:    smaxv h0, v0.4h
446; CHECK-NEXT:    fmov w0, s0
447; CHECK-NEXT:    ret
448  %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
449  ret i16 %res
450}
451
452; Don't use SVE for 128-bit vectors.
453define i16 @smaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
454; CHECK-LABEL: smaxv_v8i16:
455; CHECK:       // %bb.0:
456; CHECK-NEXT:    smaxv h0, v0.8h
457; CHECK-NEXT:    fmov w0, s0
458; CHECK-NEXT:    ret
459  %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
460  ret i16 %res
461}
462
463define i16 @smaxv_v16i16(ptr %a) vscale_range(2,0) #0 {
464; CHECK-LABEL: smaxv_v16i16:
465; CHECK:       // %bb.0:
466; CHECK-NEXT:    ptrue p0.h, vl16
467; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
468; CHECK-NEXT:    smaxv h0, p0, z0.h
469; CHECK-NEXT:    fmov w0, s0
470; CHECK-NEXT:    ret
471  %op = load <16 x i16>, ptr %a
472  %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
473  ret i16 %res
474}
475
476define i16 @smaxv_v32i16(ptr %a) #0 {
477; VBITS_GE_256-LABEL: smaxv_v32i16:
478; VBITS_GE_256:       // %bb.0:
479; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
480; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
481; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
482; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
483; VBITS_GE_256-NEXT:    smax z0.h, p0/m, z0.h, z1.h
484; VBITS_GE_256-NEXT:    smaxv h0, p0, z0.h
485; VBITS_GE_256-NEXT:    fmov w0, s0
486; VBITS_GE_256-NEXT:    ret
487;
488; VBITS_GE_512-LABEL: smaxv_v32i16:
489; VBITS_GE_512:       // %bb.0:
490; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
491; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
492; VBITS_GE_512-NEXT:    smaxv h0, p0, z0.h
493; VBITS_GE_512-NEXT:    fmov w0, s0
494; VBITS_GE_512-NEXT:    ret
495  %op = load <32 x i16>, ptr %a
496  %res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op)
497  ret i16 %res
498}
499
500define i16 @smaxv_v64i16(ptr %a) vscale_range(8,0) #0 {
501; CHECK-LABEL: smaxv_v64i16:
502; CHECK:       // %bb.0:
503; CHECK-NEXT:    ptrue p0.h, vl64
504; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
505; CHECK-NEXT:    smaxv h0, p0, z0.h
506; CHECK-NEXT:    fmov w0, s0
507; CHECK-NEXT:    ret
508  %op = load <64 x i16>, ptr %a
509  %res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op)
510  ret i16 %res
511}
512
513define i16 @smaxv_v128i16(ptr %a) vscale_range(16,0) #0 {
514; CHECK-LABEL: smaxv_v128i16:
515; CHECK:       // %bb.0:
516; CHECK-NEXT:    ptrue p0.h, vl128
517; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
518; CHECK-NEXT:    smaxv h0, p0, z0.h
519; CHECK-NEXT:    fmov w0, s0
520; CHECK-NEXT:    ret
521  %op = load <128 x i16>, ptr %a
522  %res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op)
523  ret i16 %res
524}
525
526; Don't use SVE for 64-bit vectors.
527define i32 @smaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
528; CHECK-LABEL: smaxv_v2i32:
529; CHECK:       // %bb.0:
530; CHECK-NEXT:    smaxp v0.2s, v0.2s, v0.2s
531; CHECK-NEXT:    fmov w0, s0
532; CHECK-NEXT:    ret
533  %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
534  ret i32 %res
535}
536
537; Don't use SVE for 128-bit vectors.
538define i32 @smaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
539; CHECK-LABEL: smaxv_v4i32:
540; CHECK:       // %bb.0:
541; CHECK-NEXT:    smaxv s0, v0.4s
542; CHECK-NEXT:    fmov w0, s0
543; CHECK-NEXT:    ret
544  %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
545  ret i32 %res
546}
547
548define i32 @smaxv_v8i32(ptr %a) vscale_range(2,0) #0 {
549; CHECK-LABEL: smaxv_v8i32:
550; CHECK:       // %bb.0:
551; CHECK-NEXT:    ptrue p0.s, vl8
552; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
553; CHECK-NEXT:    smaxv s0, p0, z0.s
554; CHECK-NEXT:    fmov w0, s0
555; CHECK-NEXT:    ret
556  %op = load <8 x i32>, ptr %a
557  %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
558  ret i32 %res
559}
560
561define i32 @smaxv_v16i32(ptr %a) #0 {
562; VBITS_GE_256-LABEL: smaxv_v16i32:
563; VBITS_GE_256:       // %bb.0:
564; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
565; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
566; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
567; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
568; VBITS_GE_256-NEXT:    smax z0.s, p0/m, z0.s, z1.s
569; VBITS_GE_256-NEXT:    smaxv s0, p0, z0.s
570; VBITS_GE_256-NEXT:    fmov w0, s0
571; VBITS_GE_256-NEXT:    ret
572;
573; VBITS_GE_512-LABEL: smaxv_v16i32:
574; VBITS_GE_512:       // %bb.0:
575; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
576; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
577; VBITS_GE_512-NEXT:    smaxv s0, p0, z0.s
578; VBITS_GE_512-NEXT:    fmov w0, s0
579; VBITS_GE_512-NEXT:    ret
580  %op = load <16 x i32>, ptr %a
581  %res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op)
582  ret i32 %res
583}
584
585define i32 @smaxv_v32i32(ptr %a) vscale_range(8,0) #0 {
586; CHECK-LABEL: smaxv_v32i32:
587; CHECK:       // %bb.0:
588; CHECK-NEXT:    ptrue p0.s, vl32
589; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
590; CHECK-NEXT:    smaxv s0, p0, z0.s
591; CHECK-NEXT:    fmov w0, s0
592; CHECK-NEXT:    ret
593  %op = load <32 x i32>, ptr %a
594  %res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op)
595  ret i32 %res
596}
597
598define i32 @smaxv_v64i32(ptr %a) vscale_range(16,0) #0 {
599; CHECK-LABEL: smaxv_v64i32:
600; CHECK:       // %bb.0:
601; CHECK-NEXT:    ptrue p0.s, vl64
602; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
603; CHECK-NEXT:    smaxv s0, p0, z0.s
604; CHECK-NEXT:    fmov w0, s0
605; CHECK-NEXT:    ret
606  %op = load <64 x i32>, ptr %a
607  %res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op)
608  ret i32 %res
609}
610
611; Nothing to do for single element vectors.
612define i64 @smaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
613; CHECK-LABEL: smaxv_v1i64:
614; CHECK:       // %bb.0:
615; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
616; CHECK-NEXT:    fmov x0, d0
617; CHECK-NEXT:    ret
618  %res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a)
619  ret i64 %res
620}
621
622; No NEON 64-bit vector SMAXV support. Use SVE.
623define i64 @smaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
624; CHECK-LABEL: smaxv_v2i64:
625; CHECK:       // %bb.0:
626; CHECK-NEXT:    ptrue p0.d, vl2
627; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
628; CHECK-NEXT:    smaxv d0, p0, z0.d
629; CHECK-NEXT:    fmov x0, d0
630; CHECK-NEXT:    ret
631  %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
632  ret i64 %res
633}
634
635define i64 @smaxv_v4i64(ptr %a) vscale_range(2,0) #0 {
636; CHECK-LABEL: smaxv_v4i64:
637; CHECK:       // %bb.0:
638; CHECK-NEXT:    ptrue p0.d, vl4
639; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
640; CHECK-NEXT:    smaxv d0, p0, z0.d
641; CHECK-NEXT:    fmov x0, d0
642; CHECK-NEXT:    ret
643  %op = load <4 x i64>, ptr %a
644  %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
645  ret i64 %res
646}
647
648define i64 @smaxv_v8i64(ptr %a) #0 {
649; VBITS_GE_256-LABEL: smaxv_v8i64:
650; VBITS_GE_256:       // %bb.0:
651; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
652; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
653; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
654; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
655; VBITS_GE_256-NEXT:    smax z0.d, p0/m, z0.d, z1.d
656; VBITS_GE_256-NEXT:    smaxv d0, p0, z0.d
657; VBITS_GE_256-NEXT:    fmov x0, d0
658; VBITS_GE_256-NEXT:    ret
659;
660; VBITS_GE_512-LABEL: smaxv_v8i64:
661; VBITS_GE_512:       // %bb.0:
662; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
663; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
664; VBITS_GE_512-NEXT:    smaxv d0, p0, z0.d
665; VBITS_GE_512-NEXT:    fmov x0, d0
666; VBITS_GE_512-NEXT:    ret
667  %op = load <8 x i64>, ptr %a
668  %res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op)
669  ret i64 %res
670}
671
672define i64 @smaxv_v16i64(ptr %a) vscale_range(8,0) #0 {
673; CHECK-LABEL: smaxv_v16i64:
674; CHECK:       // %bb.0:
675; CHECK-NEXT:    ptrue p0.d, vl16
676; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
677; CHECK-NEXT:    smaxv d0, p0, z0.d
678; CHECK-NEXT:    fmov x0, d0
679; CHECK-NEXT:    ret
680  %op = load <16 x i64>, ptr %a
681  %res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op)
682  ret i64 %res
683}
684
685define i64 @smaxv_v32i64(ptr %a) vscale_range(16,0) #0 {
686; CHECK-LABEL: smaxv_v32i64:
687; CHECK:       // %bb.0:
688; CHECK-NEXT:    ptrue p0.d, vl32
689; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
690; CHECK-NEXT:    smaxv d0, p0, z0.d
691; CHECK-NEXT:    fmov x0, d0
692; CHECK-NEXT:    ret
693  %op = load <32 x i64>, ptr %a
694  %res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op)
695  ret i64 %res
696}
697
698;
699; SMINV
700;
701
702; Don't use SVE for 64-bit vectors.
703define i8 @sminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
704; CHECK-LABEL: sminv_v8i8:
705; CHECK:       // %bb.0:
706; CHECK-NEXT:    sminv b0, v0.8b
707; CHECK-NEXT:    fmov w0, s0
708; CHECK-NEXT:    ret
709  %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
710  ret i8 %res
711}
712
713; Don't use SVE for 128-bit vectors.
714define i8 @sminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
715; CHECK-LABEL: sminv_v16i8:
716; CHECK:       // %bb.0:
717; CHECK-NEXT:    sminv b0, v0.16b
718; CHECK-NEXT:    fmov w0, s0
719; CHECK-NEXT:    ret
720  %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
721  ret i8 %res
722}
723
724define i8 @sminv_v32i8(ptr %a) vscale_range(2,0) #0 {
725; CHECK-LABEL: sminv_v32i8:
726; CHECK:       // %bb.0:
727; CHECK-NEXT:    ptrue p0.b, vl32
728; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
729; CHECK-NEXT:    sminv b0, p0, z0.b
730; CHECK-NEXT:    fmov w0, s0
731; CHECK-NEXT:    ret
732  %op = load <32 x i8>, ptr %a
733  %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
734  ret i8 %res
735}
736
737define i8 @sminv_v64i8(ptr %a) #0 {
738; VBITS_GE_256-LABEL: sminv_v64i8:
739; VBITS_GE_256:       // %bb.0:
740; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
741; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
742; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
743; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
744; VBITS_GE_256-NEXT:    smin z0.b, p0/m, z0.b, z1.b
745; VBITS_GE_256-NEXT:    sminv b0, p0, z0.b
746; VBITS_GE_256-NEXT:    fmov w0, s0
747; VBITS_GE_256-NEXT:    ret
748;
749; VBITS_GE_512-LABEL: sminv_v64i8:
750; VBITS_GE_512:       // %bb.0:
751; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
752; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
753; VBITS_GE_512-NEXT:    sminv b0, p0, z0.b
754; VBITS_GE_512-NEXT:    fmov w0, s0
755; VBITS_GE_512-NEXT:    ret
756  %op = load <64 x i8>, ptr %a
757  %res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op)
758  ret i8 %res
759}
760
761define i8 @sminv_v128i8(ptr %a) vscale_range(8,0) #0 {
762; CHECK-LABEL: sminv_v128i8:
763; CHECK:       // %bb.0:
764; CHECK-NEXT:    ptrue p0.b, vl128
765; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
766; CHECK-NEXT:    sminv b0, p0, z0.b
767; CHECK-NEXT:    fmov w0, s0
768; CHECK-NEXT:    ret
769  %op = load <128 x i8>, ptr %a
770  %res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op)
771  ret i8 %res
772}
773
774define i8 @sminv_v256i8(ptr %a) vscale_range(16,0) #0 {
775; CHECK-LABEL: sminv_v256i8:
776; CHECK:       // %bb.0:
777; CHECK-NEXT:    ptrue p0.b, vl256
778; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
779; CHECK-NEXT:    sminv b0, p0, z0.b
780; CHECK-NEXT:    fmov w0, s0
781; CHECK-NEXT:    ret
782  %op = load <256 x i8>, ptr %a
783  %res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op)
784  ret i8 %res
785}
786
787; Don't use SVE for 64-bit vectors.
788define i16 @sminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
789; CHECK-LABEL: sminv_v4i16:
790; CHECK:       // %bb.0:
791; CHECK-NEXT:    sminv h0, v0.4h
792; CHECK-NEXT:    fmov w0, s0
793; CHECK-NEXT:    ret
794  %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
795  ret i16 %res
796}
797
798; Don't use SVE for 128-bit vectors.
799define i16 @sminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
800; CHECK-LABEL: sminv_v8i16:
801; CHECK:       // %bb.0:
802; CHECK-NEXT:    sminv h0, v0.8h
803; CHECK-NEXT:    fmov w0, s0
804; CHECK-NEXT:    ret
805  %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
806  ret i16 %res
807}
808
809define i16 @sminv_v16i16(ptr %a) vscale_range(2,0) #0 {
810; CHECK-LABEL: sminv_v16i16:
811; CHECK:       // %bb.0:
812; CHECK-NEXT:    ptrue p0.h, vl16
813; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
814; CHECK-NEXT:    sminv h0, p0, z0.h
815; CHECK-NEXT:    fmov w0, s0
816; CHECK-NEXT:    ret
817  %op = load <16 x i16>, ptr %a
818  %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
819  ret i16 %res
820}
821
822define i16 @sminv_v32i16(ptr %a) #0 {
823; VBITS_GE_256-LABEL: sminv_v32i16:
824; VBITS_GE_256:       // %bb.0:
825; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
826; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
827; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
828; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
829; VBITS_GE_256-NEXT:    smin z0.h, p0/m, z0.h, z1.h
830; VBITS_GE_256-NEXT:    sminv h0, p0, z0.h
831; VBITS_GE_256-NEXT:    fmov w0, s0
832; VBITS_GE_256-NEXT:    ret
833;
834; VBITS_GE_512-LABEL: sminv_v32i16:
835; VBITS_GE_512:       // %bb.0:
836; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
837; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
838; VBITS_GE_512-NEXT:    sminv h0, p0, z0.h
839; VBITS_GE_512-NEXT:    fmov w0, s0
840; VBITS_GE_512-NEXT:    ret
841  %op = load <32 x i16>, ptr %a
842  %res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op)
843  ret i16 %res
844}
845
846define i16 @sminv_v64i16(ptr %a) vscale_range(8,0) #0 {
847; CHECK-LABEL: sminv_v64i16:
848; CHECK:       // %bb.0:
849; CHECK-NEXT:    ptrue p0.h, vl64
850; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
851; CHECK-NEXT:    sminv h0, p0, z0.h
852; CHECK-NEXT:    fmov w0, s0
853; CHECK-NEXT:    ret
854  %op = load <64 x i16>, ptr %a
855  %res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op)
856  ret i16 %res
857}
858
859define i16 @sminv_v128i16(ptr %a) vscale_range(16,0) #0 {
860; CHECK-LABEL: sminv_v128i16:
861; CHECK:       // %bb.0:
862; CHECK-NEXT:    ptrue p0.h, vl128
863; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
864; CHECK-NEXT:    sminv h0, p0, z0.h
865; CHECK-NEXT:    fmov w0, s0
866; CHECK-NEXT:    ret
867  %op = load <128 x i16>, ptr %a
868  %res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op)
869  ret i16 %res
870}
871
872; Don't use SVE for 64-bit vectors.
873define i32 @sminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
874; CHECK-LABEL: sminv_v2i32:
875; CHECK:       // %bb.0:
876; CHECK-NEXT:    sminp v0.2s, v0.2s, v0.2s
877; CHECK-NEXT:    fmov w0, s0
878; CHECK-NEXT:    ret
879  %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
880  ret i32 %res
881}
882
883; Don't use SVE for 128-bit vectors.
884define i32 @sminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
885; CHECK-LABEL: sminv_v4i32:
886; CHECK:       // %bb.0:
887; CHECK-NEXT:    sminv s0, v0.4s
888; CHECK-NEXT:    fmov w0, s0
889; CHECK-NEXT:    ret
890  %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
891  ret i32 %res
892}
893
894define i32 @sminv_v8i32(ptr %a) vscale_range(2,0) #0 {
895; CHECK-LABEL: sminv_v8i32:
896; CHECK:       // %bb.0:
897; CHECK-NEXT:    ptrue p0.s, vl8
898; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
899; CHECK-NEXT:    sminv s0, p0, z0.s
900; CHECK-NEXT:    fmov w0, s0
901; CHECK-NEXT:    ret
902  %op = load <8 x i32>, ptr %a
903  %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
904  ret i32 %res
905}
906
907define i32 @sminv_v16i32(ptr %a) #0 {
908; VBITS_GE_256-LABEL: sminv_v16i32:
909; VBITS_GE_256:       // %bb.0:
910; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
911; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
912; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
913; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
914; VBITS_GE_256-NEXT:    smin z0.s, p0/m, z0.s, z1.s
915; VBITS_GE_256-NEXT:    sminv s0, p0, z0.s
916; VBITS_GE_256-NEXT:    fmov w0, s0
917; VBITS_GE_256-NEXT:    ret
918;
919; VBITS_GE_512-LABEL: sminv_v16i32:
920; VBITS_GE_512:       // %bb.0:
921; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
922; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
923; VBITS_GE_512-NEXT:    sminv s0, p0, z0.s
924; VBITS_GE_512-NEXT:    fmov w0, s0
925; VBITS_GE_512-NEXT:    ret
926  %op = load <16 x i32>, ptr %a
927  %res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op)
928  ret i32 %res
929}
930
931define i32 @sminv_v32i32(ptr %a) vscale_range(8,0) #0 {
932; CHECK-LABEL: sminv_v32i32:
933; CHECK:       // %bb.0:
934; CHECK-NEXT:    ptrue p0.s, vl32
935; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
936; CHECK-NEXT:    sminv s0, p0, z0.s
937; CHECK-NEXT:    fmov w0, s0
938; CHECK-NEXT:    ret
939  %op = load <32 x i32>, ptr %a
940  %res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op)
941  ret i32 %res
942}
943
944define i32 @sminv_v64i32(ptr %a) vscale_range(16,0) #0 {
945; CHECK-LABEL: sminv_v64i32:
946; CHECK:       // %bb.0:
947; CHECK-NEXT:    ptrue p0.s, vl64
948; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
949; CHECK-NEXT:    sminv s0, p0, z0.s
950; CHECK-NEXT:    fmov w0, s0
951; CHECK-NEXT:    ret
952  %op = load <64 x i32>, ptr %a
953  %res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op)
954  ret i32 %res
955}
956
957; Nothing to do for single element vectors.
958define i64 @sminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
959; CHECK-LABEL: sminv_v1i64:
960; CHECK:       // %bb.0:
961; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
962; CHECK-NEXT:    fmov x0, d0
963; CHECK-NEXT:    ret
964  %res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a)
965  ret i64 %res
966}
967
968; No NEON 64-bit vector SMINV support. Use SVE.
969define i64 @sminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
970; CHECK-LABEL: sminv_v2i64:
971; CHECK:       // %bb.0:
972; CHECK-NEXT:    ptrue p0.d, vl2
973; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
974; CHECK-NEXT:    sminv d0, p0, z0.d
975; CHECK-NEXT:    fmov x0, d0
976; CHECK-NEXT:    ret
977  %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
978  ret i64 %res
979}
980
981define i64 @sminv_v4i64(ptr %a) vscale_range(2,0) #0 {
982; CHECK-LABEL: sminv_v4i64:
983; CHECK:       // %bb.0:
984; CHECK-NEXT:    ptrue p0.d, vl4
985; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
986; CHECK-NEXT:    sminv d0, p0, z0.d
987; CHECK-NEXT:    fmov x0, d0
988; CHECK-NEXT:    ret
989  %op = load <4 x i64>, ptr %a
990  %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
991  ret i64 %res
992}
993
994define i64 @sminv_v8i64(ptr %a) #0 {
995; VBITS_GE_256-LABEL: sminv_v8i64:
996; VBITS_GE_256:       // %bb.0:
997; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
998; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
999; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1000; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1001; VBITS_GE_256-NEXT:    smin z0.d, p0/m, z0.d, z1.d
1002; VBITS_GE_256-NEXT:    sminv d0, p0, z0.d
1003; VBITS_GE_256-NEXT:    fmov x0, d0
1004; VBITS_GE_256-NEXT:    ret
1005;
1006; VBITS_GE_512-LABEL: sminv_v8i64:
1007; VBITS_GE_512:       // %bb.0:
1008; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1009; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1010; VBITS_GE_512-NEXT:    sminv d0, p0, z0.d
1011; VBITS_GE_512-NEXT:    fmov x0, d0
1012; VBITS_GE_512-NEXT:    ret
1013  %op = load <8 x i64>, ptr %a
1014  %res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op)
1015  ret i64 %res
1016}
1017
1018define i64 @sminv_v16i64(ptr %a) vscale_range(8,0) #0 {
1019; CHECK-LABEL: sminv_v16i64:
1020; CHECK:       // %bb.0:
1021; CHECK-NEXT:    ptrue p0.d, vl16
1022; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1023; CHECK-NEXT:    sminv d0, p0, z0.d
1024; CHECK-NEXT:    fmov x0, d0
1025; CHECK-NEXT:    ret
1026  %op = load <16 x i64>, ptr %a
1027  %res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op)
1028  ret i64 %res
1029}
1030
1031define i64 @sminv_v32i64(ptr %a) vscale_range(16,0) #0 {
1032; CHECK-LABEL: sminv_v32i64:
1033; CHECK:       // %bb.0:
1034; CHECK-NEXT:    ptrue p0.d, vl32
1035; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1036; CHECK-NEXT:    sminv d0, p0, z0.d
1037; CHECK-NEXT:    fmov x0, d0
1038; CHECK-NEXT:    ret
1039  %op = load <32 x i64>, ptr %a
1040  %res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op)
1041  ret i64 %res
1042}
1043
1044;
1045; UMAXV
1046;
1047
1048; Don't use SVE for 64-bit vectors.
1049define i8 @umaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
1050; CHECK-LABEL: umaxv_v8i8:
1051; CHECK:       // %bb.0:
1052; CHECK-NEXT:    umaxv b0, v0.8b
1053; CHECK-NEXT:    fmov w0, s0
1054; CHECK-NEXT:    ret
1055  %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
1056  ret i8 %res
1057}
1058
1059; Don't use SVE for 128-bit vectors.
1060define i8 @umaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
1061; CHECK-LABEL: umaxv_v16i8:
1062; CHECK:       // %bb.0:
1063; CHECK-NEXT:    umaxv b0, v0.16b
1064; CHECK-NEXT:    fmov w0, s0
1065; CHECK-NEXT:    ret
1066  %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
1067  ret i8 %res
1068}
1069
1070define i8 @umaxv_v32i8(ptr %a) vscale_range(2,0) #0 {
1071; CHECK-LABEL: umaxv_v32i8:
1072; CHECK:       // %bb.0:
1073; CHECK-NEXT:    ptrue p0.b, vl32
1074; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1075; CHECK-NEXT:    umaxv b0, p0, z0.b
1076; CHECK-NEXT:    fmov w0, s0
1077; CHECK-NEXT:    ret
1078  %op = load <32 x i8>, ptr %a
1079  %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
1080  ret i8 %res
1081}
1082
1083define i8 @umaxv_v64i8(ptr %a) #0 {
1084; VBITS_GE_256-LABEL: umaxv_v64i8:
1085; VBITS_GE_256:       // %bb.0:
1086; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
1087; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
1088; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
1089; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
1090; VBITS_GE_256-NEXT:    umax z0.b, p0/m, z0.b, z1.b
1091; VBITS_GE_256-NEXT:    umaxv b0, p0, z0.b
1092; VBITS_GE_256-NEXT:    fmov w0, s0
1093; VBITS_GE_256-NEXT:    ret
1094;
1095; VBITS_GE_512-LABEL: umaxv_v64i8:
1096; VBITS_GE_512:       // %bb.0:
1097; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
1098; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
1099; VBITS_GE_512-NEXT:    umaxv b0, p0, z0.b
1100; VBITS_GE_512-NEXT:    fmov w0, s0
1101; VBITS_GE_512-NEXT:    ret
1102  %op = load <64 x i8>, ptr %a
1103  %res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op)
1104  ret i8 %res
1105}
1106
1107define i8 @umaxv_v128i8(ptr %a) vscale_range(8,0) #0 {
1108; CHECK-LABEL: umaxv_v128i8:
1109; CHECK:       // %bb.0:
1110; CHECK-NEXT:    ptrue p0.b, vl128
1111; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1112; CHECK-NEXT:    umaxv b0, p0, z0.b
1113; CHECK-NEXT:    fmov w0, s0
1114; CHECK-NEXT:    ret
1115  %op = load <128 x i8>, ptr %a
1116  %res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op)
1117  ret i8 %res
1118}
1119
1120define i8 @umaxv_v256i8(ptr %a) vscale_range(16,0) #0 {
1121; CHECK-LABEL: umaxv_v256i8:
1122; CHECK:       // %bb.0:
1123; CHECK-NEXT:    ptrue p0.b, vl256
1124; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1125; CHECK-NEXT:    umaxv b0, p0, z0.b
1126; CHECK-NEXT:    fmov w0, s0
1127; CHECK-NEXT:    ret
1128  %op = load <256 x i8>, ptr %a
1129  %res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op)
1130  ret i8 %res
1131}
1132
1133; Don't use SVE for 64-bit vectors.
1134define i16 @umaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
1135; CHECK-LABEL: umaxv_v4i16:
1136; CHECK:       // %bb.0:
1137; CHECK-NEXT:    umaxv h0, v0.4h
1138; CHECK-NEXT:    fmov w0, s0
1139; CHECK-NEXT:    ret
1140  %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
1141  ret i16 %res
1142}
1143
1144; Don't use SVE for 128-bit vectors.
1145define i16 @umaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
1146; CHECK-LABEL: umaxv_v8i16:
1147; CHECK:       // %bb.0:
1148; CHECK-NEXT:    umaxv h0, v0.8h
1149; CHECK-NEXT:    fmov w0, s0
1150; CHECK-NEXT:    ret
1151  %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
1152  ret i16 %res
1153}
1154
1155define i16 @umaxv_v16i16(ptr %a) vscale_range(2,0) #0 {
1156; CHECK-LABEL: umaxv_v16i16:
1157; CHECK:       // %bb.0:
1158; CHECK-NEXT:    ptrue p0.h, vl16
1159; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1160; CHECK-NEXT:    umaxv h0, p0, z0.h
1161; CHECK-NEXT:    fmov w0, s0
1162; CHECK-NEXT:    ret
1163  %op = load <16 x i16>, ptr %a
1164  %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
1165  ret i16 %res
1166}
1167
1168define i16 @umaxv_v32i16(ptr %a) #0 {
1169; VBITS_GE_256-LABEL: umaxv_v32i16:
1170; VBITS_GE_256:       // %bb.0:
1171; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1172; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
1173; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1174; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
1175; VBITS_GE_256-NEXT:    umax z0.h, p0/m, z0.h, z1.h
1176; VBITS_GE_256-NEXT:    umaxv h0, p0, z0.h
1177; VBITS_GE_256-NEXT:    fmov w0, s0
1178; VBITS_GE_256-NEXT:    ret
1179;
1180; VBITS_GE_512-LABEL: umaxv_v32i16:
1181; VBITS_GE_512:       // %bb.0:
1182; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1183; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1184; VBITS_GE_512-NEXT:    umaxv h0, p0, z0.h
1185; VBITS_GE_512-NEXT:    fmov w0, s0
1186; VBITS_GE_512-NEXT:    ret
1187  %op = load <32 x i16>, ptr %a
1188  %res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op)
1189  ret i16 %res
1190}
1191
1192define i16 @umaxv_v64i16(ptr %a) vscale_range(8,0) #0 {
1193; CHECK-LABEL: umaxv_v64i16:
1194; CHECK:       // %bb.0:
1195; CHECK-NEXT:    ptrue p0.h, vl64
1196; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1197; CHECK-NEXT:    umaxv h0, p0, z0.h
1198; CHECK-NEXT:    fmov w0, s0
1199; CHECK-NEXT:    ret
1200  %op = load <64 x i16>, ptr %a
1201  %res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op)
1202  ret i16 %res
1203}
1204
1205define i16 @umaxv_v128i16(ptr %a) vscale_range(16,0) #0 {
1206; CHECK-LABEL: umaxv_v128i16:
1207; CHECK:       // %bb.0:
1208; CHECK-NEXT:    ptrue p0.h, vl128
1209; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1210; CHECK-NEXT:    umaxv h0, p0, z0.h
1211; CHECK-NEXT:    fmov w0, s0
1212; CHECK-NEXT:    ret
1213  %op = load <128 x i16>, ptr %a
1214  %res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op)
1215  ret i16 %res
1216}
1217
1218; Don't use SVE for 64-bit vectors.
1219define i32 @umaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
1220; CHECK-LABEL: umaxv_v2i32:
1221; CHECK:       // %bb.0:
1222; CHECK-NEXT:    umaxp v0.2s, v0.2s, v0.2s
1223; CHECK-NEXT:    fmov w0, s0
1224; CHECK-NEXT:    ret
1225  %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
1226  ret i32 %res
1227}
1228
1229; Don't use SVE for 128-bit vectors.
1230define i32 @umaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
1231; CHECK-LABEL: umaxv_v4i32:
1232; CHECK:       // %bb.0:
1233; CHECK-NEXT:    umaxv s0, v0.4s
1234; CHECK-NEXT:    fmov w0, s0
1235; CHECK-NEXT:    ret
1236  %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
1237  ret i32 %res
1238}
1239
1240define i32 @umaxv_v8i32(ptr %a) vscale_range(2,0) #0 {
1241; CHECK-LABEL: umaxv_v8i32:
1242; CHECK:       // %bb.0:
1243; CHECK-NEXT:    ptrue p0.s, vl8
1244; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1245; CHECK-NEXT:    umaxv s0, p0, z0.s
1246; CHECK-NEXT:    fmov w0, s0
1247; CHECK-NEXT:    ret
1248  %op = load <8 x i32>, ptr %a
1249  %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
1250  ret i32 %res
1251}
1252
1253define i32 @umaxv_v16i32(ptr %a) #0 {
1254; VBITS_GE_256-LABEL: umaxv_v16i32:
1255; VBITS_GE_256:       // %bb.0:
1256; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1257; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1258; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1259; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
1260; VBITS_GE_256-NEXT:    umax z0.s, p0/m, z0.s, z1.s
1261; VBITS_GE_256-NEXT:    umaxv s0, p0, z0.s
1262; VBITS_GE_256-NEXT:    fmov w0, s0
1263; VBITS_GE_256-NEXT:    ret
1264;
1265; VBITS_GE_512-LABEL: umaxv_v16i32:
1266; VBITS_GE_512:       // %bb.0:
1267; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1268; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1269; VBITS_GE_512-NEXT:    umaxv s0, p0, z0.s
1270; VBITS_GE_512-NEXT:    fmov w0, s0
1271; VBITS_GE_512-NEXT:    ret
1272  %op = load <16 x i32>, ptr %a
1273  %res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op)
1274  ret i32 %res
1275}
1276
1277define i32 @umaxv_v32i32(ptr %a) vscale_range(8,0) #0 {
1278; CHECK-LABEL: umaxv_v32i32:
1279; CHECK:       // %bb.0:
1280; CHECK-NEXT:    ptrue p0.s, vl32
1281; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1282; CHECK-NEXT:    umaxv s0, p0, z0.s
1283; CHECK-NEXT:    fmov w0, s0
1284; CHECK-NEXT:    ret
1285  %op = load <32 x i32>, ptr %a
1286  %res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op)
1287  ret i32 %res
1288}
1289
1290define i32 @umaxv_v64i32(ptr %a) vscale_range(16,0) #0 {
1291; CHECK-LABEL: umaxv_v64i32:
1292; CHECK:       // %bb.0:
1293; CHECK-NEXT:    ptrue p0.s, vl64
1294; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1295; CHECK-NEXT:    umaxv s0, p0, z0.s
1296; CHECK-NEXT:    fmov w0, s0
1297; CHECK-NEXT:    ret
1298  %op = load <64 x i32>, ptr %a
1299  %res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op)
1300  ret i32 %res
1301}
1302
1303; Nothing to do for single element vectors.
1304define i64 @umaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
1305; CHECK-LABEL: umaxv_v1i64:
1306; CHECK:       // %bb.0:
1307; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1308; CHECK-NEXT:    fmov x0, d0
1309; CHECK-NEXT:    ret
1310  %res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
1311  ret i64 %res
1312}
1313
1314; No NEON 64-bit vector UMAXV support. Use SVE.
1315define i64 @umaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
1316; CHECK-LABEL: umaxv_v2i64:
1317; CHECK:       // %bb.0:
1318; CHECK-NEXT:    ptrue p0.d, vl2
1319; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1320; CHECK-NEXT:    umaxv d0, p0, z0.d
1321; CHECK-NEXT:    fmov x0, d0
1322; CHECK-NEXT:    ret
1323  %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
1324  ret i64 %res
1325}
1326
1327define i64 @umaxv_v4i64(ptr %a) vscale_range(2,0) #0 {
1328; CHECK-LABEL: umaxv_v4i64:
1329; CHECK:       // %bb.0:
1330; CHECK-NEXT:    ptrue p0.d, vl4
1331; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1332; CHECK-NEXT:    umaxv d0, p0, z0.d
1333; CHECK-NEXT:    fmov x0, d0
1334; CHECK-NEXT:    ret
1335  %op = load <4 x i64>, ptr %a
1336  %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
1337  ret i64 %res
1338}
1339
1340define i64 @umaxv_v8i64(ptr %a) #0 {
1341; VBITS_GE_256-LABEL: umaxv_v8i64:
1342; VBITS_GE_256:       // %bb.0:
1343; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1344; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1345; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1346; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1347; VBITS_GE_256-NEXT:    umax z0.d, p0/m, z0.d, z1.d
1348; VBITS_GE_256-NEXT:    umaxv d0, p0, z0.d
1349; VBITS_GE_256-NEXT:    fmov x0, d0
1350; VBITS_GE_256-NEXT:    ret
1351;
1352; VBITS_GE_512-LABEL: umaxv_v8i64:
1353; VBITS_GE_512:       // %bb.0:
1354; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1355; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1356; VBITS_GE_512-NEXT:    umaxv d0, p0, z0.d
1357; VBITS_GE_512-NEXT:    fmov x0, d0
1358; VBITS_GE_512-NEXT:    ret
1359  %op = load <8 x i64>, ptr %a
1360  %res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op)
1361  ret i64 %res
1362}
1363
1364define i64 @umaxv_v16i64(ptr %a) vscale_range(8,0) #0 {
1365; CHECK-LABEL: umaxv_v16i64:
1366; CHECK:       // %bb.0:
1367; CHECK-NEXT:    ptrue p0.d, vl16
1368; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1369; CHECK-NEXT:    umaxv d0, p0, z0.d
1370; CHECK-NEXT:    fmov x0, d0
1371; CHECK-NEXT:    ret
1372  %op = load <16 x i64>, ptr %a
1373  %res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op)
1374  ret i64 %res
1375}
1376
1377define i64 @umaxv_v32i64(ptr %a) vscale_range(16,0) #0 {
1378; CHECK-LABEL: umaxv_v32i64:
1379; CHECK:       // %bb.0:
1380; CHECK-NEXT:    ptrue p0.d, vl32
1381; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1382; CHECK-NEXT:    umaxv d0, p0, z0.d
1383; CHECK-NEXT:    fmov x0, d0
1384; CHECK-NEXT:    ret
1385  %op = load <32 x i64>, ptr %a
1386  %res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op)
1387  ret i64 %res
1388}
1389
1390;
1391; UMINV
1392;
1393
1394; Don't use SVE for 64-bit vectors.
1395define i8 @uminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
1396; CHECK-LABEL: uminv_v8i8:
1397; CHECK:       // %bb.0:
1398; CHECK-NEXT:    uminv b0, v0.8b
1399; CHECK-NEXT:    fmov w0, s0
1400; CHECK-NEXT:    ret
1401  %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
1402  ret i8 %res
1403}
1404
1405; Don't use SVE for 128-bit vectors.
1406define i8 @uminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
1407; CHECK-LABEL: uminv_v16i8:
1408; CHECK:       // %bb.0:
1409; CHECK-NEXT:    uminv b0, v0.16b
1410; CHECK-NEXT:    fmov w0, s0
1411; CHECK-NEXT:    ret
1412  %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
1413  ret i8 %res
1414}
1415
1416define i8 @uminv_v32i8(ptr %a) vscale_range(2,0) #0 {
1417; CHECK-LABEL: uminv_v32i8:
1418; CHECK:       // %bb.0:
1419; CHECK-NEXT:    ptrue p0.b, vl32
1420; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1421; CHECK-NEXT:    uminv b0, p0, z0.b
1422; CHECK-NEXT:    fmov w0, s0
1423; CHECK-NEXT:    ret
1424  %op = load <32 x i8>, ptr %a
1425  %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
1426  ret i8 %res
1427}
1428
1429define i8 @uminv_v64i8(ptr %a) #0 {
1430; VBITS_GE_256-LABEL: uminv_v64i8:
1431; VBITS_GE_256:       // %bb.0:
1432; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
1433; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
1434; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
1435; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
1436; VBITS_GE_256-NEXT:    umin z0.b, p0/m, z0.b, z1.b
1437; VBITS_GE_256-NEXT:    uminv b0, p0, z0.b
1438; VBITS_GE_256-NEXT:    fmov w0, s0
1439; VBITS_GE_256-NEXT:    ret
1440;
1441; VBITS_GE_512-LABEL: uminv_v64i8:
1442; VBITS_GE_512:       // %bb.0:
1443; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
1444; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
1445; VBITS_GE_512-NEXT:    uminv b0, p0, z0.b
1446; VBITS_GE_512-NEXT:    fmov w0, s0
1447; VBITS_GE_512-NEXT:    ret
1448  %op = load <64 x i8>, ptr %a
1449  %res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op)
1450  ret i8 %res
1451}
1452
1453define i8 @uminv_v128i8(ptr %a) vscale_range(8,0) #0 {
1454; CHECK-LABEL: uminv_v128i8:
1455; CHECK:       // %bb.0:
1456; CHECK-NEXT:    ptrue p0.b, vl128
1457; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1458; CHECK-NEXT:    uminv b0, p0, z0.b
1459; CHECK-NEXT:    fmov w0, s0
1460; CHECK-NEXT:    ret
1461  %op = load <128 x i8>, ptr %a
1462  %res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op)
1463  ret i8 %res
1464}
1465
1466define i8 @uminv_v256i8(ptr %a) vscale_range(16,0) #0 {
1467; CHECK-LABEL: uminv_v256i8:
1468; CHECK:       // %bb.0:
1469; CHECK-NEXT:    ptrue p0.b, vl256
1470; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
1471; CHECK-NEXT:    uminv b0, p0, z0.b
1472; CHECK-NEXT:    fmov w0, s0
1473; CHECK-NEXT:    ret
1474  %op = load <256 x i8>, ptr %a
1475  %res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op)
1476  ret i8 %res
1477}
1478
1479; Don't use SVE for 64-bit vectors.
1480define i16 @uminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
1481; CHECK-LABEL: uminv_v4i16:
1482; CHECK:       // %bb.0:
1483; CHECK-NEXT:    uminv h0, v0.4h
1484; CHECK-NEXT:    fmov w0, s0
1485; CHECK-NEXT:    ret
1486  %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
1487  ret i16 %res
1488}
1489
1490; Don't use SVE for 128-bit vectors.
1491define i16 @uminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
1492; CHECK-LABEL: uminv_v8i16:
1493; CHECK:       // %bb.0:
1494; CHECK-NEXT:    uminv h0, v0.8h
1495; CHECK-NEXT:    fmov w0, s0
1496; CHECK-NEXT:    ret
1497  %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
1498  ret i16 %res
1499}
1500
1501define i16 @uminv_v16i16(ptr %a) vscale_range(2,0) #0 {
1502; CHECK-LABEL: uminv_v16i16:
1503; CHECK:       // %bb.0:
1504; CHECK-NEXT:    ptrue p0.h, vl16
1505; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1506; CHECK-NEXT:    uminv h0, p0, z0.h
1507; CHECK-NEXT:    fmov w0, s0
1508; CHECK-NEXT:    ret
1509  %op = load <16 x i16>, ptr %a
1510  %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
1511  ret i16 %res
1512}
1513
1514define i16 @uminv_v32i16(ptr %a) #0 {
1515; VBITS_GE_256-LABEL: uminv_v32i16:
1516; VBITS_GE_256:       // %bb.0:
1517; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
1518; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
1519; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1520; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
1521; VBITS_GE_256-NEXT:    umin z0.h, p0/m, z0.h, z1.h
1522; VBITS_GE_256-NEXT:    uminv h0, p0, z0.h
1523; VBITS_GE_256-NEXT:    fmov w0, s0
1524; VBITS_GE_256-NEXT:    ret
1525;
1526; VBITS_GE_512-LABEL: uminv_v32i16:
1527; VBITS_GE_512:       // %bb.0:
1528; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
1529; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
1530; VBITS_GE_512-NEXT:    uminv h0, p0, z0.h
1531; VBITS_GE_512-NEXT:    fmov w0, s0
1532; VBITS_GE_512-NEXT:    ret
1533  %op = load <32 x i16>, ptr %a
1534  %res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op)
1535  ret i16 %res
1536}
1537
1538define i16 @uminv_v64i16(ptr %a) vscale_range(8,0) #0 {
1539; CHECK-LABEL: uminv_v64i16:
1540; CHECK:       // %bb.0:
1541; CHECK-NEXT:    ptrue p0.h, vl64
1542; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1543; CHECK-NEXT:    uminv h0, p0, z0.h
1544; CHECK-NEXT:    fmov w0, s0
1545; CHECK-NEXT:    ret
1546  %op = load <64 x i16>, ptr %a
1547  %res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op)
1548  ret i16 %res
1549}
1550
1551define i16 @uminv_v128i16(ptr %a) vscale_range(16,0) #0 {
1552; CHECK-LABEL: uminv_v128i16:
1553; CHECK:       // %bb.0:
1554; CHECK-NEXT:    ptrue p0.h, vl128
1555; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
1556; CHECK-NEXT:    uminv h0, p0, z0.h
1557; CHECK-NEXT:    fmov w0, s0
1558; CHECK-NEXT:    ret
1559  %op = load <128 x i16>, ptr %a
1560  %res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op)
1561  ret i16 %res
1562}
1563
1564; Don't use SVE for 64-bit vectors.
1565define i32 @uminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
1566; CHECK-LABEL: uminv_v2i32:
1567; CHECK:       // %bb.0:
1568; CHECK-NEXT:    uminp v0.2s, v0.2s, v0.2s
1569; CHECK-NEXT:    fmov w0, s0
1570; CHECK-NEXT:    ret
1571  %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
1572  ret i32 %res
1573}
1574
1575; Don't use SVE for 128-bit vectors.
1576define i32 @uminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
1577; CHECK-LABEL: uminv_v4i32:
1578; CHECK:       // %bb.0:
1579; CHECK-NEXT:    uminv s0, v0.4s
1580; CHECK-NEXT:    fmov w0, s0
1581; CHECK-NEXT:    ret
1582  %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
1583  ret i32 %res
1584}
1585
1586define i32 @uminv_v8i32(ptr %a) vscale_range(2,0) #0 {
1587; CHECK-LABEL: uminv_v8i32:
1588; CHECK:       // %bb.0:
1589; CHECK-NEXT:    ptrue p0.s, vl8
1590; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1591; CHECK-NEXT:    uminv s0, p0, z0.s
1592; CHECK-NEXT:    fmov w0, s0
1593; CHECK-NEXT:    ret
1594  %op = load <8 x i32>, ptr %a
1595  %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
1596  ret i32 %res
1597}
1598
1599define i32 @uminv_v16i32(ptr %a) #0 {
1600; VBITS_GE_256-LABEL: uminv_v16i32:
1601; VBITS_GE_256:       // %bb.0:
1602; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
1603; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
1604; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1605; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
1606; VBITS_GE_256-NEXT:    umin z0.s, p0/m, z0.s, z1.s
1607; VBITS_GE_256-NEXT:    uminv s0, p0, z0.s
1608; VBITS_GE_256-NEXT:    fmov w0, s0
1609; VBITS_GE_256-NEXT:    ret
1610;
1611; VBITS_GE_512-LABEL: uminv_v16i32:
1612; VBITS_GE_512:       // %bb.0:
1613; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
1614; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1615; VBITS_GE_512-NEXT:    uminv s0, p0, z0.s
1616; VBITS_GE_512-NEXT:    fmov w0, s0
1617; VBITS_GE_512-NEXT:    ret
1618  %op = load <16 x i32>, ptr %a
1619  %res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op)
1620  ret i32 %res
1621}
1622
1623define i32 @uminv_v32i32(ptr %a) vscale_range(8,0) #0 {
1624; CHECK-LABEL: uminv_v32i32:
1625; CHECK:       // %bb.0:
1626; CHECK-NEXT:    ptrue p0.s, vl32
1627; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1628; CHECK-NEXT:    uminv s0, p0, z0.s
1629; CHECK-NEXT:    fmov w0, s0
1630; CHECK-NEXT:    ret
1631  %op = load <32 x i32>, ptr %a
1632  %res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op)
1633  ret i32 %res
1634}
1635
1636define i32 @uminv_v64i32(ptr %a) vscale_range(16,0) #0 {
1637; CHECK-LABEL: uminv_v64i32:
1638; CHECK:       // %bb.0:
1639; CHECK-NEXT:    ptrue p0.s, vl64
1640; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1641; CHECK-NEXT:    uminv s0, p0, z0.s
1642; CHECK-NEXT:    fmov w0, s0
1643; CHECK-NEXT:    ret
1644  %op = load <64 x i32>, ptr %a
1645  %res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op)
1646  ret i32 %res
1647}
1648
1649; Nothing to do for single element vectors.
1650define i64 @uminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
1651; CHECK-LABEL: uminv_v1i64:
1652; CHECK:       // %bb.0:
1653; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1654; CHECK-NEXT:    fmov x0, d0
1655; CHECK-NEXT:    ret
1656  %res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a)
1657  ret i64 %res
1658}
1659
1660; No NEON 64-bit vector UMINV support. Use SVE.
1661define i64 @uminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
1662; CHECK-LABEL: uminv_v2i64:
1663; CHECK:       // %bb.0:
1664; CHECK-NEXT:    ptrue p0.d, vl2
1665; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1666; CHECK-NEXT:    uminv d0, p0, z0.d
1667; CHECK-NEXT:    fmov x0, d0
1668; CHECK-NEXT:    ret
1669  %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
1670  ret i64 %res
1671}
1672
1673define i64 @uminv_v4i64(ptr %a) vscale_range(2,0) #0 {
1674; CHECK-LABEL: uminv_v4i64:
1675; CHECK:       // %bb.0:
1676; CHECK-NEXT:    ptrue p0.d, vl4
1677; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1678; CHECK-NEXT:    uminv d0, p0, z0.d
1679; CHECK-NEXT:    fmov x0, d0
1680; CHECK-NEXT:    ret
1681  %op = load <4 x i64>, ptr %a
1682  %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
1683  ret i64 %res
1684}
1685
1686define i64 @uminv_v8i64(ptr %a) #0 {
1687; VBITS_GE_256-LABEL: uminv_v8i64:
1688; VBITS_GE_256:       // %bb.0:
1689; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1690; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1691; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1692; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1693; VBITS_GE_256-NEXT:    umin z0.d, p0/m, z0.d, z1.d
1694; VBITS_GE_256-NEXT:    uminv d0, p0, z0.d
1695; VBITS_GE_256-NEXT:    fmov x0, d0
1696; VBITS_GE_256-NEXT:    ret
1697;
1698; VBITS_GE_512-LABEL: uminv_v8i64:
1699; VBITS_GE_512:       // %bb.0:
1700; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1701; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1702; VBITS_GE_512-NEXT:    uminv d0, p0, z0.d
1703; VBITS_GE_512-NEXT:    fmov x0, d0
1704; VBITS_GE_512-NEXT:    ret
1705  %op = load <8 x i64>, ptr %a
1706  %res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op)
1707  ret i64 %res
1708}
1709
1710define i64 @uminv_v16i64(ptr %a) vscale_range(8,0) #0 {
1711; CHECK-LABEL: uminv_v16i64:
1712; CHECK:       // %bb.0:
1713; CHECK-NEXT:    ptrue p0.d, vl16
1714; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1715; CHECK-NEXT:    uminv d0, p0, z0.d
1716; CHECK-NEXT:    fmov x0, d0
1717; CHECK-NEXT:    ret
1718  %op = load <16 x i64>, ptr %a
1719  %res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op)
1720  ret i64 %res
1721}
1722
1723define i64 @uminv_v32i64(ptr %a) vscale_range(16,0) #0 {
1724; CHECK-LABEL: uminv_v32i64:
1725; CHECK:       // %bb.0:
1726; CHECK-NEXT:    ptrue p0.d, vl32
1727; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1728; CHECK-NEXT:    uminv d0, p0, z0.d
1729; CHECK-NEXT:    fmov x0, d0
1730; CHECK-NEXT:    ret
1731  %op = load <32 x i64>, ptr %a
1732  %res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op)
1733  ret i64 %res
1734}
1735
1736attributes #0 = { "target-features"="+sve" }
1737
1738declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
1739declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
1740declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
1741declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
1742declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
1743declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
1744
1745declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
1746declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1747declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1748declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
1749declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
1750declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
1751
1752declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
1753declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1754declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1755declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1756declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
1757declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
1758
1759declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
1760declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1761declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1762declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1763declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1764declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
1765
1766declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
1767declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
1768declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
1769declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
1770declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
1771declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
1772
1773declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
1774declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
1775declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
1776declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
1777declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
1778declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
1779
1780declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
1781declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
1782declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
1783declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
1784declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
1785declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
1786
1787declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
1788declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
1789declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
1790declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
1791declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
1792declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
1793
1794declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
1795declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
1796declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
1797declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
1798declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
1799declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>)
1800
1801declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
1802declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
1803declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
1804declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
1805declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
1806declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>)
1807
1808declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
1809declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
1810declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
1811declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
1812declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
1813declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>)
1814
1815declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
1816declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
1817declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
1818declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
1819declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
1820declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>)
1821
1822declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
1823declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
1824declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
1825declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
1826declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
1827declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>)
1828
1829declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
1830declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
1831declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
1832declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
1833declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
1834declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>)
1835
1836declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
1837declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
1838declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
1839declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
1840declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
1841declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>)
1842
1843declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
1844declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
1845declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
1846declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
1847declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
1848declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>)
1849
1850declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
1851declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
1852declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
1853declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
1854declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
1855declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>)
1856
1857declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
1858declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
1859declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
1860declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
1861declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
1862declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>)
1863
1864declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
1865declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
1866declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
1867declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
1868declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
1869declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>)
1870
1871declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
1872declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
1873declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
1874declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
1875declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
1876declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>)
1877