xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll (revision d90a42751f9bfa73ed3555c702e70cf34d97bb39)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
4
5declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
6
7define i8 @vreduce_add_v1i8(<1 x i8> %v) {
8; CHECK-LABEL: vreduce_add_v1i8:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
11; CHECK-NEXT:    vmv.x.s a0, v8
12; CHECK-NEXT:    ret
13  %red = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %v)
14  ret i8 %red
15}
16
17declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
18
19define i8 @vreduce_add_v2i8(ptr %x) {
20; CHECK-LABEL: vreduce_add_v2i8:
21; CHECK:       # %bb.0:
22; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
23; CHECK-NEXT:    vle8.v v8, (a0)
24; CHECK-NEXT:    vmv.s.x v9, zero
25; CHECK-NEXT:    vredsum.vs v8, v8, v9
26; CHECK-NEXT:    vmv.x.s a0, v8
27; CHECK-NEXT:    ret
28  %v = load <2 x i8>, ptr %x
29  %red = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v)
30  ret i8 %red
31}
32
33declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8>)
34
35define i8 @vreduce_add_v3i8(ptr %x) {
36; CHECK-LABEL: vreduce_add_v3i8:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
39; CHECK-NEXT:    vle8.v v8, (a0)
40; CHECK-NEXT:    vmv.s.x v9, zero
41; CHECK-NEXT:    vredsum.vs v8, v8, v9
42; CHECK-NEXT:    vmv.x.s a0, v8
43; CHECK-NEXT:    ret
44  %v = load <3 x i8>, ptr %x
45  %red = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %v)
46  ret i8 %red
47}
48
49declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
50
51define i8 @vreduce_add_v4i8(ptr %x) {
52; CHECK-LABEL: vreduce_add_v4i8:
53; CHECK:       # %bb.0:
54; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
55; CHECK-NEXT:    vle8.v v8, (a0)
56; CHECK-NEXT:    vmv.s.x v9, zero
57; CHECK-NEXT:    vredsum.vs v8, v8, v9
58; CHECK-NEXT:    vmv.x.s a0, v8
59; CHECK-NEXT:    ret
60  %v = load <4 x i8>, ptr %x
61  %red = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %v)
62  ret i8 %red
63}
64
65declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
66
67define i8 @vreduce_add_v8i8(ptr %x) {
68; CHECK-LABEL: vreduce_add_v8i8:
69; CHECK:       # %bb.0:
70; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
71; CHECK-NEXT:    vle8.v v8, (a0)
72; CHECK-NEXT:    vmv.s.x v9, zero
73; CHECK-NEXT:    vredsum.vs v8, v8, v9
74; CHECK-NEXT:    vmv.x.s a0, v8
75; CHECK-NEXT:    ret
76  %v = load <8 x i8>, ptr %x
77  %red = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
78  ret i8 %red
79}
80
81declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
82
83define i8 @vreduce_add_v16i8(ptr %x) {
84; CHECK-LABEL: vreduce_add_v16i8:
85; CHECK:       # %bb.0:
86; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
87; CHECK-NEXT:    vle8.v v8, (a0)
88; CHECK-NEXT:    vmv.s.x v9, zero
89; CHECK-NEXT:    vredsum.vs v8, v8, v9
90; CHECK-NEXT:    vmv.x.s a0, v8
91; CHECK-NEXT:    ret
92  %v = load <16 x i8>, ptr %x
93  %red = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
94  ret i8 %red
95}
96
97declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
98
99define i8 @vreduce_add_v32i8(ptr %x) {
100; CHECK-LABEL: vreduce_add_v32i8:
101; CHECK:       # %bb.0:
102; CHECK-NEXT:    li a1, 32
103; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
104; CHECK-NEXT:    vle8.v v8, (a0)
105; CHECK-NEXT:    vmv.s.x v10, zero
106; CHECK-NEXT:    vredsum.vs v8, v8, v10
107; CHECK-NEXT:    vmv.x.s a0, v8
108; CHECK-NEXT:    ret
109  %v = load <32 x i8>, ptr %x
110  %red = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %v)
111  ret i8 %red
112}
113
114declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
115
116define i8 @vreduce_add_v64i8(ptr %x) {
117; CHECK-LABEL: vreduce_add_v64i8:
118; CHECK:       # %bb.0:
119; CHECK-NEXT:    li a1, 64
120; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
121; CHECK-NEXT:    vle8.v v8, (a0)
122; CHECK-NEXT:    vmv.s.x v12, zero
123; CHECK-NEXT:    vredsum.vs v8, v8, v12
124; CHECK-NEXT:    vmv.x.s a0, v8
125; CHECK-NEXT:    ret
126  %v = load <64 x i8>, ptr %x
127  %red = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %v)
128  ret i8 %red
129}
130
131declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
132
133define i8 @vreduce_add_v128i8(ptr %x) {
134; CHECK-LABEL: vreduce_add_v128i8:
135; CHECK:       # %bb.0:
136; CHECK-NEXT:    li a1, 128
137; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
138; CHECK-NEXT:    vle8.v v8, (a0)
139; CHECK-NEXT:    vmv.s.x v16, zero
140; CHECK-NEXT:    vredsum.vs v8, v8, v16
141; CHECK-NEXT:    vmv.x.s a0, v8
142; CHECK-NEXT:    ret
143  %v = load <128 x i8>, ptr %x
144  %red = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %v)
145  ret i8 %red
146}
147
148declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
149
150define i8 @vreduce_add_v256i8(ptr %x) {
151; CHECK-LABEL: vreduce_add_v256i8:
152; CHECK:       # %bb.0:
153; CHECK-NEXT:    li a1, 128
154; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
155; CHECK-NEXT:    vle8.v v8, (a0)
156; CHECK-NEXT:    addi a0, a0, 128
157; CHECK-NEXT:    vle8.v v16, (a0)
158; CHECK-NEXT:    vadd.vv v8, v8, v16
159; CHECK-NEXT:    vmv.s.x v16, zero
160; CHECK-NEXT:    vredsum.vs v8, v8, v16
161; CHECK-NEXT:    vmv.x.s a0, v8
162; CHECK-NEXT:    ret
163  %v = load <256 x i8>, ptr %x
164  %red = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %v)
165  ret i8 %red
166}
167
168declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
169
170define i16 @vreduce_add_v1i16(<1 x i16> %v) {
171; CHECK-LABEL: vreduce_add_v1i16:
172; CHECK:       # %bb.0:
173; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
174; CHECK-NEXT:    vmv.x.s a0, v8
175; CHECK-NEXT:    ret
176  %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %v)
177  ret i16 %red
178}
179
180define i16 @vwreduce_add_v1i16(<1 x i8> %v) {
181; CHECK-LABEL: vwreduce_add_v1i16:
182; CHECK:       # %bb.0:
183; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
184; CHECK-NEXT:    vsext.vf2 v9, v8
185; CHECK-NEXT:    vmv.x.s a0, v9
186; CHECK-NEXT:    ret
187  %e = sext <1 x i8> %v to <1 x i16>
188  %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
189  ret i16 %red
190}
191
192define i16 @vwreduce_uadd_v1i16(<1 x i8> %v) {
193; CHECK-LABEL: vwreduce_uadd_v1i16:
194; CHECK:       # %bb.0:
195; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
196; CHECK-NEXT:    vzext.vf2 v9, v8
197; CHECK-NEXT:    vmv.x.s a0, v9
198; CHECK-NEXT:    ret
199  %e = zext <1 x i8> %v to <1 x i16>
200  %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
201  ret i16 %red
202}
203
204declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
205
206define i16 @vreduce_add_v2i16(ptr %x) {
207; CHECK-LABEL: vreduce_add_v2i16:
208; CHECK:       # %bb.0:
209; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
210; CHECK-NEXT:    vle16.v v8, (a0)
211; CHECK-NEXT:    vmv.s.x v9, zero
212; CHECK-NEXT:    vredsum.vs v8, v8, v9
213; CHECK-NEXT:    vmv.x.s a0, v8
214; CHECK-NEXT:    ret
215  %v = load <2 x i16>, ptr %x
216  %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %v)
217  ret i16 %red
218}
219
220define i16 @vwreduce_add_v2i16(ptr %x) {
221; CHECK-LABEL: vwreduce_add_v2i16:
222; CHECK:       # %bb.0:
223; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
224; CHECK-NEXT:    vle8.v v8, (a0)
225; CHECK-NEXT:    vmv.s.x v9, zero
226; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
227; CHECK-NEXT:    vwredsum.vs v8, v8, v9
228; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
229; CHECK-NEXT:    vmv.x.s a0, v8
230; CHECK-NEXT:    ret
231  %v = load <2 x i8>, ptr %x
232  %e = sext <2 x i8> %v to <2 x i16>
233  %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e)
234  ret i16 %red
235}
236
237define i16 @vwreduce_uadd_v2i16(ptr %x) {
238; CHECK-LABEL: vwreduce_uadd_v2i16:
239; CHECK:       # %bb.0:
240; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
241; CHECK-NEXT:    vle8.v v8, (a0)
242; CHECK-NEXT:    vmv.s.x v9, zero
243; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
244; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
245; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
246; CHECK-NEXT:    vmv.x.s a0, v8
247; CHECK-NEXT:    ret
248  %v = load <2 x i8>, ptr %x
249  %e = zext <2 x i8> %v to <2 x i16>
250  %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e)
251  ret i16 %red
252}
253
254declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
255
256define i16 @vreduce_add_v4i16(ptr %x) {
257; CHECK-LABEL: vreduce_add_v4i16:
258; CHECK:       # %bb.0:
259; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
260; CHECK-NEXT:    vle16.v v8, (a0)
261; CHECK-NEXT:    vmv.s.x v9, zero
262; CHECK-NEXT:    vredsum.vs v8, v8, v9
263; CHECK-NEXT:    vmv.x.s a0, v8
264; CHECK-NEXT:    ret
265  %v = load <4 x i16>, ptr %x
266  %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
267  ret i16 %red
268}
269
270define i16 @vwreduce_add_v4i16(ptr %x) {
271; CHECK-LABEL: vwreduce_add_v4i16:
272; CHECK:       # %bb.0:
273; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
274; CHECK-NEXT:    vle8.v v8, (a0)
275; CHECK-NEXT:    vmv.s.x v9, zero
276; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
277; CHECK-NEXT:    vwredsum.vs v8, v8, v9
278; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
279; CHECK-NEXT:    vmv.x.s a0, v8
280; CHECK-NEXT:    ret
281  %v = load <4 x i8>, ptr %x
282  %e = sext <4 x i8> %v to <4 x i16>
283  %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e)
284  ret i16 %red
285}
286
287define i16 @vwreduce_uadd_v4i16(ptr %x) {
288; CHECK-LABEL: vwreduce_uadd_v4i16:
289; CHECK:       # %bb.0:
290; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
291; CHECK-NEXT:    vle8.v v8, (a0)
292; CHECK-NEXT:    vmv.s.x v9, zero
293; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
294; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
295; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
296; CHECK-NEXT:    vmv.x.s a0, v8
297; CHECK-NEXT:    ret
298  %v = load <4 x i8>, ptr %x
299  %e = zext <4 x i8> %v to <4 x i16>
300  %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e)
301  ret i16 %red
302}
303
304declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
305
306define i16 @vreduce_add_v8i16(ptr %x) {
307; CHECK-LABEL: vreduce_add_v8i16:
308; CHECK:       # %bb.0:
309; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
310; CHECK-NEXT:    vle16.v v8, (a0)
311; CHECK-NEXT:    vmv.s.x v9, zero
312; CHECK-NEXT:    vredsum.vs v8, v8, v9
313; CHECK-NEXT:    vmv.x.s a0, v8
314; CHECK-NEXT:    ret
315  %v = load <8 x i16>, ptr %x
316  %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
317  ret i16 %red
318}
319
320define i16 @vwreduce_add_v8i16(ptr %x) {
321; CHECK-LABEL: vwreduce_add_v8i16:
322; CHECK:       # %bb.0:
323; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
324; CHECK-NEXT:    vle8.v v8, (a0)
325; CHECK-NEXT:    vmv.s.x v9, zero
326; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
327; CHECK-NEXT:    vwredsum.vs v8, v8, v9
328; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
329; CHECK-NEXT:    vmv.x.s a0, v8
330; CHECK-NEXT:    ret
331  %v = load <8 x i8>, ptr %x
332  %e = sext <8 x i8> %v to <8 x i16>
333  %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e)
334  ret i16 %red
335}
336
337define i16 @vwreduce_uadd_v8i16(ptr %x) {
338; CHECK-LABEL: vwreduce_uadd_v8i16:
339; CHECK:       # %bb.0:
340; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
341; CHECK-NEXT:    vle8.v v8, (a0)
342; CHECK-NEXT:    vmv.s.x v9, zero
343; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
344; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
345; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
346; CHECK-NEXT:    vmv.x.s a0, v8
347; CHECK-NEXT:    ret
348  %v = load <8 x i8>, ptr %x
349  %e = zext <8 x i8> %v to <8 x i16>
350  %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e)
351  ret i16 %red
352}
353
354declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
355
356define i16 @vreduce_add_v16i16(ptr %x) {
357; CHECK-LABEL: vreduce_add_v16i16:
358; CHECK:       # %bb.0:
359; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
360; CHECK-NEXT:    vle16.v v8, (a0)
361; CHECK-NEXT:    vmv.s.x v10, zero
362; CHECK-NEXT:    vredsum.vs v8, v8, v10
363; CHECK-NEXT:    vmv.x.s a0, v8
364; CHECK-NEXT:    ret
365  %v = load <16 x i16>, ptr %x
366  %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %v)
367  ret i16 %red
368}
369
370define i16 @vwreduce_add_v16i16(ptr %x) {
371; CHECK-LABEL: vwreduce_add_v16i16:
372; CHECK:       # %bb.0:
373; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
374; CHECK-NEXT:    vle8.v v8, (a0)
375; CHECK-NEXT:    vmv.s.x v9, zero
376; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
377; CHECK-NEXT:    vwredsum.vs v8, v8, v9
378; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
379; CHECK-NEXT:    vmv.x.s a0, v8
380; CHECK-NEXT:    ret
381  %v = load <16 x i8>, ptr %x
382  %e = sext <16 x i8> %v to <16 x i16>
383  %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e)
384  ret i16 %red
385}
386
387define i16 @vwreduce_uadd_v16i16(ptr %x) {
388; CHECK-LABEL: vwreduce_uadd_v16i16:
389; CHECK:       # %bb.0:
390; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
391; CHECK-NEXT:    vle8.v v8, (a0)
392; CHECK-NEXT:    vmv.s.x v9, zero
393; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
394; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
395; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
396; CHECK-NEXT:    vmv.x.s a0, v8
397; CHECK-NEXT:    ret
398  %v = load <16 x i8>, ptr %x
399  %e = zext <16 x i8> %v to <16 x i16>
400  %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e)
401  ret i16 %red
402}
403
404declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
405
406define i16 @vreduce_add_v32i16(ptr %x) {
407; CHECK-LABEL: vreduce_add_v32i16:
408; CHECK:       # %bb.0:
409; CHECK-NEXT:    li a1, 32
410; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
411; CHECK-NEXT:    vle16.v v8, (a0)
412; CHECK-NEXT:    vmv.s.x v12, zero
413; CHECK-NEXT:    vredsum.vs v8, v8, v12
414; CHECK-NEXT:    vmv.x.s a0, v8
415; CHECK-NEXT:    ret
416  %v = load <32 x i16>, ptr %x
417  %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %v)
418  ret i16 %red
419}
420
421define i16 @vwreduce_add_v32i16(ptr %x) {
422; CHECK-LABEL: vwreduce_add_v32i16:
423; CHECK:       # %bb.0:
424; CHECK-NEXT:    li a1, 32
425; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
426; CHECK-NEXT:    vle8.v v8, (a0)
427; CHECK-NEXT:    vmv.s.x v10, zero
428; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
429; CHECK-NEXT:    vwredsum.vs v8, v8, v10
430; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
431; CHECK-NEXT:    vmv.x.s a0, v8
432; CHECK-NEXT:    ret
433  %v = load <32 x i8>, ptr %x
434  %e = sext <32 x i8> %v to <32 x i16>
435  %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e)
436  ret i16 %red
437}
438
439define i16 @vwreduce_uadd_v32i16(ptr %x) {
440; CHECK-LABEL: vwreduce_uadd_v32i16:
441; CHECK:       # %bb.0:
442; CHECK-NEXT:    li a1, 32
443; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
444; CHECK-NEXT:    vle8.v v8, (a0)
445; CHECK-NEXT:    vmv.s.x v10, zero
446; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
447; CHECK-NEXT:    vwredsumu.vs v8, v8, v10
448; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
449; CHECK-NEXT:    vmv.x.s a0, v8
450; CHECK-NEXT:    ret
451  %v = load <32 x i8>, ptr %x
452  %e = zext <32 x i8> %v to <32 x i16>
453  %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e)
454  ret i16 %red
455}
456
457declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
458
459define i16 @vreduce_add_v64i16(ptr %x) {
460; CHECK-LABEL: vreduce_add_v64i16:
461; CHECK:       # %bb.0:
462; CHECK-NEXT:    li a1, 64
463; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
464; CHECK-NEXT:    vle16.v v8, (a0)
465; CHECK-NEXT:    vmv.s.x v16, zero
466; CHECK-NEXT:    vredsum.vs v8, v8, v16
467; CHECK-NEXT:    vmv.x.s a0, v8
468; CHECK-NEXT:    ret
469  %v = load <64 x i16>, ptr %x
470  %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %v)
471  ret i16 %red
472}
473
474define i16 @vwreduce_add_v64i16(ptr %x) {
475; CHECK-LABEL: vwreduce_add_v64i16:
476; CHECK:       # %bb.0:
477; CHECK-NEXT:    li a1, 64
478; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
479; CHECK-NEXT:    vle8.v v8, (a0)
480; CHECK-NEXT:    vmv.s.x v12, zero
481; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
482; CHECK-NEXT:    vwredsum.vs v8, v8, v12
483; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
484; CHECK-NEXT:    vmv.x.s a0, v8
485; CHECK-NEXT:    ret
486  %v = load <64 x i8>, ptr %x
487  %e = sext <64 x i8> %v to <64 x i16>
488  %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e)
489  ret i16 %red
490}
491
492define i16 @vwreduce_uadd_v64i16(ptr %x) {
493; CHECK-LABEL: vwreduce_uadd_v64i16:
494; CHECK:       # %bb.0:
495; CHECK-NEXT:    li a1, 64
496; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
497; CHECK-NEXT:    vle8.v v8, (a0)
498; CHECK-NEXT:    vmv.s.x v12, zero
499; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
500; CHECK-NEXT:    vwredsumu.vs v8, v8, v12
501; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
502; CHECK-NEXT:    vmv.x.s a0, v8
503; CHECK-NEXT:    ret
504  %v = load <64 x i8>, ptr %x
505  %e = zext <64 x i8> %v to <64 x i16>
506  %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e)
507  ret i16 %red
508}
509
510declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
511
512define i16 @vreduce_add_v128i16(ptr %x) {
513; CHECK-LABEL: vreduce_add_v128i16:
514; CHECK:       # %bb.0:
515; CHECK-NEXT:    li a1, 64
516; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
517; CHECK-NEXT:    vle16.v v8, (a0)
518; CHECK-NEXT:    addi a0, a0, 128
519; CHECK-NEXT:    vle16.v v16, (a0)
520; CHECK-NEXT:    vadd.vv v8, v8, v16
521; CHECK-NEXT:    vmv.s.x v16, zero
522; CHECK-NEXT:    vredsum.vs v8, v8, v16
523; CHECK-NEXT:    vmv.x.s a0, v8
524; CHECK-NEXT:    ret
525  %v = load <128 x i16>, ptr %x
526  %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %v)
527  ret i16 %red
528}
529
530define i16 @vwreduce_add_v128i16(ptr %x) {
531; CHECK-LABEL: vwreduce_add_v128i16:
532; CHECK:       # %bb.0:
533; CHECK-NEXT:    li a1, 128
534; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
535; CHECK-NEXT:    vle8.v v8, (a0)
536; CHECK-NEXT:    li a0, 64
537; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
538; CHECK-NEXT:    vslidedown.vx v16, v8, a0
539; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
540; CHECK-NEXT:    vwadd.vv v24, v8, v16
541; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
542; CHECK-NEXT:    vmv.s.x v8, zero
543; CHECK-NEXT:    vredsum.vs v8, v24, v8
544; CHECK-NEXT:    vmv.x.s a0, v8
545; CHECK-NEXT:    ret
546  %v = load <128 x i8>, ptr %x
547  %e = sext <128 x i8> %v to <128 x i16>
548  %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e)
549  ret i16 %red
550}
551
552define i16 @vwreduce_uadd_v128i16(ptr %x) {
553; CHECK-LABEL: vwreduce_uadd_v128i16:
554; CHECK:       # %bb.0:
555; CHECK-NEXT:    li a1, 128
556; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
557; CHECK-NEXT:    vle8.v v8, (a0)
558; CHECK-NEXT:    li a0, 64
559; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
560; CHECK-NEXT:    vslidedown.vx v16, v8, a0
561; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
562; CHECK-NEXT:    vwaddu.vv v24, v8, v16
563; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
564; CHECK-NEXT:    vmv.s.x v8, zero
565; CHECK-NEXT:    vredsum.vs v8, v24, v8
566; CHECK-NEXT:    vmv.x.s a0, v8
567; CHECK-NEXT:    ret
568  %v = load <128 x i8>, ptr %x
569  %e = zext <128 x i8> %v to <128 x i16>
570  %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e)
571  ret i16 %red
572}
573
574declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
575
576define i32 @vreduce_add_v1i32(<1 x i32> %v) {
577; CHECK-LABEL: vreduce_add_v1i32:
578; CHECK:       # %bb.0:
579; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
580; CHECK-NEXT:    vmv.x.s a0, v8
581; CHECK-NEXT:    ret
582  %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %v)
583  ret i32 %red
584}
585
586define i32 @vwreduce_add_v1i32(<1 x i16> %v) {
587; CHECK-LABEL: vwreduce_add_v1i32:
588; CHECK:       # %bb.0:
589; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
590; CHECK-NEXT:    vsext.vf2 v9, v8
591; CHECK-NEXT:    vmv.x.s a0, v9
592; CHECK-NEXT:    ret
593  %e = sext <1 x i16> %v to <1 x i32>
594  %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
595  ret i32 %red
596}
597
598define i32 @vwreduce_uadd_v1i32(<1 x i16> %v) {
599; CHECK-LABEL: vwreduce_uadd_v1i32:
600; CHECK:       # %bb.0:
601; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
602; CHECK-NEXT:    vzext.vf2 v9, v8
603; CHECK-NEXT:    vmv.x.s a0, v9
604; CHECK-NEXT:    ret
605  %e = zext <1 x i16> %v to <1 x i32>
606  %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
607  ret i32 %red
608}
609
610declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
611
612define i32 @vreduce_add_v2i32(ptr %x) {
613; CHECK-LABEL: vreduce_add_v2i32:
614; CHECK:       # %bb.0:
615; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
616; CHECK-NEXT:    vle32.v v8, (a0)
617; CHECK-NEXT:    vmv.s.x v9, zero
618; CHECK-NEXT:    vredsum.vs v8, v8, v9
619; CHECK-NEXT:    vmv.x.s a0, v8
620; CHECK-NEXT:    ret
621  %v = load <2 x i32>, ptr %x
622  %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %v)
623  ret i32 %red
624}
625
626define i32 @vwreduce_add_v2i32(ptr %x) {
627; CHECK-LABEL: vwreduce_add_v2i32:
628; CHECK:       # %bb.0:
629; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
630; CHECK-NEXT:    vle16.v v8, (a0)
631; CHECK-NEXT:    vmv.s.x v9, zero
632; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
633; CHECK-NEXT:    vwredsum.vs v8, v8, v9
634; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
635; CHECK-NEXT:    vmv.x.s a0, v8
636; CHECK-NEXT:    ret
637  %v = load <2 x i16>, ptr %x
638  %e = sext <2 x i16> %v to <2 x i32>
639  %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e)
640  ret i32 %red
641}
642
643define i32 @vwreduce_uadd_v2i32(ptr %x) {
644; CHECK-LABEL: vwreduce_uadd_v2i32:
645; CHECK:       # %bb.0:
646; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
647; CHECK-NEXT:    vle16.v v8, (a0)
648; CHECK-NEXT:    vmv.s.x v9, zero
649; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
650; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
651; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
652; CHECK-NEXT:    vmv.x.s a0, v8
653; CHECK-NEXT:    ret
654  %v = load <2 x i16>, ptr %x
655  %e = zext <2 x i16> %v to <2 x i32>
656  %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e)
657  ret i32 %red
658}
659
660declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
661
662define i32 @vreduce_add_v4i32(ptr %x) {
663; CHECK-LABEL: vreduce_add_v4i32:
664; CHECK:       # %bb.0:
665; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
666; CHECK-NEXT:    vle32.v v8, (a0)
667; CHECK-NEXT:    vmv.s.x v9, zero
668; CHECK-NEXT:    vredsum.vs v8, v8, v9
669; CHECK-NEXT:    vmv.x.s a0, v8
670; CHECK-NEXT:    ret
671  %v = load <4 x i32>, ptr %x
672  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
673  ret i32 %red
674}
675
676define i32 @vwreduce_add_v4i32(ptr %x) {
677; CHECK-LABEL: vwreduce_add_v4i32:
678; CHECK:       # %bb.0:
679; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
680; CHECK-NEXT:    vle16.v v8, (a0)
681; CHECK-NEXT:    vmv.s.x v9, zero
682; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
683; CHECK-NEXT:    vwredsum.vs v8, v8, v9
684; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
685; CHECK-NEXT:    vmv.x.s a0, v8
686; CHECK-NEXT:    ret
687  %v = load <4 x i16>, ptr %x
688  %e = sext <4 x i16> %v to <4 x i32>
689  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
690  ret i32 %red
691}
692
693define i32 @vwreduce_uadd_v4i32(ptr %x) {
694; CHECK-LABEL: vwreduce_uadd_v4i32:
695; CHECK:       # %bb.0:
696; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
697; CHECK-NEXT:    vle16.v v8, (a0)
698; CHECK-NEXT:    vmv.s.x v9, zero
699; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
700; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
701; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
702; CHECK-NEXT:    vmv.x.s a0, v8
703; CHECK-NEXT:    ret
704  %v = load <4 x i16>, ptr %x
705  %e = zext <4 x i16> %v to <4 x i32>
706  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
707  ret i32 %red
708}
709
710declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
711
712define i32 @vreduce_add_v8i32(ptr %x) {
713; CHECK-LABEL: vreduce_add_v8i32:
714; CHECK:       # %bb.0:
715; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
716; CHECK-NEXT:    vle32.v v8, (a0)
717; CHECK-NEXT:    vmv.s.x v10, zero
718; CHECK-NEXT:    vredsum.vs v8, v8, v10
719; CHECK-NEXT:    vmv.x.s a0, v8
720; CHECK-NEXT:    ret
721  %v = load <8 x i32>, ptr %x
722  %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v)
723  ret i32 %red
724}
725
726define i32 @vwreduce_add_v8i32(ptr %x) {
727; CHECK-LABEL: vwreduce_add_v8i32:
728; CHECK:       # %bb.0:
729; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
730; CHECK-NEXT:    vle16.v v8, (a0)
731; CHECK-NEXT:    vmv.s.x v9, zero
732; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
733; CHECK-NEXT:    vwredsum.vs v8, v8, v9
734; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
735; CHECK-NEXT:    vmv.x.s a0, v8
736; CHECK-NEXT:    ret
737  %v = load <8 x i16>, ptr %x
738  %e = sext <8 x i16> %v to <8 x i32>
739  %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e)
740  ret i32 %red
741}
742
743define i32 @vwreduce_uadd_v8i32(ptr %x) {
744; CHECK-LABEL: vwreduce_uadd_v8i32:
745; CHECK:       # %bb.0:
746; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
747; CHECK-NEXT:    vle16.v v8, (a0)
748; CHECK-NEXT:    vmv.s.x v9, zero
749; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
750; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
751; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
752; CHECK-NEXT:    vmv.x.s a0, v8
753; CHECK-NEXT:    ret
754  %v = load <8 x i16>, ptr %x
755  %e = zext <8 x i16> %v to <8 x i32>
756  %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e)
757  ret i32 %red
758}
759
760declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
761
762define i32 @vreduce_add_v16i32(ptr %x) {
763; CHECK-LABEL: vreduce_add_v16i32:
764; CHECK:       # %bb.0:
765; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
766; CHECK-NEXT:    vle32.v v8, (a0)
767; CHECK-NEXT:    vmv.s.x v12, zero
768; CHECK-NEXT:    vredsum.vs v8, v8, v12
769; CHECK-NEXT:    vmv.x.s a0, v8
770; CHECK-NEXT:    ret
771  %v = load <16 x i32>, ptr %x
772  %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v)
773  ret i32 %red
774}
775
776define i32 @vwreduce_add_v16i32(ptr %x) {
777; CHECK-LABEL: vwreduce_add_v16i32:
778; CHECK:       # %bb.0:
779; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
780; CHECK-NEXT:    vle16.v v8, (a0)
781; CHECK-NEXT:    vmv.s.x v10, zero
782; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
783; CHECK-NEXT:    vwredsum.vs v8, v8, v10
784; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
785; CHECK-NEXT:    vmv.x.s a0, v8
786; CHECK-NEXT:    ret
787  %v = load <16 x i16>, ptr %x
788  %e = sext <16 x i16> %v to <16 x i32>
789  %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e)
790  ret i32 %red
791}
792
793define i32 @vwreduce_uadd_v16i32(ptr %x) {
794; CHECK-LABEL: vwreduce_uadd_v16i32:
795; CHECK:       # %bb.0:
796; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
797; CHECK-NEXT:    vle16.v v8, (a0)
798; CHECK-NEXT:    vmv.s.x v10, zero
799; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
800; CHECK-NEXT:    vwredsumu.vs v8, v8, v10
801; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
802; CHECK-NEXT:    vmv.x.s a0, v8
803; CHECK-NEXT:    ret
804  %v = load <16 x i16>, ptr %x
805  %e = zext <16 x i16> %v to <16 x i32>
806  %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e)
807  ret i32 %red
808}
809
810declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
811
812define i32 @vreduce_add_v32i32(ptr %x) {
813; CHECK-LABEL: vreduce_add_v32i32:
814; CHECK:       # %bb.0:
815; CHECK-NEXT:    li a1, 32
816; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
817; CHECK-NEXT:    vle32.v v8, (a0)
818; CHECK-NEXT:    vmv.s.x v16, zero
819; CHECK-NEXT:    vredsum.vs v8, v8, v16
820; CHECK-NEXT:    vmv.x.s a0, v8
821; CHECK-NEXT:    ret
822  %v = load <32 x i32>, ptr %x
823  %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %v)
824  ret i32 %red
825}
826
827define i32 @vwreduce_add_v32i32(ptr %x) {
828; CHECK-LABEL: vwreduce_add_v32i32:
829; CHECK:       # %bb.0:
830; CHECK-NEXT:    li a1, 32
831; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
832; CHECK-NEXT:    vle16.v v8, (a0)
833; CHECK-NEXT:    vmv.s.x v12, zero
834; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
835; CHECK-NEXT:    vwredsum.vs v8, v8, v12
836; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
837; CHECK-NEXT:    vmv.x.s a0, v8
838; CHECK-NEXT:    ret
839  %v = load <32 x i16>, ptr %x
840  %e = sext <32 x i16> %v to <32 x i32>
841  %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e)
842  ret i32 %red
843}
844
845define i32 @vwreduce_uadd_v32i32(ptr %x) {
846; CHECK-LABEL: vwreduce_uadd_v32i32:
847; CHECK:       # %bb.0:
848; CHECK-NEXT:    li a1, 32
849; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
850; CHECK-NEXT:    vle16.v v8, (a0)
851; CHECK-NEXT:    vmv.s.x v12, zero
852; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
853; CHECK-NEXT:    vwredsumu.vs v8, v8, v12
854; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
855; CHECK-NEXT:    vmv.x.s a0, v8
856; CHECK-NEXT:    ret
857  %v = load <32 x i16>, ptr %x
858  %e = zext <32 x i16> %v to <32 x i32>
859  %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e)
860  ret i32 %red
861}
862
863declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
864
865define i32 @vreduce_add_v64i32(ptr %x) {
866; CHECK-LABEL: vreduce_add_v64i32:
867; CHECK:       # %bb.0:
868; CHECK-NEXT:    li a1, 32
869; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
870; CHECK-NEXT:    vle32.v v8, (a0)
871; CHECK-NEXT:    addi a0, a0, 128
872; CHECK-NEXT:    vle32.v v16, (a0)
873; CHECK-NEXT:    vadd.vv v8, v8, v16
874; CHECK-NEXT:    vmv.s.x v16, zero
875; CHECK-NEXT:    vredsum.vs v8, v8, v16
876; CHECK-NEXT:    vmv.x.s a0, v8
877; CHECK-NEXT:    ret
878  %v = load <64 x i32>, ptr %x
879  %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %v)
880  ret i32 %red
881}
882
883define i32 @vwreduce_add_v64i32(ptr %x) {
884; CHECK-LABEL: vwreduce_add_v64i32:
885; CHECK:       # %bb.0:
886; CHECK-NEXT:    li a1, 64
887; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
888; CHECK-NEXT:    vle16.v v8, (a0)
889; CHECK-NEXT:    li a0, 32
890; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
891; CHECK-NEXT:    vslidedown.vx v16, v8, a0
892; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
893; CHECK-NEXT:    vwadd.vv v24, v8, v16
894; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
895; CHECK-NEXT:    vmv.s.x v8, zero
896; CHECK-NEXT:    vredsum.vs v8, v24, v8
897; CHECK-NEXT:    vmv.x.s a0, v8
898; CHECK-NEXT:    ret
899  %v = load <64 x i16>, ptr %x
900  %e = sext <64 x i16> %v to <64 x i32>
901  %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e)
902  ret i32 %red
903}
904
905define i32 @vwreduce_uadd_v64i32(ptr %x) {
906; CHECK-LABEL: vwreduce_uadd_v64i32:
907; CHECK:       # %bb.0:
908; CHECK-NEXT:    li a1, 64
909; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
910; CHECK-NEXT:    vle16.v v8, (a0)
911; CHECK-NEXT:    li a0, 32
912; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
913; CHECK-NEXT:    vslidedown.vx v16, v8, a0
914; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
915; CHECK-NEXT:    vwaddu.vv v24, v8, v16
916; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
917; CHECK-NEXT:    vmv.s.x v8, zero
918; CHECK-NEXT:    vredsum.vs v8, v24, v8
919; CHECK-NEXT:    vmv.x.s a0, v8
920; CHECK-NEXT:    ret
921  %v = load <64 x i16>, ptr %x
922  %e = zext <64 x i16> %v to <64 x i32>
923  %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e)
924  ret i32 %red
925}
926
927declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
928
929define i64 @vreduce_add_v1i64(<1 x i64> %v) {
930; RV32-LABEL: vreduce_add_v1i64:
931; RV32:       # %bb.0:
932; RV32-NEXT:    li a0, 32
933; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
934; RV32-NEXT:    vsrl.vx v9, v8, a0
935; RV32-NEXT:    vmv.x.s a1, v9
936; RV32-NEXT:    vmv.x.s a0, v8
937; RV32-NEXT:    ret
938;
939; RV64-LABEL: vreduce_add_v1i64:
940; RV64:       # %bb.0:
941; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
942; RV64-NEXT:    vmv.x.s a0, v8
943; RV64-NEXT:    ret
944  %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %v)
945  ret i64 %red
946}
947
948define i64 @vwreduce_add_v1i64(<1 x i32> %v) {
949; RV32-LABEL: vwreduce_add_v1i64:
950; RV32:       # %bb.0:
951; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
952; RV32-NEXT:    vsext.vf2 v9, v8
953; RV32-NEXT:    li a0, 32
954; RV32-NEXT:    vsrl.vx v8, v9, a0
955; RV32-NEXT:    vmv.x.s a1, v8
956; RV32-NEXT:    vmv.x.s a0, v9
957; RV32-NEXT:    ret
958;
959; RV64-LABEL: vwreduce_add_v1i64:
960; RV64:       # %bb.0:
961; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
962; RV64-NEXT:    vsext.vf2 v9, v8
963; RV64-NEXT:    vmv.x.s a0, v9
964; RV64-NEXT:    ret
965  %e = sext <1 x i32> %v to <1 x i64>
966  %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
967  ret i64 %red
968}
969
970define i64 @vwreduce_uadd_v1i64(<1 x i32> %v) {
971; RV32-LABEL: vwreduce_uadd_v1i64:
972; RV32:       # %bb.0:
973; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
974; RV32-NEXT:    vzext.vf2 v9, v8
975; RV32-NEXT:    li a0, 32
976; RV32-NEXT:    vsrl.vx v8, v9, a0
977; RV32-NEXT:    vmv.x.s a1, v8
978; RV32-NEXT:    vmv.x.s a0, v9
979; RV32-NEXT:    ret
980;
981; RV64-LABEL: vwreduce_uadd_v1i64:
982; RV64:       # %bb.0:
983; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
984; RV64-NEXT:    vzext.vf2 v9, v8
985; RV64-NEXT:    vmv.x.s a0, v9
986; RV64-NEXT:    ret
987  %e = zext <1 x i32> %v to <1 x i64>
988  %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
989  ret i64 %red
990}
991
992declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
993
994define i64 @vreduce_add_v2i64(ptr %x) {
995; RV32-LABEL: vreduce_add_v2i64:
996; RV32:       # %bb.0:
997; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
998; RV32-NEXT:    vle64.v v8, (a0)
999; RV32-NEXT:    vmv.s.x v9, zero
1000; RV32-NEXT:    li a1, 32
1001; RV32-NEXT:    vredsum.vs v8, v8, v9
1002; RV32-NEXT:    vmv.x.s a0, v8
1003; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1004; RV32-NEXT:    vsrl.vx v8, v8, a1
1005; RV32-NEXT:    vmv.x.s a1, v8
1006; RV32-NEXT:    ret
1007;
1008; RV64-LABEL: vreduce_add_v2i64:
1009; RV64:       # %bb.0:
1010; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
1011; RV64-NEXT:    vle64.v v8, (a0)
1012; RV64-NEXT:    vmv.s.x v9, zero
1013; RV64-NEXT:    vredsum.vs v8, v8, v9
1014; RV64-NEXT:    vmv.x.s a0, v8
1015; RV64-NEXT:    ret
1016  %v = load <2 x i64>, ptr %x
1017  %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %v)
1018  ret i64 %red
1019}
1020
1021define i64 @vwreduce_add_v2i64(ptr %x) {
1022; RV32-LABEL: vwreduce_add_v2i64:
1023; RV32:       # %bb.0:
1024; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
1025; RV32-NEXT:    vle32.v v8, (a0)
1026; RV32-NEXT:    vmv.s.x v9, zero
1027; RV32-NEXT:    li a1, 32
1028; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
1029; RV32-NEXT:    vwredsum.vs v8, v8, v9
1030; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1031; RV32-NEXT:    vmv.x.s a0, v8
1032; RV32-NEXT:    vsrl.vx v8, v8, a1
1033; RV32-NEXT:    vmv.x.s a1, v8
1034; RV32-NEXT:    ret
1035;
1036; RV64-LABEL: vwreduce_add_v2i64:
1037; RV64:       # %bb.0:
1038; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
1039; RV64-NEXT:    vle32.v v8, (a0)
1040; RV64-NEXT:    vmv.s.x v9, zero
1041; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
1042; RV64-NEXT:    vwredsum.vs v8, v8, v9
1043; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
1044; RV64-NEXT:    vmv.x.s a0, v8
1045; RV64-NEXT:    ret
1046  %v = load <2 x i32>, ptr %x
1047  %e = sext <2 x i32> %v to <2 x i64>
1048  %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e)
1049  ret i64 %red
1050}
1051
1052define i64 @vwreduce_uadd_v2i64(ptr %x) {
1053; RV32-LABEL: vwreduce_uadd_v2i64:
1054; RV32:       # %bb.0:
1055; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
1056; RV32-NEXT:    vle32.v v8, (a0)
1057; RV32-NEXT:    vmv.s.x v9, zero
1058; RV32-NEXT:    li a1, 32
1059; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
1060; RV32-NEXT:    vwredsumu.vs v8, v8, v9
1061; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1062; RV32-NEXT:    vmv.x.s a0, v8
1063; RV32-NEXT:    vsrl.vx v8, v8, a1
1064; RV32-NEXT:    vmv.x.s a1, v8
1065; RV32-NEXT:    ret
1066;
1067; RV64-LABEL: vwreduce_uadd_v2i64:
1068; RV64:       # %bb.0:
1069; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
1070; RV64-NEXT:    vle32.v v8, (a0)
1071; RV64-NEXT:    vmv.s.x v9, zero
1072; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
1073; RV64-NEXT:    vwredsumu.vs v8, v8, v9
1074; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
1075; RV64-NEXT:    vmv.x.s a0, v8
1076; RV64-NEXT:    ret
1077  %v = load <2 x i32>, ptr %x
1078  %e = zext <2 x i32> %v to <2 x i64>
1079  %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e)
1080  ret i64 %red
1081}
1082
1083declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1084
1085define i64 @vreduce_add_v4i64(ptr %x) {
1086; RV32-LABEL: vreduce_add_v4i64:
1087; RV32:       # %bb.0:
1088; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
1089; RV32-NEXT:    vle64.v v8, (a0)
1090; RV32-NEXT:    vmv.s.x v10, zero
1091; RV32-NEXT:    li a1, 32
1092; RV32-NEXT:    vredsum.vs v8, v8, v10
1093; RV32-NEXT:    vmv.x.s a0, v8
1094; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1095; RV32-NEXT:    vsrl.vx v8, v8, a1
1096; RV32-NEXT:    vmv.x.s a1, v8
1097; RV32-NEXT:    ret
1098;
1099; RV64-LABEL: vreduce_add_v4i64:
1100; RV64:       # %bb.0:
1101; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
1102; RV64-NEXT:    vle64.v v8, (a0)
1103; RV64-NEXT:    vmv.s.x v10, zero
1104; RV64-NEXT:    vredsum.vs v8, v8, v10
1105; RV64-NEXT:    vmv.x.s a0, v8
1106; RV64-NEXT:    ret
1107  %v = load <4 x i64>, ptr %x
1108  %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v)
1109  ret i64 %red
1110}
1111
1112define i64 @vwreduce_add_v4i64(ptr %x) {
1113; RV32-LABEL: vwreduce_add_v4i64:
1114; RV32:       # %bb.0:
1115; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
1116; RV32-NEXT:    vle32.v v8, (a0)
1117; RV32-NEXT:    vmv.s.x v9, zero
1118; RV32-NEXT:    li a1, 32
1119; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
1120; RV32-NEXT:    vwredsum.vs v8, v8, v9
1121; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1122; RV32-NEXT:    vmv.x.s a0, v8
1123; RV32-NEXT:    vsrl.vx v8, v8, a1
1124; RV32-NEXT:    vmv.x.s a1, v8
1125; RV32-NEXT:    ret
1126;
1127; RV64-LABEL: vwreduce_add_v4i64:
1128; RV64:       # %bb.0:
1129; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
1130; RV64-NEXT:    vle32.v v8, (a0)
1131; RV64-NEXT:    vmv.s.x v9, zero
1132; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
1133; RV64-NEXT:    vwredsum.vs v8, v8, v9
1134; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
1135; RV64-NEXT:    vmv.x.s a0, v8
1136; RV64-NEXT:    ret
1137  %v = load <4 x i32>, ptr %x
1138  %e = sext <4 x i32> %v to <4 x i64>
1139  %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e)
1140  ret i64 %red
1141}
1142
1143define i64 @vwreduce_uadd_v4i64(ptr %x) {
1144; RV32-LABEL: vwreduce_uadd_v4i64:
1145; RV32:       # %bb.0:
1146; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
1147; RV32-NEXT:    vle32.v v8, (a0)
1148; RV32-NEXT:    vmv.s.x v9, zero
1149; RV32-NEXT:    li a1, 32
1150; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
1151; RV32-NEXT:    vwredsumu.vs v8, v8, v9
1152; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1153; RV32-NEXT:    vmv.x.s a0, v8
1154; RV32-NEXT:    vsrl.vx v8, v8, a1
1155; RV32-NEXT:    vmv.x.s a1, v8
1156; RV32-NEXT:    ret
1157;
1158; RV64-LABEL: vwreduce_uadd_v4i64:
1159; RV64:       # %bb.0:
1160; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
1161; RV64-NEXT:    vle32.v v8, (a0)
1162; RV64-NEXT:    vmv.s.x v9, zero
1163; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
1164; RV64-NEXT:    vwredsumu.vs v8, v8, v9
1165; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
1166; RV64-NEXT:    vmv.x.s a0, v8
1167; RV64-NEXT:    ret
1168  %v = load <4 x i32>, ptr %x
1169  %e = zext <4 x i32> %v to <4 x i64>
1170  %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e)
1171  ret i64 %red
1172}
1173
1174declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1175
1176define i64 @vreduce_add_v8i64(ptr %x) {
1177; RV32-LABEL: vreduce_add_v8i64:
1178; RV32:       # %bb.0:
1179; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
1180; RV32-NEXT:    vle64.v v8, (a0)
1181; RV32-NEXT:    vmv.s.x v12, zero
1182; RV32-NEXT:    li a1, 32
1183; RV32-NEXT:    vredsum.vs v8, v8, v12
1184; RV32-NEXT:    vmv.x.s a0, v8
1185; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1186; RV32-NEXT:    vsrl.vx v8, v8, a1
1187; RV32-NEXT:    vmv.x.s a1, v8
1188; RV32-NEXT:    ret
1189;
1190; RV64-LABEL: vreduce_add_v8i64:
1191; RV64:       # %bb.0:
1192; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
1193; RV64-NEXT:    vle64.v v8, (a0)
1194; RV64-NEXT:    vmv.s.x v12, zero
1195; RV64-NEXT:    vredsum.vs v8, v8, v12
1196; RV64-NEXT:    vmv.x.s a0, v8
1197; RV64-NEXT:    ret
1198  %v = load <8 x i64>, ptr %x
1199  %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %v)
1200  ret i64 %red
1201}
1202
1203define i64 @vwreduce_add_v8i64(ptr %x) {
1204; RV32-LABEL: vwreduce_add_v8i64:
1205; RV32:       # %bb.0:
1206; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
1207; RV32-NEXT:    vle32.v v8, (a0)
1208; RV32-NEXT:    vmv.s.x v10, zero
1209; RV32-NEXT:    li a1, 32
1210; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
1211; RV32-NEXT:    vwredsum.vs v8, v8, v10
1212; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1213; RV32-NEXT:    vmv.x.s a0, v8
1214; RV32-NEXT:    vsrl.vx v8, v8, a1
1215; RV32-NEXT:    vmv.x.s a1, v8
1216; RV32-NEXT:    ret
1217;
1218; RV64-LABEL: vwreduce_add_v8i64:
1219; RV64:       # %bb.0:
1220; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
1221; RV64-NEXT:    vle32.v v8, (a0)
1222; RV64-NEXT:    vmv.s.x v10, zero
1223; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
1224; RV64-NEXT:    vwredsum.vs v8, v8, v10
1225; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
1226; RV64-NEXT:    vmv.x.s a0, v8
1227; RV64-NEXT:    ret
1228  %v = load <8 x i32>, ptr %x
1229  %e = sext <8 x i32> %v to <8 x i64>
1230  %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e)
1231  ret i64 %red
1232}
1233
1234define i64 @vwreduce_uadd_v8i64(ptr %x) {
1235; RV32-LABEL: vwreduce_uadd_v8i64:
1236; RV32:       # %bb.0:
1237; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
1238; RV32-NEXT:    vle32.v v8, (a0)
1239; RV32-NEXT:    vmv.s.x v10, zero
1240; RV32-NEXT:    li a1, 32
1241; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
1242; RV32-NEXT:    vwredsumu.vs v8, v8, v10
1243; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1244; RV32-NEXT:    vmv.x.s a0, v8
1245; RV32-NEXT:    vsrl.vx v8, v8, a1
1246; RV32-NEXT:    vmv.x.s a1, v8
1247; RV32-NEXT:    ret
1248;
1249; RV64-LABEL: vwreduce_uadd_v8i64:
1250; RV64:       # %bb.0:
1251; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
1252; RV64-NEXT:    vle32.v v8, (a0)
1253; RV64-NEXT:    vmv.s.x v10, zero
1254; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
1255; RV64-NEXT:    vwredsumu.vs v8, v8, v10
1256; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
1257; RV64-NEXT:    vmv.x.s a0, v8
1258; RV64-NEXT:    ret
1259  %v = load <8 x i32>, ptr %x
1260  %e = zext <8 x i32> %v to <8 x i64>
1261  %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e)
1262  ret i64 %red
1263}
1264
1265declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1266
1267define i64 @vreduce_add_v16i64(ptr %x) {
1268; RV32-LABEL: vreduce_add_v16i64:
1269; RV32:       # %bb.0:
1270; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1271; RV32-NEXT:    vle64.v v8, (a0)
1272; RV32-NEXT:    vmv.s.x v16, zero
1273; RV32-NEXT:    li a1, 32
1274; RV32-NEXT:    vredsum.vs v8, v8, v16
1275; RV32-NEXT:    vmv.x.s a0, v8
1276; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1277; RV32-NEXT:    vsrl.vx v8, v8, a1
1278; RV32-NEXT:    vmv.x.s a1, v8
1279; RV32-NEXT:    ret
1280;
1281; RV64-LABEL: vreduce_add_v16i64:
1282; RV64:       # %bb.0:
1283; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1284; RV64-NEXT:    vle64.v v8, (a0)
1285; RV64-NEXT:    vmv.s.x v16, zero
1286; RV64-NEXT:    vredsum.vs v8, v8, v16
1287; RV64-NEXT:    vmv.x.s a0, v8
1288; RV64-NEXT:    ret
1289  %v = load <16 x i64>, ptr %x
1290  %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %v)
1291  ret i64 %red
1292}
1293
1294define i64 @vwreduce_add_v16i64(ptr %x) {
1295; RV32-LABEL: vwreduce_add_v16i64:
1296; RV32:       # %bb.0:
1297; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1298; RV32-NEXT:    vle32.v v8, (a0)
1299; RV32-NEXT:    vmv.s.x v12, zero
1300; RV32-NEXT:    li a1, 32
1301; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
1302; RV32-NEXT:    vwredsum.vs v8, v8, v12
1303; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1304; RV32-NEXT:    vmv.x.s a0, v8
1305; RV32-NEXT:    vsrl.vx v8, v8, a1
1306; RV32-NEXT:    vmv.x.s a1, v8
1307; RV32-NEXT:    ret
1308;
1309; RV64-LABEL: vwreduce_add_v16i64:
1310; RV64:       # %bb.0:
1311; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1312; RV64-NEXT:    vle32.v v8, (a0)
1313; RV64-NEXT:    vmv.s.x v12, zero
1314; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
1315; RV64-NEXT:    vwredsum.vs v8, v8, v12
1316; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1317; RV64-NEXT:    vmv.x.s a0, v8
1318; RV64-NEXT:    ret
1319  %v = load <16 x i32>, ptr %x
1320  %e = sext <16 x i32> %v to <16 x i64>
1321  %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e)
1322  ret i64 %red
1323}
1324
1325define i64 @vwreduce_uadd_v16i64(ptr %x) {
1326; RV32-LABEL: vwreduce_uadd_v16i64:
1327; RV32:       # %bb.0:
1328; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1329; RV32-NEXT:    vle32.v v8, (a0)
1330; RV32-NEXT:    vmv.s.x v12, zero
1331; RV32-NEXT:    li a1, 32
1332; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
1333; RV32-NEXT:    vwredsumu.vs v8, v8, v12
1334; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1335; RV32-NEXT:    vmv.x.s a0, v8
1336; RV32-NEXT:    vsrl.vx v8, v8, a1
1337; RV32-NEXT:    vmv.x.s a1, v8
1338; RV32-NEXT:    ret
1339;
1340; RV64-LABEL: vwreduce_uadd_v16i64:
1341; RV64:       # %bb.0:
1342; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1343; RV64-NEXT:    vle32.v v8, (a0)
1344; RV64-NEXT:    vmv.s.x v12, zero
1345; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
1346; RV64-NEXT:    vwredsumu.vs v8, v8, v12
1347; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1348; RV64-NEXT:    vmv.x.s a0, v8
1349; RV64-NEXT:    ret
1350  %v = load <16 x i32>, ptr %x
1351  %e = zext <16 x i32> %v to <16 x i64>
1352  %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e)
1353  ret i64 %red
1354}
1355
1356declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
1357
1358define i64 @vreduce_add_v32i64(ptr %x) {
1359; RV32-LABEL: vreduce_add_v32i64:
1360; RV32:       # %bb.0:
1361; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1362; RV32-NEXT:    vle64.v v8, (a0)
1363; RV32-NEXT:    addi a0, a0, 128
1364; RV32-NEXT:    vle64.v v16, (a0)
1365; RV32-NEXT:    vadd.vv v8, v8, v16
1366; RV32-NEXT:    vmv.s.x v16, zero
1367; RV32-NEXT:    li a1, 32
1368; RV32-NEXT:    vredsum.vs v8, v8, v16
1369; RV32-NEXT:    vmv.x.s a0, v8
1370; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1371; RV32-NEXT:    vsrl.vx v8, v8, a1
1372; RV32-NEXT:    vmv.x.s a1, v8
1373; RV32-NEXT:    ret
1374;
1375; RV64-LABEL: vreduce_add_v32i64:
1376; RV64:       # %bb.0:
1377; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1378; RV64-NEXT:    vle64.v v8, (a0)
1379; RV64-NEXT:    addi a0, a0, 128
1380; RV64-NEXT:    vle64.v v16, (a0)
1381; RV64-NEXT:    vadd.vv v8, v8, v16
1382; RV64-NEXT:    vmv.s.x v16, zero
1383; RV64-NEXT:    vredsum.vs v8, v8, v16
1384; RV64-NEXT:    vmv.x.s a0, v8
1385; RV64-NEXT:    ret
1386  %v = load <32 x i64>, ptr %x
1387  %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %v)
1388  ret i64 %red
1389}
1390
1391define i64 @vwreduce_add_v32i64(ptr %x) {
1392; RV32-LABEL: vwreduce_add_v32i64:
1393; RV32:       # %bb.0:
1394; RV32-NEXT:    li a1, 32
1395; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
1396; RV32-NEXT:    vle32.v v8, (a0)
1397; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1398; RV32-NEXT:    vslidedown.vi v16, v8, 16
1399; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1400; RV32-NEXT:    vwadd.vv v24, v8, v16
1401; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1402; RV32-NEXT:    vmv.s.x v8, zero
1403; RV32-NEXT:    vredsum.vs v8, v24, v8
1404; RV32-NEXT:    vmv.x.s a0, v8
1405; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1406; RV32-NEXT:    vsrl.vx v8, v8, a1
1407; RV32-NEXT:    vmv.x.s a1, v8
1408; RV32-NEXT:    ret
1409;
1410; RV64-LABEL: vwreduce_add_v32i64:
1411; RV64:       # %bb.0:
1412; RV64-NEXT:    li a1, 32
1413; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
1414; RV64-NEXT:    vle32.v v8, (a0)
1415; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1416; RV64-NEXT:    vslidedown.vi v16, v8, 16
1417; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1418; RV64-NEXT:    vwadd.vv v24, v8, v16
1419; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1420; RV64-NEXT:    vmv.s.x v8, zero
1421; RV64-NEXT:    vredsum.vs v8, v24, v8
1422; RV64-NEXT:    vmv.x.s a0, v8
1423; RV64-NEXT:    ret
1424  %v = load <32 x i32>, ptr %x
1425  %e = sext <32 x i32> %v to <32 x i64>
1426  %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e)
1427  ret i64 %red
1428}
1429
1430define i64 @vwreduce_uadd_v32i64(ptr %x) {
1431; RV32-LABEL: vwreduce_uadd_v32i64:
1432; RV32:       # %bb.0:
1433; RV32-NEXT:    li a1, 32
1434; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
1435; RV32-NEXT:    vle32.v v8, (a0)
1436; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1437; RV32-NEXT:    vslidedown.vi v16, v8, 16
1438; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1439; RV32-NEXT:    vwaddu.vv v24, v8, v16
1440; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1441; RV32-NEXT:    vmv.s.x v8, zero
1442; RV32-NEXT:    vredsum.vs v8, v24, v8
1443; RV32-NEXT:    vmv.x.s a0, v8
1444; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1445; RV32-NEXT:    vsrl.vx v8, v8, a1
1446; RV32-NEXT:    vmv.x.s a1, v8
1447; RV32-NEXT:    ret
1448;
1449; RV64-LABEL: vwreduce_uadd_v32i64:
1450; RV64:       # %bb.0:
1451; RV64-NEXT:    li a1, 32
1452; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
1453; RV64-NEXT:    vle32.v v8, (a0)
1454; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1455; RV64-NEXT:    vslidedown.vi v16, v8, 16
1456; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1457; RV64-NEXT:    vwaddu.vv v24, v8, v16
1458; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1459; RV64-NEXT:    vmv.s.x v8, zero
1460; RV64-NEXT:    vredsum.vs v8, v24, v8
1461; RV64-NEXT:    vmv.x.s a0, v8
1462; RV64-NEXT:    ret
1463  %v = load <32 x i32>, ptr %x
1464  %e = zext <32 x i32> %v to <32 x i64>
1465  %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e)
1466  ret i64 %red
1467}
1468
1469declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
1470
1471define i64 @vreduce_add_v64i64(ptr %x) nounwind {
1472; RV32-LABEL: vreduce_add_v64i64:
1473; RV32:       # %bb.0:
1474; RV32-NEXT:    addi a1, a0, 384
1475; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1476; RV32-NEXT:    vle64.v v24, (a1)
1477; RV32-NEXT:    addi a1, a0, 128
1478; RV32-NEXT:    vle64.v v0, (a1)
1479; RV32-NEXT:    vle64.v v8, (a0)
1480; RV32-NEXT:    addi a0, a0, 256
1481; RV32-NEXT:    vle64.v v16, (a0)
1482; RV32-NEXT:    vadd.vv v24, v0, v24
1483; RV32-NEXT:    vmv.s.x v7, zero
1484; RV32-NEXT:    li a1, 32
1485; RV32-NEXT:    vadd.vv v8, v8, v16
1486; RV32-NEXT:    vadd.vv v8, v8, v24
1487; RV32-NEXT:    vredsum.vs v8, v8, v7
1488; RV32-NEXT:    vmv.x.s a0, v8
1489; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1490; RV32-NEXT:    vsrl.vx v8, v8, a1
1491; RV32-NEXT:    vmv.x.s a1, v8
1492; RV32-NEXT:    ret
1493;
1494; RV64-LABEL: vreduce_add_v64i64:
1495; RV64:       # %bb.0:
1496; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1497; RV64-NEXT:    vle64.v v8, (a0)
1498; RV64-NEXT:    addi a1, a0, 384
1499; RV64-NEXT:    vle64.v v16, (a1)
1500; RV64-NEXT:    addi a1, a0, 256
1501; RV64-NEXT:    addi a0, a0, 128
1502; RV64-NEXT:    vle64.v v24, (a0)
1503; RV64-NEXT:    vle64.v v0, (a1)
1504; RV64-NEXT:    vadd.vv v16, v24, v16
1505; RV64-NEXT:    vadd.vv v8, v8, v0
1506; RV64-NEXT:    vadd.vv v8, v8, v16
1507; RV64-NEXT:    vmv.s.x v16, zero
1508; RV64-NEXT:    vredsum.vs v8, v8, v16
1509; RV64-NEXT:    vmv.x.s a0, v8
1510; RV64-NEXT:    ret
1511  %v = load <64 x i64>, ptr %x
1512  %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %v)
1513  ret i64 %red
1514}
1515
1516define i64 @vwreduce_add_v64i64(ptr %x) {
1517; RV32-LABEL: vwreduce_add_v64i64:
1518; RV32:       # %bb.0:
1519; RV32-NEXT:    addi a1, a0, 128
1520; RV32-NEXT:    li a2, 32
1521; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
1522; RV32-NEXT:    vle32.v v16, (a0)
1523; RV32-NEXT:    vle32.v v8, (a1)
1524; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1525; RV32-NEXT:    vslidedown.vi v0, v16, 16
1526; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1527; RV32-NEXT:    vwadd.vv v24, v16, v8
1528; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1529; RV32-NEXT:    vslidedown.vi v8, v8, 16
1530; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1531; RV32-NEXT:    vwadd.vv v16, v0, v8
1532; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1533; RV32-NEXT:    vadd.vv v8, v24, v16
1534; RV32-NEXT:    vmv.s.x v16, zero
1535; RV32-NEXT:    vredsum.vs v8, v8, v16
1536; RV32-NEXT:    vmv.x.s a0, v8
1537; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1538; RV32-NEXT:    vsrl.vx v8, v8, a2
1539; RV32-NEXT:    vmv.x.s a1, v8
1540; RV32-NEXT:    ret
1541;
1542; RV64-LABEL: vwreduce_add_v64i64:
1543; RV64:       # %bb.0:
1544; RV64-NEXT:    addi sp, sp, -16
1545; RV64-NEXT:    .cfi_def_cfa_offset 16
1546; RV64-NEXT:    csrr a1, vlenb
1547; RV64-NEXT:    slli a1, a1, 3
1548; RV64-NEXT:    sub sp, sp, a1
1549; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
1550; RV64-NEXT:    addi a1, a0, 128
1551; RV64-NEXT:    li a2, 32
1552; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
1553; RV64-NEXT:    vle32.v v8, (a0)
1554; RV64-NEXT:    vle32.v v16, (a1)
1555; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1556; RV64-NEXT:    vslidedown.vi v24, v8, 16
1557; RV64-NEXT:    addi a0, sp, 16
1558; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
1559; RV64-NEXT:    vslidedown.vi v0, v16, 16
1560; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1561; RV64-NEXT:    vwadd.vv v24, v8, v16
1562; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1563; RV64-NEXT:    vwadd.vv v8, v16, v0
1564; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1565; RV64-NEXT:    vadd.vv v8, v24, v8
1566; RV64-NEXT:    vmv.s.x v16, zero
1567; RV64-NEXT:    vredsum.vs v8, v8, v16
1568; RV64-NEXT:    vmv.x.s a0, v8
1569; RV64-NEXT:    csrr a1, vlenb
1570; RV64-NEXT:    slli a1, a1, 3
1571; RV64-NEXT:    add sp, sp, a1
1572; RV64-NEXT:    .cfi_def_cfa sp, 16
1573; RV64-NEXT:    addi sp, sp, 16
1574; RV64-NEXT:    .cfi_def_cfa_offset 0
1575; RV64-NEXT:    ret
1576  %v = load <64 x i32>, ptr %x
1577  %e = sext <64 x i32> %v to <64 x i64>
1578  %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e)
1579  ret i64 %red
1580}
1581
1582define i64 @vwreduce_uadd_v64i64(ptr %x) {
1583; RV32-LABEL: vwreduce_uadd_v64i64:
1584; RV32:       # %bb.0:
1585; RV32-NEXT:    addi a1, a0, 128
1586; RV32-NEXT:    li a2, 32
1587; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
1588; RV32-NEXT:    vle32.v v16, (a0)
1589; RV32-NEXT:    vle32.v v8, (a1)
1590; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1591; RV32-NEXT:    vslidedown.vi v0, v16, 16
1592; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1593; RV32-NEXT:    vwaddu.vv v24, v16, v8
1594; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1595; RV32-NEXT:    vslidedown.vi v8, v8, 16
1596; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1597; RV32-NEXT:    vwaddu.vv v16, v0, v8
1598; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1599; RV32-NEXT:    vadd.vv v8, v24, v16
1600; RV32-NEXT:    vmv.s.x v16, zero
1601; RV32-NEXT:    vredsum.vs v8, v8, v16
1602; RV32-NEXT:    vmv.x.s a0, v8
1603; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1604; RV32-NEXT:    vsrl.vx v8, v8, a2
1605; RV32-NEXT:    vmv.x.s a1, v8
1606; RV32-NEXT:    ret
1607;
1608; RV64-LABEL: vwreduce_uadd_v64i64:
1609; RV64:       # %bb.0:
1610; RV64-NEXT:    addi sp, sp, -16
1611; RV64-NEXT:    .cfi_def_cfa_offset 16
1612; RV64-NEXT:    csrr a1, vlenb
1613; RV64-NEXT:    slli a1, a1, 3
1614; RV64-NEXT:    sub sp, sp, a1
1615; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
1616; RV64-NEXT:    addi a1, a0, 128
1617; RV64-NEXT:    li a2, 32
1618; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
1619; RV64-NEXT:    vle32.v v8, (a0)
1620; RV64-NEXT:    vle32.v v16, (a1)
1621; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
1622; RV64-NEXT:    vslidedown.vi v24, v8, 16
1623; RV64-NEXT:    addi a0, sp, 16
1624; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
1625; RV64-NEXT:    vslidedown.vi v0, v16, 16
1626; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1627; RV64-NEXT:    vwaddu.vv v24, v8, v16
1628; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1629; RV64-NEXT:    vwaddu.vv v8, v16, v0
1630; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
1631; RV64-NEXT:    vadd.vv v8, v24, v8
1632; RV64-NEXT:    vmv.s.x v16, zero
1633; RV64-NEXT:    vredsum.vs v8, v8, v16
1634; RV64-NEXT:    vmv.x.s a0, v8
1635; RV64-NEXT:    csrr a1, vlenb
1636; RV64-NEXT:    slli a1, a1, 3
1637; RV64-NEXT:    add sp, sp, a1
1638; RV64-NEXT:    .cfi_def_cfa sp, 16
1639; RV64-NEXT:    addi sp, sp, 16
1640; RV64-NEXT:    .cfi_def_cfa_offset 0
1641; RV64-NEXT:    ret
1642  %v = load <64 x i32>, ptr %x
1643  %e = zext <64 x i32> %v to <64 x i64>
1644  %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e)
1645  ret i64 %red
1646}
1647
1648declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>)
1649
1650define i8 @vreduce_and_v1i8(<1 x i8> %v) {
1651; CHECK-LABEL: vreduce_and_v1i8:
1652; CHECK:       # %bb.0:
1653; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
1654; CHECK-NEXT:    vmv.x.s a0, v8
1655; CHECK-NEXT:    ret
1656  %red = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %v)
1657  ret i8 %red
1658}
1659
1660declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
1661
1662define i8 @vreduce_and_v2i8(ptr %x) {
1663; CHECK-LABEL: vreduce_and_v2i8:
1664; CHECK:       # %bb.0:
1665; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
1666; CHECK-NEXT:    vle8.v v8, (a0)
1667; CHECK-NEXT:    vredand.vs v8, v8, v8
1668; CHECK-NEXT:    vmv.x.s a0, v8
1669; CHECK-NEXT:    ret
1670  %v = load <2 x i8>, ptr %x
1671  %red = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %v)
1672  ret i8 %red
1673}
1674
1675declare i8 @llvm.vector.reduce.and.v3i8(<3 x i8>)
1676
1677define i8 @vreduce_and_v3i8(ptr %x) {
1678; CHECK-LABEL: vreduce_and_v3i8:
1679; CHECK:       # %bb.0:
1680; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
1681; CHECK-NEXT:    vle8.v v8, (a0)
1682; CHECK-NEXT:    li a0, -1
1683; CHECK-NEXT:    vmv.s.x v9, a0
1684; CHECK-NEXT:    vredand.vs v8, v8, v9
1685; CHECK-NEXT:    vmv.x.s a0, v8
1686; CHECK-NEXT:    ret
1687  %v = load <3 x i8>, ptr %x
1688  %red = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %v)
1689  ret i8 %red
1690}
1691
1692
1693declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
1694
1695define i8 @vreduce_and_v4i8(ptr %x) {
1696; CHECK-LABEL: vreduce_and_v4i8:
1697; CHECK:       # %bb.0:
1698; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
1699; CHECK-NEXT:    vle8.v v8, (a0)
1700; CHECK-NEXT:    vredand.vs v8, v8, v8
1701; CHECK-NEXT:    vmv.x.s a0, v8
1702; CHECK-NEXT:    ret
1703  %v = load <4 x i8>, ptr %x
1704  %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
1705  ret i8 %red
1706}
1707
1708declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
1709
1710define i8 @vreduce_and_v8i8(ptr %x) {
1711; CHECK-LABEL: vreduce_and_v8i8:
1712; CHECK:       # %bb.0:
1713; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
1714; CHECK-NEXT:    vle8.v v8, (a0)
1715; CHECK-NEXT:    vredand.vs v8, v8, v8
1716; CHECK-NEXT:    vmv.x.s a0, v8
1717; CHECK-NEXT:    ret
1718  %v = load <8 x i8>, ptr %x
1719  %red = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v)
1720  ret i8 %red
1721}
1722
1723declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
1724
1725define i8 @vreduce_and_v16i8(ptr %x) {
1726; CHECK-LABEL: vreduce_and_v16i8:
1727; CHECK:       # %bb.0:
1728; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
1729; CHECK-NEXT:    vle8.v v8, (a0)
1730; CHECK-NEXT:    vredand.vs v8, v8, v8
1731; CHECK-NEXT:    vmv.x.s a0, v8
1732; CHECK-NEXT:    ret
1733  %v = load <16 x i8>, ptr %x
1734  %red = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %v)
1735  ret i8 %red
1736}
1737
1738declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
1739
1740define i8 @vreduce_and_v32i8(ptr %x) {
1741; CHECK-LABEL: vreduce_and_v32i8:
1742; CHECK:       # %bb.0:
1743; CHECK-NEXT:    li a1, 32
1744; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
1745; CHECK-NEXT:    vle8.v v8, (a0)
1746; CHECK-NEXT:    vredand.vs v8, v8, v8
1747; CHECK-NEXT:    vmv.x.s a0, v8
1748; CHECK-NEXT:    ret
1749  %v = load <32 x i8>, ptr %x
1750  %red = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %v)
1751  ret i8 %red
1752}
1753
1754declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
1755
1756define i8 @vreduce_and_v64i8(ptr %x) {
1757; CHECK-LABEL: vreduce_and_v64i8:
1758; CHECK:       # %bb.0:
1759; CHECK-NEXT:    li a1, 64
1760; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
1761; CHECK-NEXT:    vle8.v v8, (a0)
1762; CHECK-NEXT:    vredand.vs v8, v8, v8
1763; CHECK-NEXT:    vmv.x.s a0, v8
1764; CHECK-NEXT:    ret
1765  %v = load <64 x i8>, ptr %x
1766  %red = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %v)
1767  ret i8 %red
1768}
1769
1770declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
1771
1772define i8 @vreduce_and_v128i8(ptr %x) {
1773; CHECK-LABEL: vreduce_and_v128i8:
1774; CHECK:       # %bb.0:
1775; CHECK-NEXT:    li a1, 128
1776; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
1777; CHECK-NEXT:    vle8.v v8, (a0)
1778; CHECK-NEXT:    vredand.vs v8, v8, v8
1779; CHECK-NEXT:    vmv.x.s a0, v8
1780; CHECK-NEXT:    ret
1781  %v = load <128 x i8>, ptr %x
1782  %red = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %v)
1783  ret i8 %red
1784}
1785
1786declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>)
1787
1788define i8 @vreduce_and_v256i8(ptr %x) {
1789; CHECK-LABEL: vreduce_and_v256i8:
1790; CHECK:       # %bb.0:
1791; CHECK-NEXT:    li a1, 128
1792; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
1793; CHECK-NEXT:    vle8.v v8, (a0)
1794; CHECK-NEXT:    addi a0, a0, 128
1795; CHECK-NEXT:    vle8.v v16, (a0)
1796; CHECK-NEXT:    vand.vv v8, v8, v16
1797; CHECK-NEXT:    vredand.vs v8, v8, v8
1798; CHECK-NEXT:    vmv.x.s a0, v8
1799; CHECK-NEXT:    ret
1800  %v = load <256 x i8>, ptr %x
1801  %red = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %v)
1802  ret i8 %red
1803}
1804
1805declare i16 @llvm.vector.reduce.and.v1i16(<1 x i16>)
1806
1807define i16 @vreduce_and_v1i16(<1 x i16> %v) {
1808; CHECK-LABEL: vreduce_and_v1i16:
1809; CHECK:       # %bb.0:
1810; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
1811; CHECK-NEXT:    vmv.x.s a0, v8
1812; CHECK-NEXT:    ret
1813  %red = call i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %v)
1814  ret i16 %red
1815}
1816
1817declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
1818
1819define i16 @vreduce_and_v2i16(ptr %x) {
1820; CHECK-LABEL: vreduce_and_v2i16:
1821; CHECK:       # %bb.0:
1822; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
1823; CHECK-NEXT:    vle16.v v8, (a0)
1824; CHECK-NEXT:    vredand.vs v8, v8, v8
1825; CHECK-NEXT:    vmv.x.s a0, v8
1826; CHECK-NEXT:    ret
1827  %v = load <2 x i16>, ptr %x
1828  %red = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %v)
1829  ret i16 %red
1830}
1831
1832declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
1833
1834define i16 @vreduce_and_v4i16(ptr %x) {
1835; CHECK-LABEL: vreduce_and_v4i16:
1836; CHECK:       # %bb.0:
1837; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
1838; CHECK-NEXT:    vle16.v v8, (a0)
1839; CHECK-NEXT:    vredand.vs v8, v8, v8
1840; CHECK-NEXT:    vmv.x.s a0, v8
1841; CHECK-NEXT:    ret
1842  %v = load <4 x i16>, ptr %x
1843  %red = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v)
1844  ret i16 %red
1845}
1846
1847declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
1848
1849define i16 @vreduce_and_v8i16(ptr %x) {
1850; CHECK-LABEL: vreduce_and_v8i16:
1851; CHECK:       # %bb.0:
1852; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
1853; CHECK-NEXT:    vle16.v v8, (a0)
1854; CHECK-NEXT:    vredand.vs v8, v8, v8
1855; CHECK-NEXT:    vmv.x.s a0, v8
1856; CHECK-NEXT:    ret
1857  %v = load <8 x i16>, ptr %x
1858  %red = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %v)
1859  ret i16 %red
1860}
1861
1862declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
1863
1864define i16 @vreduce_and_v16i16(ptr %x) {
1865; CHECK-LABEL: vreduce_and_v16i16:
1866; CHECK:       # %bb.0:
1867; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
1868; CHECK-NEXT:    vle16.v v8, (a0)
1869; CHECK-NEXT:    vredand.vs v8, v8, v8
1870; CHECK-NEXT:    vmv.x.s a0, v8
1871; CHECK-NEXT:    ret
1872  %v = load <16 x i16>, ptr %x
1873  %red = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %v)
1874  ret i16 %red
1875}
1876
1877declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
1878
1879define i16 @vreduce_and_v32i16(ptr %x) {
1880; CHECK-LABEL: vreduce_and_v32i16:
1881; CHECK:       # %bb.0:
1882; CHECK-NEXT:    li a1, 32
1883; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
1884; CHECK-NEXT:    vle16.v v8, (a0)
1885; CHECK-NEXT:    vredand.vs v8, v8, v8
1886; CHECK-NEXT:    vmv.x.s a0, v8
1887; CHECK-NEXT:    ret
1888  %v = load <32 x i16>, ptr %x
1889  %red = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %v)
1890  ret i16 %red
1891}
1892
1893declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
1894
1895define i16 @vreduce_and_v64i16(ptr %x) {
1896; CHECK-LABEL: vreduce_and_v64i16:
1897; CHECK:       # %bb.0:
1898; CHECK-NEXT:    li a1, 64
1899; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
1900; CHECK-NEXT:    vle16.v v8, (a0)
1901; CHECK-NEXT:    vredand.vs v8, v8, v8
1902; CHECK-NEXT:    vmv.x.s a0, v8
1903; CHECK-NEXT:    ret
1904  %v = load <64 x i16>, ptr %x
1905  %red = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %v)
1906  ret i16 %red
1907}
1908
1909declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>)
1910
1911define i16 @vreduce_and_v128i16(ptr %x) {
1912; CHECK-LABEL: vreduce_and_v128i16:
1913; CHECK:       # %bb.0:
1914; CHECK-NEXT:    li a1, 64
1915; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
1916; CHECK-NEXT:    vle16.v v8, (a0)
1917; CHECK-NEXT:    addi a0, a0, 128
1918; CHECK-NEXT:    vle16.v v16, (a0)
1919; CHECK-NEXT:    vand.vv v8, v8, v16
1920; CHECK-NEXT:    vredand.vs v8, v8, v8
1921; CHECK-NEXT:    vmv.x.s a0, v8
1922; CHECK-NEXT:    ret
1923  %v = load <128 x i16>, ptr %x
1924  %red = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %v)
1925  ret i16 %red
1926}
1927
1928declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32>)
1929
1930define i32 @vreduce_and_v1i32(<1 x i32> %v) {
1931; CHECK-LABEL: vreduce_and_v1i32:
1932; CHECK:       # %bb.0:
1933; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
1934; CHECK-NEXT:    vmv.x.s a0, v8
1935; CHECK-NEXT:    ret
1936  %red = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %v)
1937  ret i32 %red
1938}
1939
1940declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
1941
1942define i32 @vreduce_and_v2i32(ptr %x) {
1943; CHECK-LABEL: vreduce_and_v2i32:
1944; CHECK:       # %bb.0:
1945; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
1946; CHECK-NEXT:    vle32.v v8, (a0)
1947; CHECK-NEXT:    vredand.vs v8, v8, v8
1948; CHECK-NEXT:    vmv.x.s a0, v8
1949; CHECK-NEXT:    ret
1950  %v = load <2 x i32>, ptr %x
1951  %red = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v)
1952  ret i32 %red
1953}
1954
1955declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
1956
1957define i32 @vreduce_and_v4i32(ptr %x) {
1958; CHECK-LABEL: vreduce_and_v4i32:
1959; CHECK:       # %bb.0:
1960; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
1961; CHECK-NEXT:    vle32.v v8, (a0)
1962; CHECK-NEXT:    vredand.vs v8, v8, v8
1963; CHECK-NEXT:    vmv.x.s a0, v8
1964; CHECK-NEXT:    ret
1965  %v = load <4 x i32>, ptr %x
1966  %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %v)
1967  ret i32 %red
1968}
1969
1970declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
1971
1972define i32 @vreduce_and_v8i32(ptr %x) {
1973; CHECK-LABEL: vreduce_and_v8i32:
1974; CHECK:       # %bb.0:
1975; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
1976; CHECK-NEXT:    vle32.v v8, (a0)
1977; CHECK-NEXT:    vredand.vs v8, v8, v8
1978; CHECK-NEXT:    vmv.x.s a0, v8
1979; CHECK-NEXT:    ret
1980  %v = load <8 x i32>, ptr %x
1981  %red = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v)
1982  ret i32 %red
1983}
1984
1985declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
1986
1987define i32 @vreduce_and_v16i32(ptr %x) {
1988; CHECK-LABEL: vreduce_and_v16i32:
1989; CHECK:       # %bb.0:
1990; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1991; CHECK-NEXT:    vle32.v v8, (a0)
1992; CHECK-NEXT:    vredand.vs v8, v8, v8
1993; CHECK-NEXT:    vmv.x.s a0, v8
1994; CHECK-NEXT:    ret
1995  %v = load <16 x i32>, ptr %x
1996  %red = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v)
1997  ret i32 %red
1998}
1999
2000declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
2001
2002define i32 @vreduce_and_v32i32(ptr %x) {
2003; CHECK-LABEL: vreduce_and_v32i32:
2004; CHECK:       # %bb.0:
2005; CHECK-NEXT:    li a1, 32
2006; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
2007; CHECK-NEXT:    vle32.v v8, (a0)
2008; CHECK-NEXT:    vredand.vs v8, v8, v8
2009; CHECK-NEXT:    vmv.x.s a0, v8
2010; CHECK-NEXT:    ret
2011  %v = load <32 x i32>, ptr %x
2012  %red = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %v)
2013  ret i32 %red
2014}
2015
2016declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>)
2017
2018define i32 @vreduce_and_v64i32(ptr %x) {
2019; CHECK-LABEL: vreduce_and_v64i32:
2020; CHECK:       # %bb.0:
2021; CHECK-NEXT:    li a1, 32
2022; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
2023; CHECK-NEXT:    vle32.v v8, (a0)
2024; CHECK-NEXT:    addi a0, a0, 128
2025; CHECK-NEXT:    vle32.v v16, (a0)
2026; CHECK-NEXT:    vand.vv v8, v8, v16
2027; CHECK-NEXT:    vredand.vs v8, v8, v8
2028; CHECK-NEXT:    vmv.x.s a0, v8
2029; CHECK-NEXT:    ret
2030  %v = load <64 x i32>, ptr %x
2031  %red = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %v)
2032  ret i32 %red
2033}
2034
2035declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
2036
2037define i64 @vreduce_and_v1i64(<1 x i64> %v) {
2038; RV32-LABEL: vreduce_and_v1i64:
2039; RV32:       # %bb.0:
2040; RV32-NEXT:    li a0, 32
2041; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2042; RV32-NEXT:    vsrl.vx v9, v8, a0
2043; RV32-NEXT:    vmv.x.s a1, v9
2044; RV32-NEXT:    vmv.x.s a0, v8
2045; RV32-NEXT:    ret
2046;
2047; RV64-LABEL: vreduce_and_v1i64:
2048; RV64:       # %bb.0:
2049; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2050; RV64-NEXT:    vmv.x.s a0, v8
2051; RV64-NEXT:    ret
2052  %red = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %v)
2053  ret i64 %red
2054}
2055
2056declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
2057
2058define i64 @vreduce_and_v2i64(ptr %x) {
2059; RV32-LABEL: vreduce_and_v2i64:
2060; RV32:       # %bb.0:
2061; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
2062; RV32-NEXT:    vle64.v v8, (a0)
2063; RV32-NEXT:    li a0, 32
2064; RV32-NEXT:    vredand.vs v8, v8, v8
2065; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2066; RV32-NEXT:    vsrl.vx v9, v8, a0
2067; RV32-NEXT:    vmv.x.s a1, v9
2068; RV32-NEXT:    vmv.x.s a0, v8
2069; RV32-NEXT:    ret
2070;
2071; RV64-LABEL: vreduce_and_v2i64:
2072; RV64:       # %bb.0:
2073; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
2074; RV64-NEXT:    vle64.v v8, (a0)
2075; RV64-NEXT:    vredand.vs v8, v8, v8
2076; RV64-NEXT:    vmv.x.s a0, v8
2077; RV64-NEXT:    ret
2078  %v = load <2 x i64>, ptr %x
2079  %red = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %v)
2080  ret i64 %red
2081}
2082
2083declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
2084
2085define i64 @vreduce_and_v4i64(ptr %x) {
2086; RV32-LABEL: vreduce_and_v4i64:
2087; RV32:       # %bb.0:
2088; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
2089; RV32-NEXT:    vle64.v v8, (a0)
2090; RV32-NEXT:    li a1, 32
2091; RV32-NEXT:    vredand.vs v8, v8, v8
2092; RV32-NEXT:    vmv.x.s a0, v8
2093; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2094; RV32-NEXT:    vsrl.vx v8, v8, a1
2095; RV32-NEXT:    vmv.x.s a1, v8
2096; RV32-NEXT:    ret
2097;
2098; RV64-LABEL: vreduce_and_v4i64:
2099; RV64:       # %bb.0:
2100; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
2101; RV64-NEXT:    vle64.v v8, (a0)
2102; RV64-NEXT:    vredand.vs v8, v8, v8
2103; RV64-NEXT:    vmv.x.s a0, v8
2104; RV64-NEXT:    ret
2105  %v = load <4 x i64>, ptr %x
2106  %red = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v)
2107  ret i64 %red
2108}
2109
2110declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
2111
2112define i64 @vreduce_and_v8i64(ptr %x) {
2113; RV32-LABEL: vreduce_and_v8i64:
2114; RV32:       # %bb.0:
2115; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
2116; RV32-NEXT:    vle64.v v8, (a0)
2117; RV32-NEXT:    li a1, 32
2118; RV32-NEXT:    vredand.vs v8, v8, v8
2119; RV32-NEXT:    vmv.x.s a0, v8
2120; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2121; RV32-NEXT:    vsrl.vx v8, v8, a1
2122; RV32-NEXT:    vmv.x.s a1, v8
2123; RV32-NEXT:    ret
2124;
2125; RV64-LABEL: vreduce_and_v8i64:
2126; RV64:       # %bb.0:
2127; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
2128; RV64-NEXT:    vle64.v v8, (a0)
2129; RV64-NEXT:    vredand.vs v8, v8, v8
2130; RV64-NEXT:    vmv.x.s a0, v8
2131; RV64-NEXT:    ret
2132  %v = load <8 x i64>, ptr %x
2133  %red = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %v)
2134  ret i64 %red
2135}
2136
2137declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
2138
2139define i64 @vreduce_and_v16i64(ptr %x) {
2140; RV32-LABEL: vreduce_and_v16i64:
2141; RV32:       # %bb.0:
2142; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2143; RV32-NEXT:    vle64.v v8, (a0)
2144; RV32-NEXT:    li a1, 32
2145; RV32-NEXT:    vredand.vs v8, v8, v8
2146; RV32-NEXT:    vmv.x.s a0, v8
2147; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2148; RV32-NEXT:    vsrl.vx v8, v8, a1
2149; RV32-NEXT:    vmv.x.s a1, v8
2150; RV32-NEXT:    ret
2151;
2152; RV64-LABEL: vreduce_and_v16i64:
2153; RV64:       # %bb.0:
2154; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2155; RV64-NEXT:    vle64.v v8, (a0)
2156; RV64-NEXT:    vredand.vs v8, v8, v8
2157; RV64-NEXT:    vmv.x.s a0, v8
2158; RV64-NEXT:    ret
2159  %v = load <16 x i64>, ptr %x
2160  %red = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %v)
2161  ret i64 %red
2162}
2163
2164declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>)
2165
2166define i64 @vreduce_and_v32i64(ptr %x) {
2167; RV32-LABEL: vreduce_and_v32i64:
2168; RV32:       # %bb.0:
2169; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2170; RV32-NEXT:    vle64.v v8, (a0)
2171; RV32-NEXT:    addi a0, a0, 128
2172; RV32-NEXT:    vle64.v v16, (a0)
2173; RV32-NEXT:    li a1, 32
2174; RV32-NEXT:    vand.vv v8, v8, v16
2175; RV32-NEXT:    vredand.vs v8, v8, v8
2176; RV32-NEXT:    vmv.x.s a0, v8
2177; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2178; RV32-NEXT:    vsrl.vx v8, v8, a1
2179; RV32-NEXT:    vmv.x.s a1, v8
2180; RV32-NEXT:    ret
2181;
2182; RV64-LABEL: vreduce_and_v32i64:
2183; RV64:       # %bb.0:
2184; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2185; RV64-NEXT:    vle64.v v8, (a0)
2186; RV64-NEXT:    addi a0, a0, 128
2187; RV64-NEXT:    vle64.v v16, (a0)
2188; RV64-NEXT:    vand.vv v8, v8, v16
2189; RV64-NEXT:    vredand.vs v8, v8, v8
2190; RV64-NEXT:    vmv.x.s a0, v8
2191; RV64-NEXT:    ret
2192  %v = load <32 x i64>, ptr %x
2193  %red = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %v)
2194  ret i64 %red
2195}
2196
2197declare i64 @llvm.vector.reduce.and.v64i64(<64 x i64>)
2198
2199define i64 @vreduce_and_v64i64(ptr %x) nounwind {
2200; RV32-LABEL: vreduce_and_v64i64:
2201; RV32:       # %bb.0:
2202; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2203; RV32-NEXT:    vle64.v v8, (a0)
2204; RV32-NEXT:    addi a1, a0, 384
2205; RV32-NEXT:    vle64.v v16, (a1)
2206; RV32-NEXT:    addi a1, a0, 256
2207; RV32-NEXT:    addi a0, a0, 128
2208; RV32-NEXT:    vle64.v v0, (a0)
2209; RV32-NEXT:    vle64.v v24, (a1)
2210; RV32-NEXT:    li a1, 32
2211; RV32-NEXT:    vand.vv v16, v0, v16
2212; RV32-NEXT:    vand.vv v8, v8, v24
2213; RV32-NEXT:    vand.vv v8, v8, v16
2214; RV32-NEXT:    vredand.vs v8, v8, v8
2215; RV32-NEXT:    vmv.x.s a0, v8
2216; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2217; RV32-NEXT:    vsrl.vx v8, v8, a1
2218; RV32-NEXT:    vmv.x.s a1, v8
2219; RV32-NEXT:    ret
2220;
2221; RV64-LABEL: vreduce_and_v64i64:
2222; RV64:       # %bb.0:
2223; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2224; RV64-NEXT:    vle64.v v8, (a0)
2225; RV64-NEXT:    addi a1, a0, 384
2226; RV64-NEXT:    vle64.v v16, (a1)
2227; RV64-NEXT:    addi a1, a0, 256
2228; RV64-NEXT:    addi a0, a0, 128
2229; RV64-NEXT:    vle64.v v24, (a0)
2230; RV64-NEXT:    vle64.v v0, (a1)
2231; RV64-NEXT:    vand.vv v16, v24, v16
2232; RV64-NEXT:    vand.vv v8, v8, v0
2233; RV64-NEXT:    vand.vv v8, v8, v16
2234; RV64-NEXT:    vredand.vs v8, v8, v8
2235; RV64-NEXT:    vmv.x.s a0, v8
2236; RV64-NEXT:    ret
2237  %v = load <64 x i64>, ptr %x
2238  %red = call i64 @llvm.vector.reduce.and.v64i64(<64 x i64> %v)
2239  ret i64 %red
2240}
2241
2242declare i8 @llvm.vector.reduce.or.v1i8(<1 x i8>)
2243
2244define i8 @vreduce_or_v1i8(<1 x i8> %v) {
2245; CHECK-LABEL: vreduce_or_v1i8:
2246; CHECK:       # %bb.0:
2247; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
2248; CHECK-NEXT:    vmv.x.s a0, v8
2249; CHECK-NEXT:    ret
2250  %red = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> %v)
2251  ret i8 %red
2252}
2253
2254declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
2255
2256define i8 @vreduce_or_v2i8(ptr %x) {
2257; CHECK-LABEL: vreduce_or_v2i8:
2258; CHECK:       # %bb.0:
2259; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
2260; CHECK-NEXT:    vle8.v v8, (a0)
2261; CHECK-NEXT:    vredor.vs v8, v8, v8
2262; CHECK-NEXT:    vmv.x.s a0, v8
2263; CHECK-NEXT:    ret
2264  %v = load <2 x i8>, ptr %x
2265  %red = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %v)
2266  ret i8 %red
2267}
2268
2269declare i8 @llvm.vector.reduce.or.v3i8(<3 x i8>)
2270
2271define i8 @vreduce_or_v3i8(ptr %x) {
2272; CHECK-LABEL: vreduce_or_v3i8:
2273; CHECK:       # %bb.0:
2274; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
2275; CHECK-NEXT:    vle8.v v8, (a0)
2276; CHECK-NEXT:    vmv.s.x v9, zero
2277; CHECK-NEXT:    vredor.vs v8, v8, v9
2278; CHECK-NEXT:    vmv.x.s a0, v8
2279; CHECK-NEXT:    ret
2280  %v = load <3 x i8>, ptr %x
2281  %red = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> %v)
2282  ret i8 %red
2283}
2284
2285declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
2286
2287define i8 @vreduce_or_v4i8(ptr %x) {
2288; CHECK-LABEL: vreduce_or_v4i8:
2289; CHECK:       # %bb.0:
2290; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
2291; CHECK-NEXT:    vle8.v v8, (a0)
2292; CHECK-NEXT:    vredor.vs v8, v8, v8
2293; CHECK-NEXT:    vmv.x.s a0, v8
2294; CHECK-NEXT:    ret
2295  %v = load <4 x i8>, ptr %x
2296  %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
2297  ret i8 %red
2298}
2299
2300declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
2301
2302define i8 @vreduce_or_v8i8(ptr %x) {
2303; CHECK-LABEL: vreduce_or_v8i8:
2304; CHECK:       # %bb.0:
2305; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
2306; CHECK-NEXT:    vle8.v v8, (a0)
2307; CHECK-NEXT:    vredor.vs v8, v8, v8
2308; CHECK-NEXT:    vmv.x.s a0, v8
2309; CHECK-NEXT:    ret
2310  %v = load <8 x i8>, ptr %x
2311  %red = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v)
2312  ret i8 %red
2313}
2314
2315declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
2316
2317define i8 @vreduce_or_v16i8(ptr %x) {
2318; CHECK-LABEL: vreduce_or_v16i8:
2319; CHECK:       # %bb.0:
2320; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
2321; CHECK-NEXT:    vle8.v v8, (a0)
2322; CHECK-NEXT:    vredor.vs v8, v8, v8
2323; CHECK-NEXT:    vmv.x.s a0, v8
2324; CHECK-NEXT:    ret
2325  %v = load <16 x i8>, ptr %x
2326  %red = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %v)
2327  ret i8 %red
2328}
2329
2330declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
2331
2332define i8 @vreduce_or_v32i8(ptr %x) {
2333; CHECK-LABEL: vreduce_or_v32i8:
2334; CHECK:       # %bb.0:
2335; CHECK-NEXT:    li a1, 32
2336; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
2337; CHECK-NEXT:    vle8.v v8, (a0)
2338; CHECK-NEXT:    vredor.vs v8, v8, v8
2339; CHECK-NEXT:    vmv.x.s a0, v8
2340; CHECK-NEXT:    ret
2341  %v = load <32 x i8>, ptr %x
2342  %red = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %v)
2343  ret i8 %red
2344}
2345
2346declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
2347
2348define i8 @vreduce_or_v64i8(ptr %x) {
2349; CHECK-LABEL: vreduce_or_v64i8:
2350; CHECK:       # %bb.0:
2351; CHECK-NEXT:    li a1, 64
2352; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
2353; CHECK-NEXT:    vle8.v v8, (a0)
2354; CHECK-NEXT:    vredor.vs v8, v8, v8
2355; CHECK-NEXT:    vmv.x.s a0, v8
2356; CHECK-NEXT:    ret
2357  %v = load <64 x i8>, ptr %x
2358  %red = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %v)
2359  ret i8 %red
2360}
2361
2362declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
2363
2364define i8 @vreduce_or_v128i8(ptr %x) {
2365; CHECK-LABEL: vreduce_or_v128i8:
2366; CHECK:       # %bb.0:
2367; CHECK-NEXT:    li a1, 128
2368; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
2369; CHECK-NEXT:    vle8.v v8, (a0)
2370; CHECK-NEXT:    vredor.vs v8, v8, v8
2371; CHECK-NEXT:    vmv.x.s a0, v8
2372; CHECK-NEXT:    ret
2373  %v = load <128 x i8>, ptr %x
2374  %red = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %v)
2375  ret i8 %red
2376}
2377
2378declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>)
2379
2380define i8 @vreduce_or_v256i8(ptr %x) {
2381; CHECK-LABEL: vreduce_or_v256i8:
2382; CHECK:       # %bb.0:
2383; CHECK-NEXT:    li a1, 128
2384; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
2385; CHECK-NEXT:    vle8.v v8, (a0)
2386; CHECK-NEXT:    addi a0, a0, 128
2387; CHECK-NEXT:    vle8.v v16, (a0)
2388; CHECK-NEXT:    vor.vv v8, v8, v16
2389; CHECK-NEXT:    vredor.vs v8, v8, v8
2390; CHECK-NEXT:    vmv.x.s a0, v8
2391; CHECK-NEXT:    ret
2392  %v = load <256 x i8>, ptr %x
2393  %red = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %v)
2394  ret i8 %red
2395}
2396
2397declare i16 @llvm.vector.reduce.or.v1i16(<1 x i16>)
2398
2399define i16 @vreduce_or_v1i16(<1 x i16> %v) {
2400; CHECK-LABEL: vreduce_or_v1i16:
2401; CHECK:       # %bb.0:
2402; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
2403; CHECK-NEXT:    vmv.x.s a0, v8
2404; CHECK-NEXT:    ret
2405  %red = call i16 @llvm.vector.reduce.or.v1i16(<1 x i16> %v)
2406  ret i16 %red
2407}
2408
2409declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
2410
2411define i16 @vreduce_or_v2i16(ptr %x) {
2412; CHECK-LABEL: vreduce_or_v2i16:
2413; CHECK:       # %bb.0:
2414; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
2415; CHECK-NEXT:    vle16.v v8, (a0)
2416; CHECK-NEXT:    vredor.vs v8, v8, v8
2417; CHECK-NEXT:    vmv.x.s a0, v8
2418; CHECK-NEXT:    ret
2419  %v = load <2 x i16>, ptr %x
2420  %red = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %v)
2421  ret i16 %red
2422}
2423
2424declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
2425
2426define i16 @vreduce_or_v4i16(ptr %x) {
2427; CHECK-LABEL: vreduce_or_v4i16:
2428; CHECK:       # %bb.0:
2429; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
2430; CHECK-NEXT:    vle16.v v8, (a0)
2431; CHECK-NEXT:    vredor.vs v8, v8, v8
2432; CHECK-NEXT:    vmv.x.s a0, v8
2433; CHECK-NEXT:    ret
2434  %v = load <4 x i16>, ptr %x
2435  %red = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v)
2436  ret i16 %red
2437}
2438
2439declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
2440
2441define i16 @vreduce_or_v8i16(ptr %x) {
2442; CHECK-LABEL: vreduce_or_v8i16:
2443; CHECK:       # %bb.0:
2444; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
2445; CHECK-NEXT:    vle16.v v8, (a0)
2446; CHECK-NEXT:    vredor.vs v8, v8, v8
2447; CHECK-NEXT:    vmv.x.s a0, v8
2448; CHECK-NEXT:    ret
2449  %v = load <8 x i16>, ptr %x
2450  %red = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %v)
2451  ret i16 %red
2452}
2453
2454declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
2455
2456define i16 @vreduce_or_v16i16(ptr %x) {
2457; CHECK-LABEL: vreduce_or_v16i16:
2458; CHECK:       # %bb.0:
2459; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
2460; CHECK-NEXT:    vle16.v v8, (a0)
2461; CHECK-NEXT:    vredor.vs v8, v8, v8
2462; CHECK-NEXT:    vmv.x.s a0, v8
2463; CHECK-NEXT:    ret
2464  %v = load <16 x i16>, ptr %x
2465  %red = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %v)
2466  ret i16 %red
2467}
2468
2469declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
2470
2471define i16 @vreduce_or_v32i16(ptr %x) {
2472; CHECK-LABEL: vreduce_or_v32i16:
2473; CHECK:       # %bb.0:
2474; CHECK-NEXT:    li a1, 32
2475; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
2476; CHECK-NEXT:    vle16.v v8, (a0)
2477; CHECK-NEXT:    vredor.vs v8, v8, v8
2478; CHECK-NEXT:    vmv.x.s a0, v8
2479; CHECK-NEXT:    ret
2480  %v = load <32 x i16>, ptr %x
2481  %red = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %v)
2482  ret i16 %red
2483}
2484
2485declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
2486
2487define i16 @vreduce_or_v64i16(ptr %x) {
2488; CHECK-LABEL: vreduce_or_v64i16:
2489; CHECK:       # %bb.0:
2490; CHECK-NEXT:    li a1, 64
2491; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
2492; CHECK-NEXT:    vle16.v v8, (a0)
2493; CHECK-NEXT:    vredor.vs v8, v8, v8
2494; CHECK-NEXT:    vmv.x.s a0, v8
2495; CHECK-NEXT:    ret
2496  %v = load <64 x i16>, ptr %x
2497  %red = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %v)
2498  ret i16 %red
2499}
2500
2501declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>)
2502
2503define i16 @vreduce_or_v128i16(ptr %x) {
2504; CHECK-LABEL: vreduce_or_v128i16:
2505; CHECK:       # %bb.0:
2506; CHECK-NEXT:    li a1, 64
2507; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
2508; CHECK-NEXT:    vle16.v v8, (a0)
2509; CHECK-NEXT:    addi a0, a0, 128
2510; CHECK-NEXT:    vle16.v v16, (a0)
2511; CHECK-NEXT:    vor.vv v8, v8, v16
2512; CHECK-NEXT:    vredor.vs v8, v8, v8
2513; CHECK-NEXT:    vmv.x.s a0, v8
2514; CHECK-NEXT:    ret
2515  %v = load <128 x i16>, ptr %x
2516  %red = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %v)
2517  ret i16 %red
2518}
2519
2520declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32>)
2521
2522define i32 @vreduce_or_v1i32(<1 x i32> %v) {
2523; CHECK-LABEL: vreduce_or_v1i32:
2524; CHECK:       # %bb.0:
2525; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
2526; CHECK-NEXT:    vmv.x.s a0, v8
2527; CHECK-NEXT:    ret
2528  %red = call i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %v)
2529  ret i32 %red
2530}
2531
2532declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
2533
2534define i32 @vreduce_or_v2i32(ptr %x) {
2535; CHECK-LABEL: vreduce_or_v2i32:
2536; CHECK:       # %bb.0:
2537; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
2538; CHECK-NEXT:    vle32.v v8, (a0)
2539; CHECK-NEXT:    vredor.vs v8, v8, v8
2540; CHECK-NEXT:    vmv.x.s a0, v8
2541; CHECK-NEXT:    ret
2542  %v = load <2 x i32>, ptr %x
2543  %red = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v)
2544  ret i32 %red
2545}
2546
2547declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
2548
2549define i32 @vreduce_or_v4i32(ptr %x) {
2550; CHECK-LABEL: vreduce_or_v4i32:
2551; CHECK:       # %bb.0:
2552; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
2553; CHECK-NEXT:    vle32.v v8, (a0)
2554; CHECK-NEXT:    vredor.vs v8, v8, v8
2555; CHECK-NEXT:    vmv.x.s a0, v8
2556; CHECK-NEXT:    ret
2557  %v = load <4 x i32>, ptr %x
2558  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v)
2559  ret i32 %red
2560}
2561
2562declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
2563
2564define i32 @vreduce_or_v8i32(ptr %x) {
2565; CHECK-LABEL: vreduce_or_v8i32:
2566; CHECK:       # %bb.0:
2567; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
2568; CHECK-NEXT:    vle32.v v8, (a0)
2569; CHECK-NEXT:    vredor.vs v8, v8, v8
2570; CHECK-NEXT:    vmv.x.s a0, v8
2571; CHECK-NEXT:    ret
2572  %v = load <8 x i32>, ptr %x
2573  %red = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v)
2574  ret i32 %red
2575}
2576
2577declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
2578
2579define i32 @vreduce_or_v16i32(ptr %x) {
2580; CHECK-LABEL: vreduce_or_v16i32:
2581; CHECK:       # %bb.0:
2582; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
2583; CHECK-NEXT:    vle32.v v8, (a0)
2584; CHECK-NEXT:    vredor.vs v8, v8, v8
2585; CHECK-NEXT:    vmv.x.s a0, v8
2586; CHECK-NEXT:    ret
2587  %v = load <16 x i32>, ptr %x
2588  %red = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v)
2589  ret i32 %red
2590}
2591
2592declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
2593
2594define i32 @vreduce_or_v32i32(ptr %x) {
2595; CHECK-LABEL: vreduce_or_v32i32:
2596; CHECK:       # %bb.0:
2597; CHECK-NEXT:    li a1, 32
2598; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
2599; CHECK-NEXT:    vle32.v v8, (a0)
2600; CHECK-NEXT:    vredor.vs v8, v8, v8
2601; CHECK-NEXT:    vmv.x.s a0, v8
2602; CHECK-NEXT:    ret
2603  %v = load <32 x i32>, ptr %x
2604  %red = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %v)
2605  ret i32 %red
2606}
2607
2608declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>)
2609
2610define i32 @vreduce_or_v64i32(ptr %x) {
2611; CHECK-LABEL: vreduce_or_v64i32:
2612; CHECK:       # %bb.0:
2613; CHECK-NEXT:    li a1, 32
2614; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
2615; CHECK-NEXT:    vle32.v v8, (a0)
2616; CHECK-NEXT:    addi a0, a0, 128
2617; CHECK-NEXT:    vle32.v v16, (a0)
2618; CHECK-NEXT:    vor.vv v8, v8, v16
2619; CHECK-NEXT:    vredor.vs v8, v8, v8
2620; CHECK-NEXT:    vmv.x.s a0, v8
2621; CHECK-NEXT:    ret
2622  %v = load <64 x i32>, ptr %x
2623  %red = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %v)
2624  ret i32 %red
2625}
2626
2627declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
2628
2629define i64 @vreduce_or_v1i64(<1 x i64> %v) {
2630; RV32-LABEL: vreduce_or_v1i64:
2631; RV32:       # %bb.0:
2632; RV32-NEXT:    li a0, 32
2633; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2634; RV32-NEXT:    vsrl.vx v9, v8, a0
2635; RV32-NEXT:    vmv.x.s a1, v9
2636; RV32-NEXT:    vmv.x.s a0, v8
2637; RV32-NEXT:    ret
2638;
2639; RV64-LABEL: vreduce_or_v1i64:
2640; RV64:       # %bb.0:
2641; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2642; RV64-NEXT:    vmv.x.s a0, v8
2643; RV64-NEXT:    ret
2644  %red = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %v)
2645  ret i64 %red
2646}
2647
2648declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
2649
2650define i64 @vreduce_or_v2i64(ptr %x) {
2651; RV32-LABEL: vreduce_or_v2i64:
2652; RV32:       # %bb.0:
2653; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
2654; RV32-NEXT:    vle64.v v8, (a0)
2655; RV32-NEXT:    li a0, 32
2656; RV32-NEXT:    vredor.vs v8, v8, v8
2657; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2658; RV32-NEXT:    vsrl.vx v9, v8, a0
2659; RV32-NEXT:    vmv.x.s a1, v9
2660; RV32-NEXT:    vmv.x.s a0, v8
2661; RV32-NEXT:    ret
2662;
2663; RV64-LABEL: vreduce_or_v2i64:
2664; RV64:       # %bb.0:
2665; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
2666; RV64-NEXT:    vle64.v v8, (a0)
2667; RV64-NEXT:    vredor.vs v8, v8, v8
2668; RV64-NEXT:    vmv.x.s a0, v8
2669; RV64-NEXT:    ret
2670  %v = load <2 x i64>, ptr %x
2671  %red = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %v)
2672  ret i64 %red
2673}
2674
2675declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
2676
2677define i64 @vreduce_or_v4i64(ptr %x) {
2678; RV32-LABEL: vreduce_or_v4i64:
2679; RV32:       # %bb.0:
2680; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
2681; RV32-NEXT:    vle64.v v8, (a0)
2682; RV32-NEXT:    li a1, 32
2683; RV32-NEXT:    vredor.vs v8, v8, v8
2684; RV32-NEXT:    vmv.x.s a0, v8
2685; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2686; RV32-NEXT:    vsrl.vx v8, v8, a1
2687; RV32-NEXT:    vmv.x.s a1, v8
2688; RV32-NEXT:    ret
2689;
2690; RV64-LABEL: vreduce_or_v4i64:
2691; RV64:       # %bb.0:
2692; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
2693; RV64-NEXT:    vle64.v v8, (a0)
2694; RV64-NEXT:    vredor.vs v8, v8, v8
2695; RV64-NEXT:    vmv.x.s a0, v8
2696; RV64-NEXT:    ret
2697  %v = load <4 x i64>, ptr %x
2698  %red = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v)
2699  ret i64 %red
2700}
2701
2702declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
2703
2704define i64 @vreduce_or_v8i64(ptr %x) {
2705; RV32-LABEL: vreduce_or_v8i64:
2706; RV32:       # %bb.0:
2707; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
2708; RV32-NEXT:    vle64.v v8, (a0)
2709; RV32-NEXT:    li a1, 32
2710; RV32-NEXT:    vredor.vs v8, v8, v8
2711; RV32-NEXT:    vmv.x.s a0, v8
2712; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2713; RV32-NEXT:    vsrl.vx v8, v8, a1
2714; RV32-NEXT:    vmv.x.s a1, v8
2715; RV32-NEXT:    ret
2716;
2717; RV64-LABEL: vreduce_or_v8i64:
2718; RV64:       # %bb.0:
2719; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
2720; RV64-NEXT:    vle64.v v8, (a0)
2721; RV64-NEXT:    vredor.vs v8, v8, v8
2722; RV64-NEXT:    vmv.x.s a0, v8
2723; RV64-NEXT:    ret
2724  %v = load <8 x i64>, ptr %x
2725  %red = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %v)
2726  ret i64 %red
2727}
2728
2729declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
2730
2731define i64 @vreduce_or_v16i64(ptr %x) {
2732; RV32-LABEL: vreduce_or_v16i64:
2733; RV32:       # %bb.0:
2734; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2735; RV32-NEXT:    vle64.v v8, (a0)
2736; RV32-NEXT:    li a1, 32
2737; RV32-NEXT:    vredor.vs v8, v8, v8
2738; RV32-NEXT:    vmv.x.s a0, v8
2739; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2740; RV32-NEXT:    vsrl.vx v8, v8, a1
2741; RV32-NEXT:    vmv.x.s a1, v8
2742; RV32-NEXT:    ret
2743;
2744; RV64-LABEL: vreduce_or_v16i64:
2745; RV64:       # %bb.0:
2746; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2747; RV64-NEXT:    vle64.v v8, (a0)
2748; RV64-NEXT:    vredor.vs v8, v8, v8
2749; RV64-NEXT:    vmv.x.s a0, v8
2750; RV64-NEXT:    ret
2751  %v = load <16 x i64>, ptr %x
2752  %red = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %v)
2753  ret i64 %red
2754}
2755
2756declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>)
2757
2758define i64 @vreduce_or_v32i64(ptr %x) {
2759; RV32-LABEL: vreduce_or_v32i64:
2760; RV32:       # %bb.0:
2761; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2762; RV32-NEXT:    vle64.v v8, (a0)
2763; RV32-NEXT:    addi a0, a0, 128
2764; RV32-NEXT:    vle64.v v16, (a0)
2765; RV32-NEXT:    li a1, 32
2766; RV32-NEXT:    vor.vv v8, v8, v16
2767; RV32-NEXT:    vredor.vs v8, v8, v8
2768; RV32-NEXT:    vmv.x.s a0, v8
2769; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2770; RV32-NEXT:    vsrl.vx v8, v8, a1
2771; RV32-NEXT:    vmv.x.s a1, v8
2772; RV32-NEXT:    ret
2773;
2774; RV64-LABEL: vreduce_or_v32i64:
2775; RV64:       # %bb.0:
2776; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2777; RV64-NEXT:    vle64.v v8, (a0)
2778; RV64-NEXT:    addi a0, a0, 128
2779; RV64-NEXT:    vle64.v v16, (a0)
2780; RV64-NEXT:    vor.vv v8, v8, v16
2781; RV64-NEXT:    vredor.vs v8, v8, v8
2782; RV64-NEXT:    vmv.x.s a0, v8
2783; RV64-NEXT:    ret
2784  %v = load <32 x i64>, ptr %x
2785  %red = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %v)
2786  ret i64 %red
2787}
2788
2789declare i64 @llvm.vector.reduce.or.v64i64(<64 x i64>)
2790
2791define i64 @vreduce_or_v64i64(ptr %x) nounwind {
2792; RV32-LABEL: vreduce_or_v64i64:
2793; RV32:       # %bb.0:
2794; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2795; RV32-NEXT:    vle64.v v8, (a0)
2796; RV32-NEXT:    addi a1, a0, 384
2797; RV32-NEXT:    vle64.v v16, (a1)
2798; RV32-NEXT:    addi a1, a0, 256
2799; RV32-NEXT:    addi a0, a0, 128
2800; RV32-NEXT:    vle64.v v0, (a0)
2801; RV32-NEXT:    vle64.v v24, (a1)
2802; RV32-NEXT:    li a1, 32
2803; RV32-NEXT:    vor.vv v16, v0, v16
2804; RV32-NEXT:    vor.vv v8, v8, v24
2805; RV32-NEXT:    vor.vv v8, v8, v16
2806; RV32-NEXT:    vredor.vs v8, v8, v8
2807; RV32-NEXT:    vmv.x.s a0, v8
2808; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
2809; RV32-NEXT:    vsrl.vx v8, v8, a1
2810; RV32-NEXT:    vmv.x.s a1, v8
2811; RV32-NEXT:    ret
2812;
2813; RV64-LABEL: vreduce_or_v64i64:
2814; RV64:       # %bb.0:
2815; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
2816; RV64-NEXT:    vle64.v v8, (a0)
2817; RV64-NEXT:    addi a1, a0, 384
2818; RV64-NEXT:    vle64.v v16, (a1)
2819; RV64-NEXT:    addi a1, a0, 256
2820; RV64-NEXT:    addi a0, a0, 128
2821; RV64-NEXT:    vle64.v v24, (a0)
2822; RV64-NEXT:    vle64.v v0, (a1)
2823; RV64-NEXT:    vor.vv v16, v24, v16
2824; RV64-NEXT:    vor.vv v8, v8, v0
2825; RV64-NEXT:    vor.vv v8, v8, v16
2826; RV64-NEXT:    vredor.vs v8, v8, v8
2827; RV64-NEXT:    vmv.x.s a0, v8
2828; RV64-NEXT:    ret
2829  %v = load <64 x i64>, ptr %x
2830  %red = call i64 @llvm.vector.reduce.or.v64i64(<64 x i64> %v)
2831  ret i64 %red
2832}
2833
2834declare i8 @llvm.vector.reduce.xor.v1i8(<1 x i8>)
2835
2836define i8 @vreduce_xor_v1i8(<1 x i8> %v) {
2837; CHECK-LABEL: vreduce_xor_v1i8:
2838; CHECK:       # %bb.0:
2839; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
2840; CHECK-NEXT:    vmv.x.s a0, v8
2841; CHECK-NEXT:    ret
2842  %red = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> %v)
2843  ret i8 %red
2844}
2845
2846declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>)
2847
2848define i8 @vreduce_xor_v2i8(ptr %x) {
2849; CHECK-LABEL: vreduce_xor_v2i8:
2850; CHECK:       # %bb.0:
2851; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
2852; CHECK-NEXT:    vle8.v v8, (a0)
2853; CHECK-NEXT:    vmv.s.x v9, zero
2854; CHECK-NEXT:    vredxor.vs v8, v8, v9
2855; CHECK-NEXT:    vmv.x.s a0, v8
2856; CHECK-NEXT:    ret
2857  %v = load <2 x i8>, ptr %x
2858  %red = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %v)
2859  ret i8 %red
2860}
2861
2862declare i8 @llvm.vector.reduce.xor.v3i8(<3 x i8>)
2863
2864define i8 @vreduce_xor_v3i8(ptr %x) {
2865; CHECK-LABEL: vreduce_xor_v3i8:
2866; CHECK:       # %bb.0:
2867; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
2868; CHECK-NEXT:    vle8.v v8, (a0)
2869; CHECK-NEXT:    vmv.s.x v9, zero
2870; CHECK-NEXT:    vredxor.vs v8, v8, v9
2871; CHECK-NEXT:    vmv.x.s a0, v8
2872; CHECK-NEXT:    ret
2873  %v = load <3 x i8>, ptr %x
2874  %red = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> %v)
2875  ret i8 %red
2876}
2877
2878declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
2879
2880define i8 @vreduce_xor_v4i8(ptr %x) {
2881; CHECK-LABEL: vreduce_xor_v4i8:
2882; CHECK:       # %bb.0:
2883; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
2884; CHECK-NEXT:    vle8.v v8, (a0)
2885; CHECK-NEXT:    vmv.s.x v9, zero
2886; CHECK-NEXT:    vredxor.vs v8, v8, v9
2887; CHECK-NEXT:    vmv.x.s a0, v8
2888; CHECK-NEXT:    ret
2889  %v = load <4 x i8>, ptr %x
2890  %red = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %v)
2891  ret i8 %red
2892}
2893
2894declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
2895
2896define i8 @vreduce_xor_v8i8(ptr %x) {
2897; CHECK-LABEL: vreduce_xor_v8i8:
2898; CHECK:       # %bb.0:
2899; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
2900; CHECK-NEXT:    vle8.v v8, (a0)
2901; CHECK-NEXT:    vmv.s.x v9, zero
2902; CHECK-NEXT:    vredxor.vs v8, v8, v9
2903; CHECK-NEXT:    vmv.x.s a0, v8
2904; CHECK-NEXT:    ret
2905  %v = load <8 x i8>, ptr %x
2906  %red = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %v)
2907  ret i8 %red
2908}
2909
2910declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
2911
2912define i8 @vreduce_xor_v16i8(ptr %x) {
2913; CHECK-LABEL: vreduce_xor_v16i8:
2914; CHECK:       # %bb.0:
2915; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
2916; CHECK-NEXT:    vle8.v v8, (a0)
2917; CHECK-NEXT:    vmv.s.x v9, zero
2918; CHECK-NEXT:    vredxor.vs v8, v8, v9
2919; CHECK-NEXT:    vmv.x.s a0, v8
2920; CHECK-NEXT:    ret
2921  %v = load <16 x i8>, ptr %x
2922  %red = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %v)
2923  ret i8 %red
2924}
2925
2926declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
2927
2928define i8 @vreduce_xor_v32i8(ptr %x) {
2929; CHECK-LABEL: vreduce_xor_v32i8:
2930; CHECK:       # %bb.0:
2931; CHECK-NEXT:    li a1, 32
2932; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
2933; CHECK-NEXT:    vle8.v v8, (a0)
2934; CHECK-NEXT:    vmv.s.x v10, zero
2935; CHECK-NEXT:    vredxor.vs v8, v8, v10
2936; CHECK-NEXT:    vmv.x.s a0, v8
2937; CHECK-NEXT:    ret
2938  %v = load <32 x i8>, ptr %x
2939  %red = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %v)
2940  ret i8 %red
2941}
2942
2943declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
2944
2945define i8 @vreduce_xor_v64i8(ptr %x) {
2946; CHECK-LABEL: vreduce_xor_v64i8:
2947; CHECK:       # %bb.0:
2948; CHECK-NEXT:    li a1, 64
2949; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
2950; CHECK-NEXT:    vle8.v v8, (a0)
2951; CHECK-NEXT:    vmv.s.x v12, zero
2952; CHECK-NEXT:    vredxor.vs v8, v8, v12
2953; CHECK-NEXT:    vmv.x.s a0, v8
2954; CHECK-NEXT:    ret
2955  %v = load <64 x i8>, ptr %x
2956  %red = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %v)
2957  ret i8 %red
2958}
2959
2960declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
2961
2962define i8 @vreduce_xor_v128i8(ptr %x) {
2963; CHECK-LABEL: vreduce_xor_v128i8:
2964; CHECK:       # %bb.0:
2965; CHECK-NEXT:    li a1, 128
2966; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
2967; CHECK-NEXT:    vle8.v v8, (a0)
2968; CHECK-NEXT:    vmv.s.x v16, zero
2969; CHECK-NEXT:    vredxor.vs v8, v8, v16
2970; CHECK-NEXT:    vmv.x.s a0, v8
2971; CHECK-NEXT:    ret
2972  %v = load <128 x i8>, ptr %x
2973  %red = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %v)
2974  ret i8 %red
2975}
2976
2977declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>)
2978
2979define i8 @vreduce_xor_v256i8(ptr %x) {
2980; CHECK-LABEL: vreduce_xor_v256i8:
2981; CHECK:       # %bb.0:
2982; CHECK-NEXT:    li a1, 128
2983; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
2984; CHECK-NEXT:    vle8.v v8, (a0)
2985; CHECK-NEXT:    addi a0, a0, 128
2986; CHECK-NEXT:    vle8.v v16, (a0)
2987; CHECK-NEXT:    vxor.vv v8, v8, v16
2988; CHECK-NEXT:    vmv.s.x v16, zero
2989; CHECK-NEXT:    vredxor.vs v8, v8, v16
2990; CHECK-NEXT:    vmv.x.s a0, v8
2991; CHECK-NEXT:    ret
2992  %v = load <256 x i8>, ptr %x
2993  %red = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %v)
2994  ret i8 %red
2995}
2996
2997declare i16 @llvm.vector.reduce.xor.v1i16(<1 x i16>)
2998
2999define i16 @vreduce_xor_v1i16(<1 x i16> %v) {
3000; CHECK-LABEL: vreduce_xor_v1i16:
3001; CHECK:       # %bb.0:
3002; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
3003; CHECK-NEXT:    vmv.x.s a0, v8
3004; CHECK-NEXT:    ret
3005  %red = call i16 @llvm.vector.reduce.xor.v1i16(<1 x i16> %v)
3006  ret i16 %red
3007}
3008
3009declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>)
3010
3011define i16 @vreduce_xor_v2i16(ptr %x) {
3012; CHECK-LABEL: vreduce_xor_v2i16:
3013; CHECK:       # %bb.0:
3014; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
3015; CHECK-NEXT:    vle16.v v8, (a0)
3016; CHECK-NEXT:    vmv.s.x v9, zero
3017; CHECK-NEXT:    vredxor.vs v8, v8, v9
3018; CHECK-NEXT:    vmv.x.s a0, v8
3019; CHECK-NEXT:    ret
3020  %v = load <2 x i16>, ptr %x
3021  %red = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %v)
3022  ret i16 %red
3023}
3024
3025declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
3026
3027define i16 @vreduce_xor_v4i16(ptr %x) {
3028; CHECK-LABEL: vreduce_xor_v4i16:
3029; CHECK:       # %bb.0:
3030; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
3031; CHECK-NEXT:    vle16.v v8, (a0)
3032; CHECK-NEXT:    vmv.s.x v9, zero
3033; CHECK-NEXT:    vredxor.vs v8, v8, v9
3034; CHECK-NEXT:    vmv.x.s a0, v8
3035; CHECK-NEXT:    ret
3036  %v = load <4 x i16>, ptr %x
3037  %red = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v)
3038  ret i16 %red
3039}
3040
3041declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
3042
3043define i16 @vreduce_xor_v8i16(ptr %x) {
3044; CHECK-LABEL: vreduce_xor_v8i16:
3045; CHECK:       # %bb.0:
3046; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
3047; CHECK-NEXT:    vle16.v v8, (a0)
3048; CHECK-NEXT:    vmv.s.x v9, zero
3049; CHECK-NEXT:    vredxor.vs v8, v8, v9
3050; CHECK-NEXT:    vmv.x.s a0, v8
3051; CHECK-NEXT:    ret
3052  %v = load <8 x i16>, ptr %x
3053  %red = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %v)
3054  ret i16 %red
3055}
3056
3057declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
3058
3059define i16 @vreduce_xor_v16i16(ptr %x) {
3060; CHECK-LABEL: vreduce_xor_v16i16:
3061; CHECK:       # %bb.0:
3062; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
3063; CHECK-NEXT:    vle16.v v8, (a0)
3064; CHECK-NEXT:    vmv.s.x v10, zero
3065; CHECK-NEXT:    vredxor.vs v8, v8, v10
3066; CHECK-NEXT:    vmv.x.s a0, v8
3067; CHECK-NEXT:    ret
3068  %v = load <16 x i16>, ptr %x
3069  %red = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %v)
3070  ret i16 %red
3071}
3072
3073declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
3074
3075define i16 @vreduce_xor_v32i16(ptr %x) {
3076; CHECK-LABEL: vreduce_xor_v32i16:
3077; CHECK:       # %bb.0:
3078; CHECK-NEXT:    li a1, 32
3079; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
3080; CHECK-NEXT:    vle16.v v8, (a0)
3081; CHECK-NEXT:    vmv.s.x v12, zero
3082; CHECK-NEXT:    vredxor.vs v8, v8, v12
3083; CHECK-NEXT:    vmv.x.s a0, v8
3084; CHECK-NEXT:    ret
3085  %v = load <32 x i16>, ptr %x
3086  %red = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %v)
3087  ret i16 %red
3088}
3089
3090declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
3091
3092define i16 @vreduce_xor_v64i16(ptr %x) {
3093; CHECK-LABEL: vreduce_xor_v64i16:
3094; CHECK:       # %bb.0:
3095; CHECK-NEXT:    li a1, 64
3096; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
3097; CHECK-NEXT:    vle16.v v8, (a0)
3098; CHECK-NEXT:    vmv.s.x v16, zero
3099; CHECK-NEXT:    vredxor.vs v8, v8, v16
3100; CHECK-NEXT:    vmv.x.s a0, v8
3101; CHECK-NEXT:    ret
3102  %v = load <64 x i16>, ptr %x
3103  %red = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %v)
3104  ret i16 %red
3105}
3106
3107declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>)
3108
3109define i16 @vreduce_xor_v128i16(ptr %x) {
3110; CHECK-LABEL: vreduce_xor_v128i16:
3111; CHECK:       # %bb.0:
3112; CHECK-NEXT:    li a1, 64
3113; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
3114; CHECK-NEXT:    vle16.v v8, (a0)
3115; CHECK-NEXT:    addi a0, a0, 128
3116; CHECK-NEXT:    vle16.v v16, (a0)
3117; CHECK-NEXT:    vxor.vv v8, v8, v16
3118; CHECK-NEXT:    vmv.s.x v16, zero
3119; CHECK-NEXT:    vredxor.vs v8, v8, v16
3120; CHECK-NEXT:    vmv.x.s a0, v8
3121; CHECK-NEXT:    ret
3122  %v = load <128 x i16>, ptr %x
3123  %red = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %v)
3124  ret i16 %red
3125}
3126
3127declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32>)
3128
3129define i32 @vreduce_xor_v1i32(<1 x i32> %v) {
3130; CHECK-LABEL: vreduce_xor_v1i32:
3131; CHECK:       # %bb.0:
3132; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
3133; CHECK-NEXT:    vmv.x.s a0, v8
3134; CHECK-NEXT:    ret
3135  %red = call i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %v)
3136  ret i32 %red
3137}
3138
3139declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
3140
3141define i32 @vreduce_xor_v2i32(ptr %x) {
3142; CHECK-LABEL: vreduce_xor_v2i32:
3143; CHECK:       # %bb.0:
3144; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
3145; CHECK-NEXT:    vle32.v v8, (a0)
3146; CHECK-NEXT:    vmv.s.x v9, zero
3147; CHECK-NEXT:    vredxor.vs v8, v8, v9
3148; CHECK-NEXT:    vmv.x.s a0, v8
3149; CHECK-NEXT:    ret
3150  %v = load <2 x i32>, ptr %x
3151  %red = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %v)
3152  ret i32 %red
3153}
3154
3155declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
3156
3157define i32 @vreduce_xor_v4i32(ptr %x) {
3158; CHECK-LABEL: vreduce_xor_v4i32:
3159; CHECK:       # %bb.0:
3160; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
3161; CHECK-NEXT:    vle32.v v8, (a0)
3162; CHECK-NEXT:    vmv.s.x v9, zero
3163; CHECK-NEXT:    vredxor.vs v8, v8, v9
3164; CHECK-NEXT:    vmv.x.s a0, v8
3165; CHECK-NEXT:    ret
3166  %v = load <4 x i32>, ptr %x
3167  %red = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %v)
3168  ret i32 %red
3169}
3170
3171declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
3172
3173define i32 @vreduce_xor_v8i32(ptr %x) {
3174; CHECK-LABEL: vreduce_xor_v8i32:
3175; CHECK:       # %bb.0:
3176; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
3177; CHECK-NEXT:    vle32.v v8, (a0)
3178; CHECK-NEXT:    vmv.s.x v10, zero
3179; CHECK-NEXT:    vredxor.vs v8, v8, v10
3180; CHECK-NEXT:    vmv.x.s a0, v8
3181; CHECK-NEXT:    ret
3182  %v = load <8 x i32>, ptr %x
3183  %red = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %v)
3184  ret i32 %red
3185}
3186
3187declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
3188
3189define i32 @vreduce_xor_v16i32(ptr %x) {
3190; CHECK-LABEL: vreduce_xor_v16i32:
3191; CHECK:       # %bb.0:
3192; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
3193; CHECK-NEXT:    vle32.v v8, (a0)
3194; CHECK-NEXT:    vmv.s.x v12, zero
3195; CHECK-NEXT:    vredxor.vs v8, v8, v12
3196; CHECK-NEXT:    vmv.x.s a0, v8
3197; CHECK-NEXT:    ret
3198  %v = load <16 x i32>, ptr %x
3199  %red = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v)
3200  ret i32 %red
3201}
3202
3203declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
3204
3205define i32 @vreduce_xor_v32i32(ptr %x) {
3206; CHECK-LABEL: vreduce_xor_v32i32:
3207; CHECK:       # %bb.0:
3208; CHECK-NEXT:    li a1, 32
3209; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
3210; CHECK-NEXT:    vle32.v v8, (a0)
3211; CHECK-NEXT:    vmv.s.x v16, zero
3212; CHECK-NEXT:    vredxor.vs v8, v8, v16
3213; CHECK-NEXT:    vmv.x.s a0, v8
3214; CHECK-NEXT:    ret
3215  %v = load <32 x i32>, ptr %x
3216  %red = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %v)
3217  ret i32 %red
3218}
3219
3220declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>)
3221
3222define i32 @vreduce_xor_v64i32(ptr %x) {
3223; CHECK-LABEL: vreduce_xor_v64i32:
3224; CHECK:       # %bb.0:
3225; CHECK-NEXT:    li a1, 32
3226; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
3227; CHECK-NEXT:    vle32.v v8, (a0)
3228; CHECK-NEXT:    addi a0, a0, 128
3229; CHECK-NEXT:    vle32.v v16, (a0)
3230; CHECK-NEXT:    vxor.vv v8, v8, v16
3231; CHECK-NEXT:    vmv.s.x v16, zero
3232; CHECK-NEXT:    vredxor.vs v8, v8, v16
3233; CHECK-NEXT:    vmv.x.s a0, v8
3234; CHECK-NEXT:    ret
3235  %v = load <64 x i32>, ptr %x
3236  %red = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %v)
3237  ret i32 %red
3238}
3239
3240declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
3241
3242define i64 @vreduce_xor_v1i64(<1 x i64> %v) {
3243; RV32-LABEL: vreduce_xor_v1i64:
3244; RV32:       # %bb.0:
3245; RV32-NEXT:    li a0, 32
3246; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3247; RV32-NEXT:    vsrl.vx v9, v8, a0
3248; RV32-NEXT:    vmv.x.s a1, v9
3249; RV32-NEXT:    vmv.x.s a0, v8
3250; RV32-NEXT:    ret
3251;
3252; RV64-LABEL: vreduce_xor_v1i64:
3253; RV64:       # %bb.0:
3254; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3255; RV64-NEXT:    vmv.x.s a0, v8
3256; RV64-NEXT:    ret
3257  %red = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %v)
3258  ret i64 %red
3259}
3260
3261declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
3262
3263define i64 @vreduce_xor_v2i64(ptr %x) {
3264; RV32-LABEL: vreduce_xor_v2i64:
3265; RV32:       # %bb.0:
3266; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
3267; RV32-NEXT:    vle64.v v8, (a0)
3268; RV32-NEXT:    vmv.s.x v9, zero
3269; RV32-NEXT:    li a1, 32
3270; RV32-NEXT:    vredxor.vs v8, v8, v9
3271; RV32-NEXT:    vmv.x.s a0, v8
3272; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3273; RV32-NEXT:    vsrl.vx v8, v8, a1
3274; RV32-NEXT:    vmv.x.s a1, v8
3275; RV32-NEXT:    ret
3276;
3277; RV64-LABEL: vreduce_xor_v2i64:
3278; RV64:       # %bb.0:
3279; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
3280; RV64-NEXT:    vle64.v v8, (a0)
3281; RV64-NEXT:    vmv.s.x v9, zero
3282; RV64-NEXT:    vredxor.vs v8, v8, v9
3283; RV64-NEXT:    vmv.x.s a0, v8
3284; RV64-NEXT:    ret
3285  %v = load <2 x i64>, ptr %x
3286  %red = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %v)
3287  ret i64 %red
3288}
3289
3290declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
3291
3292define i64 @vreduce_xor_v4i64(ptr %x) {
3293; RV32-LABEL: vreduce_xor_v4i64:
3294; RV32:       # %bb.0:
3295; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
3296; RV32-NEXT:    vle64.v v8, (a0)
3297; RV32-NEXT:    vmv.s.x v10, zero
3298; RV32-NEXT:    li a1, 32
3299; RV32-NEXT:    vredxor.vs v8, v8, v10
3300; RV32-NEXT:    vmv.x.s a0, v8
3301; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3302; RV32-NEXT:    vsrl.vx v8, v8, a1
3303; RV32-NEXT:    vmv.x.s a1, v8
3304; RV32-NEXT:    ret
3305;
3306; RV64-LABEL: vreduce_xor_v4i64:
3307; RV64:       # %bb.0:
3308; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
3309; RV64-NEXT:    vle64.v v8, (a0)
3310; RV64-NEXT:    vmv.s.x v10, zero
3311; RV64-NEXT:    vredxor.vs v8, v8, v10
3312; RV64-NEXT:    vmv.x.s a0, v8
3313; RV64-NEXT:    ret
3314  %v = load <4 x i64>, ptr %x
3315  %red = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v)
3316  ret i64 %red
3317}
3318
3319declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
3320
3321define i64 @vreduce_xor_v8i64(ptr %x) {
3322; RV32-LABEL: vreduce_xor_v8i64:
3323; RV32:       # %bb.0:
3324; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
3325; RV32-NEXT:    vle64.v v8, (a0)
3326; RV32-NEXT:    vmv.s.x v12, zero
3327; RV32-NEXT:    li a1, 32
3328; RV32-NEXT:    vredxor.vs v8, v8, v12
3329; RV32-NEXT:    vmv.x.s a0, v8
3330; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3331; RV32-NEXT:    vsrl.vx v8, v8, a1
3332; RV32-NEXT:    vmv.x.s a1, v8
3333; RV32-NEXT:    ret
3334;
3335; RV64-LABEL: vreduce_xor_v8i64:
3336; RV64:       # %bb.0:
3337; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
3338; RV64-NEXT:    vle64.v v8, (a0)
3339; RV64-NEXT:    vmv.s.x v12, zero
3340; RV64-NEXT:    vredxor.vs v8, v8, v12
3341; RV64-NEXT:    vmv.x.s a0, v8
3342; RV64-NEXT:    ret
3343  %v = load <8 x i64>, ptr %x
3344  %red = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %v)
3345  ret i64 %red
3346}
3347
3348declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
3349
3350define i64 @vreduce_xor_v16i64(ptr %x) {
3351; RV32-LABEL: vreduce_xor_v16i64:
3352; RV32:       # %bb.0:
3353; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3354; RV32-NEXT:    vle64.v v8, (a0)
3355; RV32-NEXT:    vmv.s.x v16, zero
3356; RV32-NEXT:    li a1, 32
3357; RV32-NEXT:    vredxor.vs v8, v8, v16
3358; RV32-NEXT:    vmv.x.s a0, v8
3359; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3360; RV32-NEXT:    vsrl.vx v8, v8, a1
3361; RV32-NEXT:    vmv.x.s a1, v8
3362; RV32-NEXT:    ret
3363;
3364; RV64-LABEL: vreduce_xor_v16i64:
3365; RV64:       # %bb.0:
3366; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3367; RV64-NEXT:    vle64.v v8, (a0)
3368; RV64-NEXT:    vmv.s.x v16, zero
3369; RV64-NEXT:    vredxor.vs v8, v8, v16
3370; RV64-NEXT:    vmv.x.s a0, v8
3371; RV64-NEXT:    ret
3372  %v = load <16 x i64>, ptr %x
3373  %red = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %v)
3374  ret i64 %red
3375}
3376
3377declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>)
3378
3379define i64 @vreduce_xor_v32i64(ptr %x) {
3380; RV32-LABEL: vreduce_xor_v32i64:
3381; RV32:       # %bb.0:
3382; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3383; RV32-NEXT:    vle64.v v8, (a0)
3384; RV32-NEXT:    addi a0, a0, 128
3385; RV32-NEXT:    vle64.v v16, (a0)
3386; RV32-NEXT:    vxor.vv v8, v8, v16
3387; RV32-NEXT:    vmv.s.x v16, zero
3388; RV32-NEXT:    li a1, 32
3389; RV32-NEXT:    vredxor.vs v8, v8, v16
3390; RV32-NEXT:    vmv.x.s a0, v8
3391; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3392; RV32-NEXT:    vsrl.vx v8, v8, a1
3393; RV32-NEXT:    vmv.x.s a1, v8
3394; RV32-NEXT:    ret
3395;
3396; RV64-LABEL: vreduce_xor_v32i64:
3397; RV64:       # %bb.0:
3398; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3399; RV64-NEXT:    vle64.v v8, (a0)
3400; RV64-NEXT:    addi a0, a0, 128
3401; RV64-NEXT:    vle64.v v16, (a0)
3402; RV64-NEXT:    vxor.vv v8, v8, v16
3403; RV64-NEXT:    vmv.s.x v16, zero
3404; RV64-NEXT:    vredxor.vs v8, v8, v16
3405; RV64-NEXT:    vmv.x.s a0, v8
3406; RV64-NEXT:    ret
3407  %v = load <32 x i64>, ptr %x
3408  %red = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %v)
3409  ret i64 %red
3410}
3411
3412declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>)
3413
3414define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
3415; RV32-LABEL: vreduce_xor_v64i64:
3416; RV32:       # %bb.0:
3417; RV32-NEXT:    addi a1, a0, 384
3418; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3419; RV32-NEXT:    vle64.v v24, (a1)
3420; RV32-NEXT:    addi a1, a0, 128
3421; RV32-NEXT:    vle64.v v0, (a1)
3422; RV32-NEXT:    vle64.v v8, (a0)
3423; RV32-NEXT:    addi a0, a0, 256
3424; RV32-NEXT:    vle64.v v16, (a0)
3425; RV32-NEXT:    vxor.vv v24, v0, v24
3426; RV32-NEXT:    vmv.s.x v7, zero
3427; RV32-NEXT:    li a1, 32
3428; RV32-NEXT:    vxor.vv v8, v8, v16
3429; RV32-NEXT:    vxor.vv v8, v8, v24
3430; RV32-NEXT:    vredxor.vs v8, v8, v7
3431; RV32-NEXT:    vmv.x.s a0, v8
3432; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3433; RV32-NEXT:    vsrl.vx v8, v8, a1
3434; RV32-NEXT:    vmv.x.s a1, v8
3435; RV32-NEXT:    ret
3436;
3437; RV64-LABEL: vreduce_xor_v64i64:
3438; RV64:       # %bb.0:
3439; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3440; RV64-NEXT:    vle64.v v8, (a0)
3441; RV64-NEXT:    addi a1, a0, 384
3442; RV64-NEXT:    vle64.v v16, (a1)
3443; RV64-NEXT:    addi a1, a0, 256
3444; RV64-NEXT:    addi a0, a0, 128
3445; RV64-NEXT:    vle64.v v24, (a0)
3446; RV64-NEXT:    vle64.v v0, (a1)
3447; RV64-NEXT:    vxor.vv v16, v24, v16
3448; RV64-NEXT:    vxor.vv v8, v8, v0
3449; RV64-NEXT:    vxor.vv v8, v8, v16
3450; RV64-NEXT:    vmv.s.x v16, zero
3451; RV64-NEXT:    vredxor.vs v8, v8, v16
3452; RV64-NEXT:    vmv.x.s a0, v8
3453; RV64-NEXT:    ret
3454  %v = load <64 x i64>, ptr %x
3455  %red = call i64 @llvm.vector.reduce.xor.v64i64(<64 x i64> %v)
3456  ret i64 %red
3457}
3458
3459declare i8 @llvm.vector.reduce.smin.v1i8(<1 x i8>)
3460
3461define i8 @vreduce_smin_v1i8(<1 x i8> %v) {
3462; CHECK-LABEL: vreduce_smin_v1i8:
3463; CHECK:       # %bb.0:
3464; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
3465; CHECK-NEXT:    vmv.x.s a0, v8
3466; CHECK-NEXT:    ret
3467  %red = call i8 @llvm.vector.reduce.smin.v1i8(<1 x i8> %v)
3468  ret i8 %red
3469}
3470
3471declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
3472
3473define i8 @vreduce_smin_v2i8(ptr %x) {
3474; CHECK-LABEL: vreduce_smin_v2i8:
3475; CHECK:       # %bb.0:
3476; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
3477; CHECK-NEXT:    vle8.v v8, (a0)
3478; CHECK-NEXT:    vredmin.vs v8, v8, v8
3479; CHECK-NEXT:    vmv.x.s a0, v8
3480; CHECK-NEXT:    ret
3481  %v = load <2 x i8>, ptr %x
3482  %red = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %v)
3483  ret i8 %red
3484}
3485
3486declare i8 @llvm.vector.reduce.smin.v3i8(<3 x i8>)
3487
3488define i8 @vreduce_smin_v3i8(ptr %x) {
3489; CHECK-LABEL: vreduce_smin_v3i8:
3490; CHECK:       # %bb.0:
3491; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
3492; CHECK-NEXT:    vle8.v v8, (a0)
3493; CHECK-NEXT:    li a0, 127
3494; CHECK-NEXT:    vmv.s.x v9, a0
3495; CHECK-NEXT:    vredmin.vs v8, v8, v9
3496; CHECK-NEXT:    vmv.x.s a0, v8
3497; CHECK-NEXT:    ret
3498  %v = load <3 x i8>, ptr %x
3499  %red = call i8 @llvm.vector.reduce.smin.v3i8(<3 x i8> %v)
3500  ret i8 %red
3501}
3502
3503declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
3504
3505define i8 @vreduce_smin_v4i8(ptr %x) {
3506; CHECK-LABEL: vreduce_smin_v4i8:
3507; CHECK:       # %bb.0:
3508; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
3509; CHECK-NEXT:    vle8.v v8, (a0)
3510; CHECK-NEXT:    vredmin.vs v8, v8, v8
3511; CHECK-NEXT:    vmv.x.s a0, v8
3512; CHECK-NEXT:    ret
3513  %v = load <4 x i8>, ptr %x
3514  %red = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %v)
3515  ret i8 %red
3516}
3517
3518declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
3519
3520define i8 @vreduce_smin_v8i8(ptr %x) {
3521; CHECK-LABEL: vreduce_smin_v8i8:
3522; CHECK:       # %bb.0:
3523; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
3524; CHECK-NEXT:    vle8.v v8, (a0)
3525; CHECK-NEXT:    vredmin.vs v8, v8, v8
3526; CHECK-NEXT:    vmv.x.s a0, v8
3527; CHECK-NEXT:    ret
3528  %v = load <8 x i8>, ptr %x
3529  %red = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
3530  ret i8 %red
3531}
3532
3533declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
3534
3535define i8 @vreduce_smin_v16i8(ptr %x) {
3536; CHECK-LABEL: vreduce_smin_v16i8:
3537; CHECK:       # %bb.0:
3538; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
3539; CHECK-NEXT:    vle8.v v8, (a0)
3540; CHECK-NEXT:    vredmin.vs v8, v8, v8
3541; CHECK-NEXT:    vmv.x.s a0, v8
3542; CHECK-NEXT:    ret
3543  %v = load <16 x i8>, ptr %x
3544  %red = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
3545  ret i8 %red
3546}
3547
3548declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
3549
3550define i8 @vreduce_smin_v32i8(ptr %x) {
3551; CHECK-LABEL: vreduce_smin_v32i8:
3552; CHECK:       # %bb.0:
3553; CHECK-NEXT:    li a1, 32
3554; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
3555; CHECK-NEXT:    vle8.v v8, (a0)
3556; CHECK-NEXT:    vredmin.vs v8, v8, v8
3557; CHECK-NEXT:    vmv.x.s a0, v8
3558; CHECK-NEXT:    ret
3559  %v = load <32 x i8>, ptr %x
3560  %red = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %v)
3561  ret i8 %red
3562}
3563
3564declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
3565
3566define i8 @vreduce_smin_v64i8(ptr %x) {
3567; CHECK-LABEL: vreduce_smin_v64i8:
3568; CHECK:       # %bb.0:
3569; CHECK-NEXT:    li a1, 64
3570; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
3571; CHECK-NEXT:    vle8.v v8, (a0)
3572; CHECK-NEXT:    vredmin.vs v8, v8, v8
3573; CHECK-NEXT:    vmv.x.s a0, v8
3574; CHECK-NEXT:    ret
3575  %v = load <64 x i8>, ptr %x
3576  %red = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %v)
3577  ret i8 %red
3578}
3579
3580declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
3581
3582define i8 @vreduce_smin_v128i8(ptr %x) {
3583; CHECK-LABEL: vreduce_smin_v128i8:
3584; CHECK:       # %bb.0:
3585; CHECK-NEXT:    li a1, 128
3586; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
3587; CHECK-NEXT:    vle8.v v8, (a0)
3588; CHECK-NEXT:    vredmin.vs v8, v8, v8
3589; CHECK-NEXT:    vmv.x.s a0, v8
3590; CHECK-NEXT:    ret
3591  %v = load <128 x i8>, ptr %x
3592  %red = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %v)
3593  ret i8 %red
3594}
3595
3596declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>)
3597
3598define i8 @vreduce_smin_v256i8(ptr %x) {
3599; CHECK-LABEL: vreduce_smin_v256i8:
3600; CHECK:       # %bb.0:
3601; CHECK-NEXT:    li a1, 128
3602; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
3603; CHECK-NEXT:    vle8.v v8, (a0)
3604; CHECK-NEXT:    addi a0, a0, 128
3605; CHECK-NEXT:    vle8.v v16, (a0)
3606; CHECK-NEXT:    vmin.vv v8, v8, v16
3607; CHECK-NEXT:    vredmin.vs v8, v8, v8
3608; CHECK-NEXT:    vmv.x.s a0, v8
3609; CHECK-NEXT:    ret
3610  %v = load <256 x i8>, ptr %x
3611  %red = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %v)
3612  ret i8 %red
3613}
3614
3615declare i16 @llvm.vector.reduce.smin.v1i16(<1 x i16>)
3616
3617define i16 @vreduce_smin_v1i16(<1 x i16> %v) {
3618; CHECK-LABEL: vreduce_smin_v1i16:
3619; CHECK:       # %bb.0:
3620; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
3621; CHECK-NEXT:    vmv.x.s a0, v8
3622; CHECK-NEXT:    ret
3623  %red = call i16 @llvm.vector.reduce.smin.v1i16(<1 x i16> %v)
3624  ret i16 %red
3625}
3626
3627declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
3628
3629define i16 @vreduce_smin_v2i16(ptr %x) {
3630; CHECK-LABEL: vreduce_smin_v2i16:
3631; CHECK:       # %bb.0:
3632; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
3633; CHECK-NEXT:    vle16.v v8, (a0)
3634; CHECK-NEXT:    vredmin.vs v8, v8, v8
3635; CHECK-NEXT:    vmv.x.s a0, v8
3636; CHECK-NEXT:    ret
3637  %v = load <2 x i16>, ptr %x
3638  %red = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %v)
3639  ret i16 %red
3640}
3641
3642declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
3643
3644define i16 @vreduce_smin_v4i16(ptr %x) {
3645; CHECK-LABEL: vreduce_smin_v4i16:
3646; CHECK:       # %bb.0:
3647; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
3648; CHECK-NEXT:    vle16.v v8, (a0)
3649; CHECK-NEXT:    vredmin.vs v8, v8, v8
3650; CHECK-NEXT:    vmv.x.s a0, v8
3651; CHECK-NEXT:    ret
3652  %v = load <4 x i16>, ptr %x
3653  %red = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
3654  ret i16 %red
3655}
3656
3657declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
3658
3659define i16 @vreduce_smin_v8i16(ptr %x) {
3660; CHECK-LABEL: vreduce_smin_v8i16:
3661; CHECK:       # %bb.0:
3662; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
3663; CHECK-NEXT:    vle16.v v8, (a0)
3664; CHECK-NEXT:    vredmin.vs v8, v8, v8
3665; CHECK-NEXT:    vmv.x.s a0, v8
3666; CHECK-NEXT:    ret
3667  %v = load <8 x i16>, ptr %x
3668  %red = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
3669  ret i16 %red
3670}
3671
3672declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
3673
3674define i16 @vreduce_smin_v16i16(ptr %x) {
3675; CHECK-LABEL: vreduce_smin_v16i16:
3676; CHECK:       # %bb.0:
3677; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
3678; CHECK-NEXT:    vle16.v v8, (a0)
3679; CHECK-NEXT:    vredmin.vs v8, v8, v8
3680; CHECK-NEXT:    vmv.x.s a0, v8
3681; CHECK-NEXT:    ret
3682  %v = load <16 x i16>, ptr %x
3683  %red = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %v)
3684  ret i16 %red
3685}
3686
3687declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
3688
3689define i16 @vreduce_smin_v32i16(ptr %x) {
3690; CHECK-LABEL: vreduce_smin_v32i16:
3691; CHECK:       # %bb.0:
3692; CHECK-NEXT:    li a1, 32
3693; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
3694; CHECK-NEXT:    vle16.v v8, (a0)
3695; CHECK-NEXT:    vredmin.vs v8, v8, v8
3696; CHECK-NEXT:    vmv.x.s a0, v8
3697; CHECK-NEXT:    ret
3698  %v = load <32 x i16>, ptr %x
3699  %red = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %v)
3700  ret i16 %red
3701}
3702
3703declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
3704
3705define i16 @vreduce_smin_v64i16(ptr %x) {
3706; CHECK-LABEL: vreduce_smin_v64i16:
3707; CHECK:       # %bb.0:
3708; CHECK-NEXT:    li a1, 64
3709; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
3710; CHECK-NEXT:    vle16.v v8, (a0)
3711; CHECK-NEXT:    vredmin.vs v8, v8, v8
3712; CHECK-NEXT:    vmv.x.s a0, v8
3713; CHECK-NEXT:    ret
3714  %v = load <64 x i16>, ptr %x
3715  %red = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %v)
3716  ret i16 %red
3717}
3718
3719declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>)
3720
3721define i16 @vreduce_smin_v128i16(ptr %x) {
3722; CHECK-LABEL: vreduce_smin_v128i16:
3723; CHECK:       # %bb.0:
3724; CHECK-NEXT:    li a1, 64
3725; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
3726; CHECK-NEXT:    vle16.v v8, (a0)
3727; CHECK-NEXT:    addi a0, a0, 128
3728; CHECK-NEXT:    vle16.v v16, (a0)
3729; CHECK-NEXT:    vmin.vv v8, v8, v16
3730; CHECK-NEXT:    vredmin.vs v8, v8, v8
3731; CHECK-NEXT:    vmv.x.s a0, v8
3732; CHECK-NEXT:    ret
3733  %v = load <128 x i16>, ptr %x
3734  %red = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %v)
3735  ret i16 %red
3736}
3737
3738declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32>)
3739
3740define i32 @vreduce_smin_v1i32(<1 x i32> %v) {
3741; CHECK-LABEL: vreduce_smin_v1i32:
3742; CHECK:       # %bb.0:
3743; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
3744; CHECK-NEXT:    vmv.x.s a0, v8
3745; CHECK-NEXT:    ret
3746  %red = call i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %v)
3747  ret i32 %red
3748}
3749
3750declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
3751
3752define i32 @vreduce_smin_v2i32(ptr %x) {
3753; CHECK-LABEL: vreduce_smin_v2i32:
3754; CHECK:       # %bb.0:
3755; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
3756; CHECK-NEXT:    vle32.v v8, (a0)
3757; CHECK-NEXT:    vredmin.vs v8, v8, v8
3758; CHECK-NEXT:    vmv.x.s a0, v8
3759; CHECK-NEXT:    ret
3760  %v = load <2 x i32>, ptr %x
3761  %red = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %v)
3762  ret i32 %red
3763}
3764
3765declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
3766
3767define i32 @vreduce_smin_v4i32(ptr %x) {
3768; CHECK-LABEL: vreduce_smin_v4i32:
3769; CHECK:       # %bb.0:
3770; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
3771; CHECK-NEXT:    vle32.v v8, (a0)
3772; CHECK-NEXT:    vredmin.vs v8, v8, v8
3773; CHECK-NEXT:    vmv.x.s a0, v8
3774; CHECK-NEXT:    ret
3775  %v = load <4 x i32>, ptr %x
3776  %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v)
3777  ret i32 %red
3778}
3779
3780declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
3781
3782define i32 @vreduce_smin_v8i32(ptr %x) {
3783; CHECK-LABEL: vreduce_smin_v8i32:
3784; CHECK:       # %bb.0:
3785; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
3786; CHECK-NEXT:    vle32.v v8, (a0)
3787; CHECK-NEXT:    vredmin.vs v8, v8, v8
3788; CHECK-NEXT:    vmv.x.s a0, v8
3789; CHECK-NEXT:    ret
3790  %v = load <8 x i32>, ptr %x
3791  %red = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %v)
3792  ret i32 %red
3793}
3794
3795declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
3796
3797define i32 @vreduce_smin_v16i32(ptr %x) {
3798; CHECK-LABEL: vreduce_smin_v16i32:
3799; CHECK:       # %bb.0:
3800; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
3801; CHECK-NEXT:    vle32.v v8, (a0)
3802; CHECK-NEXT:    vredmin.vs v8, v8, v8
3803; CHECK-NEXT:    vmv.x.s a0, v8
3804; CHECK-NEXT:    ret
3805  %v = load <16 x i32>, ptr %x
3806  %red = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %v)
3807  ret i32 %red
3808}
3809
3810declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
3811
3812define i32 @vreduce_smin_v32i32(ptr %x) {
3813; CHECK-LABEL: vreduce_smin_v32i32:
3814; CHECK:       # %bb.0:
3815; CHECK-NEXT:    li a1, 32
3816; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
3817; CHECK-NEXT:    vle32.v v8, (a0)
3818; CHECK-NEXT:    vredmin.vs v8, v8, v8
3819; CHECK-NEXT:    vmv.x.s a0, v8
3820; CHECK-NEXT:    ret
3821  %v = load <32 x i32>, ptr %x
3822  %red = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %v)
3823  ret i32 %red
3824}
3825
3826declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>)
3827
3828define i32 @vreduce_smin_v64i32(ptr %x) {
3829; CHECK-LABEL: vreduce_smin_v64i32:
3830; CHECK:       # %bb.0:
3831; CHECK-NEXT:    li a1, 32
3832; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
3833; CHECK-NEXT:    vle32.v v8, (a0)
3834; CHECK-NEXT:    addi a0, a0, 128
3835; CHECK-NEXT:    vle32.v v16, (a0)
3836; CHECK-NEXT:    vmin.vv v8, v8, v16
3837; CHECK-NEXT:    vredmin.vs v8, v8, v8
3838; CHECK-NEXT:    vmv.x.s a0, v8
3839; CHECK-NEXT:    ret
3840  %v = load <64 x i32>, ptr %x
3841  %red = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %v)
3842  ret i32 %red
3843}
3844
3845declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
3846
3847define i64 @vreduce_smin_v1i64(<1 x i64> %v) {
3848; RV32-LABEL: vreduce_smin_v1i64:
3849; RV32:       # %bb.0:
3850; RV32-NEXT:    li a0, 32
3851; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3852; RV32-NEXT:    vsrl.vx v9, v8, a0
3853; RV32-NEXT:    vmv.x.s a1, v9
3854; RV32-NEXT:    vmv.x.s a0, v8
3855; RV32-NEXT:    ret
3856;
3857; RV64-LABEL: vreduce_smin_v1i64:
3858; RV64:       # %bb.0:
3859; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3860; RV64-NEXT:    vmv.x.s a0, v8
3861; RV64-NEXT:    ret
3862  %red = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %v)
3863  ret i64 %red
3864}
3865
3866declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
3867
3868define i64 @vreduce_smin_v2i64(ptr %x) {
3869; RV32-LABEL: vreduce_smin_v2i64:
3870; RV32:       # %bb.0:
3871; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
3872; RV32-NEXT:    vle64.v v8, (a0)
3873; RV32-NEXT:    li a0, 32
3874; RV32-NEXT:    vredmin.vs v8, v8, v8
3875; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3876; RV32-NEXT:    vsrl.vx v9, v8, a0
3877; RV32-NEXT:    vmv.x.s a1, v9
3878; RV32-NEXT:    vmv.x.s a0, v8
3879; RV32-NEXT:    ret
3880;
3881; RV64-LABEL: vreduce_smin_v2i64:
3882; RV64:       # %bb.0:
3883; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
3884; RV64-NEXT:    vle64.v v8, (a0)
3885; RV64-NEXT:    vredmin.vs v8, v8, v8
3886; RV64-NEXT:    vmv.x.s a0, v8
3887; RV64-NEXT:    ret
3888  %v = load <2 x i64>, ptr %x
3889  %red = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %v)
3890  ret i64 %red
3891}
3892
3893declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
3894
3895define i64 @vreduce_smin_v4i64(ptr %x) {
3896; RV32-LABEL: vreduce_smin_v4i64:
3897; RV32:       # %bb.0:
3898; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
3899; RV32-NEXT:    vle64.v v8, (a0)
3900; RV32-NEXT:    li a1, 32
3901; RV32-NEXT:    vredmin.vs v8, v8, v8
3902; RV32-NEXT:    vmv.x.s a0, v8
3903; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3904; RV32-NEXT:    vsrl.vx v8, v8, a1
3905; RV32-NEXT:    vmv.x.s a1, v8
3906; RV32-NEXT:    ret
3907;
3908; RV64-LABEL: vreduce_smin_v4i64:
3909; RV64:       # %bb.0:
3910; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
3911; RV64-NEXT:    vle64.v v8, (a0)
3912; RV64-NEXT:    vredmin.vs v8, v8, v8
3913; RV64-NEXT:    vmv.x.s a0, v8
3914; RV64-NEXT:    ret
3915  %v = load <4 x i64>, ptr %x
3916  %red = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v)
3917  ret i64 %red
3918}
3919
3920declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
3921
3922define i64 @vreduce_smin_v8i64(ptr %x) {
3923; RV32-LABEL: vreduce_smin_v8i64:
3924; RV32:       # %bb.0:
3925; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
3926; RV32-NEXT:    vle64.v v8, (a0)
3927; RV32-NEXT:    li a1, 32
3928; RV32-NEXT:    vredmin.vs v8, v8, v8
3929; RV32-NEXT:    vmv.x.s a0, v8
3930; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3931; RV32-NEXT:    vsrl.vx v8, v8, a1
3932; RV32-NEXT:    vmv.x.s a1, v8
3933; RV32-NEXT:    ret
3934;
3935; RV64-LABEL: vreduce_smin_v8i64:
3936; RV64:       # %bb.0:
3937; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
3938; RV64-NEXT:    vle64.v v8, (a0)
3939; RV64-NEXT:    vredmin.vs v8, v8, v8
3940; RV64-NEXT:    vmv.x.s a0, v8
3941; RV64-NEXT:    ret
3942  %v = load <8 x i64>, ptr %x
3943  %red = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %v)
3944  ret i64 %red
3945}
3946
3947declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
3948
3949define i64 @vreduce_smin_v16i64(ptr %x) {
3950; RV32-LABEL: vreduce_smin_v16i64:
3951; RV32:       # %bb.0:
3952; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3953; RV32-NEXT:    vle64.v v8, (a0)
3954; RV32-NEXT:    li a1, 32
3955; RV32-NEXT:    vredmin.vs v8, v8, v8
3956; RV32-NEXT:    vmv.x.s a0, v8
3957; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3958; RV32-NEXT:    vsrl.vx v8, v8, a1
3959; RV32-NEXT:    vmv.x.s a1, v8
3960; RV32-NEXT:    ret
3961;
3962; RV64-LABEL: vreduce_smin_v16i64:
3963; RV64:       # %bb.0:
3964; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3965; RV64-NEXT:    vle64.v v8, (a0)
3966; RV64-NEXT:    vredmin.vs v8, v8, v8
3967; RV64-NEXT:    vmv.x.s a0, v8
3968; RV64-NEXT:    ret
3969  %v = load <16 x i64>, ptr %x
3970  %red = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %v)
3971  ret i64 %red
3972}
3973
3974declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>)
3975
3976define i64 @vreduce_smin_v32i64(ptr %x) {
3977; RV32-LABEL: vreduce_smin_v32i64:
3978; RV32:       # %bb.0:
3979; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3980; RV32-NEXT:    vle64.v v8, (a0)
3981; RV32-NEXT:    addi a0, a0, 128
3982; RV32-NEXT:    vle64.v v16, (a0)
3983; RV32-NEXT:    li a1, 32
3984; RV32-NEXT:    vmin.vv v8, v8, v16
3985; RV32-NEXT:    vredmin.vs v8, v8, v8
3986; RV32-NEXT:    vmv.x.s a0, v8
3987; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
3988; RV32-NEXT:    vsrl.vx v8, v8, a1
3989; RV32-NEXT:    vmv.x.s a1, v8
3990; RV32-NEXT:    ret
3991;
3992; RV64-LABEL: vreduce_smin_v32i64:
3993; RV64:       # %bb.0:
3994; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
3995; RV64-NEXT:    vle64.v v8, (a0)
3996; RV64-NEXT:    addi a0, a0, 128
3997; RV64-NEXT:    vle64.v v16, (a0)
3998; RV64-NEXT:    vmin.vv v8, v8, v16
3999; RV64-NEXT:    vredmin.vs v8, v8, v8
4000; RV64-NEXT:    vmv.x.s a0, v8
4001; RV64-NEXT:    ret
4002  %v = load <32 x i64>, ptr %x
4003  %red = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %v)
4004  ret i64 %red
4005}
4006
4007declare i64 @llvm.vector.reduce.smin.v64i64(<64 x i64>)
4008
4009define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
4010; RV32-LABEL: vreduce_smin_v64i64:
4011; RV32:       # %bb.0:
4012; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
4013; RV32-NEXT:    vle64.v v8, (a0)
4014; RV32-NEXT:    addi a1, a0, 384
4015; RV32-NEXT:    vle64.v v16, (a1)
4016; RV32-NEXT:    addi a1, a0, 256
4017; RV32-NEXT:    addi a0, a0, 128
4018; RV32-NEXT:    vle64.v v0, (a0)
4019; RV32-NEXT:    vle64.v v24, (a1)
4020; RV32-NEXT:    li a1, 32
4021; RV32-NEXT:    vmin.vv v16, v0, v16
4022; RV32-NEXT:    vmin.vv v8, v8, v24
4023; RV32-NEXT:    vmin.vv v8, v8, v16
4024; RV32-NEXT:    vredmin.vs v8, v8, v8
4025; RV32-NEXT:    vmv.x.s a0, v8
4026; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
4027; RV32-NEXT:    vsrl.vx v8, v8, a1
4028; RV32-NEXT:    vmv.x.s a1, v8
4029; RV32-NEXT:    ret
4030;
4031; RV64-LABEL: vreduce_smin_v64i64:
4032; RV64:       # %bb.0:
4033; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
4034; RV64-NEXT:    vle64.v v8, (a0)
4035; RV64-NEXT:    addi a1, a0, 384
4036; RV64-NEXT:    vle64.v v16, (a1)
4037; RV64-NEXT:    addi a1, a0, 256
4038; RV64-NEXT:    addi a0, a0, 128
4039; RV64-NEXT:    vle64.v v24, (a0)
4040; RV64-NEXT:    vle64.v v0, (a1)
4041; RV64-NEXT:    vmin.vv v16, v24, v16
4042; RV64-NEXT:    vmin.vv v8, v8, v0
4043; RV64-NEXT:    vmin.vv v8, v8, v16
4044; RV64-NEXT:    vredmin.vs v8, v8, v8
4045; RV64-NEXT:    vmv.x.s a0, v8
4046; RV64-NEXT:    ret
4047  %v = load <64 x i64>, ptr %x
4048  %red = call i64 @llvm.vector.reduce.smin.v64i64(<64 x i64> %v)
4049  ret i64 %red
4050}
4051
4052declare i8 @llvm.vector.reduce.smax.v1i8(<1 x i8>)
4053
4054define i8 @vreduce_smax_v1i8(<1 x i8> %v) {
4055; CHECK-LABEL: vreduce_smax_v1i8:
4056; CHECK:       # %bb.0:
4057; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
4058; CHECK-NEXT:    vmv.x.s a0, v8
4059; CHECK-NEXT:    ret
4060  %red = call i8 @llvm.vector.reduce.smax.v1i8(<1 x i8> %v)
4061  ret i8 %red
4062}
4063
4064declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
4065
4066define i8 @vreduce_smax_v2i8(ptr %x) {
4067; CHECK-LABEL: vreduce_smax_v2i8:
4068; CHECK:       # %bb.0:
4069; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
4070; CHECK-NEXT:    vle8.v v8, (a0)
4071; CHECK-NEXT:    vredmax.vs v8, v8, v8
4072; CHECK-NEXT:    vmv.x.s a0, v8
4073; CHECK-NEXT:    ret
4074  %v = load <2 x i8>, ptr %x
4075  %red = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %v)
4076  ret i8 %red
4077}
4078
4079declare i8 @llvm.vector.reduce.smax.v3i8(<3 x i8>)
4080
4081define i8 @vreduce_smax_v3i8(ptr %x) {
4082; CHECK-LABEL: vreduce_smax_v3i8:
4083; CHECK:       # %bb.0:
4084; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
4085; CHECK-NEXT:    vle8.v v8, (a0)
4086; CHECK-NEXT:    li a0, -128
4087; CHECK-NEXT:    vmv.s.x v9, a0
4088; CHECK-NEXT:    vredmax.vs v8, v8, v9
4089; CHECK-NEXT:    vmv.x.s a0, v8
4090; CHECK-NEXT:    ret
4091  %v = load <3 x i8>, ptr %x
4092  %red = call i8 @llvm.vector.reduce.smax.v3i8(<3 x i8> %v)
4093  ret i8 %red
4094}
4095
4096declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
4097
4098define i8 @vreduce_smax_v4i8(ptr %x) {
4099; CHECK-LABEL: vreduce_smax_v4i8:
4100; CHECK:       # %bb.0:
4101; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
4102; CHECK-NEXT:    vle8.v v8, (a0)
4103; CHECK-NEXT:    vredmax.vs v8, v8, v8
4104; CHECK-NEXT:    vmv.x.s a0, v8
4105; CHECK-NEXT:    ret
4106  %v = load <4 x i8>, ptr %x
4107  %red = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %v)
4108  ret i8 %red
4109}
4110
4111declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
4112
4113define i8 @vreduce_smax_v8i8(ptr %x) {
4114; CHECK-LABEL: vreduce_smax_v8i8:
4115; CHECK:       # %bb.0:
4116; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
4117; CHECK-NEXT:    vle8.v v8, (a0)
4118; CHECK-NEXT:    vredmax.vs v8, v8, v8
4119; CHECK-NEXT:    vmv.x.s a0, v8
4120; CHECK-NEXT:    ret
4121  %v = load <8 x i8>, ptr %x
4122  %red = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
4123  ret i8 %red
4124}
4125
4126declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
4127
4128define i8 @vreduce_smax_v16i8(ptr %x) {
4129; CHECK-LABEL: vreduce_smax_v16i8:
4130; CHECK:       # %bb.0:
4131; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
4132; CHECK-NEXT:    vle8.v v8, (a0)
4133; CHECK-NEXT:    vredmax.vs v8, v8, v8
4134; CHECK-NEXT:    vmv.x.s a0, v8
4135; CHECK-NEXT:    ret
4136  %v = load <16 x i8>, ptr %x
4137  %red = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
4138  ret i8 %red
4139}
4140
4141declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
4142
4143define i8 @vreduce_smax_v32i8(ptr %x) {
4144; CHECK-LABEL: vreduce_smax_v32i8:
4145; CHECK:       # %bb.0:
4146; CHECK-NEXT:    li a1, 32
4147; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
4148; CHECK-NEXT:    vle8.v v8, (a0)
4149; CHECK-NEXT:    vredmax.vs v8, v8, v8
4150; CHECK-NEXT:    vmv.x.s a0, v8
4151; CHECK-NEXT:    ret
4152  %v = load <32 x i8>, ptr %x
4153  %red = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %v)
4154  ret i8 %red
4155}
4156
4157declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
4158
4159define i8 @vreduce_smax_v64i8(ptr %x) {
4160; CHECK-LABEL: vreduce_smax_v64i8:
4161; CHECK:       # %bb.0:
4162; CHECK-NEXT:    li a1, 64
4163; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
4164; CHECK-NEXT:    vle8.v v8, (a0)
4165; CHECK-NEXT:    vredmax.vs v8, v8, v8
4166; CHECK-NEXT:    vmv.x.s a0, v8
4167; CHECK-NEXT:    ret
4168  %v = load <64 x i8>, ptr %x
4169  %red = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %v)
4170  ret i8 %red
4171}
4172
4173declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
4174
4175define i8 @vreduce_smax_v128i8(ptr %x) {
4176; CHECK-LABEL: vreduce_smax_v128i8:
4177; CHECK:       # %bb.0:
4178; CHECK-NEXT:    li a1, 128
4179; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
4180; CHECK-NEXT:    vle8.v v8, (a0)
4181; CHECK-NEXT:    vredmax.vs v8, v8, v8
4182; CHECK-NEXT:    vmv.x.s a0, v8
4183; CHECK-NEXT:    ret
4184  %v = load <128 x i8>, ptr %x
4185  %red = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %v)
4186  ret i8 %red
4187}
4188
4189declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
4190
4191define i8 @vreduce_smax_v256i8(ptr %x) {
4192; CHECK-LABEL: vreduce_smax_v256i8:
4193; CHECK:       # %bb.0:
4194; CHECK-NEXT:    li a1, 128
4195; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
4196; CHECK-NEXT:    vle8.v v8, (a0)
4197; CHECK-NEXT:    addi a0, a0, 128
4198; CHECK-NEXT:    vle8.v v16, (a0)
4199; CHECK-NEXT:    vmax.vv v8, v8, v16
4200; CHECK-NEXT:    vredmax.vs v8, v8, v8
4201; CHECK-NEXT:    vmv.x.s a0, v8
4202; CHECK-NEXT:    ret
4203  %v = load <256 x i8>, ptr %x
4204  %red = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %v)
4205  ret i8 %red
4206}
4207
4208declare i16 @llvm.vector.reduce.smax.v1i16(<1 x i16>)
4209
4210define i16 @vreduce_smax_v1i16(<1 x i16> %v) {
4211; CHECK-LABEL: vreduce_smax_v1i16:
4212; CHECK:       # %bb.0:
4213; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
4214; CHECK-NEXT:    vmv.x.s a0, v8
4215; CHECK-NEXT:    ret
4216  %red = call i16 @llvm.vector.reduce.smax.v1i16(<1 x i16> %v)
4217  ret i16 %red
4218}
4219
4220declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
4221
4222define i16 @vreduce_smax_v2i16(ptr %x) {
4223; CHECK-LABEL: vreduce_smax_v2i16:
4224; CHECK:       # %bb.0:
4225; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
4226; CHECK-NEXT:    vle16.v v8, (a0)
4227; CHECK-NEXT:    vredmax.vs v8, v8, v8
4228; CHECK-NEXT:    vmv.x.s a0, v8
4229; CHECK-NEXT:    ret
4230  %v = load <2 x i16>, ptr %x
4231  %red = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %v)
4232  ret i16 %red
4233}
4234
4235declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
4236
4237define i16 @vreduce_smax_v4i16(ptr %x) {
4238; CHECK-LABEL: vreduce_smax_v4i16:
4239; CHECK:       # %bb.0:
4240; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
4241; CHECK-NEXT:    vle16.v v8, (a0)
4242; CHECK-NEXT:    vredmax.vs v8, v8, v8
4243; CHECK-NEXT:    vmv.x.s a0, v8
4244; CHECK-NEXT:    ret
4245  %v = load <4 x i16>, ptr %x
4246  %red = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
4247  ret i16 %red
4248}
4249
4250declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
4251
4252define i16 @vreduce_smax_v8i16(ptr %x) {
4253; CHECK-LABEL: vreduce_smax_v8i16:
4254; CHECK:       # %bb.0:
4255; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
4256; CHECK-NEXT:    vle16.v v8, (a0)
4257; CHECK-NEXT:    vredmax.vs v8, v8, v8
4258; CHECK-NEXT:    vmv.x.s a0, v8
4259; CHECK-NEXT:    ret
4260  %v = load <8 x i16>, ptr %x
4261  %red = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
4262  ret i16 %red
4263}
4264
4265declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
4266
4267define i16 @vreduce_smax_v16i16(ptr %x) {
4268; CHECK-LABEL: vreduce_smax_v16i16:
4269; CHECK:       # %bb.0:
4270; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
4271; CHECK-NEXT:    vle16.v v8, (a0)
4272; CHECK-NEXT:    vredmax.vs v8, v8, v8
4273; CHECK-NEXT:    vmv.x.s a0, v8
4274; CHECK-NEXT:    ret
4275  %v = load <16 x i16>, ptr %x
4276  %red = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %v)
4277  ret i16 %red
4278}
4279
4280declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
4281
4282define i16 @vreduce_smax_v32i16(ptr %x) {
4283; CHECK-LABEL: vreduce_smax_v32i16:
4284; CHECK:       # %bb.0:
4285; CHECK-NEXT:    li a1, 32
4286; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
4287; CHECK-NEXT:    vle16.v v8, (a0)
4288; CHECK-NEXT:    vredmax.vs v8, v8, v8
4289; CHECK-NEXT:    vmv.x.s a0, v8
4290; CHECK-NEXT:    ret
4291  %v = load <32 x i16>, ptr %x
4292  %red = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %v)
4293  ret i16 %red
4294}
4295
4296declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
4297
4298define i16 @vreduce_smax_v64i16(ptr %x) {
4299; CHECK-LABEL: vreduce_smax_v64i16:
4300; CHECK:       # %bb.0:
4301; CHECK-NEXT:    li a1, 64
4302; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
4303; CHECK-NEXT:    vle16.v v8, (a0)
4304; CHECK-NEXT:    vredmax.vs v8, v8, v8
4305; CHECK-NEXT:    vmv.x.s a0, v8
4306; CHECK-NEXT:    ret
4307  %v = load <64 x i16>, ptr %x
4308  %red = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %v)
4309  ret i16 %red
4310}
4311
4312declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
4313
4314define i16 @vreduce_smax_v128i16(ptr %x) {
4315; CHECK-LABEL: vreduce_smax_v128i16:
4316; CHECK:       # %bb.0:
4317; CHECK-NEXT:    li a1, 64
4318; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
4319; CHECK-NEXT:    vle16.v v8, (a0)
4320; CHECK-NEXT:    addi a0, a0, 128
4321; CHECK-NEXT:    vle16.v v16, (a0)
4322; CHECK-NEXT:    vmax.vv v8, v8, v16
4323; CHECK-NEXT:    vredmax.vs v8, v8, v8
4324; CHECK-NEXT:    vmv.x.s a0, v8
4325; CHECK-NEXT:    ret
4326  %v = load <128 x i16>, ptr %x
4327  %red = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %v)
4328  ret i16 %red
4329}
4330
4331declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32>)
4332
4333define i32 @vreduce_smax_v1i32(<1 x i32> %v) {
4334; CHECK-LABEL: vreduce_smax_v1i32:
4335; CHECK:       # %bb.0:
4336; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
4337; CHECK-NEXT:    vmv.x.s a0, v8
4338; CHECK-NEXT:    ret
4339  %red = call i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %v)
4340  ret i32 %red
4341}
4342
4343declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
4344
4345define i32 @vreduce_smax_v2i32(ptr %x) {
4346; CHECK-LABEL: vreduce_smax_v2i32:
4347; CHECK:       # %bb.0:
4348; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
4349; CHECK-NEXT:    vle32.v v8, (a0)
4350; CHECK-NEXT:    vredmax.vs v8, v8, v8
4351; CHECK-NEXT:    vmv.x.s a0, v8
4352; CHECK-NEXT:    ret
4353  %v = load <2 x i32>, ptr %x
4354  %red = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %v)
4355  ret i32 %red
4356}
4357
4358declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
4359
4360define i32 @vreduce_smax_v4i32(ptr %x) {
4361; CHECK-LABEL: vreduce_smax_v4i32:
4362; CHECK:       # %bb.0:
4363; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
4364; CHECK-NEXT:    vle32.v v8, (a0)
4365; CHECK-NEXT:    vredmax.vs v8, v8, v8
4366; CHECK-NEXT:    vmv.x.s a0, v8
4367; CHECK-NEXT:    ret
4368  %v = load <4 x i32>, ptr %x
4369  %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v)
4370  ret i32 %red
4371}
4372
4373declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
4374
4375define i32 @vreduce_smax_v8i32(ptr %x) {
4376; CHECK-LABEL: vreduce_smax_v8i32:
4377; CHECK:       # %bb.0:
4378; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
4379; CHECK-NEXT:    vle32.v v8, (a0)
4380; CHECK-NEXT:    vredmax.vs v8, v8, v8
4381; CHECK-NEXT:    vmv.x.s a0, v8
4382; CHECK-NEXT:    ret
4383  %v = load <8 x i32>, ptr %x
4384  %red = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %v)
4385  ret i32 %red
4386}
4387
4388declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
4389
4390define i32 @vreduce_smax_v16i32(ptr %x) {
4391; CHECK-LABEL: vreduce_smax_v16i32:
4392; CHECK:       # %bb.0:
4393; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
4394; CHECK-NEXT:    vle32.v v8, (a0)
4395; CHECK-NEXT:    vredmax.vs v8, v8, v8
4396; CHECK-NEXT:    vmv.x.s a0, v8
4397; CHECK-NEXT:    ret
4398  %v = load <16 x i32>, ptr %x
4399  %red = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %v)
4400  ret i32 %red
4401}
4402
4403declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
4404
4405define i32 @vreduce_smax_v32i32(ptr %x) {
4406; CHECK-LABEL: vreduce_smax_v32i32:
4407; CHECK:       # %bb.0:
4408; CHECK-NEXT:    li a1, 32
4409; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
4410; CHECK-NEXT:    vle32.v v8, (a0)
4411; CHECK-NEXT:    vredmax.vs v8, v8, v8
4412; CHECK-NEXT:    vmv.x.s a0, v8
4413; CHECK-NEXT:    ret
4414  %v = load <32 x i32>, ptr %x
4415  %red = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %v)
4416  ret i32 %red
4417}
4418
4419declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
4420
4421define i32 @vreduce_smax_v64i32(ptr %x) {
4422; CHECK-LABEL: vreduce_smax_v64i32:
4423; CHECK:       # %bb.0:
4424; CHECK-NEXT:    li a1, 32
4425; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
4426; CHECK-NEXT:    vle32.v v8, (a0)
4427; CHECK-NEXT:    addi a0, a0, 128
4428; CHECK-NEXT:    vle32.v v16, (a0)
4429; CHECK-NEXT:    vmax.vv v8, v8, v16
4430; CHECK-NEXT:    vredmax.vs v8, v8, v8
4431; CHECK-NEXT:    vmv.x.s a0, v8
4432; CHECK-NEXT:    ret
4433  %v = load <64 x i32>, ptr %x
4434  %red = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %v)
4435  ret i32 %red
4436}
4437
4438declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
4439
4440define i64 @vreduce_smax_v1i64(<1 x i64> %v) {
4441; RV32-LABEL: vreduce_smax_v1i64:
4442; RV32:       # %bb.0:
4443; RV32-NEXT:    li a0, 32
4444; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
4445; RV32-NEXT:    vsrl.vx v9, v8, a0
4446; RV32-NEXT:    vmv.x.s a1, v9
4447; RV32-NEXT:    vmv.x.s a0, v8
4448; RV32-NEXT:    ret
4449;
4450; RV64-LABEL: vreduce_smax_v1i64:
4451; RV64:       # %bb.0:
4452; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
4453; RV64-NEXT:    vmv.x.s a0, v8
4454; RV64-NEXT:    ret
4455  %red = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %v)
4456  ret i64 %red
4457}
4458
4459declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
4460
4461define i64 @vreduce_smax_v2i64(ptr %x) {
4462; RV32-LABEL: vreduce_smax_v2i64:
4463; RV32:       # %bb.0:
4464; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
4465; RV32-NEXT:    vle64.v v8, (a0)
4466; RV32-NEXT:    li a0, 32
4467; RV32-NEXT:    vredmax.vs v8, v8, v8
4468; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
4469; RV32-NEXT:    vsrl.vx v9, v8, a0
4470; RV32-NEXT:    vmv.x.s a1, v9
4471; RV32-NEXT:    vmv.x.s a0, v8
4472; RV32-NEXT:    ret
4473;
4474; RV64-LABEL: vreduce_smax_v2i64:
4475; RV64:       # %bb.0:
4476; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
4477; RV64-NEXT:    vle64.v v8, (a0)
4478; RV64-NEXT:    vredmax.vs v8, v8, v8
4479; RV64-NEXT:    vmv.x.s a0, v8
4480; RV64-NEXT:    ret
4481  %v = load <2 x i64>, ptr %x
4482  %red = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %v)
4483  ret i64 %red
4484}
4485
4486declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
4487
4488define i64 @vreduce_smax_v4i64(ptr %x) {
4489; RV32-LABEL: vreduce_smax_v4i64:
4490; RV32:       # %bb.0:
4491; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
4492; RV32-NEXT:    vle64.v v8, (a0)
4493; RV32-NEXT:    li a1, 32
4494; RV32-NEXT:    vredmax.vs v8, v8, v8
4495; RV32-NEXT:    vmv.x.s a0, v8
4496; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
4497; RV32-NEXT:    vsrl.vx v8, v8, a1
4498; RV32-NEXT:    vmv.x.s a1, v8
4499; RV32-NEXT:    ret
4500;
4501; RV64-LABEL: vreduce_smax_v4i64:
4502; RV64:       # %bb.0:
4503; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
4504; RV64-NEXT:    vle64.v v8, (a0)
4505; RV64-NEXT:    vredmax.vs v8, v8, v8
4506; RV64-NEXT:    vmv.x.s a0, v8
4507; RV64-NEXT:    ret
4508  %v = load <4 x i64>, ptr %x
4509  %red = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v)
4510  ret i64 %red
4511}
4512
4513declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
4514
4515define i64 @vreduce_smax_v8i64(ptr %x) {
4516; RV32-LABEL: vreduce_smax_v8i64:
4517; RV32:       # %bb.0:
4518; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
4519; RV32-NEXT:    vle64.v v8, (a0)
4520; RV32-NEXT:    li a1, 32
4521; RV32-NEXT:    vredmax.vs v8, v8, v8
4522; RV32-NEXT:    vmv.x.s a0, v8
4523; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
4524; RV32-NEXT:    vsrl.vx v8, v8, a1
4525; RV32-NEXT:    vmv.x.s a1, v8
4526; RV32-NEXT:    ret
4527;
4528; RV64-LABEL: vreduce_smax_v8i64:
4529; RV64:       # %bb.0:
4530; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
4531; RV64-NEXT:    vle64.v v8, (a0)
4532; RV64-NEXT:    vredmax.vs v8, v8, v8
4533; RV64-NEXT:    vmv.x.s a0, v8
4534; RV64-NEXT:    ret
4535  %v = load <8 x i64>, ptr %x
4536  %red = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %v)
4537  ret i64 %red
4538}
4539
4540declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
4541
4542define i64 @vreduce_smax_v16i64(ptr %x) {
4543; RV32-LABEL: vreduce_smax_v16i64:
4544; RV32:       # %bb.0:
4545; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
4546; RV32-NEXT:    vle64.v v8, (a0)
4547; RV32-NEXT:    li a1, 32
4548; RV32-NEXT:    vredmax.vs v8, v8, v8
4549; RV32-NEXT:    vmv.x.s a0, v8
4550; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
4551; RV32-NEXT:    vsrl.vx v8, v8, a1
4552; RV32-NEXT:    vmv.x.s a1, v8
4553; RV32-NEXT:    ret
4554;
4555; RV64-LABEL: vreduce_smax_v16i64:
4556; RV64:       # %bb.0:
4557; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
4558; RV64-NEXT:    vle64.v v8, (a0)
4559; RV64-NEXT:    vredmax.vs v8, v8, v8
4560; RV64-NEXT:    vmv.x.s a0, v8
4561; RV64-NEXT:    ret
4562  %v = load <16 x i64>, ptr %x
4563  %red = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %v)
4564  ret i64 %red
4565}
4566
4567declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
4568
4569define i64 @vreduce_smax_v32i64(ptr %x) {
4570; RV32-LABEL: vreduce_smax_v32i64:
4571; RV32:       # %bb.0:
4572; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
4573; RV32-NEXT:    vle64.v v8, (a0)
4574; RV32-NEXT:    addi a0, a0, 128
4575; RV32-NEXT:    vle64.v v16, (a0)
4576; RV32-NEXT:    li a1, 32
4577; RV32-NEXT:    vmax.vv v8, v8, v16
4578; RV32-NEXT:    vredmax.vs v8, v8, v8
4579; RV32-NEXT:    vmv.x.s a0, v8
4580; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
4581; RV32-NEXT:    vsrl.vx v8, v8, a1
4582; RV32-NEXT:    vmv.x.s a1, v8
4583; RV32-NEXT:    ret
4584;
4585; RV64-LABEL: vreduce_smax_v32i64:
4586; RV64:       # %bb.0:
4587; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
4588; RV64-NEXT:    vle64.v v8, (a0)
4589; RV64-NEXT:    addi a0, a0, 128
4590; RV64-NEXT:    vle64.v v16, (a0)
4591; RV64-NEXT:    vmax.vv v8, v8, v16
4592; RV64-NEXT:    vredmax.vs v8, v8, v8
4593; RV64-NEXT:    vmv.x.s a0, v8
4594; RV64-NEXT:    ret
4595  %v = load <32 x i64>, ptr %x
4596  %red = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %v)
4597  ret i64 %red
4598}
4599
4600declare i64 @llvm.vector.reduce.smax.v64i64(<64 x i64>)
4601
4602define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
4603; RV32-LABEL: vreduce_smax_v64i64:
4604; RV32:       # %bb.0:
4605; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
4606; RV32-NEXT:    vle64.v v8, (a0)
4607; RV32-NEXT:    addi a1, a0, 384
4608; RV32-NEXT:    vle64.v v16, (a1)
4609; RV32-NEXT:    addi a1, a0, 256
4610; RV32-NEXT:    addi a0, a0, 128
4611; RV32-NEXT:    vle64.v v0, (a0)
4612; RV32-NEXT:    vle64.v v24, (a1)
4613; RV32-NEXT:    li a1, 32
4614; RV32-NEXT:    vmax.vv v16, v0, v16
4615; RV32-NEXT:    vmax.vv v8, v8, v24
4616; RV32-NEXT:    vmax.vv v8, v8, v16
4617; RV32-NEXT:    vredmax.vs v8, v8, v8
4618; RV32-NEXT:    vmv.x.s a0, v8
4619; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
4620; RV32-NEXT:    vsrl.vx v8, v8, a1
4621; RV32-NEXT:    vmv.x.s a1, v8
4622; RV32-NEXT:    ret
4623;
4624; RV64-LABEL: vreduce_smax_v64i64:
4625; RV64:       # %bb.0:
4626; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
4627; RV64-NEXT:    vle64.v v8, (a0)
4628; RV64-NEXT:    addi a1, a0, 384
4629; RV64-NEXT:    vle64.v v16, (a1)
4630; RV64-NEXT:    addi a1, a0, 256
4631; RV64-NEXT:    addi a0, a0, 128
4632; RV64-NEXT:    vle64.v v24, (a0)
4633; RV64-NEXT:    vle64.v v0, (a1)
4634; RV64-NEXT:    vmax.vv v16, v24, v16
4635; RV64-NEXT:    vmax.vv v8, v8, v0
4636; RV64-NEXT:    vmax.vv v8, v8, v16
4637; RV64-NEXT:    vredmax.vs v8, v8, v8
4638; RV64-NEXT:    vmv.x.s a0, v8
4639; RV64-NEXT:    ret
4640  %v = load <64 x i64>, ptr %x
4641  %red = call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> %v)
4642  ret i64 %red
4643}
4644
4645declare i8 @llvm.vector.reduce.umin.v1i8(<1 x i8>)
4646
4647define i8 @vreduce_umin_v1i8(<1 x i8> %v) {
4648; CHECK-LABEL: vreduce_umin_v1i8:
4649; CHECK:       # %bb.0:
4650; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
4651; CHECK-NEXT:    vmv.x.s a0, v8
4652; CHECK-NEXT:    ret
4653  %red = call i8 @llvm.vector.reduce.umin.v1i8(<1 x i8> %v)
4654  ret i8 %red
4655}
4656
4657declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
4658
4659define i8 @vreduce_umin_v2i8(ptr %x) {
4660; CHECK-LABEL: vreduce_umin_v2i8:
4661; CHECK:       # %bb.0:
4662; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
4663; CHECK-NEXT:    vle8.v v8, (a0)
4664; CHECK-NEXT:    vredminu.vs v8, v8, v8
4665; CHECK-NEXT:    vmv.x.s a0, v8
4666; CHECK-NEXT:    ret
4667  %v = load <2 x i8>, ptr %x
4668  %red = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %v)
4669  ret i8 %red
4670}
4671
4672declare i8 @llvm.vector.reduce.umin.v3i8(<3 x i8>)
4673
4674define i8 @vreduce_umin_v3i8(ptr %x) {
4675; CHECK-LABEL: vreduce_umin_v3i8:
4676; CHECK:       # %bb.0:
4677; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
4678; CHECK-NEXT:    vle8.v v8, (a0)
4679; CHECK-NEXT:    li a0, -1
4680; CHECK-NEXT:    vmv.s.x v9, a0
4681; CHECK-NEXT:    vredminu.vs v8, v8, v9
4682; CHECK-NEXT:    vmv.x.s a0, v8
4683; CHECK-NEXT:    ret
4684  %v = load <3 x i8>, ptr %x
4685  %red = call i8 @llvm.vector.reduce.umin.v3i8(<3 x i8> %v)
4686  ret i8 %red
4687}
4688
4689declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
4690
4691define i8 @vreduce_umin_v4i8(ptr %x) {
4692; CHECK-LABEL: vreduce_umin_v4i8:
4693; CHECK:       # %bb.0:
4694; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
4695; CHECK-NEXT:    vle8.v v8, (a0)
4696; CHECK-NEXT:    vredminu.vs v8, v8, v8
4697; CHECK-NEXT:    vmv.x.s a0, v8
4698; CHECK-NEXT:    ret
4699  %v = load <4 x i8>, ptr %x
4700  %red = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %v)
4701  ret i8 %red
4702}
4703
4704declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
4705
4706define i8 @vreduce_umin_v8i8(ptr %x) {
4707; CHECK-LABEL: vreduce_umin_v8i8:
4708; CHECK:       # %bb.0:
4709; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
4710; CHECK-NEXT:    vle8.v v8, (a0)
4711; CHECK-NEXT:    vredminu.vs v8, v8, v8
4712; CHECK-NEXT:    vmv.x.s a0, v8
4713; CHECK-NEXT:    ret
4714  %v = load <8 x i8>, ptr %x
4715  %red = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
4716  ret i8 %red
4717}
4718
4719declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
4720
4721define i8 @vreduce_umin_v16i8(ptr %x) {
4722; CHECK-LABEL: vreduce_umin_v16i8:
4723; CHECK:       # %bb.0:
4724; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
4725; CHECK-NEXT:    vle8.v v8, (a0)
4726; CHECK-NEXT:    vredminu.vs v8, v8, v8
4727; CHECK-NEXT:    vmv.x.s a0, v8
4728; CHECK-NEXT:    ret
4729  %v = load <16 x i8>, ptr %x
4730  %red = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
4731  ret i8 %red
4732}
4733
4734declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
4735
4736define i8 @vreduce_umin_v32i8(ptr %x) {
4737; CHECK-LABEL: vreduce_umin_v32i8:
4738; CHECK:       # %bb.0:
4739; CHECK-NEXT:    li a1, 32
4740; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
4741; CHECK-NEXT:    vle8.v v8, (a0)
4742; CHECK-NEXT:    vredminu.vs v8, v8, v8
4743; CHECK-NEXT:    vmv.x.s a0, v8
4744; CHECK-NEXT:    ret
4745  %v = load <32 x i8>, ptr %x
4746  %red = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %v)
4747  ret i8 %red
4748}
4749
4750declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
4751
4752define i8 @vreduce_umin_v64i8(ptr %x) {
4753; CHECK-LABEL: vreduce_umin_v64i8:
4754; CHECK:       # %bb.0:
4755; CHECK-NEXT:    li a1, 64
4756; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
4757; CHECK-NEXT:    vle8.v v8, (a0)
4758; CHECK-NEXT:    vredminu.vs v8, v8, v8
4759; CHECK-NEXT:    vmv.x.s a0, v8
4760; CHECK-NEXT:    ret
4761  %v = load <64 x i8>, ptr %x
4762  %red = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %v)
4763  ret i8 %red
4764}
4765
4766declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
4767
4768define i8 @vreduce_umin_v128i8(ptr %x) {
4769; CHECK-LABEL: vreduce_umin_v128i8:
4770; CHECK:       # %bb.0:
4771; CHECK-NEXT:    li a1, 128
4772; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
4773; CHECK-NEXT:    vle8.v v8, (a0)
4774; CHECK-NEXT:    vredminu.vs v8, v8, v8
4775; CHECK-NEXT:    vmv.x.s a0, v8
4776; CHECK-NEXT:    ret
4777  %v = load <128 x i8>, ptr %x
4778  %red = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %v)
4779  ret i8 %red
4780}
4781
4782declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>)
4783
4784define i8 @vreduce_umin_v256i8(ptr %x) {
4785; CHECK-LABEL: vreduce_umin_v256i8:
4786; CHECK:       # %bb.0:
4787; CHECK-NEXT:    li a1, 128
4788; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
4789; CHECK-NEXT:    vle8.v v8, (a0)
4790; CHECK-NEXT:    addi a0, a0, 128
4791; CHECK-NEXT:    vle8.v v16, (a0)
4792; CHECK-NEXT:    vminu.vv v8, v8, v16
4793; CHECK-NEXT:    vredminu.vs v8, v8, v8
4794; CHECK-NEXT:    vmv.x.s a0, v8
4795; CHECK-NEXT:    ret
4796  %v = load <256 x i8>, ptr %x
4797  %red = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %v)
4798  ret i8 %red
4799}
4800
4801declare i16 @llvm.vector.reduce.umin.v1i16(<1 x i16>)
4802
4803define i16 @vreduce_umin_v1i16(<1 x i16> %v) {
4804; CHECK-LABEL: vreduce_umin_v1i16:
4805; CHECK:       # %bb.0:
4806; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
4807; CHECK-NEXT:    vmv.x.s a0, v8
4808; CHECK-NEXT:    ret
4809  %red = call i16 @llvm.vector.reduce.umin.v1i16(<1 x i16> %v)
4810  ret i16 %red
4811}
4812
4813declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
4814
4815define i16 @vreduce_umin_v2i16(ptr %x) {
4816; CHECK-LABEL: vreduce_umin_v2i16:
4817; CHECK:       # %bb.0:
4818; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
4819; CHECK-NEXT:    vle16.v v8, (a0)
4820; CHECK-NEXT:    vredminu.vs v8, v8, v8
4821; CHECK-NEXT:    vmv.x.s a0, v8
4822; CHECK-NEXT:    ret
4823  %v = load <2 x i16>, ptr %x
4824  %red = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %v)
4825  ret i16 %red
4826}
4827
4828declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
4829
4830define i16 @vreduce_umin_v4i16(ptr %x) {
4831; CHECK-LABEL: vreduce_umin_v4i16:
4832; CHECK:       # %bb.0:
4833; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
4834; CHECK-NEXT:    vle16.v v8, (a0)
4835; CHECK-NEXT:    vredminu.vs v8, v8, v8
4836; CHECK-NEXT:    vmv.x.s a0, v8
4837; CHECK-NEXT:    ret
4838  %v = load <4 x i16>, ptr %x
4839  %red = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
4840  ret i16 %red
4841}
4842
4843declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
4844
4845define i16 @vreduce_umin_v8i16(ptr %x) {
4846; CHECK-LABEL: vreduce_umin_v8i16:
4847; CHECK:       # %bb.0:
4848; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
4849; CHECK-NEXT:    vle16.v v8, (a0)
4850; CHECK-NEXT:    vredminu.vs v8, v8, v8
4851; CHECK-NEXT:    vmv.x.s a0, v8
4852; CHECK-NEXT:    ret
4853  %v = load <8 x i16>, ptr %x
4854  %red = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
4855  ret i16 %red
4856}
4857
4858declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
4859
4860define i16 @vreduce_umin_v16i16(ptr %x) {
4861; CHECK-LABEL: vreduce_umin_v16i16:
4862; CHECK:       # %bb.0:
4863; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
4864; CHECK-NEXT:    vle16.v v8, (a0)
4865; CHECK-NEXT:    vredminu.vs v8, v8, v8
4866; CHECK-NEXT:    vmv.x.s a0, v8
4867; CHECK-NEXT:    ret
4868  %v = load <16 x i16>, ptr %x
4869  %red = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %v)
4870  ret i16 %red
4871}
4872
4873declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
4874
4875define i16 @vreduce_umin_v32i16(ptr %x) {
4876; CHECK-LABEL: vreduce_umin_v32i16:
4877; CHECK:       # %bb.0:
4878; CHECK-NEXT:    li a1, 32
4879; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
4880; CHECK-NEXT:    vle16.v v8, (a0)
4881; CHECK-NEXT:    vredminu.vs v8, v8, v8
4882; CHECK-NEXT:    vmv.x.s a0, v8
4883; CHECK-NEXT:    ret
4884  %v = load <32 x i16>, ptr %x
4885  %red = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %v)
4886  ret i16 %red
4887}
4888
4889declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
4890
4891define i16 @vreduce_umin_v64i16(ptr %x) {
4892; CHECK-LABEL: vreduce_umin_v64i16:
4893; CHECK:       # %bb.0:
4894; CHECK-NEXT:    li a1, 64
4895; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
4896; CHECK-NEXT:    vle16.v v8, (a0)
4897; CHECK-NEXT:    vredminu.vs v8, v8, v8
4898; CHECK-NEXT:    vmv.x.s a0, v8
4899; CHECK-NEXT:    ret
4900  %v = load <64 x i16>, ptr %x
4901  %red = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %v)
4902  ret i16 %red
4903}
4904
4905declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>)
4906
4907define i16 @vreduce_umin_v128i16(ptr %x) {
4908; CHECK-LABEL: vreduce_umin_v128i16:
4909; CHECK:       # %bb.0:
4910; CHECK-NEXT:    li a1, 64
4911; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
4912; CHECK-NEXT:    vle16.v v8, (a0)
4913; CHECK-NEXT:    addi a0, a0, 128
4914; CHECK-NEXT:    vle16.v v16, (a0)
4915; CHECK-NEXT:    vminu.vv v8, v8, v16
4916; CHECK-NEXT:    vredminu.vs v8, v8, v8
4917; CHECK-NEXT:    vmv.x.s a0, v8
4918; CHECK-NEXT:    ret
4919  %v = load <128 x i16>, ptr %x
4920  %red = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %v)
4921  ret i16 %red
4922}
4923
4924declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32>)
4925
4926define i32 @vreduce_umin_v1i32(<1 x i32> %v) {
4927; CHECK-LABEL: vreduce_umin_v1i32:
4928; CHECK:       # %bb.0:
4929; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
4930; CHECK-NEXT:    vmv.x.s a0, v8
4931; CHECK-NEXT:    ret
4932  %red = call i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %v)
4933  ret i32 %red
4934}
4935
4936declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
4937
4938define i32 @vreduce_umin_v2i32(ptr %x) {
4939; CHECK-LABEL: vreduce_umin_v2i32:
4940; CHECK:       # %bb.0:
4941; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
4942; CHECK-NEXT:    vle32.v v8, (a0)
4943; CHECK-NEXT:    vredminu.vs v8, v8, v8
4944; CHECK-NEXT:    vmv.x.s a0, v8
4945; CHECK-NEXT:    ret
4946  %v = load <2 x i32>, ptr %x
4947  %red = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %v)
4948  ret i32 %red
4949}
4950
4951declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
4952
4953define i32 @vreduce_umin_v4i32(ptr %x) {
4954; CHECK-LABEL: vreduce_umin_v4i32:
4955; CHECK:       # %bb.0:
4956; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
4957; CHECK-NEXT:    vle32.v v8, (a0)
4958; CHECK-NEXT:    vredminu.vs v8, v8, v8
4959; CHECK-NEXT:    vmv.x.s a0, v8
4960; CHECK-NEXT:    ret
4961  %v = load <4 x i32>, ptr %x
4962  %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v)
4963  ret i32 %red
4964}
4965
4966declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
4967
4968define i32 @vreduce_umin_v8i32(ptr %x) {
4969; CHECK-LABEL: vreduce_umin_v8i32:
4970; CHECK:       # %bb.0:
4971; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
4972; CHECK-NEXT:    vle32.v v8, (a0)
4973; CHECK-NEXT:    vredminu.vs v8, v8, v8
4974; CHECK-NEXT:    vmv.x.s a0, v8
4975; CHECK-NEXT:    ret
4976  %v = load <8 x i32>, ptr %x
4977  %red = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %v)
4978  ret i32 %red
4979}
4980
4981declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
4982
4983define i32 @vreduce_umin_v16i32(ptr %x) {
4984; CHECK-LABEL: vreduce_umin_v16i32:
4985; CHECK:       # %bb.0:
4986; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
4987; CHECK-NEXT:    vle32.v v8, (a0)
4988; CHECK-NEXT:    vredminu.vs v8, v8, v8
4989; CHECK-NEXT:    vmv.x.s a0, v8
4990; CHECK-NEXT:    ret
4991  %v = load <16 x i32>, ptr %x
4992  %red = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %v)
4993  ret i32 %red
4994}
4995
4996declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
4997
4998define i32 @vreduce_umin_v32i32(ptr %x) {
4999; CHECK-LABEL: vreduce_umin_v32i32:
5000; CHECK:       # %bb.0:
5001; CHECK-NEXT:    li a1, 32
5002; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
5003; CHECK-NEXT:    vle32.v v8, (a0)
5004; CHECK-NEXT:    vredminu.vs v8, v8, v8
5005; CHECK-NEXT:    vmv.x.s a0, v8
5006; CHECK-NEXT:    ret
5007  %v = load <32 x i32>, ptr %x
5008  %red = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %v)
5009  ret i32 %red
5010}
5011
5012declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>)
5013
5014define i32 @vreduce_umin_v64i32(ptr %x) {
5015; CHECK-LABEL: vreduce_umin_v64i32:
5016; CHECK:       # %bb.0:
5017; CHECK-NEXT:    li a1, 32
5018; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
5019; CHECK-NEXT:    vle32.v v8, (a0)
5020; CHECK-NEXT:    addi a0, a0, 128
5021; CHECK-NEXT:    vle32.v v16, (a0)
5022; CHECK-NEXT:    vminu.vv v8, v8, v16
5023; CHECK-NEXT:    vredminu.vs v8, v8, v8
5024; CHECK-NEXT:    vmv.x.s a0, v8
5025; CHECK-NEXT:    ret
5026  %v = load <64 x i32>, ptr %x
5027  %red = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %v)
5028  ret i32 %red
5029}
5030
5031declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
5032
5033define i64 @vreduce_umin_v1i64(<1 x i64> %v) {
5034; RV32-LABEL: vreduce_umin_v1i64:
5035; RV32:       # %bb.0:
5036; RV32-NEXT:    li a0, 32
5037; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5038; RV32-NEXT:    vsrl.vx v9, v8, a0
5039; RV32-NEXT:    vmv.x.s a1, v9
5040; RV32-NEXT:    vmv.x.s a0, v8
5041; RV32-NEXT:    ret
5042;
5043; RV64-LABEL: vreduce_umin_v1i64:
5044; RV64:       # %bb.0:
5045; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5046; RV64-NEXT:    vmv.x.s a0, v8
5047; RV64-NEXT:    ret
5048  %red = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %v)
5049  ret i64 %red
5050}
5051
5052declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
5053
5054define i64 @vreduce_umin_v2i64(ptr %x) {
5055; RV32-LABEL: vreduce_umin_v2i64:
5056; RV32:       # %bb.0:
5057; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
5058; RV32-NEXT:    vle64.v v8, (a0)
5059; RV32-NEXT:    li a0, 32
5060; RV32-NEXT:    vredminu.vs v8, v8, v8
5061; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5062; RV32-NEXT:    vsrl.vx v9, v8, a0
5063; RV32-NEXT:    vmv.x.s a1, v9
5064; RV32-NEXT:    vmv.x.s a0, v8
5065; RV32-NEXT:    ret
5066;
5067; RV64-LABEL: vreduce_umin_v2i64:
5068; RV64:       # %bb.0:
5069; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
5070; RV64-NEXT:    vle64.v v8, (a0)
5071; RV64-NEXT:    vredminu.vs v8, v8, v8
5072; RV64-NEXT:    vmv.x.s a0, v8
5073; RV64-NEXT:    ret
5074  %v = load <2 x i64>, ptr %x
5075  %red = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %v)
5076  ret i64 %red
5077}
5078
5079declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
5080
5081define i64 @vreduce_umin_v4i64(ptr %x) {
5082; RV32-LABEL: vreduce_umin_v4i64:
5083; RV32:       # %bb.0:
5084; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
5085; RV32-NEXT:    vle64.v v8, (a0)
5086; RV32-NEXT:    li a1, 32
5087; RV32-NEXT:    vredminu.vs v8, v8, v8
5088; RV32-NEXT:    vmv.x.s a0, v8
5089; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5090; RV32-NEXT:    vsrl.vx v8, v8, a1
5091; RV32-NEXT:    vmv.x.s a1, v8
5092; RV32-NEXT:    ret
5093;
5094; RV64-LABEL: vreduce_umin_v4i64:
5095; RV64:       # %bb.0:
5096; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
5097; RV64-NEXT:    vle64.v v8, (a0)
5098; RV64-NEXT:    vredminu.vs v8, v8, v8
5099; RV64-NEXT:    vmv.x.s a0, v8
5100; RV64-NEXT:    ret
5101  %v = load <4 x i64>, ptr %x
5102  %red = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v)
5103  ret i64 %red
5104}
5105
5106declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
5107
5108define i64 @vreduce_umin_v8i64(ptr %x) {
5109; RV32-LABEL: vreduce_umin_v8i64:
5110; RV32:       # %bb.0:
5111; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
5112; RV32-NEXT:    vle64.v v8, (a0)
5113; RV32-NEXT:    li a1, 32
5114; RV32-NEXT:    vredminu.vs v8, v8, v8
5115; RV32-NEXT:    vmv.x.s a0, v8
5116; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5117; RV32-NEXT:    vsrl.vx v8, v8, a1
5118; RV32-NEXT:    vmv.x.s a1, v8
5119; RV32-NEXT:    ret
5120;
5121; RV64-LABEL: vreduce_umin_v8i64:
5122; RV64:       # %bb.0:
5123; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
5124; RV64-NEXT:    vle64.v v8, (a0)
5125; RV64-NEXT:    vredminu.vs v8, v8, v8
5126; RV64-NEXT:    vmv.x.s a0, v8
5127; RV64-NEXT:    ret
5128  %v = load <8 x i64>, ptr %x
5129  %red = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %v)
5130  ret i64 %red
5131}
5132
5133declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
5134
5135define i64 @vreduce_umin_v16i64(ptr %x) {
5136; RV32-LABEL: vreduce_umin_v16i64:
5137; RV32:       # %bb.0:
5138; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5139; RV32-NEXT:    vle64.v v8, (a0)
5140; RV32-NEXT:    li a1, 32
5141; RV32-NEXT:    vredminu.vs v8, v8, v8
5142; RV32-NEXT:    vmv.x.s a0, v8
5143; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5144; RV32-NEXT:    vsrl.vx v8, v8, a1
5145; RV32-NEXT:    vmv.x.s a1, v8
5146; RV32-NEXT:    ret
5147;
5148; RV64-LABEL: vreduce_umin_v16i64:
5149; RV64:       # %bb.0:
5150; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5151; RV64-NEXT:    vle64.v v8, (a0)
5152; RV64-NEXT:    vredminu.vs v8, v8, v8
5153; RV64-NEXT:    vmv.x.s a0, v8
5154; RV64-NEXT:    ret
5155  %v = load <16 x i64>, ptr %x
5156  %red = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %v)
5157  ret i64 %red
5158}
5159
5160declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>)
5161
5162define i64 @vreduce_umin_v32i64(ptr %x) {
5163; RV32-LABEL: vreduce_umin_v32i64:
5164; RV32:       # %bb.0:
5165; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5166; RV32-NEXT:    vle64.v v8, (a0)
5167; RV32-NEXT:    addi a0, a0, 128
5168; RV32-NEXT:    vle64.v v16, (a0)
5169; RV32-NEXT:    li a1, 32
5170; RV32-NEXT:    vminu.vv v8, v8, v16
5171; RV32-NEXT:    vredminu.vs v8, v8, v8
5172; RV32-NEXT:    vmv.x.s a0, v8
5173; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5174; RV32-NEXT:    vsrl.vx v8, v8, a1
5175; RV32-NEXT:    vmv.x.s a1, v8
5176; RV32-NEXT:    ret
5177;
5178; RV64-LABEL: vreduce_umin_v32i64:
5179; RV64:       # %bb.0:
5180; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5181; RV64-NEXT:    vle64.v v8, (a0)
5182; RV64-NEXT:    addi a0, a0, 128
5183; RV64-NEXT:    vle64.v v16, (a0)
5184; RV64-NEXT:    vminu.vv v8, v8, v16
5185; RV64-NEXT:    vredminu.vs v8, v8, v8
5186; RV64-NEXT:    vmv.x.s a0, v8
5187; RV64-NEXT:    ret
5188  %v = load <32 x i64>, ptr %x
5189  %red = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %v)
5190  ret i64 %red
5191}
5192
5193declare i64 @llvm.vector.reduce.umin.v64i64(<64 x i64>)
5194
5195define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
5196; RV32-LABEL: vreduce_umin_v64i64:
5197; RV32:       # %bb.0:
5198; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5199; RV32-NEXT:    vle64.v v8, (a0)
5200; RV32-NEXT:    addi a1, a0, 384
5201; RV32-NEXT:    vle64.v v16, (a1)
5202; RV32-NEXT:    addi a1, a0, 256
5203; RV32-NEXT:    addi a0, a0, 128
5204; RV32-NEXT:    vle64.v v0, (a0)
5205; RV32-NEXT:    vle64.v v24, (a1)
5206; RV32-NEXT:    li a1, 32
5207; RV32-NEXT:    vminu.vv v16, v0, v16
5208; RV32-NEXT:    vminu.vv v8, v8, v24
5209; RV32-NEXT:    vminu.vv v8, v8, v16
5210; RV32-NEXT:    vredminu.vs v8, v8, v8
5211; RV32-NEXT:    vmv.x.s a0, v8
5212; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5213; RV32-NEXT:    vsrl.vx v8, v8, a1
5214; RV32-NEXT:    vmv.x.s a1, v8
5215; RV32-NEXT:    ret
5216;
5217; RV64-LABEL: vreduce_umin_v64i64:
5218; RV64:       # %bb.0:
5219; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5220; RV64-NEXT:    vle64.v v8, (a0)
5221; RV64-NEXT:    addi a1, a0, 384
5222; RV64-NEXT:    vle64.v v16, (a1)
5223; RV64-NEXT:    addi a1, a0, 256
5224; RV64-NEXT:    addi a0, a0, 128
5225; RV64-NEXT:    vle64.v v24, (a0)
5226; RV64-NEXT:    vle64.v v0, (a1)
5227; RV64-NEXT:    vminu.vv v16, v24, v16
5228; RV64-NEXT:    vminu.vv v8, v8, v0
5229; RV64-NEXT:    vminu.vv v8, v8, v16
5230; RV64-NEXT:    vredminu.vs v8, v8, v8
5231; RV64-NEXT:    vmv.x.s a0, v8
5232; RV64-NEXT:    ret
5233  %v = load <64 x i64>, ptr %x
5234  %red = call i64 @llvm.vector.reduce.umin.v64i64(<64 x i64> %v)
5235  ret i64 %red
5236}
5237
5238declare i8 @llvm.vector.reduce.umax.v1i8(<1 x i8>)
5239
5240define i8 @vreduce_umax_v1i8(<1 x i8> %v) {
5241; CHECK-LABEL: vreduce_umax_v1i8:
5242; CHECK:       # %bb.0:
5243; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
5244; CHECK-NEXT:    vmv.x.s a0, v8
5245; CHECK-NEXT:    ret
5246  %red = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %v)
5247  ret i8 %red
5248}
5249
5250declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
5251
5252define i8 @vreduce_umax_v2i8(ptr %x) {
5253; CHECK-LABEL: vreduce_umax_v2i8:
5254; CHECK:       # %bb.0:
5255; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
5256; CHECK-NEXT:    vle8.v v8, (a0)
5257; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5258; CHECK-NEXT:    vmv.x.s a0, v8
5259; CHECK-NEXT:    ret
5260  %v = load <2 x i8>, ptr %x
5261  %red = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %v)
5262  ret i8 %red
5263}
5264
5265declare i8 @llvm.vector.reduce.umax.v3i8(<3 x i8>)
5266
5267define i8 @vreduce_umax_v3i8(ptr %x) {
5268; CHECK-LABEL: vreduce_umax_v3i8:
5269; CHECK:       # %bb.0:
5270; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
5271; CHECK-NEXT:    vle8.v v8, (a0)
5272; CHECK-NEXT:    vmv.s.x v9, zero
5273; CHECK-NEXT:    vredmaxu.vs v8, v8, v9
5274; CHECK-NEXT:    vmv.x.s a0, v8
5275; CHECK-NEXT:    ret
5276  %v = load <3 x i8>, ptr %x
5277  %red = call i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> %v)
5278  ret i8 %red
5279}
5280
5281declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
5282
5283define i8 @vreduce_umax_v4i8(ptr %x) {
5284; CHECK-LABEL: vreduce_umax_v4i8:
5285; CHECK:       # %bb.0:
5286; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
5287; CHECK-NEXT:    vle8.v v8, (a0)
5288; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5289; CHECK-NEXT:    vmv.x.s a0, v8
5290; CHECK-NEXT:    ret
5291  %v = load <4 x i8>, ptr %x
5292  %red = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %v)
5293  ret i8 %red
5294}
5295
5296declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
5297
5298define i8 @vreduce_umax_v8i8(ptr %x) {
5299; CHECK-LABEL: vreduce_umax_v8i8:
5300; CHECK:       # %bb.0:
5301; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
5302; CHECK-NEXT:    vle8.v v8, (a0)
5303; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5304; CHECK-NEXT:    vmv.x.s a0, v8
5305; CHECK-NEXT:    ret
5306  %v = load <8 x i8>, ptr %x
5307  %red = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
5308  ret i8 %red
5309}
5310
5311declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
5312
5313define i8 @vreduce_umax_v16i8(ptr %x) {
5314; CHECK-LABEL: vreduce_umax_v16i8:
5315; CHECK:       # %bb.0:
5316; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
5317; CHECK-NEXT:    vle8.v v8, (a0)
5318; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5319; CHECK-NEXT:    vmv.x.s a0, v8
5320; CHECK-NEXT:    ret
5321  %v = load <16 x i8>, ptr %x
5322  %red = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
5323  ret i8 %red
5324}
5325
5326declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
5327
5328define i8 @vreduce_umax_v32i8(ptr %x) {
5329; CHECK-LABEL: vreduce_umax_v32i8:
5330; CHECK:       # %bb.0:
5331; CHECK-NEXT:    li a1, 32
5332; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
5333; CHECK-NEXT:    vle8.v v8, (a0)
5334; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5335; CHECK-NEXT:    vmv.x.s a0, v8
5336; CHECK-NEXT:    ret
5337  %v = load <32 x i8>, ptr %x
5338  %red = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %v)
5339  ret i8 %red
5340}
5341
5342declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
5343
5344define i8 @vreduce_umax_v64i8(ptr %x) {
5345; CHECK-LABEL: vreduce_umax_v64i8:
5346; CHECK:       # %bb.0:
5347; CHECK-NEXT:    li a1, 64
5348; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
5349; CHECK-NEXT:    vle8.v v8, (a0)
5350; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5351; CHECK-NEXT:    vmv.x.s a0, v8
5352; CHECK-NEXT:    ret
5353  %v = load <64 x i8>, ptr %x
5354  %red = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %v)
5355  ret i8 %red
5356}
5357
5358declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
5359
5360define i8 @vreduce_umax_v128i8(ptr %x) {
5361; CHECK-LABEL: vreduce_umax_v128i8:
5362; CHECK:       # %bb.0:
5363; CHECK-NEXT:    li a1, 128
5364; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
5365; CHECK-NEXT:    vle8.v v8, (a0)
5366; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5367; CHECK-NEXT:    vmv.x.s a0, v8
5368; CHECK-NEXT:    ret
5369  %v = load <128 x i8>, ptr %x
5370  %red = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %v)
5371  ret i8 %red
5372}
5373
5374declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>)
5375
5376define i8 @vreduce_umax_v256i8(ptr %x) {
5377; CHECK-LABEL: vreduce_umax_v256i8:
5378; CHECK:       # %bb.0:
5379; CHECK-NEXT:    li a1, 128
5380; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
5381; CHECK-NEXT:    vle8.v v8, (a0)
5382; CHECK-NEXT:    addi a0, a0, 128
5383; CHECK-NEXT:    vle8.v v16, (a0)
5384; CHECK-NEXT:    vmaxu.vv v8, v8, v16
5385; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5386; CHECK-NEXT:    vmv.x.s a0, v8
5387; CHECK-NEXT:    ret
5388  %v = load <256 x i8>, ptr %x
5389  %red = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %v)
5390  ret i8 %red
5391}
5392
5393declare i16 @llvm.vector.reduce.umax.v1i16(<1 x i16>)
5394
5395define i16 @vreduce_umax_v1i16(<1 x i16> %v) {
5396; CHECK-LABEL: vreduce_umax_v1i16:
5397; CHECK:       # %bb.0:
5398; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
5399; CHECK-NEXT:    vmv.x.s a0, v8
5400; CHECK-NEXT:    ret
5401  %red = call i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %v)
5402  ret i16 %red
5403}
5404
5405declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
5406
5407define i16 @vreduce_umax_v2i16(ptr %x) {
5408; CHECK-LABEL: vreduce_umax_v2i16:
5409; CHECK:       # %bb.0:
5410; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
5411; CHECK-NEXT:    vle16.v v8, (a0)
5412; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5413; CHECK-NEXT:    vmv.x.s a0, v8
5414; CHECK-NEXT:    ret
5415  %v = load <2 x i16>, ptr %x
5416  %red = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %v)
5417  ret i16 %red
5418}
5419
5420declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
5421
5422define i16 @vreduce_umax_v4i16(ptr %x) {
5423; CHECK-LABEL: vreduce_umax_v4i16:
5424; CHECK:       # %bb.0:
5425; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
5426; CHECK-NEXT:    vle16.v v8, (a0)
5427; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5428; CHECK-NEXT:    vmv.x.s a0, v8
5429; CHECK-NEXT:    ret
5430  %v = load <4 x i16>, ptr %x
5431  %red = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
5432  ret i16 %red
5433}
5434
5435declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
5436
5437define i16 @vreduce_umax_v8i16(ptr %x) {
5438; CHECK-LABEL: vreduce_umax_v8i16:
5439; CHECK:       # %bb.0:
5440; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
5441; CHECK-NEXT:    vle16.v v8, (a0)
5442; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5443; CHECK-NEXT:    vmv.x.s a0, v8
5444; CHECK-NEXT:    ret
5445  %v = load <8 x i16>, ptr %x
5446  %red = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
5447  ret i16 %red
5448}
5449
5450declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
5451
5452define i16 @vreduce_umax_v16i16(ptr %x) {
5453; CHECK-LABEL: vreduce_umax_v16i16:
5454; CHECK:       # %bb.0:
5455; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
5456; CHECK-NEXT:    vle16.v v8, (a0)
5457; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5458; CHECK-NEXT:    vmv.x.s a0, v8
5459; CHECK-NEXT:    ret
5460  %v = load <16 x i16>, ptr %x
5461  %red = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %v)
5462  ret i16 %red
5463}
5464
5465declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
5466
5467define i16 @vreduce_umax_v32i16(ptr %x) {
5468; CHECK-LABEL: vreduce_umax_v32i16:
5469; CHECK:       # %bb.0:
5470; CHECK-NEXT:    li a1, 32
5471; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
5472; CHECK-NEXT:    vle16.v v8, (a0)
5473; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5474; CHECK-NEXT:    vmv.x.s a0, v8
5475; CHECK-NEXT:    ret
5476  %v = load <32 x i16>, ptr %x
5477  %red = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %v)
5478  ret i16 %red
5479}
5480
5481declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
5482
5483define i16 @vreduce_umax_v64i16(ptr %x) {
5484; CHECK-LABEL: vreduce_umax_v64i16:
5485; CHECK:       # %bb.0:
5486; CHECK-NEXT:    li a1, 64
5487; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
5488; CHECK-NEXT:    vle16.v v8, (a0)
5489; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5490; CHECK-NEXT:    vmv.x.s a0, v8
5491; CHECK-NEXT:    ret
5492  %v = load <64 x i16>, ptr %x
5493  %red = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %v)
5494  ret i16 %red
5495}
5496
5497declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>)
5498
5499define i16 @vreduce_umax_v128i16(ptr %x) {
5500; CHECK-LABEL: vreduce_umax_v128i16:
5501; CHECK:       # %bb.0:
5502; CHECK-NEXT:    li a1, 64
5503; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
5504; CHECK-NEXT:    vle16.v v8, (a0)
5505; CHECK-NEXT:    addi a0, a0, 128
5506; CHECK-NEXT:    vle16.v v16, (a0)
5507; CHECK-NEXT:    vmaxu.vv v8, v8, v16
5508; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5509; CHECK-NEXT:    vmv.x.s a0, v8
5510; CHECK-NEXT:    ret
5511  %v = load <128 x i16>, ptr %x
5512  %red = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %v)
5513  ret i16 %red
5514}
5515
5516declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32>)
5517
5518define i32 @vreduce_umax_v1i32(<1 x i32> %v) {
5519; CHECK-LABEL: vreduce_umax_v1i32:
5520; CHECK:       # %bb.0:
5521; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
5522; CHECK-NEXT:    vmv.x.s a0, v8
5523; CHECK-NEXT:    ret
5524  %red = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %v)
5525  ret i32 %red
5526}
5527
5528declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
5529
5530define i32 @vreduce_umax_v2i32(ptr %x) {
5531; CHECK-LABEL: vreduce_umax_v2i32:
5532; CHECK:       # %bb.0:
5533; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
5534; CHECK-NEXT:    vle32.v v8, (a0)
5535; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5536; CHECK-NEXT:    vmv.x.s a0, v8
5537; CHECK-NEXT:    ret
5538  %v = load <2 x i32>, ptr %x
5539  %red = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %v)
5540  ret i32 %red
5541}
5542
5543declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
5544
5545define i32 @vreduce_umax_v4i32(ptr %x) {
5546; CHECK-LABEL: vreduce_umax_v4i32:
5547; CHECK:       # %bb.0:
5548; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
5549; CHECK-NEXT:    vle32.v v8, (a0)
5550; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5551; CHECK-NEXT:    vmv.x.s a0, v8
5552; CHECK-NEXT:    ret
5553  %v = load <4 x i32>, ptr %x
5554  %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v)
5555  ret i32 %red
5556}
5557
5558declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
5559
5560define i32 @vreduce_umax_v8i32(ptr %x) {
5561; CHECK-LABEL: vreduce_umax_v8i32:
5562; CHECK:       # %bb.0:
5563; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
5564; CHECK-NEXT:    vle32.v v8, (a0)
5565; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5566; CHECK-NEXT:    vmv.x.s a0, v8
5567; CHECK-NEXT:    ret
5568  %v = load <8 x i32>, ptr %x
5569  %red = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %v)
5570  ret i32 %red
5571}
5572
5573declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
5574
5575define i32 @vreduce_umax_v16i32(ptr %x) {
5576; CHECK-LABEL: vreduce_umax_v16i32:
5577; CHECK:       # %bb.0:
5578; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
5579; CHECK-NEXT:    vle32.v v8, (a0)
5580; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5581; CHECK-NEXT:    vmv.x.s a0, v8
5582; CHECK-NEXT:    ret
5583  %v = load <16 x i32>, ptr %x
5584  %red = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %v)
5585  ret i32 %red
5586}
5587
5588declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
5589
5590define i32 @vreduce_umax_v32i32(ptr %x) {
5591; CHECK-LABEL: vreduce_umax_v32i32:
5592; CHECK:       # %bb.0:
5593; CHECK-NEXT:    li a1, 32
5594; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
5595; CHECK-NEXT:    vle32.v v8, (a0)
5596; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5597; CHECK-NEXT:    vmv.x.s a0, v8
5598; CHECK-NEXT:    ret
5599  %v = load <32 x i32>, ptr %x
5600  %red = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %v)
5601  ret i32 %red
5602}
5603
5604declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>)
5605
5606define i32 @vreduce_umax_v64i32(ptr %x) {
5607; CHECK-LABEL: vreduce_umax_v64i32:
5608; CHECK:       # %bb.0:
5609; CHECK-NEXT:    li a1, 32
5610; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
5611; CHECK-NEXT:    vle32.v v8, (a0)
5612; CHECK-NEXT:    addi a0, a0, 128
5613; CHECK-NEXT:    vle32.v v16, (a0)
5614; CHECK-NEXT:    vmaxu.vv v8, v8, v16
5615; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
5616; CHECK-NEXT:    vmv.x.s a0, v8
5617; CHECK-NEXT:    ret
5618  %v = load <64 x i32>, ptr %x
5619  %red = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %v)
5620  ret i32 %red
5621}
5622
5623declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
5624
5625define i64 @vreduce_umax_v1i64(<1 x i64> %v) {
5626; RV32-LABEL: vreduce_umax_v1i64:
5627; RV32:       # %bb.0:
5628; RV32-NEXT:    li a0, 32
5629; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5630; RV32-NEXT:    vsrl.vx v9, v8, a0
5631; RV32-NEXT:    vmv.x.s a1, v9
5632; RV32-NEXT:    vmv.x.s a0, v8
5633; RV32-NEXT:    ret
5634;
5635; RV64-LABEL: vreduce_umax_v1i64:
5636; RV64:       # %bb.0:
5637; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5638; RV64-NEXT:    vmv.x.s a0, v8
5639; RV64-NEXT:    ret
5640  %red = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %v)
5641  ret i64 %red
5642}
5643
5644declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
5645
5646define i64 @vreduce_umax_v2i64(ptr %x) {
5647; RV32-LABEL: vreduce_umax_v2i64:
5648; RV32:       # %bb.0:
5649; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
5650; RV32-NEXT:    vle64.v v8, (a0)
5651; RV32-NEXT:    li a0, 32
5652; RV32-NEXT:    vredmaxu.vs v8, v8, v8
5653; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5654; RV32-NEXT:    vsrl.vx v9, v8, a0
5655; RV32-NEXT:    vmv.x.s a1, v9
5656; RV32-NEXT:    vmv.x.s a0, v8
5657; RV32-NEXT:    ret
5658;
5659; RV64-LABEL: vreduce_umax_v2i64:
5660; RV64:       # %bb.0:
5661; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
5662; RV64-NEXT:    vle64.v v8, (a0)
5663; RV64-NEXT:    vredmaxu.vs v8, v8, v8
5664; RV64-NEXT:    vmv.x.s a0, v8
5665; RV64-NEXT:    ret
5666  %v = load <2 x i64>, ptr %x
5667  %red = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %v)
5668  ret i64 %red
5669}
5670
5671declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
5672
5673define i64 @vreduce_umax_v4i64(ptr %x) {
5674; RV32-LABEL: vreduce_umax_v4i64:
5675; RV32:       # %bb.0:
5676; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
5677; RV32-NEXT:    vle64.v v8, (a0)
5678; RV32-NEXT:    li a1, 32
5679; RV32-NEXT:    vredmaxu.vs v8, v8, v8
5680; RV32-NEXT:    vmv.x.s a0, v8
5681; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5682; RV32-NEXT:    vsrl.vx v8, v8, a1
5683; RV32-NEXT:    vmv.x.s a1, v8
5684; RV32-NEXT:    ret
5685;
5686; RV64-LABEL: vreduce_umax_v4i64:
5687; RV64:       # %bb.0:
5688; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
5689; RV64-NEXT:    vle64.v v8, (a0)
5690; RV64-NEXT:    vredmaxu.vs v8, v8, v8
5691; RV64-NEXT:    vmv.x.s a0, v8
5692; RV64-NEXT:    ret
5693  %v = load <4 x i64>, ptr %x
5694  %red = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v)
5695  ret i64 %red
5696}
5697
5698declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
5699
5700define i64 @vreduce_umax_v8i64(ptr %x) {
5701; RV32-LABEL: vreduce_umax_v8i64:
5702; RV32:       # %bb.0:
5703; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
5704; RV32-NEXT:    vle64.v v8, (a0)
5705; RV32-NEXT:    li a1, 32
5706; RV32-NEXT:    vredmaxu.vs v8, v8, v8
5707; RV32-NEXT:    vmv.x.s a0, v8
5708; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5709; RV32-NEXT:    vsrl.vx v8, v8, a1
5710; RV32-NEXT:    vmv.x.s a1, v8
5711; RV32-NEXT:    ret
5712;
5713; RV64-LABEL: vreduce_umax_v8i64:
5714; RV64:       # %bb.0:
5715; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
5716; RV64-NEXT:    vle64.v v8, (a0)
5717; RV64-NEXT:    vredmaxu.vs v8, v8, v8
5718; RV64-NEXT:    vmv.x.s a0, v8
5719; RV64-NEXT:    ret
5720  %v = load <8 x i64>, ptr %x
5721  %red = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %v)
5722  ret i64 %red
5723}
5724
5725declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
5726
5727define i64 @vreduce_umax_v16i64(ptr %x) {
5728; RV32-LABEL: vreduce_umax_v16i64:
5729; RV32:       # %bb.0:
5730; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5731; RV32-NEXT:    vle64.v v8, (a0)
5732; RV32-NEXT:    li a1, 32
5733; RV32-NEXT:    vredmaxu.vs v8, v8, v8
5734; RV32-NEXT:    vmv.x.s a0, v8
5735; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5736; RV32-NEXT:    vsrl.vx v8, v8, a1
5737; RV32-NEXT:    vmv.x.s a1, v8
5738; RV32-NEXT:    ret
5739;
5740; RV64-LABEL: vreduce_umax_v16i64:
5741; RV64:       # %bb.0:
5742; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5743; RV64-NEXT:    vle64.v v8, (a0)
5744; RV64-NEXT:    vredmaxu.vs v8, v8, v8
5745; RV64-NEXT:    vmv.x.s a0, v8
5746; RV64-NEXT:    ret
5747  %v = load <16 x i64>, ptr %x
5748  %red = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %v)
5749  ret i64 %red
5750}
5751
5752declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>)
5753
5754define i64 @vreduce_umax_v32i64(ptr %x) {
5755; RV32-LABEL: vreduce_umax_v32i64:
5756; RV32:       # %bb.0:
5757; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5758; RV32-NEXT:    vle64.v v8, (a0)
5759; RV32-NEXT:    addi a0, a0, 128
5760; RV32-NEXT:    vle64.v v16, (a0)
5761; RV32-NEXT:    li a1, 32
5762; RV32-NEXT:    vmaxu.vv v8, v8, v16
5763; RV32-NEXT:    vredmaxu.vs v8, v8, v8
5764; RV32-NEXT:    vmv.x.s a0, v8
5765; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5766; RV32-NEXT:    vsrl.vx v8, v8, a1
5767; RV32-NEXT:    vmv.x.s a1, v8
5768; RV32-NEXT:    ret
5769;
5770; RV64-LABEL: vreduce_umax_v32i64:
5771; RV64:       # %bb.0:
5772; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5773; RV64-NEXT:    vle64.v v8, (a0)
5774; RV64-NEXT:    addi a0, a0, 128
5775; RV64-NEXT:    vle64.v v16, (a0)
5776; RV64-NEXT:    vmaxu.vv v8, v8, v16
5777; RV64-NEXT:    vredmaxu.vs v8, v8, v8
5778; RV64-NEXT:    vmv.x.s a0, v8
5779; RV64-NEXT:    ret
5780  %v = load <32 x i64>, ptr %x
5781  %red = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %v)
5782  ret i64 %red
5783}
5784
5785declare i64 @llvm.vector.reduce.umax.v64i64(<64 x i64>)
5786
5787define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
5788; RV32-LABEL: vreduce_umax_v64i64:
5789; RV32:       # %bb.0:
5790; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5791; RV32-NEXT:    vle64.v v8, (a0)
5792; RV32-NEXT:    addi a1, a0, 384
5793; RV32-NEXT:    vle64.v v16, (a1)
5794; RV32-NEXT:    addi a1, a0, 256
5795; RV32-NEXT:    addi a0, a0, 128
5796; RV32-NEXT:    vle64.v v0, (a0)
5797; RV32-NEXT:    vle64.v v24, (a1)
5798; RV32-NEXT:    li a1, 32
5799; RV32-NEXT:    vmaxu.vv v16, v0, v16
5800; RV32-NEXT:    vmaxu.vv v8, v8, v24
5801; RV32-NEXT:    vmaxu.vv v8, v8, v16
5802; RV32-NEXT:    vredmaxu.vs v8, v8, v8
5803; RV32-NEXT:    vmv.x.s a0, v8
5804; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
5805; RV32-NEXT:    vsrl.vx v8, v8, a1
5806; RV32-NEXT:    vmv.x.s a1, v8
5807; RV32-NEXT:    ret
5808;
5809; RV64-LABEL: vreduce_umax_v64i64:
5810; RV64:       # %bb.0:
5811; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
5812; RV64-NEXT:    vle64.v v8, (a0)
5813; RV64-NEXT:    addi a1, a0, 384
5814; RV64-NEXT:    vle64.v v16, (a1)
5815; RV64-NEXT:    addi a1, a0, 256
5816; RV64-NEXT:    addi a0, a0, 128
5817; RV64-NEXT:    vle64.v v24, (a0)
5818; RV64-NEXT:    vle64.v v0, (a1)
5819; RV64-NEXT:    vmaxu.vv v16, v24, v16
5820; RV64-NEXT:    vmaxu.vv v8, v8, v0
5821; RV64-NEXT:    vmaxu.vv v8, v8, v16
5822; RV64-NEXT:    vredmaxu.vs v8, v8, v8
5823; RV64-NEXT:    vmv.x.s a0, v8
5824; RV64-NEXT:    ret
5825  %v = load <64 x i64>, ptr %x
5826  %red = call i64 @llvm.vector.reduce.umax.v64i64(<64 x i64> %v)
5827  ret i64 %red
5828}
5829
5830declare i8 @llvm.vector.reduce.mul.v1i8(<1 x i8>)
5831
5832define i8 @vreduce_mul_v1i8(<1 x i8> %v) {
5833; CHECK-LABEL: vreduce_mul_v1i8:
5834; CHECK:       # %bb.0:
5835; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
5836; CHECK-NEXT:    vmv.x.s a0, v8
5837; CHECK-NEXT:    ret
5838  %red = call i8 @llvm.vector.reduce.mul.v1i8(<1 x i8> %v)
5839  ret i8 %red
5840}
5841
5842declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
5843
5844define i8 @vreduce_mul_v2i8(ptr %x) {
5845; CHECK-LABEL: vreduce_mul_v2i8:
5846; CHECK:       # %bb.0:
5847; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
5848; CHECK-NEXT:    vle8.v v8, (a0)
5849; CHECK-NEXT:    lbu a0, 1(a0)
5850; CHECK-NEXT:    vmul.vx v8, v8, a0
5851; CHECK-NEXT:    vmv.x.s a0, v8
5852; CHECK-NEXT:    ret
5853  %v = load <2 x i8>, ptr %x
5854  %red = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %v)
5855  ret i8 %red
5856}
5857
5858declare i8 @llvm.vector.reduce.mul.v3i8(<3 x i8>)
5859
5860define i8 @vreduce_mul_v3i8(ptr %x) {
5861; CHECK-LABEL: vreduce_mul_v3i8:
5862; CHECK:       # %bb.0:
5863; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
5864; CHECK-NEXT:    vle8.v v8, (a0)
5865; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
5866; CHECK-NEXT:    vmv.v.i v9, 1
5867; CHECK-NEXT:    vslideup.vi v8, v9, 3
5868; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
5869; CHECK-NEXT:    vslidedown.vi v9, v8, 2
5870; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
5871; CHECK-NEXT:    vmul.vv v8, v8, v9
5872; CHECK-NEXT:    vslidedown.vi v9, v8, 1
5873; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
5874; CHECK-NEXT:    vmul.vv v8, v8, v9
5875; CHECK-NEXT:    vmv.x.s a0, v8
5876; CHECK-NEXT:    ret
5877  %v = load <3 x i8>, ptr %x
5878  %red = call i8 @llvm.vector.reduce.mul.v3i8(<3 x i8> %v)
5879  ret i8 %red
5880}
5881
5882declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
5883
5884define i8 @vreduce_mul_v4i8(ptr %x) {
5885; CHECK-LABEL: vreduce_mul_v4i8:
5886; CHECK:       # %bb.0:
5887; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
5888; CHECK-NEXT:    vle8.v v8, (a0)
5889; CHECK-NEXT:    vslidedown.vi v9, v8, 2
5890; CHECK-NEXT:    vmul.vv v8, v8, v9
5891; CHECK-NEXT:    vrgather.vi v9, v8, 1
5892; CHECK-NEXT:    vmul.vv v8, v8, v9
5893; CHECK-NEXT:    vmv.x.s a0, v8
5894; CHECK-NEXT:    ret
5895  %v = load <4 x i8>, ptr %x
5896  %red = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> %v)
5897  ret i8 %red
5898}
5899
5900declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
5901
5902define i8 @vreduce_mul_v8i8(ptr %x) {
5903; CHECK-LABEL: vreduce_mul_v8i8:
5904; CHECK:       # %bb.0:
5905; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
5906; CHECK-NEXT:    vle8.v v8, (a0)
5907; CHECK-NEXT:    vslidedown.vi v9, v8, 4
5908; CHECK-NEXT:    vmul.vv v8, v8, v9
5909; CHECK-NEXT:    vslidedown.vi v9, v8, 2
5910; CHECK-NEXT:    vmul.vv v8, v8, v9
5911; CHECK-NEXT:    vrgather.vi v9, v8, 1
5912; CHECK-NEXT:    vmul.vv v8, v8, v9
5913; CHECK-NEXT:    vmv.x.s a0, v8
5914; CHECK-NEXT:    ret
5915  %v = load <8 x i8>, ptr %x
5916  %red = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %v)
5917  ret i8 %red
5918}
5919
5920declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
5921
5922define i8 @vreduce_mul_v16i8(ptr %x) {
5923; CHECK-LABEL: vreduce_mul_v16i8:
5924; CHECK:       # %bb.0:
5925; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
5926; CHECK-NEXT:    vle8.v v8, (a0)
5927; CHECK-NEXT:    vslidedown.vi v9, v8, 8
5928; CHECK-NEXT:    vmul.vv v8, v8, v9
5929; CHECK-NEXT:    vslidedown.vi v9, v8, 4
5930; CHECK-NEXT:    vmul.vv v8, v8, v9
5931; CHECK-NEXT:    vslidedown.vi v9, v8, 2
5932; CHECK-NEXT:    vmul.vv v8, v8, v9
5933; CHECK-NEXT:    vrgather.vi v9, v8, 1
5934; CHECK-NEXT:    vmul.vv v8, v8, v9
5935; CHECK-NEXT:    vmv.x.s a0, v8
5936; CHECK-NEXT:    ret
5937  %v = load <16 x i8>, ptr %x
5938  %red = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %v)
5939  ret i8 %red
5940}
5941
5942declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
5943
5944define i8 @vreduce_mul_v32i8(ptr %x) {
5945; CHECK-LABEL: vreduce_mul_v32i8:
5946; CHECK:       # %bb.0:
5947; CHECK-NEXT:    li a1, 32
5948; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
5949; CHECK-NEXT:    vle8.v v8, (a0)
5950; CHECK-NEXT:    vslidedown.vi v10, v8, 16
5951; CHECK-NEXT:    vmul.vv v8, v8, v10
5952; CHECK-NEXT:    vslidedown.vi v10, v8, 8
5953; CHECK-NEXT:    vmul.vv v8, v8, v10
5954; CHECK-NEXT:    vslidedown.vi v10, v8, 4
5955; CHECK-NEXT:    vmul.vv v8, v8, v10
5956; CHECK-NEXT:    vslidedown.vi v10, v8, 2
5957; CHECK-NEXT:    vmul.vv v8, v8, v10
5958; CHECK-NEXT:    vrgather.vi v10, v8, 1
5959; CHECK-NEXT:    vmul.vv v8, v8, v10
5960; CHECK-NEXT:    vmv.x.s a0, v8
5961; CHECK-NEXT:    ret
5962  %v = load <32 x i8>, ptr %x
5963  %red = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %v)
5964  ret i8 %red
5965}
5966
5967declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>)
5968
5969define i8 @vreduce_mul_v64i8(ptr %x) {
5970; CHECK-LABEL: vreduce_mul_v64i8:
5971; CHECK:       # %bb.0:
5972; CHECK-NEXT:    li a1, 64
5973; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
5974; CHECK-NEXT:    vle8.v v8, (a0)
5975; CHECK-NEXT:    li a0, 32
5976; CHECK-NEXT:    vslidedown.vx v12, v8, a0
5977; CHECK-NEXT:    vmul.vv v8, v8, v12
5978; CHECK-NEXT:    vslidedown.vi v12, v8, 16
5979; CHECK-NEXT:    vmul.vv v8, v8, v12
5980; CHECK-NEXT:    vslidedown.vi v12, v8, 8
5981; CHECK-NEXT:    vmul.vv v8, v8, v12
5982; CHECK-NEXT:    vslidedown.vi v12, v8, 4
5983; CHECK-NEXT:    vmul.vv v8, v8, v12
5984; CHECK-NEXT:    vslidedown.vi v12, v8, 2
5985; CHECK-NEXT:    vmul.vv v8, v8, v12
5986; CHECK-NEXT:    vrgather.vi v12, v8, 1
5987; CHECK-NEXT:    vmul.vv v8, v8, v12
5988; CHECK-NEXT:    vmv.x.s a0, v8
5989; CHECK-NEXT:    ret
5990  %v = load <64 x i8>, ptr %x
5991  %red = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> %v)
5992  ret i8 %red
5993}
5994
5995declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
5996
5997define i8 @vreduce_mul_v128i8(ptr %x) {
5998; CHECK-LABEL: vreduce_mul_v128i8:
5999; CHECK:       # %bb.0:
6000; CHECK-NEXT:    li a1, 128
6001; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
6002; CHECK-NEXT:    vle8.v v8, (a0)
6003; CHECK-NEXT:    li a0, 64
6004; CHECK-NEXT:    vslidedown.vx v16, v8, a0
6005; CHECK-NEXT:    vmul.vv v8, v8, v16
6006; CHECK-NEXT:    li a0, 32
6007; CHECK-NEXT:    vslidedown.vx v16, v8, a0
6008; CHECK-NEXT:    vmul.vv v8, v8, v16
6009; CHECK-NEXT:    vslidedown.vi v16, v8, 16
6010; CHECK-NEXT:    vmul.vv v8, v8, v16
6011; CHECK-NEXT:    vslidedown.vi v16, v8, 8
6012; CHECK-NEXT:    vmul.vv v8, v8, v16
6013; CHECK-NEXT:    vslidedown.vi v16, v8, 4
6014; CHECK-NEXT:    vmul.vv v8, v8, v16
6015; CHECK-NEXT:    vslidedown.vi v16, v8, 2
6016; CHECK-NEXT:    vmul.vv v8, v8, v16
6017; CHECK-NEXT:    vrgather.vi v16, v8, 1
6018; CHECK-NEXT:    vmul.vv v8, v8, v16
6019; CHECK-NEXT:    vmv.x.s a0, v8
6020; CHECK-NEXT:    ret
6021  %v = load <128 x i8>, ptr %x
6022  %red = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> %v)
6023  ret i8 %red
6024}
6025
6026declare i8 @llvm.vector.reduce.mul.v256i8(<256 x i8>)
6027
6028define i8 @vreduce_mul_v256i8(ptr %x) {
6029; CHECK-LABEL: vreduce_mul_v256i8:
6030; CHECK:       # %bb.0:
6031; CHECK-NEXT:    li a1, 128
6032; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
6033; CHECK-NEXT:    vle8.v v8, (a0)
6034; CHECK-NEXT:    addi a0, a0, 128
6035; CHECK-NEXT:    vle8.v v16, (a0)
6036; CHECK-NEXT:    li a0, 64
6037; CHECK-NEXT:    vmul.vv v8, v8, v16
6038; CHECK-NEXT:    vslidedown.vx v16, v8, a0
6039; CHECK-NEXT:    vmul.vv v8, v8, v16
6040; CHECK-NEXT:    li a0, 32
6041; CHECK-NEXT:    vslidedown.vx v16, v8, a0
6042; CHECK-NEXT:    vmul.vv v8, v8, v16
6043; CHECK-NEXT:    vslidedown.vi v16, v8, 16
6044; CHECK-NEXT:    vmul.vv v8, v8, v16
6045; CHECK-NEXT:    vslidedown.vi v16, v8, 8
6046; CHECK-NEXT:    vmul.vv v8, v8, v16
6047; CHECK-NEXT:    vslidedown.vi v16, v8, 4
6048; CHECK-NEXT:    vmul.vv v8, v8, v16
6049; CHECK-NEXT:    vslidedown.vi v16, v8, 2
6050; CHECK-NEXT:    vmul.vv v8, v8, v16
6051; CHECK-NEXT:    vrgather.vi v16, v8, 1
6052; CHECK-NEXT:    vmul.vv v8, v8, v16
6053; CHECK-NEXT:    vmv.x.s a0, v8
6054; CHECK-NEXT:    ret
6055  %v = load <256 x i8>, ptr %x
6056  %red = call i8 @llvm.vector.reduce.mul.v256i8(<256 x i8> %v)
6057  ret i8 %red
6058}
6059
6060declare i16 @llvm.vector.reduce.mul.v1i16(<1 x i16>)
6061
6062define i16 @vreduce_mul_v1i16(<1 x i16> %v) {
6063; CHECK-LABEL: vreduce_mul_v1i16:
6064; CHECK:       # %bb.0:
6065; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
6066; CHECK-NEXT:    vmv.x.s a0, v8
6067; CHECK-NEXT:    ret
6068  %red = call i16 @llvm.vector.reduce.mul.v1i16(<1 x i16> %v)
6069  ret i16 %red
6070}
6071
6072declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
6073
6074define i16 @vreduce_mul_v2i16(ptr %x) {
6075; CHECK-LABEL: vreduce_mul_v2i16:
6076; CHECK:       # %bb.0:
6077; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
6078; CHECK-NEXT:    vle16.v v8, (a0)
6079; CHECK-NEXT:    lh a0, 2(a0)
6080; CHECK-NEXT:    vmul.vx v8, v8, a0
6081; CHECK-NEXT:    vmv.x.s a0, v8
6082; CHECK-NEXT:    ret
6083  %v = load <2 x i16>, ptr %x
6084  %red = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %v)
6085  ret i16 %red
6086}
6087
6088declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
6089
6090define i16 @vreduce_mul_v4i16(ptr %x) {
6091; CHECK-LABEL: vreduce_mul_v4i16:
6092; CHECK:       # %bb.0:
6093; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
6094; CHECK-NEXT:    vle16.v v8, (a0)
6095; CHECK-NEXT:    vslidedown.vi v9, v8, 2
6096; CHECK-NEXT:    vmul.vv v8, v8, v9
6097; CHECK-NEXT:    vrgather.vi v9, v8, 1
6098; CHECK-NEXT:    vmul.vv v8, v8, v9
6099; CHECK-NEXT:    vmv.x.s a0, v8
6100; CHECK-NEXT:    ret
6101  %v = load <4 x i16>, ptr %x
6102  %red = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %v)
6103  ret i16 %red
6104}
6105
6106declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
6107
6108define i16 @vreduce_mul_v8i16(ptr %x) {
6109; CHECK-LABEL: vreduce_mul_v8i16:
6110; CHECK:       # %bb.0:
6111; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
6112; CHECK-NEXT:    vle16.v v8, (a0)
6113; CHECK-NEXT:    vslidedown.vi v9, v8, 4
6114; CHECK-NEXT:    vmul.vv v8, v8, v9
6115; CHECK-NEXT:    vslidedown.vi v9, v8, 2
6116; CHECK-NEXT:    vmul.vv v8, v8, v9
6117; CHECK-NEXT:    vrgather.vi v9, v8, 1
6118; CHECK-NEXT:    vmul.vv v8, v8, v9
6119; CHECK-NEXT:    vmv.x.s a0, v8
6120; CHECK-NEXT:    ret
6121  %v = load <8 x i16>, ptr %x
6122  %red = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %v)
6123  ret i16 %red
6124}
6125
6126declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
6127
6128define i16 @vreduce_mul_v16i16(ptr %x) {
6129; CHECK-LABEL: vreduce_mul_v16i16:
6130; CHECK:       # %bb.0:
6131; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
6132; CHECK-NEXT:    vle16.v v8, (a0)
6133; CHECK-NEXT:    vslidedown.vi v10, v8, 8
6134; CHECK-NEXT:    vmul.vv v8, v8, v10
6135; CHECK-NEXT:    vslidedown.vi v10, v8, 4
6136; CHECK-NEXT:    vmul.vv v8, v8, v10
6137; CHECK-NEXT:    vslidedown.vi v10, v8, 2
6138; CHECK-NEXT:    vmul.vv v8, v8, v10
6139; CHECK-NEXT:    vrgather.vi v10, v8, 1
6140; CHECK-NEXT:    vmul.vv v8, v8, v10
6141; CHECK-NEXT:    vmv.x.s a0, v8
6142; CHECK-NEXT:    ret
6143  %v = load <16 x i16>, ptr %x
6144  %red = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %v)
6145  ret i16 %red
6146}
6147
6148declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>)
6149
6150define i16 @vreduce_mul_v32i16(ptr %x) {
6151; CHECK-LABEL: vreduce_mul_v32i16:
6152; CHECK:       # %bb.0:
6153; CHECK-NEXT:    li a1, 32
6154; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
6155; CHECK-NEXT:    vle16.v v8, (a0)
6156; CHECK-NEXT:    vslidedown.vi v12, v8, 16
6157; CHECK-NEXT:    vmul.vv v8, v8, v12
6158; CHECK-NEXT:    vslidedown.vi v12, v8, 8
6159; CHECK-NEXT:    vmul.vv v8, v8, v12
6160; CHECK-NEXT:    vslidedown.vi v12, v8, 4
6161; CHECK-NEXT:    vmul.vv v8, v8, v12
6162; CHECK-NEXT:    vslidedown.vi v12, v8, 2
6163; CHECK-NEXT:    vmul.vv v8, v8, v12
6164; CHECK-NEXT:    vrgather.vi v12, v8, 1
6165; CHECK-NEXT:    vmul.vv v8, v8, v12
6166; CHECK-NEXT:    vmv.x.s a0, v8
6167; CHECK-NEXT:    ret
6168  %v = load <32 x i16>, ptr %x
6169  %red = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> %v)
6170  ret i16 %red
6171}
6172
6173declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>)
6174
6175define i16 @vreduce_mul_v64i16(ptr %x) {
6176; CHECK-LABEL: vreduce_mul_v64i16:
6177; CHECK:       # %bb.0:
6178; CHECK-NEXT:    li a1, 64
6179; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
6180; CHECK-NEXT:    vle16.v v8, (a0)
6181; CHECK-NEXT:    li a0, 32
6182; CHECK-NEXT:    vslidedown.vx v16, v8, a0
6183; CHECK-NEXT:    vmul.vv v8, v8, v16
6184; CHECK-NEXT:    vslidedown.vi v16, v8, 16
6185; CHECK-NEXT:    vmul.vv v8, v8, v16
6186; CHECK-NEXT:    vslidedown.vi v16, v8, 8
6187; CHECK-NEXT:    vmul.vv v8, v8, v16
6188; CHECK-NEXT:    vslidedown.vi v16, v8, 4
6189; CHECK-NEXT:    vmul.vv v8, v8, v16
6190; CHECK-NEXT:    vslidedown.vi v16, v8, 2
6191; CHECK-NEXT:    vmul.vv v8, v8, v16
6192; CHECK-NEXT:    vrgather.vi v16, v8, 1
6193; CHECK-NEXT:    vmul.vv v8, v8, v16
6194; CHECK-NEXT:    vmv.x.s a0, v8
6195; CHECK-NEXT:    ret
6196  %v = load <64 x i16>, ptr %x
6197  %red = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> %v)
6198  ret i16 %red
6199}
6200
6201declare i16 @llvm.vector.reduce.mul.v128i16(<128 x i16>)
6202
6203define i16 @vreduce_mul_v128i16(ptr %x) {
6204; CHECK-LABEL: vreduce_mul_v128i16:
6205; CHECK:       # %bb.0:
6206; CHECK-NEXT:    li a1, 64
6207; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
6208; CHECK-NEXT:    vle16.v v8, (a0)
6209; CHECK-NEXT:    addi a0, a0, 128
6210; CHECK-NEXT:    vle16.v v16, (a0)
6211; CHECK-NEXT:    vmul.vv v8, v8, v16
6212; CHECK-NEXT:    li a0, 32
6213; CHECK-NEXT:    vslidedown.vx v16, v8, a0
6214; CHECK-NEXT:    vmul.vv v8, v8, v16
6215; CHECK-NEXT:    vslidedown.vi v16, v8, 16
6216; CHECK-NEXT:    vmul.vv v8, v8, v16
6217; CHECK-NEXT:    vslidedown.vi v16, v8, 8
6218; CHECK-NEXT:    vmul.vv v8, v8, v16
6219; CHECK-NEXT:    vslidedown.vi v16, v8, 4
6220; CHECK-NEXT:    vmul.vv v8, v8, v16
6221; CHECK-NEXT:    vslidedown.vi v16, v8, 2
6222; CHECK-NEXT:    vmul.vv v8, v8, v16
6223; CHECK-NEXT:    vrgather.vi v16, v8, 1
6224; CHECK-NEXT:    vmul.vv v8, v8, v16
6225; CHECK-NEXT:    vmv.x.s a0, v8
6226; CHECK-NEXT:    ret
6227  %v = load <128 x i16>, ptr %x
6228  %red = call i16 @llvm.vector.reduce.mul.v128i16(<128 x i16> %v)
6229  ret i16 %red
6230}
6231
6232declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32>)
6233
6234define i32 @vreduce_mul_v1i32(<1 x i32> %v) {
6235; CHECK-LABEL: vreduce_mul_v1i32:
6236; CHECK:       # %bb.0:
6237; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
6238; CHECK-NEXT:    vmv.x.s a0, v8
6239; CHECK-NEXT:    ret
6240  %red = call i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %v)
6241  ret i32 %red
6242}
6243
6244declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
6245
6246define i32 @vreduce_mul_v2i32(ptr %x) {
6247; CHECK-LABEL: vreduce_mul_v2i32:
6248; CHECK:       # %bb.0:
6249; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
6250; CHECK-NEXT:    vle32.v v8, (a0)
6251; CHECK-NEXT:    lw a0, 4(a0)
6252; CHECK-NEXT:    vmul.vx v8, v8, a0
6253; CHECK-NEXT:    vmv.x.s a0, v8
6254; CHECK-NEXT:    ret
6255  %v = load <2 x i32>, ptr %x
6256  %red = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %v)
6257  ret i32 %red
6258}
6259
6260declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
6261
6262define i32 @vreduce_mul_v4i32(ptr %x) {
6263; CHECK-LABEL: vreduce_mul_v4i32:
6264; CHECK:       # %bb.0:
6265; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
6266; CHECK-NEXT:    vle32.v v8, (a0)
6267; CHECK-NEXT:    vslidedown.vi v9, v8, 2
6268; CHECK-NEXT:    vmul.vv v8, v8, v9
6269; CHECK-NEXT:    vrgather.vi v9, v8, 1
6270; CHECK-NEXT:    vmul.vv v8, v8, v9
6271; CHECK-NEXT:    vmv.x.s a0, v8
6272; CHECK-NEXT:    ret
6273  %v = load <4 x i32>, ptr %x
6274  %red = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %v)
6275  ret i32 %red
6276}
6277
6278declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
6279
6280define i32 @vreduce_mul_v8i32(ptr %x) {
6281; CHECK-LABEL: vreduce_mul_v8i32:
6282; CHECK:       # %bb.0:
6283; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
6284; CHECK-NEXT:    vle32.v v8, (a0)
6285; CHECK-NEXT:    vslidedown.vi v10, v8, 4
6286; CHECK-NEXT:    vmul.vv v8, v8, v10
6287; CHECK-NEXT:    vslidedown.vi v10, v8, 2
6288; CHECK-NEXT:    vmul.vv v8, v8, v10
6289; CHECK-NEXT:    vrgather.vi v10, v8, 1
6290; CHECK-NEXT:    vmul.vv v8, v8, v10
6291; CHECK-NEXT:    vmv.x.s a0, v8
6292; CHECK-NEXT:    ret
6293  %v = load <8 x i32>, ptr %x
6294  %red = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %v)
6295  ret i32 %red
6296}
6297
6298declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
6299
6300define i32 @vreduce_mul_v16i32(ptr %x) {
6301; CHECK-LABEL: vreduce_mul_v16i32:
6302; CHECK:       # %bb.0:
6303; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
6304; CHECK-NEXT:    vle32.v v8, (a0)
6305; CHECK-NEXT:    vslidedown.vi v12, v8, 8
6306; CHECK-NEXT:    vmul.vv v8, v8, v12
6307; CHECK-NEXT:    vslidedown.vi v12, v8, 4
6308; CHECK-NEXT:    vmul.vv v8, v8, v12
6309; CHECK-NEXT:    vslidedown.vi v12, v8, 2
6310; CHECK-NEXT:    vmul.vv v8, v8, v12
6311; CHECK-NEXT:    vrgather.vi v12, v8, 1
6312; CHECK-NEXT:    vmul.vv v8, v8, v12
6313; CHECK-NEXT:    vmv.x.s a0, v8
6314; CHECK-NEXT:    ret
6315  %v = load <16 x i32>, ptr %x
6316  %red = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v)
6317  ret i32 %red
6318}
6319
6320declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>)
6321
6322define i32 @vreduce_mul_v32i32(ptr %x) {
6323; CHECK-LABEL: vreduce_mul_v32i32:
6324; CHECK:       # %bb.0:
6325; CHECK-NEXT:    li a1, 32
6326; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
6327; CHECK-NEXT:    vle32.v v8, (a0)
6328; CHECK-NEXT:    vslidedown.vi v16, v8, 16
6329; CHECK-NEXT:    vmul.vv v8, v8, v16
6330; CHECK-NEXT:    vslidedown.vi v16, v8, 8
6331; CHECK-NEXT:    vmul.vv v8, v8, v16
6332; CHECK-NEXT:    vslidedown.vi v16, v8, 4
6333; CHECK-NEXT:    vmul.vv v8, v8, v16
6334; CHECK-NEXT:    vslidedown.vi v16, v8, 2
6335; CHECK-NEXT:    vmul.vv v8, v8, v16
6336; CHECK-NEXT:    vrgather.vi v16, v8, 1
6337; CHECK-NEXT:    vmul.vv v8, v8, v16
6338; CHECK-NEXT:    vmv.x.s a0, v8
6339; CHECK-NEXT:    ret
6340  %v = load <32 x i32>, ptr %x
6341  %red = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> %v)
6342  ret i32 %red
6343}
6344
6345declare i32 @llvm.vector.reduce.mul.v64i32(<64 x i32>)
6346
6347define i32 @vreduce_mul_v64i32(ptr %x) {
6348; CHECK-LABEL: vreduce_mul_v64i32:
6349; CHECK:       # %bb.0:
6350; CHECK-NEXT:    li a1, 32
6351; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
6352; CHECK-NEXT:    vle32.v v8, (a0)
6353; CHECK-NEXT:    addi a0, a0, 128
6354; CHECK-NEXT:    vle32.v v16, (a0)
6355; CHECK-NEXT:    vmul.vv v8, v8, v16
6356; CHECK-NEXT:    vslidedown.vi v16, v8, 16
6357; CHECK-NEXT:    vmul.vv v8, v8, v16
6358; CHECK-NEXT:    vslidedown.vi v16, v8, 8
6359; CHECK-NEXT:    vmul.vv v8, v8, v16
6360; CHECK-NEXT:    vslidedown.vi v16, v8, 4
6361; CHECK-NEXT:    vmul.vv v8, v8, v16
6362; CHECK-NEXT:    vslidedown.vi v16, v8, 2
6363; CHECK-NEXT:    vmul.vv v8, v8, v16
6364; CHECK-NEXT:    vrgather.vi v16, v8, 1
6365; CHECK-NEXT:    vmul.vv v8, v8, v16
6366; CHECK-NEXT:    vmv.x.s a0, v8
6367; CHECK-NEXT:    ret
6368  %v = load <64 x i32>, ptr %x
6369  %red = call i32 @llvm.vector.reduce.mul.v64i32(<64 x i32> %v)
6370  ret i32 %red
6371}
6372
6373declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>)
6374
6375define i64 @vreduce_mul_v1i64(<1 x i64> %v) {
6376; RV32-LABEL: vreduce_mul_v1i64:
6377; RV32:       # %bb.0:
6378; RV32-NEXT:    li a0, 32
6379; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
6380; RV32-NEXT:    vsrl.vx v9, v8, a0
6381; RV32-NEXT:    vmv.x.s a1, v9
6382; RV32-NEXT:    vmv.x.s a0, v8
6383; RV32-NEXT:    ret
6384;
6385; RV64-LABEL: vreduce_mul_v1i64:
6386; RV64:       # %bb.0:
6387; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
6388; RV64-NEXT:    vmv.x.s a0, v8
6389; RV64-NEXT:    ret
6390  %red = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %v)
6391  ret i64 %red
6392}
6393
6394declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
6395
6396define i64 @vreduce_mul_v2i64(ptr %x) {
6397; RV32-LABEL: vreduce_mul_v2i64:
6398; RV32:       # %bb.0:
6399; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
6400; RV32-NEXT:    vle64.v v8, (a0)
6401; RV32-NEXT:    addi a0, a0, 8
6402; RV32-NEXT:    vlse64.v v9, (a0), zero
6403; RV32-NEXT:    li a1, 32
6404; RV32-NEXT:    vmul.vv v8, v8, v9
6405; RV32-NEXT:    vmv.x.s a0, v8
6406; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
6407; RV32-NEXT:    vsrl.vx v8, v8, a1
6408; RV32-NEXT:    vmv.x.s a1, v8
6409; RV32-NEXT:    ret
6410;
6411; RV64-LABEL: vreduce_mul_v2i64:
6412; RV64:       # %bb.0:
6413; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
6414; RV64-NEXT:    vle64.v v8, (a0)
6415; RV64-NEXT:    ld a0, 8(a0)
6416; RV64-NEXT:    vmul.vx v8, v8, a0
6417; RV64-NEXT:    vmv.x.s a0, v8
6418; RV64-NEXT:    ret
6419  %v = load <2 x i64>, ptr %x
6420  %red = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %v)
6421  ret i64 %red
6422}
6423
6424declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
6425
6426define i64 @vreduce_mul_v4i64(ptr %x) {
6427; RV32-LABEL: vreduce_mul_v4i64:
6428; RV32:       # %bb.0:
6429; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
6430; RV32-NEXT:    vle64.v v8, (a0)
6431; RV32-NEXT:    li a1, 32
6432; RV32-NEXT:    vslidedown.vi v10, v8, 2
6433; RV32-NEXT:    vmul.vv v8, v8, v10
6434; RV32-NEXT:    vrgather.vi v10, v8, 1
6435; RV32-NEXT:    vmul.vv v8, v8, v10
6436; RV32-NEXT:    vmv.x.s a0, v8
6437; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
6438; RV32-NEXT:    vsrl.vx v8, v8, a1
6439; RV32-NEXT:    vmv.x.s a1, v8
6440; RV32-NEXT:    ret
6441;
6442; RV64-LABEL: vreduce_mul_v4i64:
6443; RV64:       # %bb.0:
6444; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
6445; RV64-NEXT:    vle64.v v8, (a0)
6446; RV64-NEXT:    vslidedown.vi v10, v8, 2
6447; RV64-NEXT:    vmul.vv v8, v8, v10
6448; RV64-NEXT:    vrgather.vi v10, v8, 1
6449; RV64-NEXT:    vmul.vv v8, v8, v10
6450; RV64-NEXT:    vmv.x.s a0, v8
6451; RV64-NEXT:    ret
6452  %v = load <4 x i64>, ptr %x
6453  %red = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %v)
6454  ret i64 %red
6455}
6456
6457declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
6458
6459define i64 @vreduce_mul_v8i64(ptr %x) {
6460; RV32-LABEL: vreduce_mul_v8i64:
6461; RV32:       # %bb.0:
6462; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
6463; RV32-NEXT:    vle64.v v8, (a0)
6464; RV32-NEXT:    li a1, 32
6465; RV32-NEXT:    vslidedown.vi v12, v8, 4
6466; RV32-NEXT:    vmul.vv v8, v8, v12
6467; RV32-NEXT:    vslidedown.vi v12, v8, 2
6468; RV32-NEXT:    vmul.vv v8, v8, v12
6469; RV32-NEXT:    vrgather.vi v12, v8, 1
6470; RV32-NEXT:    vmul.vv v8, v8, v12
6471; RV32-NEXT:    vmv.x.s a0, v8
6472; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
6473; RV32-NEXT:    vsrl.vx v8, v8, a1
6474; RV32-NEXT:    vmv.x.s a1, v8
6475; RV32-NEXT:    ret
6476;
6477; RV64-LABEL: vreduce_mul_v8i64:
6478; RV64:       # %bb.0:
6479; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
6480; RV64-NEXT:    vle64.v v8, (a0)
6481; RV64-NEXT:    vslidedown.vi v12, v8, 4
6482; RV64-NEXT:    vmul.vv v8, v8, v12
6483; RV64-NEXT:    vslidedown.vi v12, v8, 2
6484; RV64-NEXT:    vmul.vv v8, v8, v12
6485; RV64-NEXT:    vrgather.vi v12, v8, 1
6486; RV64-NEXT:    vmul.vv v8, v8, v12
6487; RV64-NEXT:    vmv.x.s a0, v8
6488; RV64-NEXT:    ret
6489  %v = load <8 x i64>, ptr %x
6490  %red = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %v)
6491  ret i64 %red
6492}
6493
6494declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
6495
6496define i64 @vreduce_mul_v16i64(ptr %x) {
6497; RV32-LABEL: vreduce_mul_v16i64:
6498; RV32:       # %bb.0:
6499; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
6500; RV32-NEXT:    vle64.v v8, (a0)
6501; RV32-NEXT:    li a1, 32
6502; RV32-NEXT:    vslidedown.vi v16, v8, 8
6503; RV32-NEXT:    vmul.vv v8, v8, v16
6504; RV32-NEXT:    vslidedown.vi v16, v8, 4
6505; RV32-NEXT:    vmul.vv v8, v8, v16
6506; RV32-NEXT:    vslidedown.vi v16, v8, 2
6507; RV32-NEXT:    vmul.vv v8, v8, v16
6508; RV32-NEXT:    vrgather.vi v16, v8, 1
6509; RV32-NEXT:    vmul.vv v8, v8, v16
6510; RV32-NEXT:    vmv.x.s a0, v8
6511; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
6512; RV32-NEXT:    vsrl.vx v8, v8, a1
6513; RV32-NEXT:    vmv.x.s a1, v8
6514; RV32-NEXT:    ret
6515;
6516; RV64-LABEL: vreduce_mul_v16i64:
6517; RV64:       # %bb.0:
6518; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
6519; RV64-NEXT:    vle64.v v8, (a0)
6520; RV64-NEXT:    vslidedown.vi v16, v8, 8
6521; RV64-NEXT:    vmul.vv v8, v8, v16
6522; RV64-NEXT:    vslidedown.vi v16, v8, 4
6523; RV64-NEXT:    vmul.vv v8, v8, v16
6524; RV64-NEXT:    vslidedown.vi v16, v8, 2
6525; RV64-NEXT:    vmul.vv v8, v8, v16
6526; RV64-NEXT:    vrgather.vi v16, v8, 1
6527; RV64-NEXT:    vmul.vv v8, v8, v16
6528; RV64-NEXT:    vmv.x.s a0, v8
6529; RV64-NEXT:    ret
6530  %v = load <16 x i64>, ptr %x
6531  %red = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> %v)
6532  ret i64 %red
6533}
6534
6535declare i64 @llvm.vector.reduce.mul.v32i64(<32 x i64>)
6536
6537define i64 @vreduce_mul_v32i64(ptr %x) {
6538; RV32-LABEL: vreduce_mul_v32i64:
6539; RV32:       # %bb.0:
6540; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
6541; RV32-NEXT:    vle64.v v8, (a0)
6542; RV32-NEXT:    addi a0, a0, 128
6543; RV32-NEXT:    vle64.v v16, (a0)
6544; RV32-NEXT:    vmul.vv v8, v8, v16
6545; RV32-NEXT:    vslidedown.vi v16, v8, 8
6546; RV32-NEXT:    vmul.vv v8, v8, v16
6547; RV32-NEXT:    vslidedown.vi v16, v8, 4
6548; RV32-NEXT:    vmul.vv v8, v8, v16
6549; RV32-NEXT:    vslidedown.vi v16, v8, 2
6550; RV32-NEXT:    vmul.vv v8, v8, v16
6551; RV32-NEXT:    vrgather.vi v16, v8, 1
6552; RV32-NEXT:    vmul.vv v8, v8, v16
6553; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
6554; RV32-NEXT:    vmv.x.s a0, v8
6555; RV32-NEXT:    vslidedown.vi v8, v8, 1
6556; RV32-NEXT:    vmv.x.s a1, v8
6557; RV32-NEXT:    ret
6558;
6559; RV64-LABEL: vreduce_mul_v32i64:
6560; RV64:       # %bb.0:
6561; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
6562; RV64-NEXT:    vle64.v v8, (a0)
6563; RV64-NEXT:    addi a0, a0, 128
6564; RV64-NEXT:    vle64.v v16, (a0)
6565; RV64-NEXT:    vmul.vv v8, v8, v16
6566; RV64-NEXT:    vslidedown.vi v16, v8, 8
6567; RV64-NEXT:    vmul.vv v8, v8, v16
6568; RV64-NEXT:    vslidedown.vi v16, v8, 4
6569; RV64-NEXT:    vmul.vv v8, v8, v16
6570; RV64-NEXT:    vslidedown.vi v16, v8, 2
6571; RV64-NEXT:    vmul.vv v8, v8, v16
6572; RV64-NEXT:    vrgather.vi v16, v8, 1
6573; RV64-NEXT:    vmul.vv v8, v8, v16
6574; RV64-NEXT:    vmv.x.s a0, v8
6575; RV64-NEXT:    ret
6576  %v = load <32 x i64>, ptr %x
6577  %red = call i64 @llvm.vector.reduce.mul.v32i64(<32 x i64> %v)
6578  ret i64 %red
6579}
6580
6581declare i64 @llvm.vector.reduce.mul.v64i64(<64 x i64>)
6582
6583define i64 @vreduce_mul_v64i64(ptr %x) nounwind {
6584; RV32-LABEL: vreduce_mul_v64i64:
6585; RV32:       # %bb.0:
6586; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
6587; RV32-NEXT:    vle64.v v8, (a0)
6588; RV32-NEXT:    addi a1, a0, 384
6589; RV32-NEXT:    vle64.v v16, (a1)
6590; RV32-NEXT:    addi a1, a0, 256
6591; RV32-NEXT:    addi a0, a0, 128
6592; RV32-NEXT:    vle64.v v24, (a0)
6593; RV32-NEXT:    vle64.v v0, (a1)
6594; RV32-NEXT:    vmul.vv v16, v24, v16
6595; RV32-NEXT:    vmul.vv v8, v8, v0
6596; RV32-NEXT:    vmul.vv v8, v8, v16
6597; RV32-NEXT:    vslidedown.vi v16, v8, 8
6598; RV32-NEXT:    vmul.vv v8, v8, v16
6599; RV32-NEXT:    vslidedown.vi v16, v8, 4
6600; RV32-NEXT:    vmul.vv v8, v8, v16
6601; RV32-NEXT:    vslidedown.vi v16, v8, 2
6602; RV32-NEXT:    vmul.vv v8, v8, v16
6603; RV32-NEXT:    vrgather.vi v16, v8, 1
6604; RV32-NEXT:    vmul.vv v8, v8, v16
6605; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
6606; RV32-NEXT:    vmv.x.s a0, v8
6607; RV32-NEXT:    vslidedown.vi v8, v8, 1
6608; RV32-NEXT:    vmv.x.s a1, v8
6609; RV32-NEXT:    ret
6610;
6611; RV64-LABEL: vreduce_mul_v64i64:
6612; RV64:       # %bb.0:
6613; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
6614; RV64-NEXT:    vle64.v v8, (a0)
6615; RV64-NEXT:    addi a1, a0, 384
6616; RV64-NEXT:    vle64.v v16, (a1)
6617; RV64-NEXT:    addi a1, a0, 256
6618; RV64-NEXT:    addi a0, a0, 128
6619; RV64-NEXT:    vle64.v v24, (a0)
6620; RV64-NEXT:    vle64.v v0, (a1)
6621; RV64-NEXT:    vmul.vv v16, v24, v16
6622; RV64-NEXT:    vmul.vv v8, v8, v0
6623; RV64-NEXT:    vmul.vv v8, v8, v16
6624; RV64-NEXT:    vslidedown.vi v16, v8, 8
6625; RV64-NEXT:    vmul.vv v8, v8, v16
6626; RV64-NEXT:    vslidedown.vi v16, v8, 4
6627; RV64-NEXT:    vmul.vv v8, v8, v16
6628; RV64-NEXT:    vslidedown.vi v16, v8, 2
6629; RV64-NEXT:    vmul.vv v8, v8, v16
6630; RV64-NEXT:    vrgather.vi v16, v8, 1
6631; RV64-NEXT:    vmul.vv v8, v8, v16
6632; RV64-NEXT:    vmv.x.s a0, v8
6633; RV64-NEXT:    ret
6634  %v = load <64 x i64>, ptr %x
6635  %red = call i64 @llvm.vector.reduce.mul.v64i64(<64 x i64> %v)
6636  ret i64 %red
6637}
6638