xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll (revision 9122c5235ec85ce0c0ad337e862b006e7b349d84)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
3; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
4; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
5; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
6
7declare i8 @llvm.vp.reduce.add.v2i8(i8, <2 x i8>, <2 x i1>, i32)
8
9define signext i8 @vpreduce_add_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
10; CHECK-LABEL: vpreduce_add_v2i8:
11; CHECK:       # %bb.0:
12; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
13; CHECK-NEXT:    vmv.s.x v9, a0
14; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
15; CHECK-NEXT:    vredsum.vs v9, v8, v9, v0.t
16; CHECK-NEXT:    vmv.x.s a0, v9
17; CHECK-NEXT:    ret
18  %r = call i8 @llvm.vp.reduce.add.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
19  ret i8 %r
20}
21
22declare i8 @llvm.vp.reduce.umax.v2i8(i8, <2 x i8>, <2 x i1>, i32)
23
24define signext i8 @vpreduce_umax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
25; CHECK-LABEL: vpreduce_umax_v2i8:
26; CHECK:       # %bb.0:
27; CHECK-NEXT:    andi a0, a0, 255
28; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
29; CHECK-NEXT:    vmv.s.x v9, a0
30; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
31; CHECK-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
32; CHECK-NEXT:    vmv.x.s a0, v9
33; CHECK-NEXT:    ret
34  %r = call i8 @llvm.vp.reduce.umax.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
35  ret i8 %r
36}
37
38declare i8 @llvm.vp.reduce.smax.v2i8(i8, <2 x i8>, <2 x i1>, i32)
39
40define signext i8 @vpreduce_smax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
41; CHECK-LABEL: vpreduce_smax_v2i8:
42; CHECK:       # %bb.0:
43; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
44; CHECK-NEXT:    vmv.s.x v9, a0
45; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
46; CHECK-NEXT:    vredmax.vs v9, v8, v9, v0.t
47; CHECK-NEXT:    vmv.x.s a0, v9
48; CHECK-NEXT:    ret
49  %r = call i8 @llvm.vp.reduce.smax.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
50  ret i8 %r
51}
52
53declare i8 @llvm.vp.reduce.umin.v2i8(i8, <2 x i8>, <2 x i1>, i32)
54
55define signext i8 @vpreduce_umin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
56; CHECK-LABEL: vpreduce_umin_v2i8:
57; CHECK:       # %bb.0:
58; CHECK-NEXT:    andi a0, a0, 255
59; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
60; CHECK-NEXT:    vmv.s.x v9, a0
61; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
62; CHECK-NEXT:    vredminu.vs v9, v8, v9, v0.t
63; CHECK-NEXT:    vmv.x.s a0, v9
64; CHECK-NEXT:    ret
65  %r = call i8 @llvm.vp.reduce.umin.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
66  ret i8 %r
67}
68
69declare i8 @llvm.vp.reduce.smin.v2i8(i8, <2 x i8>, <2 x i1>, i32)
70
71define signext i8 @vpreduce_smin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
72; CHECK-LABEL: vpreduce_smin_v2i8:
73; CHECK:       # %bb.0:
74; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
75; CHECK-NEXT:    vmv.s.x v9, a0
76; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
77; CHECK-NEXT:    vredmin.vs v9, v8, v9, v0.t
78; CHECK-NEXT:    vmv.x.s a0, v9
79; CHECK-NEXT:    ret
80  %r = call i8 @llvm.vp.reduce.smin.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
81  ret i8 %r
82}
83
84declare i8 @llvm.vp.reduce.and.v2i8(i8, <2 x i8>, <2 x i1>, i32)
85
86define signext i8 @vpreduce_and_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
87; CHECK-LABEL: vpreduce_and_v2i8:
88; CHECK:       # %bb.0:
89; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
90; CHECK-NEXT:    vmv.s.x v9, a0
91; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
92; CHECK-NEXT:    vredand.vs v9, v8, v9, v0.t
93; CHECK-NEXT:    vmv.x.s a0, v9
94; CHECK-NEXT:    ret
95  %r = call i8 @llvm.vp.reduce.and.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
96  ret i8 %r
97}
98
99declare i8 @llvm.vp.reduce.or.v2i8(i8, <2 x i8>, <2 x i1>, i32)
100
101define signext i8 @vpreduce_or_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
102; CHECK-LABEL: vpreduce_or_v2i8:
103; CHECK:       # %bb.0:
104; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
105; CHECK-NEXT:    vmv.s.x v9, a0
106; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
107; CHECK-NEXT:    vredor.vs v9, v8, v9, v0.t
108; CHECK-NEXT:    vmv.x.s a0, v9
109; CHECK-NEXT:    ret
110  %r = call i8 @llvm.vp.reduce.or.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
111  ret i8 %r
112}
113
114declare i8 @llvm.vp.reduce.xor.v2i8(i8, <2 x i8>, <2 x i1>, i32)
115
116define signext i8 @vpreduce_xor_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
117; CHECK-LABEL: vpreduce_xor_v2i8:
118; CHECK:       # %bb.0:
119; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
120; CHECK-NEXT:    vmv.s.x v9, a0
121; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
122; CHECK-NEXT:    vredxor.vs v9, v8, v9, v0.t
123; CHECK-NEXT:    vmv.x.s a0, v9
124; CHECK-NEXT:    ret
125  %r = call i8 @llvm.vp.reduce.xor.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
126  ret i8 %r
127}
128
129declare i8 @llvm.vp.reduce.umin.v3i8(i8, <3 x i8>, <3 x i1>, i32)
130
131define signext i8 @vpreduce_umin_v3i8(i8 signext %s, <3 x i8> %v, <3 x i1> %m, i32 zeroext %evl) {
132; CHECK-LABEL: vpreduce_umin_v3i8:
133; CHECK:       # %bb.0:
134; CHECK-NEXT:    andi a0, a0, 255
135; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
136; CHECK-NEXT:    vmv.s.x v9, a0
137; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
138; CHECK-NEXT:    vredminu.vs v9, v8, v9, v0.t
139; CHECK-NEXT:    vmv.x.s a0, v9
140; CHECK-NEXT:    ret
141  %r = call i8 @llvm.vp.reduce.umin.v3i8(i8 %s, <3 x i8> %v, <3 x i1> %m, i32 %evl)
142  ret i8 %r
143}
144
145declare i8 @llvm.vp.reduce.add.v4i8(i8, <4 x i8>, <4 x i1>, i32)
146
147define signext i8 @vpreduce_add_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
148; CHECK-LABEL: vpreduce_add_v4i8:
149; CHECK:       # %bb.0:
150; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
151; CHECK-NEXT:    vmv.s.x v9, a0
152; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
153; CHECK-NEXT:    vredsum.vs v9, v8, v9, v0.t
154; CHECK-NEXT:    vmv.x.s a0, v9
155; CHECK-NEXT:    ret
156  %r = call i8 @llvm.vp.reduce.add.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
157  ret i8 %r
158}
159
160declare i8 @llvm.vp.reduce.umax.v4i8(i8, <4 x i8>, <4 x i1>, i32)
161
162define signext i8 @vpreduce_umax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
163; CHECK-LABEL: vpreduce_umax_v4i8:
164; CHECK:       # %bb.0:
165; CHECK-NEXT:    andi a0, a0, 255
166; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
167; CHECK-NEXT:    vmv.s.x v9, a0
168; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
169; CHECK-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
170; CHECK-NEXT:    vmv.x.s a0, v9
171; CHECK-NEXT:    ret
172  %r = call i8 @llvm.vp.reduce.umax.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
173  ret i8 %r
174}
175
176declare i8 @llvm.vp.reduce.smax.v4i8(i8, <4 x i8>, <4 x i1>, i32)
177
178define signext i8 @vpreduce_smax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
179; CHECK-LABEL: vpreduce_smax_v4i8:
180; CHECK:       # %bb.0:
181; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
182; CHECK-NEXT:    vmv.s.x v9, a0
183; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
184; CHECK-NEXT:    vredmax.vs v9, v8, v9, v0.t
185; CHECK-NEXT:    vmv.x.s a0, v9
186; CHECK-NEXT:    ret
187  %r = call i8 @llvm.vp.reduce.smax.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
188  ret i8 %r
189}
190
191declare i8 @llvm.vp.reduce.umin.v4i8(i8, <4 x i8>, <4 x i1>, i32)
192
193define signext i8 @vpreduce_umin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
194; CHECK-LABEL: vpreduce_umin_v4i8:
195; CHECK:       # %bb.0:
196; CHECK-NEXT:    andi a0, a0, 255
197; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
198; CHECK-NEXT:    vmv.s.x v9, a0
199; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
200; CHECK-NEXT:    vredminu.vs v9, v8, v9, v0.t
201; CHECK-NEXT:    vmv.x.s a0, v9
202; CHECK-NEXT:    ret
203  %r = call i8 @llvm.vp.reduce.umin.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
204  ret i8 %r
205}
206
207declare i8 @llvm.vp.reduce.smin.v4i8(i8, <4 x i8>, <4 x i1>, i32)
208
209define signext i8 @vpreduce_smin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
210; CHECK-LABEL: vpreduce_smin_v4i8:
211; CHECK:       # %bb.0:
212; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
213; CHECK-NEXT:    vmv.s.x v9, a0
214; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
215; CHECK-NEXT:    vredmin.vs v9, v8, v9, v0.t
216; CHECK-NEXT:    vmv.x.s a0, v9
217; CHECK-NEXT:    ret
218  %r = call i8 @llvm.vp.reduce.smin.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
219  ret i8 %r
220}
221
222declare i8 @llvm.vp.reduce.and.v4i8(i8, <4 x i8>, <4 x i1>, i32)
223
224define signext i8 @vpreduce_and_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
225; CHECK-LABEL: vpreduce_and_v4i8:
226; CHECK:       # %bb.0:
227; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
228; CHECK-NEXT:    vmv.s.x v9, a0
229; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
230; CHECK-NEXT:    vredand.vs v9, v8, v9, v0.t
231; CHECK-NEXT:    vmv.x.s a0, v9
232; CHECK-NEXT:    ret
233  %r = call i8 @llvm.vp.reduce.and.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
234  ret i8 %r
235}
236
237declare i8 @llvm.vp.reduce.or.v4i8(i8, <4 x i8>, <4 x i1>, i32)
238
239define signext i8 @vpreduce_or_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
240; CHECK-LABEL: vpreduce_or_v4i8:
241; CHECK:       # %bb.0:
242; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
243; CHECK-NEXT:    vmv.s.x v9, a0
244; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
245; CHECK-NEXT:    vredor.vs v9, v8, v9, v0.t
246; CHECK-NEXT:    vmv.x.s a0, v9
247; CHECK-NEXT:    ret
248  %r = call i8 @llvm.vp.reduce.or.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
249  ret i8 %r
250}
251
252declare i8 @llvm.vp.reduce.xor.v4i8(i8, <4 x i8>, <4 x i1>, i32)
253
254define signext i8 @vpreduce_xor_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
255; CHECK-LABEL: vpreduce_xor_v4i8:
256; CHECK:       # %bb.0:
257; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
258; CHECK-NEXT:    vmv.s.x v9, a0
259; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
260; CHECK-NEXT:    vredxor.vs v9, v8, v9, v0.t
261; CHECK-NEXT:    vmv.x.s a0, v9
262; CHECK-NEXT:    ret
263  %r = call i8 @llvm.vp.reduce.xor.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
264  ret i8 %r
265}
266
267declare i16 @llvm.vp.reduce.add.v2i16(i16, <2 x i16>, <2 x i1>, i32)
268
269define signext i16 @vpreduce_add_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
270; CHECK-LABEL: vpreduce_add_v2i16:
271; CHECK:       # %bb.0:
272; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
273; CHECK-NEXT:    vmv.s.x v9, a0
274; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
275; CHECK-NEXT:    vredsum.vs v9, v8, v9, v0.t
276; CHECK-NEXT:    vmv.x.s a0, v9
277; CHECK-NEXT:    ret
278  %r = call i16 @llvm.vp.reduce.add.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
279  ret i16 %r
280}
281
282declare i16 @llvm.vp.reduce.umax.v2i16(i16, <2 x i16>, <2 x i1>, i32)
283
284define signext i16 @vpreduce_umax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
285; RV32-LABEL: vpreduce_umax_v2i16:
286; RV32:       # %bb.0:
287; RV32-NEXT:    slli a0, a0, 16
288; RV32-NEXT:    srli a0, a0, 16
289; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
290; RV32-NEXT:    vmv.s.x v9, a0
291; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
292; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
293; RV32-NEXT:    vmv.x.s a0, v9
294; RV32-NEXT:    ret
295;
296; RV64-LABEL: vpreduce_umax_v2i16:
297; RV64:       # %bb.0:
298; RV64-NEXT:    slli a0, a0, 48
299; RV64-NEXT:    srli a0, a0, 48
300; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
301; RV64-NEXT:    vmv.s.x v9, a0
302; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
303; RV64-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
304; RV64-NEXT:    vmv.x.s a0, v9
305; RV64-NEXT:    ret
306  %r = call i16 @llvm.vp.reduce.umax.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
307  ret i16 %r
308}
309
310declare i16 @llvm.vp.reduce.smax.v2i16(i16, <2 x i16>, <2 x i1>, i32)
311
312define signext i16 @vpreduce_smax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
313; CHECK-LABEL: vpreduce_smax_v2i16:
314; CHECK:       # %bb.0:
315; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
316; CHECK-NEXT:    vmv.s.x v9, a0
317; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
318; CHECK-NEXT:    vredmax.vs v9, v8, v9, v0.t
319; CHECK-NEXT:    vmv.x.s a0, v9
320; CHECK-NEXT:    ret
321  %r = call i16 @llvm.vp.reduce.smax.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
322  ret i16 %r
323}
324
325declare i16 @llvm.vp.reduce.umin.v2i16(i16, <2 x i16>, <2 x i1>, i32)
326
327define signext i16 @vpreduce_umin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
328; RV32-LABEL: vpreduce_umin_v2i16:
329; RV32:       # %bb.0:
330; RV32-NEXT:    slli a0, a0, 16
331; RV32-NEXT:    srli a0, a0, 16
332; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
333; RV32-NEXT:    vmv.s.x v9, a0
334; RV32-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
335; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
336; RV32-NEXT:    vmv.x.s a0, v9
337; RV32-NEXT:    ret
338;
339; RV64-LABEL: vpreduce_umin_v2i16:
340; RV64:       # %bb.0:
341; RV64-NEXT:    slli a0, a0, 48
342; RV64-NEXT:    srli a0, a0, 48
343; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
344; RV64-NEXT:    vmv.s.x v9, a0
345; RV64-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
346; RV64-NEXT:    vredminu.vs v9, v8, v9, v0.t
347; RV64-NEXT:    vmv.x.s a0, v9
348; RV64-NEXT:    ret
349  %r = call i16 @llvm.vp.reduce.umin.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
350  ret i16 %r
351}
352
353declare i16 @llvm.vp.reduce.smin.v2i16(i16, <2 x i16>, <2 x i1>, i32)
354
355define signext i16 @vpreduce_smin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
356; CHECK-LABEL: vpreduce_smin_v2i16:
357; CHECK:       # %bb.0:
358; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
359; CHECK-NEXT:    vmv.s.x v9, a0
360; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
361; CHECK-NEXT:    vredmin.vs v9, v8, v9, v0.t
362; CHECK-NEXT:    vmv.x.s a0, v9
363; CHECK-NEXT:    ret
364  %r = call i16 @llvm.vp.reduce.smin.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
365  ret i16 %r
366}
367
368declare i16 @llvm.vp.reduce.and.v2i16(i16, <2 x i16>, <2 x i1>, i32)
369
370define signext i16 @vpreduce_and_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
371; CHECK-LABEL: vpreduce_and_v2i16:
372; CHECK:       # %bb.0:
373; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
374; CHECK-NEXT:    vmv.s.x v9, a0
375; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
376; CHECK-NEXT:    vredand.vs v9, v8, v9, v0.t
377; CHECK-NEXT:    vmv.x.s a0, v9
378; CHECK-NEXT:    ret
379  %r = call i16 @llvm.vp.reduce.and.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
380  ret i16 %r
381}
382
383declare i16 @llvm.vp.reduce.or.v2i16(i16, <2 x i16>, <2 x i1>, i32)
384
385define signext i16 @vpreduce_or_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
386; CHECK-LABEL: vpreduce_or_v2i16:
387; CHECK:       # %bb.0:
388; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
389; CHECK-NEXT:    vmv.s.x v9, a0
390; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
391; CHECK-NEXT:    vredor.vs v9, v8, v9, v0.t
392; CHECK-NEXT:    vmv.x.s a0, v9
393; CHECK-NEXT:    ret
394  %r = call i16 @llvm.vp.reduce.or.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
395  ret i16 %r
396}
397
398declare i16 @llvm.vp.reduce.xor.v2i16(i16, <2 x i16>, <2 x i1>, i32)
399
400define signext i16 @vpreduce_xor_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) {
401; CHECK-LABEL: vpreduce_xor_v2i16:
402; CHECK:       # %bb.0:
403; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
404; CHECK-NEXT:    vmv.s.x v9, a0
405; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
406; CHECK-NEXT:    vredxor.vs v9, v8, v9, v0.t
407; CHECK-NEXT:    vmv.x.s a0, v9
408; CHECK-NEXT:    ret
409  %r = call i16 @llvm.vp.reduce.xor.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl)
410  ret i16 %r
411}
412
413declare i16 @llvm.vp.reduce.add.v4i16(i16, <4 x i16>, <4 x i1>, i32)
414
415define signext i16 @vpreduce_add_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
416; CHECK-LABEL: vpreduce_add_v4i16:
417; CHECK:       # %bb.0:
418; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
419; CHECK-NEXT:    vmv.s.x v9, a0
420; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
421; CHECK-NEXT:    vredsum.vs v9, v8, v9, v0.t
422; CHECK-NEXT:    vmv.x.s a0, v9
423; CHECK-NEXT:    ret
424  %r = call i16 @llvm.vp.reduce.add.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
425  ret i16 %r
426}
427
428declare i16 @llvm.vp.reduce.umax.v4i16(i16, <4 x i16>, <4 x i1>, i32)
429
430define signext i16 @vpreduce_umax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
431; RV32-LABEL: vpreduce_umax_v4i16:
432; RV32:       # %bb.0:
433; RV32-NEXT:    slli a0, a0, 16
434; RV32-NEXT:    srli a0, a0, 16
435; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
436; RV32-NEXT:    vmv.s.x v9, a0
437; RV32-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
438; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
439; RV32-NEXT:    vmv.x.s a0, v9
440; RV32-NEXT:    ret
441;
442; RV64-LABEL: vpreduce_umax_v4i16:
443; RV64:       # %bb.0:
444; RV64-NEXT:    slli a0, a0, 48
445; RV64-NEXT:    srli a0, a0, 48
446; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
447; RV64-NEXT:    vmv.s.x v9, a0
448; RV64-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
449; RV64-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
450; RV64-NEXT:    vmv.x.s a0, v9
451; RV64-NEXT:    ret
452  %r = call i16 @llvm.vp.reduce.umax.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
453  ret i16 %r
454}
455
456declare i16 @llvm.vp.reduce.smax.v4i16(i16, <4 x i16>, <4 x i1>, i32)
457
458define signext i16 @vpreduce_smax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
459; CHECK-LABEL: vpreduce_smax_v4i16:
460; CHECK:       # %bb.0:
461; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
462; CHECK-NEXT:    vmv.s.x v9, a0
463; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
464; CHECK-NEXT:    vredmax.vs v9, v8, v9, v0.t
465; CHECK-NEXT:    vmv.x.s a0, v9
466; CHECK-NEXT:    ret
467  %r = call i16 @llvm.vp.reduce.smax.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
468  ret i16 %r
469}
470
471declare i16 @llvm.vp.reduce.umin.v4i16(i16, <4 x i16>, <4 x i1>, i32)
472
473define signext i16 @vpreduce_umin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
474; RV32-LABEL: vpreduce_umin_v4i16:
475; RV32:       # %bb.0:
476; RV32-NEXT:    slli a0, a0, 16
477; RV32-NEXT:    srli a0, a0, 16
478; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
479; RV32-NEXT:    vmv.s.x v9, a0
480; RV32-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
481; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
482; RV32-NEXT:    vmv.x.s a0, v9
483; RV32-NEXT:    ret
484;
485; RV64-LABEL: vpreduce_umin_v4i16:
486; RV64:       # %bb.0:
487; RV64-NEXT:    slli a0, a0, 48
488; RV64-NEXT:    srli a0, a0, 48
489; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
490; RV64-NEXT:    vmv.s.x v9, a0
491; RV64-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
492; RV64-NEXT:    vredminu.vs v9, v8, v9, v0.t
493; RV64-NEXT:    vmv.x.s a0, v9
494; RV64-NEXT:    ret
495  %r = call i16 @llvm.vp.reduce.umin.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
496  ret i16 %r
497}
498
499declare i16 @llvm.vp.reduce.smin.v4i16(i16, <4 x i16>, <4 x i1>, i32)
500
501define signext i16 @vpreduce_smin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
502; CHECK-LABEL: vpreduce_smin_v4i16:
503; CHECK:       # %bb.0:
504; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
505; CHECK-NEXT:    vmv.s.x v9, a0
506; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
507; CHECK-NEXT:    vredmin.vs v9, v8, v9, v0.t
508; CHECK-NEXT:    vmv.x.s a0, v9
509; CHECK-NEXT:    ret
510  %r = call i16 @llvm.vp.reduce.smin.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
511  ret i16 %r
512}
513
514declare i16 @llvm.vp.reduce.and.v4i16(i16, <4 x i16>, <4 x i1>, i32)
515
516define signext i16 @vpreduce_and_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
517; CHECK-LABEL: vpreduce_and_v4i16:
518; CHECK:       # %bb.0:
519; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
520; CHECK-NEXT:    vmv.s.x v9, a0
521; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
522; CHECK-NEXT:    vredand.vs v9, v8, v9, v0.t
523; CHECK-NEXT:    vmv.x.s a0, v9
524; CHECK-NEXT:    ret
525  %r = call i16 @llvm.vp.reduce.and.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
526  ret i16 %r
527}
528
529declare i16 @llvm.vp.reduce.or.v4i16(i16, <4 x i16>, <4 x i1>, i32)
530
531define signext i16 @vpreduce_or_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
532; CHECK-LABEL: vpreduce_or_v4i16:
533; CHECK:       # %bb.0:
534; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
535; CHECK-NEXT:    vmv.s.x v9, a0
536; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
537; CHECK-NEXT:    vredor.vs v9, v8, v9, v0.t
538; CHECK-NEXT:    vmv.x.s a0, v9
539; CHECK-NEXT:    ret
540  %r = call i16 @llvm.vp.reduce.or.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
541  ret i16 %r
542}
543
544declare i16 @llvm.vp.reduce.xor.v4i16(i16, <4 x i16>, <4 x i1>, i32)
545
546define signext i16 @vpreduce_xor_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) {
547; CHECK-LABEL: vpreduce_xor_v4i16:
548; CHECK:       # %bb.0:
549; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
550; CHECK-NEXT:    vmv.s.x v9, a0
551; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
552; CHECK-NEXT:    vredxor.vs v9, v8, v9, v0.t
553; CHECK-NEXT:    vmv.x.s a0, v9
554; CHECK-NEXT:    ret
555  %r = call i16 @llvm.vp.reduce.xor.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl)
556  ret i16 %r
557}
558
559declare i32 @llvm.vp.reduce.add.v2i32(i32, <2 x i32>, <2 x i1>, i32)
560
561define signext i32 @vpreduce_add_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
562; CHECK-LABEL: vpreduce_add_v2i32:
563; CHECK:       # %bb.0:
564; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
565; CHECK-NEXT:    vmv.s.x v9, a0
566; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
567; CHECK-NEXT:    vredsum.vs v9, v8, v9, v0.t
568; CHECK-NEXT:    vmv.x.s a0, v9
569; CHECK-NEXT:    ret
570  %r = call i32 @llvm.vp.reduce.add.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
571  ret i32 %r
572}
573
574declare i32 @llvm.vp.reduce.umax.v2i32(i32, <2 x i32>, <2 x i1>, i32)
575
576define signext i32 @vpreduce_umax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
577; CHECK-LABEL: vpreduce_umax_v2i32:
578; CHECK:       # %bb.0:
579; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
580; CHECK-NEXT:    vmv.s.x v9, a0
581; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
582; CHECK-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
583; CHECK-NEXT:    vmv.x.s a0, v9
584; CHECK-NEXT:    ret
585  %r = call i32 @llvm.vp.reduce.umax.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
586  ret i32 %r
587}
588
589declare i32 @llvm.vp.reduce.smax.v2i32(i32, <2 x i32>, <2 x i1>, i32)
590
591define signext i32 @vpreduce_smax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
592; CHECK-LABEL: vpreduce_smax_v2i32:
593; CHECK:       # %bb.0:
594; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
595; CHECK-NEXT:    vmv.s.x v9, a0
596; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
597; CHECK-NEXT:    vredmax.vs v9, v8, v9, v0.t
598; CHECK-NEXT:    vmv.x.s a0, v9
599; CHECK-NEXT:    ret
600  %r = call i32 @llvm.vp.reduce.smax.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
601  ret i32 %r
602}
603
604declare i32 @llvm.vp.reduce.umin.v2i32(i32, <2 x i32>, <2 x i1>, i32)
605
606define signext i32 @vpreduce_umin_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
607; CHECK-LABEL: vpreduce_umin_v2i32:
608; CHECK:       # %bb.0:
609; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
610; CHECK-NEXT:    vmv.s.x v9, a0
611; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
612; CHECK-NEXT:    vredminu.vs v9, v8, v9, v0.t
613; CHECK-NEXT:    vmv.x.s a0, v9
614; CHECK-NEXT:    ret
615  %r = call i32 @llvm.vp.reduce.umin.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
616  ret i32 %r
617}
618
619declare i32 @llvm.vp.reduce.smin.v2i32(i32, <2 x i32>, <2 x i1>, i32)
620
621define signext i32 @vpreduce_smin_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
622; CHECK-LABEL: vpreduce_smin_v2i32:
623; CHECK:       # %bb.0:
624; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
625; CHECK-NEXT:    vmv.s.x v9, a0
626; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
627; CHECK-NEXT:    vredmin.vs v9, v8, v9, v0.t
628; CHECK-NEXT:    vmv.x.s a0, v9
629; CHECK-NEXT:    ret
630  %r = call i32 @llvm.vp.reduce.smin.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
631  ret i32 %r
632}
633
634declare i32 @llvm.vp.reduce.and.v2i32(i32, <2 x i32>, <2 x i1>, i32)
635
636define signext i32 @vpreduce_and_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
637; CHECK-LABEL: vpreduce_and_v2i32:
638; CHECK:       # %bb.0:
639; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
640; CHECK-NEXT:    vmv.s.x v9, a0
641; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
642; CHECK-NEXT:    vredand.vs v9, v8, v9, v0.t
643; CHECK-NEXT:    vmv.x.s a0, v9
644; CHECK-NEXT:    ret
645  %r = call i32 @llvm.vp.reduce.and.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
646  ret i32 %r
647}
648
649declare i32 @llvm.vp.reduce.or.v2i32(i32, <2 x i32>, <2 x i1>, i32)
650
651define signext i32 @vpreduce_or_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
652; CHECK-LABEL: vpreduce_or_v2i32:
653; CHECK:       # %bb.0:
654; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
655; CHECK-NEXT:    vmv.s.x v9, a0
656; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
657; CHECK-NEXT:    vredor.vs v9, v8, v9, v0.t
658; CHECK-NEXT:    vmv.x.s a0, v9
659; CHECK-NEXT:    ret
660  %r = call i32 @llvm.vp.reduce.or.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
661  ret i32 %r
662}
663
664declare i32 @llvm.vp.reduce.xor.v2i32(i32, <2 x i32>, <2 x i1>, i32)
665
666define signext i32 @vpreduce_xor_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
667; CHECK-LABEL: vpreduce_xor_v2i32:
668; CHECK:       # %bb.0:
669; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
670; CHECK-NEXT:    vmv.s.x v9, a0
671; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
672; CHECK-NEXT:    vredxor.vs v9, v8, v9, v0.t
673; CHECK-NEXT:    vmv.x.s a0, v9
674; CHECK-NEXT:    ret
675  %r = call i32 @llvm.vp.reduce.xor.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
676  ret i32 %r
677}
678
679declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32)
680
681define signext i32 @vpreduce_add_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
682; CHECK-LABEL: vpreduce_add_v4i32:
683; CHECK:       # %bb.0:
684; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
685; CHECK-NEXT:    vmv.s.x v9, a0
686; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
687; CHECK-NEXT:    vredsum.vs v9, v8, v9, v0.t
688; CHECK-NEXT:    vmv.x.s a0, v9
689; CHECK-NEXT:    ret
690  %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
691  ret i32 %r
692}
693
694declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32)
695
696define signext i32 @vpreduce_umax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
697; CHECK-LABEL: vpreduce_umax_v4i32:
698; CHECK:       # %bb.0:
699; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
700; CHECK-NEXT:    vmv.s.x v9, a0
701; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
702; CHECK-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
703; CHECK-NEXT:    vmv.x.s a0, v9
704; CHECK-NEXT:    ret
705  %r = call i32 @llvm.vp.reduce.umax.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
706  ret i32 %r
707}
708
709declare i32 @llvm.vp.reduce.smax.v4i32(i32, <4 x i32>, <4 x i1>, i32)
710
711define signext i32 @vpreduce_smax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
712; CHECK-LABEL: vpreduce_smax_v4i32:
713; CHECK:       # %bb.0:
714; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
715; CHECK-NEXT:    vmv.s.x v9, a0
716; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
717; CHECK-NEXT:    vredmax.vs v9, v8, v9, v0.t
718; CHECK-NEXT:    vmv.x.s a0, v9
719; CHECK-NEXT:    ret
720  %r = call i32 @llvm.vp.reduce.smax.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
721  ret i32 %r
722}
723
724declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32)
725
726define signext i32 @vpreduce_umin_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
727; CHECK-LABEL: vpreduce_umin_v4i32:
728; CHECK:       # %bb.0:
729; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
730; CHECK-NEXT:    vmv.s.x v9, a0
731; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
732; CHECK-NEXT:    vredminu.vs v9, v8, v9, v0.t
733; CHECK-NEXT:    vmv.x.s a0, v9
734; CHECK-NEXT:    ret
735  %r = call i32 @llvm.vp.reduce.umin.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
736  ret i32 %r
737}
738
739declare i32 @llvm.vp.reduce.smin.v4i32(i32, <4 x i32>, <4 x i1>, i32)
740
741define signext i32 @vpreduce_smin_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
742; CHECK-LABEL: vpreduce_smin_v4i32:
743; CHECK:       # %bb.0:
744; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
745; CHECK-NEXT:    vmv.s.x v9, a0
746; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
747; CHECK-NEXT:    vredmin.vs v9, v8, v9, v0.t
748; CHECK-NEXT:    vmv.x.s a0, v9
749; CHECK-NEXT:    ret
750  %r = call i32 @llvm.vp.reduce.smin.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
751  ret i32 %r
752}
753
754declare i32 @llvm.vp.reduce.and.v4i32(i32, <4 x i32>, <4 x i1>, i32)
755
756define signext i32 @vpreduce_and_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
757; CHECK-LABEL: vpreduce_and_v4i32:
758; CHECK:       # %bb.0:
759; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
760; CHECK-NEXT:    vmv.s.x v9, a0
761; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
762; CHECK-NEXT:    vredand.vs v9, v8, v9, v0.t
763; CHECK-NEXT:    vmv.x.s a0, v9
764; CHECK-NEXT:    ret
765  %r = call i32 @llvm.vp.reduce.and.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
766  ret i32 %r
767}
768
769declare i32 @llvm.vp.reduce.or.v4i32(i32, <4 x i32>, <4 x i1>, i32)
770
771define signext i32 @vpreduce_or_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
772; CHECK-LABEL: vpreduce_or_v4i32:
773; CHECK:       # %bb.0:
774; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
775; CHECK-NEXT:    vmv.s.x v9, a0
776; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
777; CHECK-NEXT:    vredor.vs v9, v8, v9, v0.t
778; CHECK-NEXT:    vmv.x.s a0, v9
779; CHECK-NEXT:    ret
780  %r = call i32 @llvm.vp.reduce.or.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
781  ret i32 %r
782}
783
784declare i32 @llvm.vp.reduce.xor.v4i32(i32, <4 x i32>, <4 x i1>, i32)
785
786define signext i32 @vpreduce_xor_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
787; CHECK-LABEL: vpreduce_xor_v4i32:
788; CHECK:       # %bb.0:
789; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
790; CHECK-NEXT:    vmv.s.x v9, a0
791; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
792; CHECK-NEXT:    vredxor.vs v9, v8, v9, v0.t
793; CHECK-NEXT:    vmv.x.s a0, v9
794; CHECK-NEXT:    ret
795  %r = call i32 @llvm.vp.reduce.xor.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
796  ret i32 %r
797}
798
799declare i32 @llvm.vp.reduce.xor.v64i32(i32, <64 x i32>, <64 x i1>, i32)
800
801define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1> %m, i32 zeroext %evl) {
802; CHECK-LABEL: vpreduce_xor_v64i32:
803; CHECK:       # %bb.0:
804; CHECK-NEXT:    li a3, 32
805; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
806; CHECK-NEXT:    vslidedown.vi v24, v0, 4
807; CHECK-NEXT:    mv a2, a1
808; CHECK-NEXT:    bltu a1, a3, .LBB49_2
809; CHECK-NEXT:  # %bb.1:
810; CHECK-NEXT:    li a2, 32
811; CHECK-NEXT:  .LBB49_2:
812; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
813; CHECK-NEXT:    vmv.s.x v25, a0
814; CHECK-NEXT:    addi a0, a1, -32
815; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
816; CHECK-NEXT:    vredxor.vs v25, v8, v25, v0.t
817; CHECK-NEXT:    sltu a1, a1, a0
818; CHECK-NEXT:    addi a1, a1, -1
819; CHECK-NEXT:    and a0, a1, a0
820; CHECK-NEXT:    vmv1r.v v0, v24
821; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
822; CHECK-NEXT:    vredxor.vs v25, v16, v25, v0.t
823; CHECK-NEXT:    vmv.x.s a0, v25
824; CHECK-NEXT:    ret
825  %r = call i32 @llvm.vp.reduce.xor.v64i32(i32 %s, <64 x i32> %v, <64 x i1> %m, i32 %evl)
826  ret i32 %r
827}
828
829declare i64 @llvm.vp.reduce.add.v2i64(i64, <2 x i64>, <2 x i1>, i32)
830
831define signext i64 @vpreduce_add_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
832; RV32-LABEL: vpreduce_add_v2i64:
833; RV32:       # %bb.0:
834; RV32-NEXT:    addi sp, sp, -16
835; RV32-NEXT:    .cfi_def_cfa_offset 16
836; RV32-NEXT:    sw a0, 8(sp)
837; RV32-NEXT:    sw a1, 12(sp)
838; RV32-NEXT:    addi a0, sp, 8
839; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
840; RV32-NEXT:    vlse64.v v9, (a0), zero
841; RV32-NEXT:    li a1, 32
842; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
843; RV32-NEXT:    vredsum.vs v9, v8, v9, v0.t
844; RV32-NEXT:    vmv.x.s a0, v9
845; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
846; RV32-NEXT:    vsrl.vx v8, v9, a1
847; RV32-NEXT:    vmv.x.s a1, v8
848; RV32-NEXT:    addi sp, sp, 16
849; RV32-NEXT:    .cfi_def_cfa_offset 0
850; RV32-NEXT:    ret
851;
852; RV64-LABEL: vpreduce_add_v2i64:
853; RV64:       # %bb.0:
854; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
855; RV64-NEXT:    vmv.s.x v9, a0
856; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
857; RV64-NEXT:    vredsum.vs v9, v8, v9, v0.t
858; RV64-NEXT:    vmv.x.s a0, v9
859; RV64-NEXT:    ret
860  %r = call i64 @llvm.vp.reduce.add.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
861  ret i64 %r
862}
863
864declare i64 @llvm.vp.reduce.umax.v2i64(i64, <2 x i64>, <2 x i1>, i32)
865
866define signext i64 @vpreduce_umax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
867; RV32-LABEL: vpreduce_umax_v2i64:
868; RV32:       # %bb.0:
869; RV32-NEXT:    addi sp, sp, -16
870; RV32-NEXT:    .cfi_def_cfa_offset 16
871; RV32-NEXT:    sw a0, 8(sp)
872; RV32-NEXT:    sw a1, 12(sp)
873; RV32-NEXT:    addi a0, sp, 8
874; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
875; RV32-NEXT:    vlse64.v v9, (a0), zero
876; RV32-NEXT:    li a1, 32
877; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
878; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
879; RV32-NEXT:    vmv.x.s a0, v9
880; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
881; RV32-NEXT:    vsrl.vx v8, v9, a1
882; RV32-NEXT:    vmv.x.s a1, v8
883; RV32-NEXT:    addi sp, sp, 16
884; RV32-NEXT:    .cfi_def_cfa_offset 0
885; RV32-NEXT:    ret
886;
887; RV64-LABEL: vpreduce_umax_v2i64:
888; RV64:       # %bb.0:
889; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
890; RV64-NEXT:    vmv.s.x v9, a0
891; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
892; RV64-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
893; RV64-NEXT:    vmv.x.s a0, v9
894; RV64-NEXT:    ret
895  %r = call i64 @llvm.vp.reduce.umax.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
896  ret i64 %r
897}
898
899declare i64 @llvm.vp.reduce.smax.v2i64(i64, <2 x i64>, <2 x i1>, i32)
900
901define signext i64 @vpreduce_smax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
902; RV32-LABEL: vpreduce_smax_v2i64:
903; RV32:       # %bb.0:
904; RV32-NEXT:    addi sp, sp, -16
905; RV32-NEXT:    .cfi_def_cfa_offset 16
906; RV32-NEXT:    sw a0, 8(sp)
907; RV32-NEXT:    sw a1, 12(sp)
908; RV32-NEXT:    addi a0, sp, 8
909; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
910; RV32-NEXT:    vlse64.v v9, (a0), zero
911; RV32-NEXT:    li a1, 32
912; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
913; RV32-NEXT:    vredmax.vs v9, v8, v9, v0.t
914; RV32-NEXT:    vmv.x.s a0, v9
915; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
916; RV32-NEXT:    vsrl.vx v8, v9, a1
917; RV32-NEXT:    vmv.x.s a1, v8
918; RV32-NEXT:    addi sp, sp, 16
919; RV32-NEXT:    .cfi_def_cfa_offset 0
920; RV32-NEXT:    ret
921;
922; RV64-LABEL: vpreduce_smax_v2i64:
923; RV64:       # %bb.0:
924; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
925; RV64-NEXT:    vmv.s.x v9, a0
926; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
927; RV64-NEXT:    vredmax.vs v9, v8, v9, v0.t
928; RV64-NEXT:    vmv.x.s a0, v9
929; RV64-NEXT:    ret
930  %r = call i64 @llvm.vp.reduce.smax.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
931  ret i64 %r
932}
933
934declare i64 @llvm.vp.reduce.umin.v2i64(i64, <2 x i64>, <2 x i1>, i32)
935
936define signext i64 @vpreduce_umin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
937; RV32-LABEL: vpreduce_umin_v2i64:
938; RV32:       # %bb.0:
939; RV32-NEXT:    addi sp, sp, -16
940; RV32-NEXT:    .cfi_def_cfa_offset 16
941; RV32-NEXT:    sw a0, 8(sp)
942; RV32-NEXT:    sw a1, 12(sp)
943; RV32-NEXT:    addi a0, sp, 8
944; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
945; RV32-NEXT:    vlse64.v v9, (a0), zero
946; RV32-NEXT:    li a1, 32
947; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
948; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
949; RV32-NEXT:    vmv.x.s a0, v9
950; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
951; RV32-NEXT:    vsrl.vx v8, v9, a1
952; RV32-NEXT:    vmv.x.s a1, v8
953; RV32-NEXT:    addi sp, sp, 16
954; RV32-NEXT:    .cfi_def_cfa_offset 0
955; RV32-NEXT:    ret
956;
957; RV64-LABEL: vpreduce_umin_v2i64:
958; RV64:       # %bb.0:
959; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
960; RV64-NEXT:    vmv.s.x v9, a0
961; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
962; RV64-NEXT:    vredminu.vs v9, v8, v9, v0.t
963; RV64-NEXT:    vmv.x.s a0, v9
964; RV64-NEXT:    ret
965  %r = call i64 @llvm.vp.reduce.umin.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
966  ret i64 %r
967}
968
969declare i64 @llvm.vp.reduce.smin.v2i64(i64, <2 x i64>, <2 x i1>, i32)
970
971define signext i64 @vpreduce_smin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
972; RV32-LABEL: vpreduce_smin_v2i64:
973; RV32:       # %bb.0:
974; RV32-NEXT:    addi sp, sp, -16
975; RV32-NEXT:    .cfi_def_cfa_offset 16
976; RV32-NEXT:    sw a0, 8(sp)
977; RV32-NEXT:    sw a1, 12(sp)
978; RV32-NEXT:    addi a0, sp, 8
979; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
980; RV32-NEXT:    vlse64.v v9, (a0), zero
981; RV32-NEXT:    li a1, 32
982; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
983; RV32-NEXT:    vredmin.vs v9, v8, v9, v0.t
984; RV32-NEXT:    vmv.x.s a0, v9
985; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
986; RV32-NEXT:    vsrl.vx v8, v9, a1
987; RV32-NEXT:    vmv.x.s a1, v8
988; RV32-NEXT:    addi sp, sp, 16
989; RV32-NEXT:    .cfi_def_cfa_offset 0
990; RV32-NEXT:    ret
991;
992; RV64-LABEL: vpreduce_smin_v2i64:
993; RV64:       # %bb.0:
994; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
995; RV64-NEXT:    vmv.s.x v9, a0
996; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
997; RV64-NEXT:    vredmin.vs v9, v8, v9, v0.t
998; RV64-NEXT:    vmv.x.s a0, v9
999; RV64-NEXT:    ret
1000  %r = call i64 @llvm.vp.reduce.smin.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
1001  ret i64 %r
1002}
1003
1004declare i64 @llvm.vp.reduce.and.v2i64(i64, <2 x i64>, <2 x i1>, i32)
1005
1006define signext i64 @vpreduce_and_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
1007; RV32-LABEL: vpreduce_and_v2i64:
1008; RV32:       # %bb.0:
1009; RV32-NEXT:    addi sp, sp, -16
1010; RV32-NEXT:    .cfi_def_cfa_offset 16
1011; RV32-NEXT:    sw a0, 8(sp)
1012; RV32-NEXT:    sw a1, 12(sp)
1013; RV32-NEXT:    addi a0, sp, 8
1014; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1015; RV32-NEXT:    vlse64.v v9, (a0), zero
1016; RV32-NEXT:    li a1, 32
1017; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
1018; RV32-NEXT:    vredand.vs v9, v8, v9, v0.t
1019; RV32-NEXT:    vmv.x.s a0, v9
1020; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1021; RV32-NEXT:    vsrl.vx v8, v9, a1
1022; RV32-NEXT:    vmv.x.s a1, v8
1023; RV32-NEXT:    addi sp, sp, 16
1024; RV32-NEXT:    .cfi_def_cfa_offset 0
1025; RV32-NEXT:    ret
1026;
1027; RV64-LABEL: vpreduce_and_v2i64:
1028; RV64:       # %bb.0:
1029; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1030; RV64-NEXT:    vmv.s.x v9, a0
1031; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
1032; RV64-NEXT:    vredand.vs v9, v8, v9, v0.t
1033; RV64-NEXT:    vmv.x.s a0, v9
1034; RV64-NEXT:    ret
1035  %r = call i64 @llvm.vp.reduce.and.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
1036  ret i64 %r
1037}
1038
1039declare i64 @llvm.vp.reduce.or.v2i64(i64, <2 x i64>, <2 x i1>, i32)
1040
1041define signext i64 @vpreduce_or_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
1042; RV32-LABEL: vpreduce_or_v2i64:
1043; RV32:       # %bb.0:
1044; RV32-NEXT:    addi sp, sp, -16
1045; RV32-NEXT:    .cfi_def_cfa_offset 16
1046; RV32-NEXT:    sw a0, 8(sp)
1047; RV32-NEXT:    sw a1, 12(sp)
1048; RV32-NEXT:    addi a0, sp, 8
1049; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1050; RV32-NEXT:    vlse64.v v9, (a0), zero
1051; RV32-NEXT:    li a1, 32
1052; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
1053; RV32-NEXT:    vredor.vs v9, v8, v9, v0.t
1054; RV32-NEXT:    vmv.x.s a0, v9
1055; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1056; RV32-NEXT:    vsrl.vx v8, v9, a1
1057; RV32-NEXT:    vmv.x.s a1, v8
1058; RV32-NEXT:    addi sp, sp, 16
1059; RV32-NEXT:    .cfi_def_cfa_offset 0
1060; RV32-NEXT:    ret
1061;
1062; RV64-LABEL: vpreduce_or_v2i64:
1063; RV64:       # %bb.0:
1064; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1065; RV64-NEXT:    vmv.s.x v9, a0
1066; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
1067; RV64-NEXT:    vredor.vs v9, v8, v9, v0.t
1068; RV64-NEXT:    vmv.x.s a0, v9
1069; RV64-NEXT:    ret
1070  %r = call i64 @llvm.vp.reduce.or.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
1071  ret i64 %r
1072}
1073
1074declare i64 @llvm.vp.reduce.xor.v2i64(i64, <2 x i64>, <2 x i1>, i32)
1075
1076define signext i64 @vpreduce_xor_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) {
1077; RV32-LABEL: vpreduce_xor_v2i64:
1078; RV32:       # %bb.0:
1079; RV32-NEXT:    addi sp, sp, -16
1080; RV32-NEXT:    .cfi_def_cfa_offset 16
1081; RV32-NEXT:    sw a0, 8(sp)
1082; RV32-NEXT:    sw a1, 12(sp)
1083; RV32-NEXT:    addi a0, sp, 8
1084; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1085; RV32-NEXT:    vlse64.v v9, (a0), zero
1086; RV32-NEXT:    li a1, 32
1087; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
1088; RV32-NEXT:    vredxor.vs v9, v8, v9, v0.t
1089; RV32-NEXT:    vmv.x.s a0, v9
1090; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1091; RV32-NEXT:    vsrl.vx v8, v9, a1
1092; RV32-NEXT:    vmv.x.s a1, v8
1093; RV32-NEXT:    addi sp, sp, 16
1094; RV32-NEXT:    .cfi_def_cfa_offset 0
1095; RV32-NEXT:    ret
1096;
1097; RV64-LABEL: vpreduce_xor_v2i64:
1098; RV64:       # %bb.0:
1099; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1100; RV64-NEXT:    vmv.s.x v9, a0
1101; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
1102; RV64-NEXT:    vredxor.vs v9, v8, v9, v0.t
1103; RV64-NEXT:    vmv.x.s a0, v9
1104; RV64-NEXT:    ret
1105  %r = call i64 @llvm.vp.reduce.xor.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl)
1106  ret i64 %r
1107}
1108
1109declare i64 @llvm.vp.reduce.add.v4i64(i64, <4 x i64>, <4 x i1>, i32)
1110
1111define signext i64 @vpreduce_add_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
1112; RV32-LABEL: vpreduce_add_v4i64:
1113; RV32:       # %bb.0:
1114; RV32-NEXT:    addi sp, sp, -16
1115; RV32-NEXT:    .cfi_def_cfa_offset 16
1116; RV32-NEXT:    sw a0, 8(sp)
1117; RV32-NEXT:    sw a1, 12(sp)
1118; RV32-NEXT:    addi a0, sp, 8
1119; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1120; RV32-NEXT:    vlse64.v v10, (a0), zero
1121; RV32-NEXT:    li a1, 32
1122; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
1123; RV32-NEXT:    vredsum.vs v10, v8, v10, v0.t
1124; RV32-NEXT:    vmv.x.s a0, v10
1125; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1126; RV32-NEXT:    vsrl.vx v8, v10, a1
1127; RV32-NEXT:    vmv.x.s a1, v8
1128; RV32-NEXT:    addi sp, sp, 16
1129; RV32-NEXT:    .cfi_def_cfa_offset 0
1130; RV32-NEXT:    ret
1131;
1132; RV64-LABEL: vpreduce_add_v4i64:
1133; RV64:       # %bb.0:
1134; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1135; RV64-NEXT:    vmv.s.x v10, a0
1136; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
1137; RV64-NEXT:    vredsum.vs v10, v8, v10, v0.t
1138; RV64-NEXT:    vmv.x.s a0, v10
1139; RV64-NEXT:    ret
1140  %r = call i64 @llvm.vp.reduce.add.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
1141  ret i64 %r
1142}
1143
1144declare i64 @llvm.vp.reduce.umax.v4i64(i64, <4 x i64>, <4 x i1>, i32)
1145
1146define signext i64 @vpreduce_umax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
1147; RV32-LABEL: vpreduce_umax_v4i64:
1148; RV32:       # %bb.0:
1149; RV32-NEXT:    addi sp, sp, -16
1150; RV32-NEXT:    .cfi_def_cfa_offset 16
1151; RV32-NEXT:    sw a0, 8(sp)
1152; RV32-NEXT:    sw a1, 12(sp)
1153; RV32-NEXT:    addi a0, sp, 8
1154; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1155; RV32-NEXT:    vlse64.v v10, (a0), zero
1156; RV32-NEXT:    li a1, 32
1157; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
1158; RV32-NEXT:    vredmaxu.vs v10, v8, v10, v0.t
1159; RV32-NEXT:    vmv.x.s a0, v10
1160; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1161; RV32-NEXT:    vsrl.vx v8, v10, a1
1162; RV32-NEXT:    vmv.x.s a1, v8
1163; RV32-NEXT:    addi sp, sp, 16
1164; RV32-NEXT:    .cfi_def_cfa_offset 0
1165; RV32-NEXT:    ret
1166;
1167; RV64-LABEL: vpreduce_umax_v4i64:
1168; RV64:       # %bb.0:
1169; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1170; RV64-NEXT:    vmv.s.x v10, a0
1171; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
1172; RV64-NEXT:    vredmaxu.vs v10, v8, v10, v0.t
1173; RV64-NEXT:    vmv.x.s a0, v10
1174; RV64-NEXT:    ret
1175  %r = call i64 @llvm.vp.reduce.umax.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
1176  ret i64 %r
1177}
1178
1179declare i64 @llvm.vp.reduce.smax.v4i64(i64, <4 x i64>, <4 x i1>, i32)
1180
1181define signext i64 @vpreduce_smax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
1182; RV32-LABEL: vpreduce_smax_v4i64:
1183; RV32:       # %bb.0:
1184; RV32-NEXT:    addi sp, sp, -16
1185; RV32-NEXT:    .cfi_def_cfa_offset 16
1186; RV32-NEXT:    sw a0, 8(sp)
1187; RV32-NEXT:    sw a1, 12(sp)
1188; RV32-NEXT:    addi a0, sp, 8
1189; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1190; RV32-NEXT:    vlse64.v v10, (a0), zero
1191; RV32-NEXT:    li a1, 32
1192; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
1193; RV32-NEXT:    vredmax.vs v10, v8, v10, v0.t
1194; RV32-NEXT:    vmv.x.s a0, v10
1195; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1196; RV32-NEXT:    vsrl.vx v8, v10, a1
1197; RV32-NEXT:    vmv.x.s a1, v8
1198; RV32-NEXT:    addi sp, sp, 16
1199; RV32-NEXT:    .cfi_def_cfa_offset 0
1200; RV32-NEXT:    ret
1201;
1202; RV64-LABEL: vpreduce_smax_v4i64:
1203; RV64:       # %bb.0:
1204; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1205; RV64-NEXT:    vmv.s.x v10, a0
1206; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
1207; RV64-NEXT:    vredmax.vs v10, v8, v10, v0.t
1208; RV64-NEXT:    vmv.x.s a0, v10
1209; RV64-NEXT:    ret
1210  %r = call i64 @llvm.vp.reduce.smax.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
1211  ret i64 %r
1212}
1213
1214declare i64 @llvm.vp.reduce.umin.v4i64(i64, <4 x i64>, <4 x i1>, i32)
1215
1216define signext i64 @vpreduce_umin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
1217; RV32-LABEL: vpreduce_umin_v4i64:
1218; RV32:       # %bb.0:
1219; RV32-NEXT:    addi sp, sp, -16
1220; RV32-NEXT:    .cfi_def_cfa_offset 16
1221; RV32-NEXT:    sw a0, 8(sp)
1222; RV32-NEXT:    sw a1, 12(sp)
1223; RV32-NEXT:    addi a0, sp, 8
1224; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1225; RV32-NEXT:    vlse64.v v10, (a0), zero
1226; RV32-NEXT:    li a1, 32
1227; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
1228; RV32-NEXT:    vredminu.vs v10, v8, v10, v0.t
1229; RV32-NEXT:    vmv.x.s a0, v10
1230; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1231; RV32-NEXT:    vsrl.vx v8, v10, a1
1232; RV32-NEXT:    vmv.x.s a1, v8
1233; RV32-NEXT:    addi sp, sp, 16
1234; RV32-NEXT:    .cfi_def_cfa_offset 0
1235; RV32-NEXT:    ret
1236;
1237; RV64-LABEL: vpreduce_umin_v4i64:
1238; RV64:       # %bb.0:
1239; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1240; RV64-NEXT:    vmv.s.x v10, a0
1241; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
1242; RV64-NEXT:    vredminu.vs v10, v8, v10, v0.t
1243; RV64-NEXT:    vmv.x.s a0, v10
1244; RV64-NEXT:    ret
1245  %r = call i64 @llvm.vp.reduce.umin.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
1246  ret i64 %r
1247}
1248
1249declare i64 @llvm.vp.reduce.smin.v4i64(i64, <4 x i64>, <4 x i1>, i32)
1250
1251define signext i64 @vpreduce_smin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
1252; RV32-LABEL: vpreduce_smin_v4i64:
1253; RV32:       # %bb.0:
1254; RV32-NEXT:    addi sp, sp, -16
1255; RV32-NEXT:    .cfi_def_cfa_offset 16
1256; RV32-NEXT:    sw a0, 8(sp)
1257; RV32-NEXT:    sw a1, 12(sp)
1258; RV32-NEXT:    addi a0, sp, 8
1259; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1260; RV32-NEXT:    vlse64.v v10, (a0), zero
1261; RV32-NEXT:    li a1, 32
1262; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
1263; RV32-NEXT:    vredmin.vs v10, v8, v10, v0.t
1264; RV32-NEXT:    vmv.x.s a0, v10
1265; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1266; RV32-NEXT:    vsrl.vx v8, v10, a1
1267; RV32-NEXT:    vmv.x.s a1, v8
1268; RV32-NEXT:    addi sp, sp, 16
1269; RV32-NEXT:    .cfi_def_cfa_offset 0
1270; RV32-NEXT:    ret
1271;
1272; RV64-LABEL: vpreduce_smin_v4i64:
1273; RV64:       # %bb.0:
1274; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1275; RV64-NEXT:    vmv.s.x v10, a0
1276; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
1277; RV64-NEXT:    vredmin.vs v10, v8, v10, v0.t
1278; RV64-NEXT:    vmv.x.s a0, v10
1279; RV64-NEXT:    ret
1280  %r = call i64 @llvm.vp.reduce.smin.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
1281  ret i64 %r
1282}
1283
1284declare i64 @llvm.vp.reduce.and.v4i64(i64, <4 x i64>, <4 x i1>, i32)
1285
1286define signext i64 @vpreduce_and_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
1287; RV32-LABEL: vpreduce_and_v4i64:
1288; RV32:       # %bb.0:
1289; RV32-NEXT:    addi sp, sp, -16
1290; RV32-NEXT:    .cfi_def_cfa_offset 16
1291; RV32-NEXT:    sw a0, 8(sp)
1292; RV32-NEXT:    sw a1, 12(sp)
1293; RV32-NEXT:    addi a0, sp, 8
1294; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1295; RV32-NEXT:    vlse64.v v10, (a0), zero
1296; RV32-NEXT:    li a1, 32
1297; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
1298; RV32-NEXT:    vredand.vs v10, v8, v10, v0.t
1299; RV32-NEXT:    vmv.x.s a0, v10
1300; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1301; RV32-NEXT:    vsrl.vx v8, v10, a1
1302; RV32-NEXT:    vmv.x.s a1, v8
1303; RV32-NEXT:    addi sp, sp, 16
1304; RV32-NEXT:    .cfi_def_cfa_offset 0
1305; RV32-NEXT:    ret
1306;
1307; RV64-LABEL: vpreduce_and_v4i64:
1308; RV64:       # %bb.0:
1309; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1310; RV64-NEXT:    vmv.s.x v10, a0
1311; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
1312; RV64-NEXT:    vredand.vs v10, v8, v10, v0.t
1313; RV64-NEXT:    vmv.x.s a0, v10
1314; RV64-NEXT:    ret
1315  %r = call i64 @llvm.vp.reduce.and.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
1316  ret i64 %r
1317}
1318
1319declare i64 @llvm.vp.reduce.or.v4i64(i64, <4 x i64>, <4 x i1>, i32)
1320
1321define signext i64 @vpreduce_or_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
1322; RV32-LABEL: vpreduce_or_v4i64:
1323; RV32:       # %bb.0:
1324; RV32-NEXT:    addi sp, sp, -16
1325; RV32-NEXT:    .cfi_def_cfa_offset 16
1326; RV32-NEXT:    sw a0, 8(sp)
1327; RV32-NEXT:    sw a1, 12(sp)
1328; RV32-NEXT:    addi a0, sp, 8
1329; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1330; RV32-NEXT:    vlse64.v v10, (a0), zero
1331; RV32-NEXT:    li a1, 32
1332; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
1333; RV32-NEXT:    vredor.vs v10, v8, v10, v0.t
1334; RV32-NEXT:    vmv.x.s a0, v10
1335; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1336; RV32-NEXT:    vsrl.vx v8, v10, a1
1337; RV32-NEXT:    vmv.x.s a1, v8
1338; RV32-NEXT:    addi sp, sp, 16
1339; RV32-NEXT:    .cfi_def_cfa_offset 0
1340; RV32-NEXT:    ret
1341;
1342; RV64-LABEL: vpreduce_or_v4i64:
1343; RV64:       # %bb.0:
1344; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1345; RV64-NEXT:    vmv.s.x v10, a0
1346; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
1347; RV64-NEXT:    vredor.vs v10, v8, v10, v0.t
1348; RV64-NEXT:    vmv.x.s a0, v10
1349; RV64-NEXT:    ret
1350  %r = call i64 @llvm.vp.reduce.or.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
1351  ret i64 %r
1352}
1353
1354declare i64 @llvm.vp.reduce.xor.v4i64(i64, <4 x i64>, <4 x i1>, i32)
1355
1356define signext i64 @vpreduce_xor_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) {
1357; RV32-LABEL: vpreduce_xor_v4i64:
1358; RV32:       # %bb.0:
1359; RV32-NEXT:    addi sp, sp, -16
1360; RV32-NEXT:    .cfi_def_cfa_offset 16
1361; RV32-NEXT:    sw a0, 8(sp)
1362; RV32-NEXT:    sw a1, 12(sp)
1363; RV32-NEXT:    addi a0, sp, 8
1364; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1365; RV32-NEXT:    vlse64.v v10, (a0), zero
1366; RV32-NEXT:    li a1, 32
1367; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
1368; RV32-NEXT:    vredxor.vs v10, v8, v10, v0.t
1369; RV32-NEXT:    vmv.x.s a0, v10
1370; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1371; RV32-NEXT:    vsrl.vx v8, v10, a1
1372; RV32-NEXT:    vmv.x.s a1, v8
1373; RV32-NEXT:    addi sp, sp, 16
1374; RV32-NEXT:    .cfi_def_cfa_offset 0
1375; RV32-NEXT:    ret
1376;
1377; RV64-LABEL: vpreduce_xor_v4i64:
1378; RV64:       # %bb.0:
1379; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
1380; RV64-NEXT:    vmv.s.x v10, a0
1381; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
1382; RV64-NEXT:    vredxor.vs v10, v8, v10, v0.t
1383; RV64-NEXT:    vmv.x.s a0, v10
1384; RV64-NEXT:    ret
1385  %r = call i64 @llvm.vp.reduce.xor.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl)
1386  ret i64 %r
1387}
1388
1389declare i8 @llvm.vp.reduce.mul.v1i8(i8, <1 x i8>, <1 x i1>, i32)
1390
1391define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl) {
1392; RV32-LABEL: vpreduce_mul_v1i8:
1393; RV32:       # %bb.0:
1394; RV32-NEXT:    addi sp, sp, -16
1395; RV32-NEXT:    .cfi_def_cfa_offset 16
1396; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
1397; RV32-NEXT:    .cfi_offset ra, -4
1398; RV32-NEXT:    mv a2, a0
1399; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
1400; RV32-NEXT:    vmv.s.x v9, a1
1401; RV32-NEXT:    vmsne.vi v9, v9, 0
1402; RV32-NEXT:    vmand.mm v0, v9, v0
1403; RV32-NEXT:    vmv.v.i v9, 1
1404; RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
1405; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
1406; RV32-NEXT:    vmv.x.s a0, v8
1407; RV32-NEXT:    mv a1, a2
1408; RV32-NEXT:    call __mulsi3
1409; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
1410; RV32-NEXT:    .cfi_restore ra
1411; RV32-NEXT:    addi sp, sp, 16
1412; RV32-NEXT:    .cfi_def_cfa_offset 0
1413; RV32-NEXT:    ret
1414;
1415; RV64-LABEL: vpreduce_mul_v1i8:
1416; RV64:       # %bb.0:
1417; RV64-NEXT:    addi sp, sp, -16
1418; RV64-NEXT:    .cfi_def_cfa_offset 16
1419; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
1420; RV64-NEXT:    .cfi_offset ra, -8
1421; RV64-NEXT:    mv a2, a0
1422; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
1423; RV64-NEXT:    vmv.s.x v9, a1
1424; RV64-NEXT:    vmsne.vi v9, v9, 0
1425; RV64-NEXT:    vmand.mm v0, v9, v0
1426; RV64-NEXT:    vmv.v.i v9, 1
1427; RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
1428; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
1429; RV64-NEXT:    vmv.x.s a0, v8
1430; RV64-NEXT:    mv a1, a2
1431; RV64-NEXT:    call __muldi3
1432; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
1433; RV64-NEXT:    .cfi_restore ra
1434; RV64-NEXT:    addi sp, sp, 16
1435; RV64-NEXT:    .cfi_def_cfa_offset 0
1436; RV64-NEXT:    ret
1437  %r = call i8 @llvm.vp.reduce.mul.v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 %evl)
1438  ret i8 %r
1439}
1440
1441declare i8 @llvm.vp.reduce.mul.v2i8(i8, <2 x i8>, <2 x i1>, i32)
1442
1443define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) {
1444; RV32-LABEL: vpreduce_mul_v2i8:
1445; RV32:       # %bb.0:
1446; RV32-NEXT:    addi sp, sp, -16
1447; RV32-NEXT:    .cfi_def_cfa_offset 16
1448; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
1449; RV32-NEXT:    .cfi_offset ra, -4
1450; RV32-NEXT:    mv a2, a0
1451; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
1452; RV32-NEXT:    vid.v v9
1453; RV32-NEXT:    vmsltu.vx v9, v9, a1
1454; RV32-NEXT:    vmand.mm v0, v9, v0
1455; RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
1456; RV32-NEXT:    vmv.v.i v9, 1
1457; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
1458; RV32-NEXT:    vrgather.vi v9, v8, 1
1459; RV32-NEXT:    vmul.vv v8, v8, v9
1460; RV32-NEXT:    vmv.x.s a0, v8
1461; RV32-NEXT:    mv a1, a2
1462; RV32-NEXT:    call __mulsi3
1463; RV32-NEXT:    slli a0, a0, 24
1464; RV32-NEXT:    srai a0, a0, 24
1465; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
1466; RV32-NEXT:    .cfi_restore ra
1467; RV32-NEXT:    addi sp, sp, 16
1468; RV32-NEXT:    .cfi_def_cfa_offset 0
1469; RV32-NEXT:    ret
1470;
1471; RV64-LABEL: vpreduce_mul_v2i8:
1472; RV64:       # %bb.0:
1473; RV64-NEXT:    addi sp, sp, -16
1474; RV64-NEXT:    .cfi_def_cfa_offset 16
1475; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
1476; RV64-NEXT:    .cfi_offset ra, -8
1477; RV64-NEXT:    mv a2, a0
1478; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
1479; RV64-NEXT:    vid.v v9
1480; RV64-NEXT:    vmsltu.vx v9, v9, a1
1481; RV64-NEXT:    vmand.mm v0, v9, v0
1482; RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
1483; RV64-NEXT:    vmv.v.i v9, 1
1484; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
1485; RV64-NEXT:    vrgather.vi v9, v8, 1
1486; RV64-NEXT:    vmul.vv v8, v8, v9
1487; RV64-NEXT:    vmv.x.s a0, v8
1488; RV64-NEXT:    mv a1, a2
1489; RV64-NEXT:    call __muldi3
1490; RV64-NEXT:    slli a0, a0, 56
1491; RV64-NEXT:    srai a0, a0, 56
1492; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
1493; RV64-NEXT:    .cfi_restore ra
1494; RV64-NEXT:    addi sp, sp, 16
1495; RV64-NEXT:    .cfi_def_cfa_offset 0
1496; RV64-NEXT:    ret
1497  %r = call i8 @llvm.vp.reduce.mul.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl)
1498  ret i8 %r
1499}
1500
1501declare i8 @llvm.vp.reduce.mul.v4i8(i8, <4 x i8>, <4 x i1>, i32)
1502
1503define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
1504; RV32-LABEL: vpreduce_mul_v4i8:
1505; RV32:       # %bb.0:
1506; RV32-NEXT:    addi sp, sp, -16
1507; RV32-NEXT:    .cfi_def_cfa_offset 16
1508; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
1509; RV32-NEXT:    .cfi_offset ra, -4
1510; RV32-NEXT:    mv a2, a0
1511; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
1512; RV32-NEXT:    vid.v v9
1513; RV32-NEXT:    vmsltu.vx v9, v9, a1
1514; RV32-NEXT:    vmand.mm v0, v9, v0
1515; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
1516; RV32-NEXT:    vmv.v.i v9, 1
1517; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
1518; RV32-NEXT:    vslidedown.vi v9, v8, 2
1519; RV32-NEXT:    vmul.vv v8, v8, v9
1520; RV32-NEXT:    vrgather.vi v9, v8, 1
1521; RV32-NEXT:    vmul.vv v8, v8, v9
1522; RV32-NEXT:    vmv.x.s a0, v8
1523; RV32-NEXT:    mv a1, a2
1524; RV32-NEXT:    call __mulsi3
1525; RV32-NEXT:    slli a0, a0, 24
1526; RV32-NEXT:    srai a0, a0, 24
1527; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
1528; RV32-NEXT:    .cfi_restore ra
1529; RV32-NEXT:    addi sp, sp, 16
1530; RV32-NEXT:    .cfi_def_cfa_offset 0
1531; RV32-NEXT:    ret
1532;
1533; RV64-LABEL: vpreduce_mul_v4i8:
1534; RV64:       # %bb.0:
1535; RV64-NEXT:    addi sp, sp, -16
1536; RV64-NEXT:    .cfi_def_cfa_offset 16
1537; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
1538; RV64-NEXT:    .cfi_offset ra, -8
1539; RV64-NEXT:    mv a2, a0
1540; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
1541; RV64-NEXT:    vid.v v9
1542; RV64-NEXT:    vmsltu.vx v9, v9, a1
1543; RV64-NEXT:    vmand.mm v0, v9, v0
1544; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
1545; RV64-NEXT:    vmv.v.i v9, 1
1546; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
1547; RV64-NEXT:    vslidedown.vi v9, v8, 2
1548; RV64-NEXT:    vmul.vv v8, v8, v9
1549; RV64-NEXT:    vrgather.vi v9, v8, 1
1550; RV64-NEXT:    vmul.vv v8, v8, v9
1551; RV64-NEXT:    vmv.x.s a0, v8
1552; RV64-NEXT:    mv a1, a2
1553; RV64-NEXT:    call __muldi3
1554; RV64-NEXT:    slli a0, a0, 56
1555; RV64-NEXT:    srai a0, a0, 56
1556; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
1557; RV64-NEXT:    .cfi_restore ra
1558; RV64-NEXT:    addi sp, sp, 16
1559; RV64-NEXT:    .cfi_def_cfa_offset 0
1560; RV64-NEXT:    ret
1561  %r = call i8 @llvm.vp.reduce.mul.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
1562  ret i8 %r
1563}
1564
1565declare i8 @llvm.vp.reduce.mul.v8i8(i8, <8 x i8>, <8 x i1>, i32)
1566
1567define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i32 zeroext %evl) {
1568; RV32-LABEL: vpreduce_mul_v8i8:
1569; RV32:       # %bb.0:
1570; RV32-NEXT:    addi sp, sp, -16
1571; RV32-NEXT:    .cfi_def_cfa_offset 16
1572; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
1573; RV32-NEXT:    .cfi_offset ra, -4
1574; RV32-NEXT:    mv a2, a0
1575; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
1576; RV32-NEXT:    vid.v v10
1577; RV32-NEXT:    vmsltu.vx v9, v10, a1
1578; RV32-NEXT:    vmand.mm v0, v9, v0
1579; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
1580; RV32-NEXT:    vmv.v.i v9, 1
1581; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
1582; RV32-NEXT:    vslidedown.vi v9, v8, 4
1583; RV32-NEXT:    vmul.vv v8, v8, v9
1584; RV32-NEXT:    vslidedown.vi v9, v8, 2
1585; RV32-NEXT:    vmul.vv v8, v8, v9
1586; RV32-NEXT:    vrgather.vi v9, v8, 1
1587; RV32-NEXT:    vmul.vv v8, v8, v9
1588; RV32-NEXT:    vmv.x.s a0, v8
1589; RV32-NEXT:    mv a1, a2
1590; RV32-NEXT:    call __mulsi3
1591; RV32-NEXT:    slli a0, a0, 24
1592; RV32-NEXT:    srai a0, a0, 24
1593; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
1594; RV32-NEXT:    .cfi_restore ra
1595; RV32-NEXT:    addi sp, sp, 16
1596; RV32-NEXT:    .cfi_def_cfa_offset 0
1597; RV32-NEXT:    ret
1598;
1599; RV64-LABEL: vpreduce_mul_v8i8:
1600; RV64:       # %bb.0:
1601; RV64-NEXT:    addi sp, sp, -16
1602; RV64-NEXT:    .cfi_def_cfa_offset 16
1603; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
1604; RV64-NEXT:    .cfi_offset ra, -8
1605; RV64-NEXT:    mv a2, a0
1606; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
1607; RV64-NEXT:    vid.v v10
1608; RV64-NEXT:    vmsltu.vx v9, v10, a1
1609; RV64-NEXT:    vmand.mm v0, v9, v0
1610; RV64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
1611; RV64-NEXT:    vmv.v.i v9, 1
1612; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
1613; RV64-NEXT:    vslidedown.vi v9, v8, 4
1614; RV64-NEXT:    vmul.vv v8, v8, v9
1615; RV64-NEXT:    vslidedown.vi v9, v8, 2
1616; RV64-NEXT:    vmul.vv v8, v8, v9
1617; RV64-NEXT:    vrgather.vi v9, v8, 1
1618; RV64-NEXT:    vmul.vv v8, v8, v9
1619; RV64-NEXT:    vmv.x.s a0, v8
1620; RV64-NEXT:    mv a1, a2
1621; RV64-NEXT:    call __muldi3
1622; RV64-NEXT:    slli a0, a0, 56
1623; RV64-NEXT:    srai a0, a0, 56
1624; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
1625; RV64-NEXT:    .cfi_restore ra
1626; RV64-NEXT:    addi sp, sp, 16
1627; RV64-NEXT:    .cfi_def_cfa_offset 0
1628; RV64-NEXT:    ret
1629  %r = call i8 @llvm.vp.reduce.mul.v8i8(i8 %s, <8 x i8> %v, <8 x i1> %m, i32 %evl)
1630  ret i8 %r
1631}
1632
1633declare i8 @llvm.vp.reduce.mul.v16i8(i8, <16 x i8>, <16 x i1>, i32)
1634
1635define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m, i32 zeroext %evl) {
1636; RV32-LABEL: vpreduce_mul_v16i8:
1637; RV32:       # %bb.0:
1638; RV32-NEXT:    addi sp, sp, -16
1639; RV32-NEXT:    .cfi_def_cfa_offset 16
1640; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
1641; RV32-NEXT:    .cfi_offset ra, -4
1642; RV32-NEXT:    mv a2, a0
1643; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1644; RV32-NEXT:    vid.v v12
1645; RV32-NEXT:    vmsltu.vx v9, v12, a1
1646; RV32-NEXT:    vmand.mm v0, v9, v0
1647; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
1648; RV32-NEXT:    vmv.v.i v9, 1
1649; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
1650; RV32-NEXT:    vslidedown.vi v9, v8, 8
1651; RV32-NEXT:    vmul.vv v8, v8, v9
1652; RV32-NEXT:    vslidedown.vi v9, v8, 4
1653; RV32-NEXT:    vmul.vv v8, v8, v9
1654; RV32-NEXT:    vslidedown.vi v9, v8, 2
1655; RV32-NEXT:    vmul.vv v8, v8, v9
1656; RV32-NEXT:    vrgather.vi v9, v8, 1
1657; RV32-NEXT:    vmul.vv v8, v8, v9
1658; RV32-NEXT:    vmv.x.s a0, v8
1659; RV32-NEXT:    mv a1, a2
1660; RV32-NEXT:    call __mulsi3
1661; RV32-NEXT:    slli a0, a0, 24
1662; RV32-NEXT:    srai a0, a0, 24
1663; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
1664; RV32-NEXT:    .cfi_restore ra
1665; RV32-NEXT:    addi sp, sp, 16
1666; RV32-NEXT:    .cfi_def_cfa_offset 0
1667; RV32-NEXT:    ret
1668;
1669; RV64-LABEL: vpreduce_mul_v16i8:
1670; RV64:       # %bb.0:
1671; RV64-NEXT:    addi sp, sp, -16
1672; RV64-NEXT:    .cfi_def_cfa_offset 16
1673; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
1674; RV64-NEXT:    .cfi_offset ra, -8
1675; RV64-NEXT:    mv a2, a0
1676; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
1677; RV64-NEXT:    vid.v v12
1678; RV64-NEXT:    vmsltu.vx v9, v12, a1
1679; RV64-NEXT:    vmand.mm v0, v9, v0
1680; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
1681; RV64-NEXT:    vmv.v.i v9, 1
1682; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
1683; RV64-NEXT:    vslidedown.vi v9, v8, 8
1684; RV64-NEXT:    vmul.vv v8, v8, v9
1685; RV64-NEXT:    vslidedown.vi v9, v8, 4
1686; RV64-NEXT:    vmul.vv v8, v8, v9
1687; RV64-NEXT:    vslidedown.vi v9, v8, 2
1688; RV64-NEXT:    vmul.vv v8, v8, v9
1689; RV64-NEXT:    vrgather.vi v9, v8, 1
1690; RV64-NEXT:    vmul.vv v8, v8, v9
1691; RV64-NEXT:    vmv.x.s a0, v8
1692; RV64-NEXT:    mv a1, a2
1693; RV64-NEXT:    call __muldi3
1694; RV64-NEXT:    slli a0, a0, 56
1695; RV64-NEXT:    srai a0, a0, 56
1696; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
1697; RV64-NEXT:    .cfi_restore ra
1698; RV64-NEXT:    addi sp, sp, 16
1699; RV64-NEXT:    .cfi_def_cfa_offset 0
1700; RV64-NEXT:    ret
1701  %r = call i8 @llvm.vp.reduce.mul.v16i8(i8 %s, <16 x i8> %v, <16 x i1> %m, i32 %evl)
1702  ret i8 %r
1703}
1704
1705declare i8 @llvm.vp.reduce.mul.v32i8(i8, <32 x i8>, <32 x i1>, i32)
1706
1707define signext i8 @vpreduce_mul_v32i8(i8 signext %s, <32 x i8> %v, <32 x i1> %m, i32 zeroext %evl) {
1708; RV32-LABEL: vpreduce_mul_v32i8:
1709; RV32:       # %bb.0:
1710; RV32-NEXT:    addi sp, sp, -16
1711; RV32-NEXT:    .cfi_def_cfa_offset 16
1712; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
1713; RV32-NEXT:    .cfi_offset ra, -4
1714; RV32-NEXT:    mv a2, a0
1715; RV32-NEXT:    li a0, 32
1716; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
1717; RV32-NEXT:    vid.v v16
1718; RV32-NEXT:    vmsltu.vx v10, v16, a1
1719; RV32-NEXT:    vmand.mm v0, v10, v0
1720; RV32-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
1721; RV32-NEXT:    vmv.v.i v10, 1
1722; RV32-NEXT:    vmerge.vvm v8, v10, v8, v0
1723; RV32-NEXT:    vslidedown.vi v10, v8, 16
1724; RV32-NEXT:    vmul.vv v8, v8, v10
1725; RV32-NEXT:    vslidedown.vi v10, v8, 8
1726; RV32-NEXT:    vmul.vv v8, v8, v10
1727; RV32-NEXT:    vslidedown.vi v10, v8, 4
1728; RV32-NEXT:    vmul.vv v8, v8, v10
1729; RV32-NEXT:    vslidedown.vi v10, v8, 2
1730; RV32-NEXT:    vmul.vv v8, v8, v10
1731; RV32-NEXT:    vrgather.vi v10, v8, 1
1732; RV32-NEXT:    vmul.vv v8, v8, v10
1733; RV32-NEXT:    vmv.x.s a0, v8
1734; RV32-NEXT:    mv a1, a2
1735; RV32-NEXT:    call __mulsi3
1736; RV32-NEXT:    slli a0, a0, 24
1737; RV32-NEXT:    srai a0, a0, 24
1738; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
1739; RV32-NEXT:    .cfi_restore ra
1740; RV32-NEXT:    addi sp, sp, 16
1741; RV32-NEXT:    .cfi_def_cfa_offset 0
1742; RV32-NEXT:    ret
1743;
1744; RV64-LABEL: vpreduce_mul_v32i8:
1745; RV64:       # %bb.0:
1746; RV64-NEXT:    addi sp, sp, -16
1747; RV64-NEXT:    .cfi_def_cfa_offset 16
1748; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
1749; RV64-NEXT:    .cfi_offset ra, -8
1750; RV64-NEXT:    mv a2, a0
1751; RV64-NEXT:    li a0, 32
1752; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
1753; RV64-NEXT:    vid.v v16
1754; RV64-NEXT:    vmsltu.vx v10, v16, a1
1755; RV64-NEXT:    vmand.mm v0, v10, v0
1756; RV64-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
1757; RV64-NEXT:    vmv.v.i v10, 1
1758; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
1759; RV64-NEXT:    vslidedown.vi v10, v8, 16
1760; RV64-NEXT:    vmul.vv v8, v8, v10
1761; RV64-NEXT:    vslidedown.vi v10, v8, 8
1762; RV64-NEXT:    vmul.vv v8, v8, v10
1763; RV64-NEXT:    vslidedown.vi v10, v8, 4
1764; RV64-NEXT:    vmul.vv v8, v8, v10
1765; RV64-NEXT:    vslidedown.vi v10, v8, 2
1766; RV64-NEXT:    vmul.vv v8, v8, v10
1767; RV64-NEXT:    vrgather.vi v10, v8, 1
1768; RV64-NEXT:    vmul.vv v8, v8, v10
1769; RV64-NEXT:    vmv.x.s a0, v8
1770; RV64-NEXT:    mv a1, a2
1771; RV64-NEXT:    call __muldi3
1772; RV64-NEXT:    slli a0, a0, 56
1773; RV64-NEXT:    srai a0, a0, 56
1774; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
1775; RV64-NEXT:    .cfi_restore ra
1776; RV64-NEXT:    addi sp, sp, 16
1777; RV64-NEXT:    .cfi_def_cfa_offset 0
1778; RV64-NEXT:    ret
1779  %r = call i8 @llvm.vp.reduce.mul.v32i8(i8 %s, <32 x i8> %v, <32 x i1> %m, i32 %evl)
1780  ret i8 %r
1781}
1782
1783declare i8 @llvm.vp.reduce.mul.v64i8(i8, <64 x i8>, <64 x i1>, i32)
1784
1785define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, i32 zeroext %evl) {
1786; RV32-LABEL: vpreduce_mul_v64i8:
1787; RV32:       # %bb.0:
1788; RV32-NEXT:    addi sp, sp, -16
1789; RV32-NEXT:    .cfi_def_cfa_offset 16
1790; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
1791; RV32-NEXT:    .cfi_offset ra, -4
1792; RV32-NEXT:    mv a2, a0
1793; RV32-NEXT:    li a0, 32
1794; RV32-NEXT:    lui a3, %hi(.LCPI72_0)
1795; RV32-NEXT:    addi a3, a3, %lo(.LCPI72_0)
1796; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
1797; RV32-NEXT:    vle8.v v12, (a3)
1798; RV32-NEXT:    vid.v v16
1799; RV32-NEXT:    vmsltu.vx v14, v16, a1
1800; RV32-NEXT:    li a3, 64
1801; RV32-NEXT:    vsext.vf4 v16, v12
1802; RV32-NEXT:    vmsltu.vx v12, v16, a1
1803; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
1804; RV32-NEXT:    vslideup.vi v14, v12, 4
1805; RV32-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
1806; RV32-NEXT:    vmand.mm v0, v14, v0
1807; RV32-NEXT:    vmv.v.i v12, 1
1808; RV32-NEXT:    vmerge.vvm v8, v12, v8, v0
1809; RV32-NEXT:    vslidedown.vx v12, v8, a0
1810; RV32-NEXT:    vmul.vv v8, v8, v12
1811; RV32-NEXT:    vslidedown.vi v12, v8, 16
1812; RV32-NEXT:    vmul.vv v8, v8, v12
1813; RV32-NEXT:    vslidedown.vi v12, v8, 8
1814; RV32-NEXT:    vmul.vv v8, v8, v12
1815; RV32-NEXT:    vslidedown.vi v12, v8, 4
1816; RV32-NEXT:    vmul.vv v8, v8, v12
1817; RV32-NEXT:    vslidedown.vi v12, v8, 2
1818; RV32-NEXT:    vmul.vv v8, v8, v12
1819; RV32-NEXT:    vrgather.vi v12, v8, 1
1820; RV32-NEXT:    vmul.vv v8, v8, v12
1821; RV32-NEXT:    vmv.x.s a0, v8
1822; RV32-NEXT:    mv a1, a2
1823; RV32-NEXT:    call __mulsi3
1824; RV32-NEXT:    slli a0, a0, 24
1825; RV32-NEXT:    srai a0, a0, 24
1826; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
1827; RV32-NEXT:    .cfi_restore ra
1828; RV32-NEXT:    addi sp, sp, 16
1829; RV32-NEXT:    .cfi_def_cfa_offset 0
1830; RV32-NEXT:    ret
1831;
1832; RV64-LABEL: vpreduce_mul_v64i8:
1833; RV64:       # %bb.0:
1834; RV64-NEXT:    addi sp, sp, -16
1835; RV64-NEXT:    .cfi_def_cfa_offset 16
1836; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
1837; RV64-NEXT:    .cfi_offset ra, -8
1838; RV64-NEXT:    mv a2, a0
1839; RV64-NEXT:    li a0, 32
1840; RV64-NEXT:    lui a3, %hi(.LCPI72_0)
1841; RV64-NEXT:    addi a3, a3, %lo(.LCPI72_0)
1842; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
1843; RV64-NEXT:    vle8.v v12, (a3)
1844; RV64-NEXT:    vid.v v16
1845; RV64-NEXT:    vmsltu.vx v14, v16, a1
1846; RV64-NEXT:    li a3, 64
1847; RV64-NEXT:    vsext.vf4 v16, v12
1848; RV64-NEXT:    vmsltu.vx v12, v16, a1
1849; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
1850; RV64-NEXT:    vslideup.vi v14, v12, 4
1851; RV64-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
1852; RV64-NEXT:    vmand.mm v0, v14, v0
1853; RV64-NEXT:    vmv.v.i v12, 1
1854; RV64-NEXT:    vmerge.vvm v8, v12, v8, v0
1855; RV64-NEXT:    vslidedown.vx v12, v8, a0
1856; RV64-NEXT:    vmul.vv v8, v8, v12
1857; RV64-NEXT:    vslidedown.vi v12, v8, 16
1858; RV64-NEXT:    vmul.vv v8, v8, v12
1859; RV64-NEXT:    vslidedown.vi v12, v8, 8
1860; RV64-NEXT:    vmul.vv v8, v8, v12
1861; RV64-NEXT:    vslidedown.vi v12, v8, 4
1862; RV64-NEXT:    vmul.vv v8, v8, v12
1863; RV64-NEXT:    vslidedown.vi v12, v8, 2
1864; RV64-NEXT:    vmul.vv v8, v8, v12
1865; RV64-NEXT:    vrgather.vi v12, v8, 1
1866; RV64-NEXT:    vmul.vv v8, v8, v12
1867; RV64-NEXT:    vmv.x.s a0, v8
1868; RV64-NEXT:    mv a1, a2
1869; RV64-NEXT:    call __muldi3
1870; RV64-NEXT:    slli a0, a0, 56
1871; RV64-NEXT:    srai a0, a0, 56
1872; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
1873; RV64-NEXT:    .cfi_restore ra
1874; RV64-NEXT:    addi sp, sp, 16
1875; RV64-NEXT:    .cfi_def_cfa_offset 0
1876; RV64-NEXT:    ret
1877  %r = call i8 @llvm.vp.reduce.mul.v64i8(i8 %s, <64 x i8> %v, <64 x i1> %m, i32 %evl)
1878  ret i8 %r
1879}
1880
1881; Test start value is the first element of a vector.
1882define zeroext i8 @front_ele_v4i8(<4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) {
1883; CHECK-LABEL: front_ele_v4i8:
1884; CHECK:       # %bb.0:
1885; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
1886; CHECK-NEXT:    vredand.vs v8, v8, v8, v0.t
1887; CHECK-NEXT:    vmv.x.s a0, v8
1888; CHECK-NEXT:    andi a0, a0, 255
1889; CHECK-NEXT:    ret
1890  %s = extractelement <4 x i8> %v, i64 0
1891  %r = call i8 @llvm.vp.reduce.and.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl)
1892  ret i8 %r
1893}
1894
1895; Test start value is the first element of a vector which longer than M1.
1896declare i8 @llvm.vp.reduce.and.v32i8(i8, <32 x i8>, <32 x i1>, i32)
1897define zeroext i8 @front_ele_v32i8(<32 x i8> %v, <32 x i1> %m, i32 zeroext %evl) {
1898; CHECK-LABEL: front_ele_v32i8:
1899; CHECK:       # %bb.0:
1900; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
1901; CHECK-NEXT:    vredand.vs v8, v8, v8, v0.t
1902; CHECK-NEXT:    vmv.x.s a0, v8
1903; CHECK-NEXT:    andi a0, a0, 255
1904; CHECK-NEXT:    ret
1905  %s = extractelement <32 x i8> %v, i64 0
1906  %r = call i8 @llvm.vp.reduce.and.v32i8(i8 %s, <32 x i8> %v, <32 x i1> %m, i32 %evl)
1907  ret i8 %r
1908}
1909