xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll (revision 142787d3687eb58633c7c55a7a9f328ba4504986)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zbb -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s
3
4define i64 @reduce_add(i64 %x, <4 x i64> %v) {
5; CHECK-LABEL: reduce_add:
6; CHECK:       # %bb.0: # %entry
7; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
8; CHECK-NEXT:    vmv.s.x v10, a0
9; CHECK-NEXT:    vredsum.vs v8, v8, v10
10; CHECK-NEXT:    vmv.x.s a0, v8
11; CHECK-NEXT:    ret
12entry:
13  %rdx = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v)
14  %res = add i64 %rdx, %x
15  ret i64 %res
16}
17
18define i64 @reduce_add2(<4 x i64> %v) {
19; CHECK-LABEL: reduce_add2:
20; CHECK:       # %bb.0: # %entry
21; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
22; CHECK-NEXT:    vmv.v.i v10, 8
23; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
24; CHECK-NEXT:    vredsum.vs v8, v8, v10
25; CHECK-NEXT:    vmv.x.s a0, v8
26; CHECK-NEXT:    ret
27entry:
28  %rdx = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v)
29  %res = add i64 %rdx, 8
30  ret i64 %res
31}
32
33define i64 @reduce_and(i64 %x, <4 x i64> %v) {
34; CHECK-LABEL: reduce_and:
35; CHECK:       # %bb.0: # %entry
36; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
37; CHECK-NEXT:    vredand.vs v8, v8, v8
38; CHECK-NEXT:    vmv.x.s a1, v8
39; CHECK-NEXT:    and a0, a1, a0
40; CHECK-NEXT:    ret
41entry:
42  %rdx = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v)
43  %res = and i64 %rdx, %x
44  ret i64 %res
45}
46
47define i64 @reduce_and2(<4 x i64> %v) {
48; CHECK-LABEL: reduce_and2:
49; CHECK:       # %bb.0: # %entry
50; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
51; CHECK-NEXT:    vredand.vs v8, v8, v8
52; CHECK-NEXT:    vmv.x.s a0, v8
53; CHECK-NEXT:    andi a0, a0, 8
54; CHECK-NEXT:    ret
55entry:
56  %rdx = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v)
57  %res = and i64 %rdx, 8
58  ret i64 %res
59}
60
61define i64 @reduce_or(i64 %x, <4 x i64> %v) {
62; CHECK-LABEL: reduce_or:
63; CHECK:       # %bb.0: # %entry
64; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
65; CHECK-NEXT:    vredor.vs v8, v8, v8
66; CHECK-NEXT:    vmv.x.s a1, v8
67; CHECK-NEXT:    or a0, a1, a0
68; CHECK-NEXT:    ret
69entry:
70  %rdx = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v)
71  %res = or i64 %rdx, %x
72  ret i64 %res
73}
74
75define i64 @reduce_or2(<4 x i64> %v) {
76; CHECK-LABEL: reduce_or2:
77; CHECK:       # %bb.0: # %entry
78; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
79; CHECK-NEXT:    vredor.vs v8, v8, v8
80; CHECK-NEXT:    vmv.x.s a0, v8
81; CHECK-NEXT:    ori a0, a0, 8
82; CHECK-NEXT:    ret
83entry:
84  %rdx = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v)
85  %res = or i64 %rdx, 8
86  ret i64 %res
87}
88
89define i64 @reduce_xor(i64 %x, <4 x i64> %v) {
90; CHECK-LABEL: reduce_xor:
91; CHECK:       # %bb.0: # %entry
92; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
93; CHECK-NEXT:    vmv.s.x v10, a0
94; CHECK-NEXT:    vredxor.vs v8, v8, v10
95; CHECK-NEXT:    vmv.x.s a0, v8
96; CHECK-NEXT:    ret
97entry:
98  %rdx = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v)
99  %res = xor i64 %rdx, %x
100  ret i64 %res
101}
102
103define i64 @reduce_xor2(<4 x i64> %v) {
104; CHECK-LABEL: reduce_xor2:
105; CHECK:       # %bb.0: # %entry
106; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
107; CHECK-NEXT:    vmv.s.x v10, zero
108; CHECK-NEXT:    vredxor.vs v8, v8, v10
109; CHECK-NEXT:    vmv.x.s a0, v8
110; CHECK-NEXT:    andi a0, a0, 8
111; CHECK-NEXT:    ret
112entry:
113  %rdx = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v)
114  %res = and i64 %rdx, 8
115  ret i64 %res
116}
117
118define i64 @reduce_umax(i64 %x, <4 x i64> %v) {
119; CHECK-LABEL: reduce_umax:
120; CHECK:       # %bb.0: # %entry
121; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
122; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
123; CHECK-NEXT:    vmv.x.s a1, v8
124; CHECK-NEXT:    maxu a0, a1, a0
125; CHECK-NEXT:    ret
126entry:
127  %rdx = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v)
128  %res = call i64 @llvm.umax.i64(i64 %rdx, i64 %x)
129  ret i64 %res
130}
131
132define i64 @reduce_umax2(<4 x i64> %v) {
133; CHECK-LABEL: reduce_umax2:
134; CHECK:       # %bb.0: # %entry
135; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
136; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
137; CHECK-NEXT:    vmv.x.s a0, v8
138; CHECK-NEXT:    li a1, 8
139; CHECK-NEXT:    maxu a0, a0, a1
140; CHECK-NEXT:    ret
141entry:
142  %rdx = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v)
143  %res = call i64 @llvm.umax.i64(i64 %rdx, i64 8)
144  ret i64 %res
145}
146
147define i64 @reduce_umin(i64 %x, <4 x i64> %v) {
148; CHECK-LABEL: reduce_umin:
149; CHECK:       # %bb.0: # %entry
150; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
151; CHECK-NEXT:    vredminu.vs v8, v8, v8
152; CHECK-NEXT:    vmv.x.s a1, v8
153; CHECK-NEXT:    minu a0, a1, a0
154; CHECK-NEXT:    ret
155entry:
156  %rdx = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v)
157  %res = call i64 @llvm.umin.i64(i64 %rdx, i64 %x)
158  ret i64 %res
159}
160
161define i64 @reduce_umin2(<4 x i64> %v) {
162; CHECK-LABEL: reduce_umin2:
163; CHECK:       # %bb.0: # %entry
164; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
165; CHECK-NEXT:    vredminu.vs v8, v8, v8
166; CHECK-NEXT:    vmv.x.s a0, v8
167; CHECK-NEXT:    li a1, 8
168; CHECK-NEXT:    minu a0, a0, a1
169; CHECK-NEXT:    ret
170entry:
171  %rdx = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v)
172  %res = call i64 @llvm.umin.i64(i64 %rdx, i64 8)
173  ret i64 %res
174}
175
176define i64 @reduce_smax(i64 %x, <4 x i64> %v) {
177; CHECK-LABEL: reduce_smax:
178; CHECK:       # %bb.0: # %entry
179; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
180; CHECK-NEXT:    vredmax.vs v8, v8, v8
181; CHECK-NEXT:    vmv.x.s a1, v8
182; CHECK-NEXT:    max a0, a1, a0
183; CHECK-NEXT:    ret
184entry:
185  %rdx = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v)
186  %res = call i64 @llvm.smax.i64(i64 %rdx, i64 %x)
187  ret i64 %res
188}
189
190define i64 @reduce_smax2(<4 x i64> %v) {
191; CHECK-LABEL: reduce_smax2:
192; CHECK:       # %bb.0: # %entry
193; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
194; CHECK-NEXT:    vredmax.vs v8, v8, v8
195; CHECK-NEXT:    vmv.x.s a0, v8
196; CHECK-NEXT:    li a1, 8
197; CHECK-NEXT:    max a0, a0, a1
198; CHECK-NEXT:    ret
199entry:
200  %rdx = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v)
201  %res = call i64 @llvm.smax.i64(i64 %rdx, i64 8)
202  ret i64 %res
203}
204
205define i64 @reduce_smin(i64 %x, <4 x i64> %v) {
206; CHECK-LABEL: reduce_smin:
207; CHECK:       # %bb.0: # %entry
208; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
209; CHECK-NEXT:    vredmin.vs v8, v8, v8
210; CHECK-NEXT:    vmv.x.s a1, v8
211; CHECK-NEXT:    min a0, a1, a0
212; CHECK-NEXT:    ret
213entry:
214  %rdx = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v)
215  %res = call i64 @llvm.smin.i64(i64 %rdx, i64 %x)
216  ret i64 %res
217}
218
219define i64 @reduce_smin2(<4 x i64> %v) {
220; CHECK-LABEL: reduce_smin2:
221; CHECK:       # %bb.0: # %entry
222; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
223; CHECK-NEXT:    vredmin.vs v8, v8, v8
224; CHECK-NEXT:    vmv.x.s a0, v8
225; CHECK-NEXT:    li a1, 8
226; CHECK-NEXT:    min a0, a0, a1
227; CHECK-NEXT:    ret
228entry:
229  %rdx = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v)
230  %res = call i64 @llvm.smin.i64(i64 %rdx, i64 8)
231  ret i64 %res
232}
233
234define float @reduce_fadd(float %x, <4 x float> %v) {
235; CHECK-LABEL: reduce_fadd:
236; CHECK:       # %bb.0: # %entry
237; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
238; CHECK-NEXT:    vfmv.s.f v9, fa0
239; CHECK-NEXT:    vfredusum.vs v8, v8, v9
240; CHECK-NEXT:    vfmv.f.s fa0, v8
241; CHECK-NEXT:    ret
242entry:
243  %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float %x, <4 x float> %v)
244  ret float %rdx
245}
246
247define float @reduce_fadd2(float %x, <4 x float> %v) {
248; CHECK-LABEL: reduce_fadd2:
249; CHECK:       # %bb.0: # %entry
250; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
251; CHECK-NEXT:    vfmv.s.f v9, fa0
252; CHECK-NEXT:    vfredusum.vs v8, v8, v9
253; CHECK-NEXT:    vfmv.f.s fa0, v8
254; CHECK-NEXT:    ret
255entry:
256  %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> %v)
257  %res = fadd fast float %rdx, %x
258  ret float %res
259}
260
261define float @reduce_fadd3(float %x, <4 x float> %v, ptr %rdxptr) {
262; CHECK-LABEL: reduce_fadd3:
263; CHECK:       # %bb.0: # %entry
264; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
265; CHECK-NEXT:    vmv.s.x v9, zero
266; CHECK-NEXT:    vfredusum.vs v8, v8, v9
267; CHECK-NEXT:    vfmv.f.s fa5, v8
268; CHECK-NEXT:    fadd.s fa0, fa5, fa0
269; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
270; CHECK-NEXT:    vse32.v v8, (a0)
271; CHECK-NEXT:    ret
272entry:
273  %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %v)
274  %res = fadd fast float %rdx, %x
275  store float %rdx, ptr %rdxptr
276  ret float %res
277}
278
279define float @reduce_fadd4(float %x, float %y, <4 x float> %v, <4 x float> %w) {
280; CHECK-LABEL: reduce_fadd4:
281; CHECK:       # %bb.0: # %entry
282; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
283; CHECK-NEXT:    vfmv.s.f v10, fa0
284; CHECK-NEXT:    vfredusum.vs v8, v8, v10
285; CHECK-NEXT:    vfmv.s.f v10, fa1
286; CHECK-NEXT:    vfredusum.vs v9, v9, v10
287; CHECK-NEXT:    vfmv.f.s fa5, v8
288; CHECK-NEXT:    vfmv.f.s fa4, v9
289; CHECK-NEXT:    fdiv.s fa0, fa5, fa4
290; CHECK-NEXT:    ret
291entry:
292  %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %v)
293  %rdx2 = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %w)
294  %res = fadd fast float %rdx, %x
295  %res2 = fadd fast float %rdx2, %y
296  %div = fdiv fast float %res, %res2
297  ret float %div
298}
299
300define float @reduce_fmax(float %x, <4 x float> %v) {
301; CHECK-LABEL: reduce_fmax:
302; CHECK:       # %bb.0: # %entry
303; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
304; CHECK-NEXT:    vfredmax.vs v8, v8, v8
305; CHECK-NEXT:    vfmv.f.s fa5, v8
306; CHECK-NEXT:    fmax.s fa0, fa0, fa5
307; CHECK-NEXT:    ret
308entry:
309  %rdx = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v)
310  %res = call float @llvm.maxnum.f32(float %x, float %rdx)
311  ret float %res
312}
313
314define float @reduce_fmin(float %x, <4 x float> %v) {
315; CHECK-LABEL: reduce_fmin:
316; CHECK:       # %bb.0: # %entry
317; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
318; CHECK-NEXT:    vfredmin.vs v8, v8, v8
319; CHECK-NEXT:    vfmv.f.s fa5, v8
320; CHECK-NEXT:    fmin.s fa0, fa0, fa5
321; CHECK-NEXT:    ret
322entry:
323  %rdx = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v)
324  %res = call float @llvm.minnum.f32(float %x, float %rdx)
325  ret float %res
326}
327
328; Function Attrs: nofree nosync nounwind readnone willreturn
329declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
330declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
331declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
332declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
333declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
334declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
335declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
336declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
337declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
338declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
339declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
340declare i64 @llvm.umax.i64(i64, i64)
341declare i64 @llvm.umin.i64(i64, i64)
342declare i64 @llvm.smax.i64(i64, i64)
343declare i64 @llvm.smin.i64(i64, i64)
344declare float @llvm.maxnum.f32(float ,float)
345declare float @llvm.minnum.f32(float ,float)
346
347define void @crash(<2 x i32> %0) {
348; CHECK-LABEL: crash:
349; CHECK:       # %bb.0: # %entry
350; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
351; CHECK-NEXT:    vmv.x.s a0, v8
352; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
353; CHECK-NEXT:    vmv.v.i v8, 0
354; CHECK-NEXT:    vmv.s.x v9, a0
355; CHECK-NEXT:    vredsum.vs v8, v8, v9
356; CHECK-NEXT:    vmv.x.s a0, v8
357; CHECK-NEXT:    sb a0, 0(zero)
358; CHECK-NEXT:    ret
359entry:
360  %1 = extractelement <2 x i32> %0, i64 0
361  %2 = tail call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> zeroinitializer)
362  %3 = zext i16 %2 to i32
363  %op.rdx = add i32 %1, %3
364  %conv18.us = trunc i32 %op.rdx to i8
365  store i8 %conv18.us, ptr null, align 1
366  ret void
367}
368declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
369
370define i64 @op_then_reduce(<4 x i64> %v, <4 x i64> %v2) {
371; CHECK-LABEL: op_then_reduce:
372; CHECK:       # %bb.0: # %entry
373; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
374; CHECK-NEXT:    vadd.vv v8, v8, v10
375; CHECK-NEXT:    vmv.s.x v10, zero
376; CHECK-NEXT:    vredsum.vs v8, v8, v10
377; CHECK-NEXT:    vmv.x.s a0, v8
378; CHECK-NEXT:    ret
379entry:
380  %rdx1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v)
381  %rdx2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v2)
382  %res = add i64 %rdx1, %rdx2
383  ret i64 %res
384}
385
386
387define i64 @two_reduce_scalar_bypass(<4 x i64> %v, <4 x i64> %v2) {
388; CHECK-LABEL: two_reduce_scalar_bypass:
389; CHECK:       # %bb.0: # %entry
390; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
391; CHECK-NEXT:    vmv.s.x v12, zero
392; CHECK-NEXT:    vredxor.vs v8, v8, v12
393; CHECK-NEXT:    vredsum.vs v8, v10, v8
394; CHECK-NEXT:    vmv.x.s a0, v8
395; CHECK-NEXT:    ret
396entry:
397  %rdx1 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v)
398  %rdx2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v2)
399  %res = add i64 %rdx1, %rdx2
400  ret i64 %res
401}
402
403define i64 @two_reduce_scalar_bypass_zext(<4 x i64> %v, <4 x i32> %v2) {
404; CHECK-LABEL: two_reduce_scalar_bypass_zext:
405; CHECK:       # %bb.0: # %entry
406; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
407; CHECK-NEXT:    vmv.s.x v11, zero
408; CHECK-NEXT:    vredsum.vs v10, v10, v11
409; CHECK-NEXT:    vmv.x.s a0, v10
410; CHECK-NEXT:    slli a0, a0, 32
411; CHECK-NEXT:    srli a0, a0, 32
412; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
413; CHECK-NEXT:    vmv.s.x v10, a0
414; CHECK-NEXT:    vredsum.vs v8, v8, v10
415; CHECK-NEXT:    vmv.x.s a0, v8
416; CHECK-NEXT:    ret
417entry:
418  %rdx1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v)
419  %rdx2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v2)
420  %rdx2.zext = zext i32 %rdx2 to i64
421  %res = add i64 %rdx1, %rdx2.zext
422  ret i64 %res
423}
424
425define i64 @two_reduce_scalar_bypass_sext(<4 x i64> %v, <4 x i32> %v2) {
426; CHECK-LABEL: two_reduce_scalar_bypass_sext:
427; CHECK:       # %bb.0: # %entry
428; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
429; CHECK-NEXT:    vmv.s.x v11, zero
430; CHECK-NEXT:    vredsum.vs v10, v10, v11
431; CHECK-NEXT:    vmv.x.s a0, v10
432; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
433; CHECK-NEXT:    vmv.s.x v10, a0
434; CHECK-NEXT:    vredsum.vs v8, v8, v10
435; CHECK-NEXT:    vmv.x.s a0, v8
436; CHECK-NEXT:    ret
437entry:
438  %rdx1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v)
439  %rdx2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v2)
440  %rdx2.zext = sext i32 %rdx2 to i64
441  %res = add i64 %rdx1, %rdx2.zext
442  ret i64 %res
443}
444