xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll (revision e0ed0333f0fed2e73f805afd58b61176a87aa3ad)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -float-abi=hard -verify-machineinstrs %s -o - | FileCheck %s
3
4define float @add_f32(<8 x float> %a, <4 x float> %b) {
5; CHECK-LABEL: add_f32:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vadd.f32 q0, q0, q1
8; CHECK-NEXT:    vadd.f32 q0, q0, q2
9; CHECK-NEXT:    vadd.f32 s2, s2, s3
10; CHECK-NEXT:    vadd.f32 s0, s0, s1
11; CHECK-NEXT:    vadd.f32 s0, s0, s2
12; CHECK-NEXT:    bx lr
13  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
14  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
15  %r = fadd fast float %r1, %r2
16  ret float %r
17}
18
19define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
20; CHECK-LABEL: fmul_f32:
21; CHECK:       @ %bb.0:
22; CHECK-NEXT:    vmul.f32 q0, q0, q1
23; CHECK-NEXT:    vmul.f32 q0, q0, q2
24; CHECK-NEXT:    vmul.f32 s2, s2, s3
25; CHECK-NEXT:    vmul.f32 s0, s0, s1
26; CHECK-NEXT:    vmul.f32 s0, s0, s2
27; CHECK-NEXT:    bx lr
28  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
29  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
30  %r = fmul fast float %r1, %r2
31  ret float %r
32}
33
34define float @fmin_f32(<8 x float> %a, <4 x float> %b) {
35; CHECK-LABEL: fmin_f32:
36; CHECK:       @ %bb.0:
37; CHECK-NEXT:    vminnm.f32 q0, q0, q1
38; CHECK-NEXT:    vminnm.f32 q0, q0, q2
39; CHECK-NEXT:    vminnm.f32 s2, s2, s3
40; CHECK-NEXT:    vminnm.f32 s0, s0, s1
41; CHECK-NEXT:    vminnm.f32 s0, s0, s2
42; CHECK-NEXT:    bx lr
43  %r1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a)
44  %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
45  %r = call float @llvm.minnum.f32(float %r1, float %r2)
46  ret float %r
47}
48
49define float @fmax_f32(<8 x float> %a, <4 x float> %b) {
50; CHECK-LABEL: fmax_f32:
51; CHECK:       @ %bb.0:
52; CHECK-NEXT:    vmaxnm.f32 q0, q0, q1
53; CHECK-NEXT:    vmaxnm.f32 q0, q0, q2
54; CHECK-NEXT:    vmaxnm.f32 s2, s2, s3
55; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
56; CHECK-NEXT:    vmaxnm.f32 s0, s0, s2
57; CHECK-NEXT:    bx lr
58  %r1 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
59  %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
60  %r = call float @llvm.maxnum.f32(float %r1, float %r2)
61  ret float %r
62}
63
64
65define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) {
66; CHECK-LABEL: add_i32:
67; CHECK:       @ %bb.0:
68; CHECK-NEXT:    vaddv.u32 r0, q1
69; CHECK-NEXT:    vaddva.u32 r0, q0
70; CHECK-NEXT:    vaddva.u32 r0, q2
71; CHECK-NEXT:    bx lr
72  %r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a)
73  %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
74  %r = add i32 %r1, %r2
75  ret i32 %r
76}
77
78define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
79; CHECK-LABEL: add_ext_i16:
80; CHECK:       @ %bb.0:
81; CHECK-NEXT:    vaddv.u8 r0, q1
82; CHECK-NEXT:    vaddva.u8 r0, q0
83; CHECK-NEXT:    bx lr
84  %ae = zext <16 x i8> %a to <16 x i16>
85  %be = zext <16 x i8> %b to <16 x i16>
86  %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae)
87  %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
88  %r = add i16 %r1, %r2
89  ret i16 %r
90}
91
92define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
93; CHECK-LABEL: add_ext_v32i16:
94; CHECK:       @ %bb.0:
95; CHECK-NEXT:    .pad #32
96; CHECK-NEXT:    sub sp, #32
97; CHECK-NEXT:    mov r1, sp
98; CHECK-NEXT:    add r2, sp, #16
99; CHECK-NEXT:    vstrw.32 q0, [r1]
100; CHECK-NEXT:    vstrw.32 q1, [r2]
101; CHECK-NEXT:    vldrb.u16 q1, [r2]
102; CHECK-NEXT:    vldrb.u16 q0, [r1]
103; CHECK-NEXT:    vaddv.u16 r0, q1
104; CHECK-NEXT:    vaddva.u16 r0, q0
105; CHECK-NEXT:    vldrb.u16 q0, [r1, #8]
106; CHECK-NEXT:    vaddva.u16 r0, q0
107; CHECK-NEXT:    vldrb.u16 q0, [r2, #8]
108; CHECK-NEXT:    vaddva.u16 r0, q0
109; CHECK-NEXT:    vaddva.u8 r0, q2
110; CHECK-NEXT:    add sp, #32
111; CHECK-NEXT:    bx lr
112  %ae = zext <32 x i8> %a to <32 x i16>
113  %be = zext <16 x i8> %b to <16 x i16>
114  %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae)
115  %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
116  %r = add i16 %r1, %r2
117  ret i16 %r
118}
119
120define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
121; CHECK-LABEL: mul_i32:
122; CHECK:       @ %bb.0:
123; CHECK-NEXT:    vmul.i32 q0, q0, q1
124; CHECK-NEXT:    vmul.i32 q0, q0, q2
125; CHECK-NEXT:    vmov r0, r1, d1
126; CHECK-NEXT:    vmov r2, r3, d0
127; CHECK-NEXT:    muls r0, r1, r0
128; CHECK-NEXT:    mul r1, r2, r3
129; CHECK-NEXT:    muls r0, r1, r0
130; CHECK-NEXT:    bx lr
131  %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a)
132  %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
133  %r = mul i32 %r1, %r2
134  ret i32 %r
135}
136
137define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
138; CHECK-LABEL: and_i32:
139; CHECK:       @ %bb.0:
140; CHECK-NEXT:    vand q0, q0, q1
141; CHECK-NEXT:    vand q0, q0, q2
142; CHECK-NEXT:    vmov r0, r1, d1
143; CHECK-NEXT:    vmov r2, r3, d0
144; CHECK-NEXT:    ands r0, r1
145; CHECK-NEXT:    and.w r1, r2, r3
146; CHECK-NEXT:    ands r0, r1
147; CHECK-NEXT:    bx lr
148  %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a)
149  %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
150  %r = and i32 %r1, %r2
151  ret i32 %r
152}
153
154define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
155; CHECK-LABEL: or_i32:
156; CHECK:       @ %bb.0:
157; CHECK-NEXT:    vorr q0, q0, q1
158; CHECK-NEXT:    vorr q0, q0, q2
159; CHECK-NEXT:    vmov r0, r1, d1
160; CHECK-NEXT:    vmov r2, r3, d0
161; CHECK-NEXT:    orrs r0, r1
162; CHECK-NEXT:    orr.w r1, r2, r3
163; CHECK-NEXT:    orrs r0, r1
164; CHECK-NEXT:    bx lr
165  %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a)
166  %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
167  %r = or i32 %r1, %r2
168  ret i32 %r
169}
170
171define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
172; CHECK-LABEL: xor_i32:
173; CHECK:       @ %bb.0:
174; CHECK-NEXT:    veor q0, q0, q1
175; CHECK-NEXT:    veor q0, q0, q2
176; CHECK-NEXT:    vmov r0, r1, d1
177; CHECK-NEXT:    vmov r2, r3, d0
178; CHECK-NEXT:    eors r0, r1
179; CHECK-NEXT:    eor.w r1, r2, r3
180; CHECK-NEXT:    eors r0, r1
181; CHECK-NEXT:    bx lr
182  %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a)
183  %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
184  %r = xor i32 %r1, %r2
185  ret i32 %r
186}
187
188define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) {
189; CHECK-LABEL: umin_i32:
190; CHECK:       @ %bb.0:
191; CHECK-NEXT:    vmin.u32 q0, q0, q1
192; CHECK-NEXT:    mov.w r0, #-1
193; CHECK-NEXT:    vmin.u32 q0, q0, q2
194; CHECK-NEXT:    vminv.u32 r0, q0
195; CHECK-NEXT:    bx lr
196  %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a)
197  %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
198  %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
199  ret i32 %r
200}
201
202define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) {
203; CHECK-LABEL: umax_i32:
204; CHECK:       @ %bb.0:
205; CHECK-NEXT:    vmax.u32 q0, q0, q1
206; CHECK-NEXT:    movs r0, #0
207; CHECK-NEXT:    vmax.u32 q0, q0, q2
208; CHECK-NEXT:    vmaxv.u32 r0, q0
209; CHECK-NEXT:    bx lr
210  %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a)
211  %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
212  %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
213  ret i32 %r
214}
215
216define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) {
217; CHECK-LABEL: smin_i32:
218; CHECK:       @ %bb.0:
219; CHECK-NEXT:    vmin.s32 q0, q0, q1
220; CHECK-NEXT:    mvn r0, #-2147483648
221; CHECK-NEXT:    vmin.s32 q0, q0, q2
222; CHECK-NEXT:    vminv.s32 r0, q0
223; CHECK-NEXT:    bx lr
224  %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a)
225  %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
226  %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
227  ret i32 %r
228}
229
230define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
231; CHECK-LABEL: smax_i32:
232; CHECK:       @ %bb.0:
233; CHECK-NEXT:    vmax.s32 q0, q0, q1
234; CHECK-NEXT:    mov.w r0, #-2147483648
235; CHECK-NEXT:    vmax.s32 q0, q0, q2
236; CHECK-NEXT:    vmaxv.s32 r0, q0
237; CHECK-NEXT:    bx lr
238  %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a)
239  %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
240  %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
241  ret i32 %r
242}
243
244define float @nested_add_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
245; CHECK-LABEL: nested_add_f32:
246; CHECK:       @ %bb.0:
247; CHECK-NEXT:    vadd.f32 s6, s6, s7
248; CHECK-NEXT:    vadd.f32 s4, s4, s5
249; CHECK-NEXT:    vadd.f32 s2, s2, s3
250; CHECK-NEXT:    vadd.f32 s0, s0, s1
251; CHECK-NEXT:    vadd.f32 s4, s4, s6
252; CHECK-NEXT:    vadd.f32 s0, s0, s2
253; CHECK-NEXT:    vadd.f32 s2, s4, s9
254; CHECK-NEXT:    vadd.f32 s0, s0, s8
255; CHECK-NEXT:    vadd.f32 s0, s0, s2
256; CHECK-NEXT:    bx lr
257  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
258  %a1 = fadd fast float %r1, %c
259  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
260  %a2 = fadd fast float %r2, %d
261  %r = fadd fast float %a1, %a2
262  ret float %r
263}
264
265define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
266; CHECK-LABEL: nested_mul_f32:
267; CHECK:       @ %bb.0:
268; CHECK-NEXT:    vmul.f32 s6, s6, s7
269; CHECK-NEXT:    vmul.f32 s4, s4, s5
270; CHECK-NEXT:    vmul.f32 s2, s2, s3
271; CHECK-NEXT:    vmul.f32 s0, s0, s1
272; CHECK-NEXT:    vmul.f32 s4, s4, s6
273; CHECK-NEXT:    vmul.f32 s0, s0, s2
274; CHECK-NEXT:    vmul.f32 s2, s4, s9
275; CHECK-NEXT:    vmul.f32 s0, s0, s8
276; CHECK-NEXT:    vmul.f32 s0, s0, s2
277; CHECK-NEXT:    bx lr
278  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
279  %a1 = fmul fast float %r1, %c
280  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
281  %a2 = fmul fast float %r2, %d
282  %r = fmul fast float %a1, %a2
283  ret float %r
284}
285
286define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
287; CHECK-LABEL: nested_add_i32:
288; CHECK:       @ %bb.0:
289; CHECK-NEXT:    add r0, r1
290; CHECK-NEXT:    vaddva.u32 r0, q0
291; CHECK-NEXT:    vaddva.u32 r0, q1
292; CHECK-NEXT:    bx lr
293  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
294  %a1 = add i32 %r1, %c
295  %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
296  %a2 = add i32 %r2, %d
297  %r = add i32 %a1, %a2
298  ret i32 %r
299}
300
301define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
302; CHECK-LABEL: nested_mul_i32:
303; CHECK:       @ %bb.0:
304; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
305; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
306; CHECK-NEXT:    vmov r8, r3, d2
307; CHECK-NEXT:    vmov r4, r5, d1
308; CHECK-NEXT:    vmov r6, r7, d0
309; CHECK-NEXT:    vmov r12, lr, d3
310; CHECK-NEXT:    mul r3, r8, r3
311; CHECK-NEXT:    muls r5, r4, r5
312; CHECK-NEXT:    mul r2, r12, lr
313; CHECK-NEXT:    muls r7, r6, r7
314; CHECK-NEXT:    muls r2, r3, r2
315; CHECK-NEXT:    mul r3, r7, r5
316; CHECK-NEXT:    muls r1, r2, r1
317; CHECK-NEXT:    muls r0, r3, r0
318; CHECK-NEXT:    muls r0, r1, r0
319; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
320  %r1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
321  %a1 = mul i32 %r1, %c
322  %r2 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %b)
323  %a2 = mul i32 %r2, %d
324  %r = mul i32 %a1, %a2
325  ret i32 %r
326}
327
328define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
329; CHECK-LABEL: nested_and_i32:
330; CHECK:       @ %bb.0:
331; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
332; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
333; CHECK-NEXT:    vmov r2, r3, d2
334; CHECK-NEXT:    vmov r12, lr, d3
335; CHECK-NEXT:    vmov r8, r5, d1
336; CHECK-NEXT:    vmov r6, r7, d0
337; CHECK-NEXT:    ands r2, r3
338; CHECK-NEXT:    and.w r4, r12, lr
339; CHECK-NEXT:    ands r2, r4
340; CHECK-NEXT:    ands r1, r2
341; CHECK-NEXT:    and.w r2, r8, r5
342; CHECK-NEXT:    and.w r3, r6, r7
343; CHECK-NEXT:    ands r2, r3
344; CHECK-NEXT:    ands r0, r2
345; CHECK-NEXT:    ands r0, r1
346; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
347  %r1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
348  %a1 = and i32 %r1, %c
349  %r2 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %b)
350  %a2 = and i32 %r2, %d
351  %r = and i32 %a1, %a2
352  ret i32 %r
353}
354
355define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
356; CHECK-LABEL: nested_or_i32:
357; CHECK:       @ %bb.0:
358; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
359; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
360; CHECK-NEXT:    vmov r2, r3, d2
361; CHECK-NEXT:    vmov r12, lr, d3
362; CHECK-NEXT:    vmov r8, r5, d1
363; CHECK-NEXT:    vmov r6, r7, d0
364; CHECK-NEXT:    orrs r2, r3
365; CHECK-NEXT:    orr.w r4, r12, lr
366; CHECK-NEXT:    orrs r2, r4
367; CHECK-NEXT:    orrs r1, r2
368; CHECK-NEXT:    orr.w r2, r8, r5
369; CHECK-NEXT:    orr.w r3, r6, r7
370; CHECK-NEXT:    orrs r2, r3
371; CHECK-NEXT:    orrs r0, r2
372; CHECK-NEXT:    orrs r0, r1
373; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
374  %r1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
375  %a1 = or i32 %r1, %c
376  %r2 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %b)
377  %a2 = or i32 %r2, %d
378  %r = or i32 %a1, %a2
379  ret i32 %r
380}
381
382define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
383; CHECK-LABEL: nested_xor_i32:
384; CHECK:       @ %bb.0:
385; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
386; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
387; CHECK-NEXT:    vmov r2, r3, d2
388; CHECK-NEXT:    vmov r12, lr, d3
389; CHECK-NEXT:    vmov r8, r5, d1
390; CHECK-NEXT:    vmov r6, r7, d0
391; CHECK-NEXT:    eors r2, r3
392; CHECK-NEXT:    eor.w r4, r12, lr
393; CHECK-NEXT:    eors r2, r4
394; CHECK-NEXT:    eors r1, r2
395; CHECK-NEXT:    eor.w r2, r8, r5
396; CHECK-NEXT:    eor.w r3, r6, r7
397; CHECK-NEXT:    eors r2, r3
398; CHECK-NEXT:    eors r0, r2
399; CHECK-NEXT:    eors r0, r1
400; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
401  %r1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
402  %a1 = xor i32 %r1, %c
403  %r2 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %b)
404  %a2 = xor i32 %r2, %d
405  %r = xor i32 %a1, %a2
406  ret i32 %r
407}
408
409define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
410; CHECK-LABEL: nested_smin_i32:
411; CHECK:       @ %bb.0:
412; CHECK-NEXT:    mvn r3, #-2147483648
413; CHECK-NEXT:    mvn r2, #-2147483648
414; CHECK-NEXT:    vminv.s32 r3, q1
415; CHECK-NEXT:    vminv.s32 r2, q0
416; CHECK-NEXT:    cmp r3, r1
417; CHECK-NEXT:    csel r1, r3, r1, lt
418; CHECK-NEXT:    cmp r2, r0
419; CHECK-NEXT:    csel r0, r2, r0, lt
420; CHECK-NEXT:    cmp r0, r1
421; CHECK-NEXT:    csel r0, r0, r1, lt
422; CHECK-NEXT:    bx lr
423  %r1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
424  %a1 = call i32 @llvm.smin.i32(i32 %r1, i32 %c)
425  %r2 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %b)
426  %a2 = call i32 @llvm.smin.i32(i32 %r2, i32 %d)
427  %r = call i32 @llvm.smin.i32(i32 %a1, i32 %a2)
428  ret i32 %r
429}
430
431define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
432; CHECK-LABEL: nested_smax_i32:
433; CHECK:       @ %bb.0:
434; CHECK-NEXT:    mov.w r3, #-2147483648
435; CHECK-NEXT:    mov.w r2, #-2147483648
436; CHECK-NEXT:    vmaxv.s32 r3, q1
437; CHECK-NEXT:    vmaxv.s32 r2, q0
438; CHECK-NEXT:    cmp r3, r1
439; CHECK-NEXT:    csel r1, r3, r1, gt
440; CHECK-NEXT:    cmp r2, r0
441; CHECK-NEXT:    csel r0, r2, r0, gt
442; CHECK-NEXT:    cmp r0, r1
443; CHECK-NEXT:    csel r0, r0, r1, gt
444; CHECK-NEXT:    bx lr
445  %r1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
446  %a1 = call i32 @llvm.smax.i32(i32 %r1, i32 %c)
447  %r2 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %b)
448  %a2 = call i32 @llvm.smax.i32(i32 %r2, i32 %d)
449  %r = call i32 @llvm.smax.i32(i32 %a1, i32 %a2)
450  ret i32 %r
451}
452
453define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
454; CHECK-LABEL: nested_umin_i32:
455; CHECK:       @ %bb.0:
456; CHECK-NEXT:    mov.w r3, #-1
457; CHECK-NEXT:    mov.w r2, #-1
458; CHECK-NEXT:    vminv.u32 r3, q1
459; CHECK-NEXT:    vminv.u32 r2, q0
460; CHECK-NEXT:    cmp r3, r1
461; CHECK-NEXT:    csel r1, r3, r1, lo
462; CHECK-NEXT:    cmp r2, r0
463; CHECK-NEXT:    csel r0, r2, r0, lo
464; CHECK-NEXT:    cmp r0, r1
465; CHECK-NEXT:    csel r0, r0, r1, lo
466; CHECK-NEXT:    bx lr
467  %r1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
468  %a1 = call i32 @llvm.umin.i32(i32 %r1, i32 %c)
469  %r2 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %b)
470  %a2 = call i32 @llvm.umin.i32(i32 %r2, i32 %d)
471  %r = call i32 @llvm.umin.i32(i32 %a1, i32 %a2)
472  ret i32 %r
473}
474
475define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
476; CHECK-LABEL: nested_umax_i32:
477; CHECK:       @ %bb.0:
478; CHECK-NEXT:    movs r3, #0
479; CHECK-NEXT:    movs r2, #0
480; CHECK-NEXT:    vmaxv.u32 r3, q1
481; CHECK-NEXT:    vmaxv.u32 r2, q0
482; CHECK-NEXT:    cmp r3, r1
483; CHECK-NEXT:    csel r1, r3, r1, hi
484; CHECK-NEXT:    cmp r2, r0
485; CHECK-NEXT:    csel r0, r2, r0, hi
486; CHECK-NEXT:    cmp r0, r1
487; CHECK-NEXT:    csel r0, r0, r1, hi
488; CHECK-NEXT:    bx lr
489  %r1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
490  %a1 = call i32 @llvm.umax.i32(i32 %r1, i32 %c)
491  %r2 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %b)
492  %a2 = call i32 @llvm.umax.i32(i32 %r2, i32 %d)
493  %r = call i32 @llvm.umax.i32(i32 %a1, i32 %a2)
494  ret i32 %r
495}
496
497define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
498; CHECK-LABEL: nested_fmin_float:
499; CHECK:       @ %bb.0:
500; CHECK-NEXT:    vminnm.f32 s2, s2, s3
501; CHECK-NEXT:    vminnm.f32 s0, s0, s1
502; CHECK-NEXT:    vminnm.f32 s0, s0, s2
503; CHECK-NEXT:    vminnm.f32 s2, s6, s7
504; CHECK-NEXT:    vminnm.f32 s4, s4, s5
505; CHECK-NEXT:    vminnm.f32 s0, s0, s8
506; CHECK-NEXT:    vminnm.f32 s2, s4, s2
507; CHECK-NEXT:    vminnm.f32 s2, s2, s9
508; CHECK-NEXT:    vminnm.f32 s0, s0, s2
509; CHECK-NEXT:    bx lr
510  %r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
511  %a1 = call float @llvm.minnum.f32(float %r1, float %c)
512  %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
513  %a2 = call float @llvm.minnum.f32(float %r2, float %d)
514  %r = call float @llvm.minnum.f32(float %a1, float %a2)
515  ret float %r
516}
517
518define float @nested_fmax_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
519; CHECK-LABEL: nested_fmax_float:
520; CHECK:       @ %bb.0:
521; CHECK-NEXT:    vmaxnm.f32 s2, s2, s3
522; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
523; CHECK-NEXT:    vmaxnm.f32 s0, s0, s2
524; CHECK-NEXT:    vmaxnm.f32 s2, s6, s7
525; CHECK-NEXT:    vmaxnm.f32 s4, s4, s5
526; CHECK-NEXT:    vmaxnm.f32 s0, s0, s8
527; CHECK-NEXT:    vmaxnm.f32 s2, s4, s2
528; CHECK-NEXT:    vmaxnm.f32 s2, s2, s9
529; CHECK-NEXT:    vmaxnm.f32 s0, s0, s2
530; CHECK-NEXT:    bx lr
531  %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
532  %a1 = call float @llvm.maxnum.f32(float %r1, float %c)
533  %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
534  %a2 = call float @llvm.maxnum.f32(float %r2, float %d)
535  %r = call float @llvm.maxnum.f32(float %a1, float %a2)
536  ret float %r
537}
538
539declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
540declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
541declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
542declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
543declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
544declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
545declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
546declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
547declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>)
548declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
549declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>)
550declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>)
551declare i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32>)
552declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>)
553declare i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32>)
554declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>)
555declare i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32>)
556declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>)
557declare i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32>)
558declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>)
559declare i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32>)
560declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>)
561declare i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32>)
562declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>)
563declare i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32>)
564declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>)
565declare i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32>)
566declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>)
567declare float @llvm.minnum.f32(float, float)
568declare float @llvm.maxnum.f32(float, float)
569declare i32 @llvm.umin.i32(i32, i32)
570declare i32 @llvm.umax.i32(i32, i32)
571declare i32 @llvm.smin.i32(i32, i32)
572declare i32 @llvm.smax.i32(i32, i32)
573