xref: /llvm-project/llvm/test/CodeGen/AArch64/double_reduct.ll (revision ccd8d0b548fdbf18deda3163780c966a287db7e8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc --mtriple=aarch64-eabi < %s -global-isel=false | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3; RUN: llc --mtriple=aarch64-eabi < %s -global-isel=true | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4
5define float @add_f32(<8 x float> %a, <4 x float> %b) {
6; CHECK-SD-LABEL: add_f32:
7; CHECK-SD:       // %bb.0:
8; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
9; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v2.4s
10; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
11; CHECK-SD-NEXT:    faddp s0, v0.2s
12; CHECK-SD-NEXT:    ret
13;
14; CHECK-GI-LABEL: add_f32:
15; CHECK-GI:       // %bb.0:
16; CHECK-GI-NEXT:    fadd v0.4s, v0.4s, v1.4s
17; CHECK-GI-NEXT:    faddp v1.4s, v2.4s, v2.4s
18; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
19; CHECK-GI-NEXT:    faddp s1, v1.2s
20; CHECK-GI-NEXT:    faddp s0, v0.2s
21; CHECK-GI-NEXT:    fadd s0, s0, s1
22; CHECK-GI-NEXT:    ret
23  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
24  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
25  %r = fadd fast float %r1, %r2
26  ret float %r
27}
28
29define float @add_f32_same(<4 x float> %a, <4 x float> %b) {
30; CHECK-SD-LABEL: add_f32_same:
31; CHECK-SD:       // %bb.0:
32; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
33; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
34; CHECK-SD-NEXT:    faddp s0, v0.2s
35; CHECK-SD-NEXT:    ret
36;
37; CHECK-GI-LABEL: add_f32_same:
38; CHECK-GI:       // %bb.0:
39; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
40; CHECK-GI-NEXT:    faddp v1.4s, v1.4s, v1.4s
41; CHECK-GI-NEXT:    faddp s0, v0.2s
42; CHECK-GI-NEXT:    faddp s1, v1.2s
43; CHECK-GI-NEXT:    fadd s0, s0, s1
44; CHECK-GI-NEXT:    ret
45  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
46  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
47  %r = fadd fast float %r1, %r2
48  ret float %r
49}
50
51define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
52; CHECK-SD-LABEL: fmul_f32:
53; CHECK-SD:       // %bb.0:
54; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
55; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v2.4s
56; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
57; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
58; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
59; CHECK-SD-NEXT:    ret
60;
61; CHECK-GI-LABEL: fmul_f32:
62; CHECK-GI:       // %bb.0:
63; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
64; CHECK-GI-NEXT:    mov d3, v2.d[1]
65; CHECK-GI-NEXT:    mov d1, v0.d[1]
66; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
67; CHECK-GI-NEXT:    fmul v1.2s, v2.2s, v3.2s
68; CHECK-GI-NEXT:    mov s2, v0.s[1]
69; CHECK-GI-NEXT:    mov s3, v1.s[1]
70; CHECK-GI-NEXT:    fmul s0, s0, s2
71; CHECK-GI-NEXT:    fmul s1, s1, s3
72; CHECK-GI-NEXT:    fmul s0, s0, s1
73; CHECK-GI-NEXT:    ret
74  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
75  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
76  %r = fmul fast float %r1, %r2
77  ret float %r
78}
79
80define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) {
81; CHECK-SD-LABEL: fmul_f32_same:
82; CHECK-SD:       // %bb.0:
83; CHECK-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
84; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
85; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.2s
86; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
87; CHECK-SD-NEXT:    ret
88;
89; CHECK-GI-LABEL: fmul_f32_same:
90; CHECK-GI:       // %bb.0:
91; CHECK-GI-NEXT:    mov d2, v0.d[1]
92; CHECK-GI-NEXT:    mov d3, v1.d[1]
93; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
94; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
95; CHECK-GI-NEXT:    mov s2, v0.s[1]
96; CHECK-GI-NEXT:    mov s3, v1.s[1]
97; CHECK-GI-NEXT:    fmul s0, s0, s2
98; CHECK-GI-NEXT:    fmul s1, s1, s3
99; CHECK-GI-NEXT:    fmul s0, s0, s1
100; CHECK-GI-NEXT:    ret
101  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
102  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
103  %r = fmul fast float %r1, %r2
104  ret float %r
105}
106
107define float @fmin_f32(<8 x float> %a, <4 x float> %b) {
108; CHECK-SD-LABEL: fmin_f32:
109; CHECK-SD:       // %bb.0:
110; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v1.4s
111; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v2.4s
112; CHECK-SD-NEXT:    fminnmv s0, v0.4s
113; CHECK-SD-NEXT:    ret
114;
115; CHECK-GI-LABEL: fmin_f32:
116; CHECK-GI:       // %bb.0:
117; CHECK-GI-NEXT:    fminnm v0.4s, v0.4s, v1.4s
118; CHECK-GI-NEXT:    fminnmv s1, v2.4s
119; CHECK-GI-NEXT:    fminnmv s0, v0.4s
120; CHECK-GI-NEXT:    fminnm s0, s0, s1
121; CHECK-GI-NEXT:    ret
122  %r1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a)
123  %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
124  %r = call float @llvm.minnum.f32(float %r1, float %r2)
125  ret float %r
126}
127
128define float @fmin_f32_same(<4 x float> %a, <4 x float> %b) {
129; CHECK-SD-LABEL: fmin_f32_same:
130; CHECK-SD:       // %bb.0:
131; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v1.4s
132; CHECK-SD-NEXT:    fminnmv s0, v0.4s
133; CHECK-SD-NEXT:    ret
134;
135; CHECK-GI-LABEL: fmin_f32_same:
136; CHECK-GI:       // %bb.0:
137; CHECK-GI-NEXT:    fminnmv s0, v0.4s
138; CHECK-GI-NEXT:    fminnmv s1, v1.4s
139; CHECK-GI-NEXT:    fminnm s0, s0, s1
140; CHECK-GI-NEXT:    ret
141  %r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
142  %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
143  %r = call float @llvm.minnum.f32(float %r1, float %r2)
144  ret float %r
145}
146
147define float @fmax_f32(<8 x float> %a, <4 x float> %b) {
148; CHECK-SD-LABEL: fmax_f32:
149; CHECK-SD:       // %bb.0:
150; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
151; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v2.4s
152; CHECK-SD-NEXT:    fmaxnmv s0, v0.4s
153; CHECK-SD-NEXT:    ret
154;
155; CHECK-GI-LABEL: fmax_f32:
156; CHECK-GI:       // %bb.0:
157; CHECK-GI-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
158; CHECK-GI-NEXT:    fmaxnmv s1, v2.4s
159; CHECK-GI-NEXT:    fmaxnmv s0, v0.4s
160; CHECK-GI-NEXT:    fmaxnm s0, s0, s1
161; CHECK-GI-NEXT:    ret
162  %r1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
163  %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
164  %r = call float @llvm.maxnum.f32(float %r1, float %r2)
165  ret float %r
166}
167
168define float @fmax_f32_same(<4 x float> %a, <4 x float> %b) {
169; CHECK-SD-LABEL: fmax_f32_same:
170; CHECK-SD:       // %bb.0:
171; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
172; CHECK-SD-NEXT:    fmaxnmv s0, v0.4s
173; CHECK-SD-NEXT:    ret
174;
175; CHECK-GI-LABEL: fmax_f32_same:
176; CHECK-GI:       // %bb.0:
177; CHECK-GI-NEXT:    fmaxnmv s0, v0.4s
178; CHECK-GI-NEXT:    fmaxnmv s1, v1.4s
179; CHECK-GI-NEXT:    fmaxnm s0, s0, s1
180; CHECK-GI-NEXT:    ret
181  %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
182  %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
183  %r = call float @llvm.maxnum.f32(float %r1, float %r2)
184  ret float %r
185}
186
187define float @fminimum_f32(<8 x float> %a, <4 x float> %b) {
188; CHECK-SD-LABEL: fminimum_f32:
189; CHECK-SD:       // %bb.0:
190; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v1.4s
191; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v2.4s
192; CHECK-SD-NEXT:    fminv s0, v0.4s
193; CHECK-SD-NEXT:    ret
194;
195; CHECK-GI-LABEL: fminimum_f32:
196; CHECK-GI:       // %bb.0:
197; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v1.4s
198; CHECK-GI-NEXT:    fminv s1, v2.4s
199; CHECK-GI-NEXT:    fminv s0, v0.4s
200; CHECK-GI-NEXT:    fmin s0, s0, s1
201; CHECK-GI-NEXT:    ret
202  %r1 = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %a)
203  %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
204  %r = call float @llvm.minimum.f32(float %r1, float %r2)
205  ret float %r
206}
207
208define float @fminimum_f32_same(<4 x float> %a, <4 x float> %b) {
209; CHECK-SD-LABEL: fminimum_f32_same:
210; CHECK-SD:       // %bb.0:
211; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v1.4s
212; CHECK-SD-NEXT:    fminv s0, v0.4s
213; CHECK-SD-NEXT:    ret
214;
215; CHECK-GI-LABEL: fminimum_f32_same:
216; CHECK-GI:       // %bb.0:
217; CHECK-GI-NEXT:    fminv s0, v0.4s
218; CHECK-GI-NEXT:    fminv s1, v1.4s
219; CHECK-GI-NEXT:    fmin s0, s0, s1
220; CHECK-GI-NEXT:    ret
221  %r1 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
222  %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
223  %r = call float @llvm.minimum.f32(float %r1, float %r2)
224  ret float %r
225}
226
227define float @fmaximum_f32(<8 x float> %a, <4 x float> %b) {
228; CHECK-SD-LABEL: fmaximum_f32:
229; CHECK-SD:       // %bb.0:
230; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v1.4s
231; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v2.4s
232; CHECK-SD-NEXT:    fmaxv s0, v0.4s
233; CHECK-SD-NEXT:    ret
234;
235; CHECK-GI-LABEL: fmaximum_f32:
236; CHECK-GI:       // %bb.0:
237; CHECK-GI-NEXT:    fmax v0.4s, v0.4s, v1.4s
238; CHECK-GI-NEXT:    fmaxv s1, v2.4s
239; CHECK-GI-NEXT:    fmaxv s0, v0.4s
240; CHECK-GI-NEXT:    fmax s0, s0, s1
241; CHECK-GI-NEXT:    ret
242  %r1 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %a)
243  %r2 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %b)
244  %r = call float @llvm.maximum.f32(float %r1, float %r2)
245  ret float %r
246}
247
248define float @fmaximum_f32_same(<4 x float> %a, <4 x float> %b) {
249; CHECK-SD-LABEL: fmaximum_f32_same:
250; CHECK-SD:       // %bb.0:
251; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v1.4s
252; CHECK-SD-NEXT:    fmaxv s0, v0.4s
253; CHECK-SD-NEXT:    ret
254;
255; CHECK-GI-LABEL: fmaximum_f32_same:
256; CHECK-GI:       // %bb.0:
257; CHECK-GI-NEXT:    fmaxv s0, v0.4s
258; CHECK-GI-NEXT:    fmaxv s1, v1.4s
259; CHECK-GI-NEXT:    fmax s0, s0, s1
260; CHECK-GI-NEXT:    ret
261  %r1 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
262  %r2 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %b)
263  %r = call float @llvm.maximum.f32(float %r1, float %r2)
264  ret float %r
265}
266
267; These next two tests have incorrect minnum/minimum combinations
268define float @fminimumnum_f32(<4 x float> %a, <4 x float> %b) {
269; CHECK-LABEL: fminimumnum_f32:
270; CHECK:       // %bb.0:
271; CHECK-NEXT:    fminv s0, v0.4s
272; CHECK-NEXT:    fminv s1, v1.4s
273; CHECK-NEXT:    fminnm s0, s0, s1
274; CHECK-NEXT:    ret
275  %r1 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
276  %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
277  %r = call float @llvm.minnum.f32(float %r1, float %r2)
278  ret float %r
279}
280
281define float @fmaxnumimum_f32(<4 x float> %a, <4 x float> %b) {
282; CHECK-LABEL: fmaxnumimum_f32:
283; CHECK:       // %bb.0:
284; CHECK-NEXT:    fmaxnmv s0, v0.4s
285; CHECK-NEXT:    fmaxnmv s1, v1.4s
286; CHECK-NEXT:    fmax s0, s0, s1
287; CHECK-NEXT:    ret
288  %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
289  %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
290  %r = call float @llvm.maximum.f32(float %r1, float %r2)
291  ret float %r
292}
293
294
295define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) {
296; CHECK-SD-LABEL: add_i32:
297; CHECK-SD:       // %bb.0:
298; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
299; CHECK-SD-NEXT:    add v0.4s, v0.4s, v2.4s
300; CHECK-SD-NEXT:    addv s0, v0.4s
301; CHECK-SD-NEXT:    fmov w0, s0
302; CHECK-SD-NEXT:    ret
303;
304; CHECK-GI-LABEL: add_i32:
305; CHECK-GI:       // %bb.0:
306; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
307; CHECK-GI-NEXT:    addv s1, v2.4s
308; CHECK-GI-NEXT:    addv s0, v0.4s
309; CHECK-GI-NEXT:    fmov w9, s1
310; CHECK-GI-NEXT:    fmov w8, s0
311; CHECK-GI-NEXT:    add w0, w8, w9
312; CHECK-GI-NEXT:    ret
313  %r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a)
314  %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
315  %r = add i32 %r1, %r2
316  ret i32 %r
317}
318
319define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
320; CHECK-SD-LABEL: add_ext_i16:
321; CHECK-SD:       // %bb.0:
322; CHECK-SD-NEXT:    uaddlp v1.8h, v1.16b
323; CHECK-SD-NEXT:    uadalp v1.8h, v0.16b
324; CHECK-SD-NEXT:    addv h0, v1.8h
325; CHECK-SD-NEXT:    fmov w0, s0
326; CHECK-SD-NEXT:    ret
327;
328; CHECK-GI-LABEL: add_ext_i16:
329; CHECK-GI:       // %bb.0:
330; CHECK-GI-NEXT:    uaddlv h0, v0.16b
331; CHECK-GI-NEXT:    uaddlv h1, v1.16b
332; CHECK-GI-NEXT:    fmov w8, s0
333; CHECK-GI-NEXT:    fmov w9, s1
334; CHECK-GI-NEXT:    add w0, w8, w9
335; CHECK-GI-NEXT:    ret
336  %ae = zext <16 x i8> %a to <16 x i16>
337  %be = zext <16 x i8> %b to <16 x i16>
338  %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae)
339  %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
340  %r = add i16 %r1, %r2
341  ret i16 %r
342}
343
344define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
345; CHECK-SD-LABEL: add_ext_v32i16:
346; CHECK-SD:       // %bb.0:
347; CHECK-SD-NEXT:    uaddl2 v3.8h, v0.16b, v1.16b
348; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
349; CHECK-SD-NEXT:    add v0.8h, v0.8h, v3.8h
350; CHECK-SD-NEXT:    uadalp v0.8h, v2.16b
351; CHECK-SD-NEXT:    addv h0, v0.8h
352; CHECK-SD-NEXT:    fmov w0, s0
353; CHECK-SD-NEXT:    ret
354;
355; CHECK-GI-LABEL: add_ext_v32i16:
356; CHECK-GI:       // %bb.0:
357; CHECK-GI-NEXT:    uaddlv h0, v0.16b
358; CHECK-GI-NEXT:    uaddlv h1, v1.16b
359; CHECK-GI-NEXT:    uaddlv h2, v2.16b
360; CHECK-GI-NEXT:    fmov w8, s0
361; CHECK-GI-NEXT:    fmov w9, s1
362; CHECK-GI-NEXT:    add w8, w8, w9
363; CHECK-GI-NEXT:    fmov w9, s2
364; CHECK-GI-NEXT:    add w0, w8, w9
365; CHECK-GI-NEXT:    ret
366  %ae = zext <32 x i8> %a to <32 x i16>
367  %be = zext <16 x i8> %b to <16 x i16>
368  %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae)
369  %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
370  %r = add i16 %r1, %r2
371  ret i16 %r
372}
373
374define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
375; CHECK-SD-LABEL: mul_i32:
376; CHECK-SD:       // %bb.0:
377; CHECK-SD-NEXT:    mul v0.4s, v0.4s, v1.4s
378; CHECK-SD-NEXT:    mul v0.4s, v0.4s, v2.4s
379; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
380; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
381; CHECK-SD-NEXT:    mov w8, v0.s[1]
382; CHECK-SD-NEXT:    fmov w9, s0
383; CHECK-SD-NEXT:    mul w0, w9, w8
384; CHECK-SD-NEXT:    ret
385;
386; CHECK-GI-LABEL: mul_i32:
387; CHECK-GI:       // %bb.0:
388; CHECK-GI-NEXT:    mov d3, v0.d[1]
389; CHECK-GI-NEXT:    mov d4, v1.d[1]
390; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v3.2s
391; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v4.2s
392; CHECK-GI-NEXT:    mov d3, v2.d[1]
393; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
394; CHECK-GI-NEXT:    mul v1.2s, v2.2s, v3.2s
395; CHECK-GI-NEXT:    mov w8, v0.s[1]
396; CHECK-GI-NEXT:    fmov w10, s0
397; CHECK-GI-NEXT:    mov w9, v1.s[1]
398; CHECK-GI-NEXT:    mul w8, w10, w8
399; CHECK-GI-NEXT:    fmov w10, s1
400; CHECK-GI-NEXT:    mul w9, w10, w9
401; CHECK-GI-NEXT:    mul w0, w8, w9
402; CHECK-GI-NEXT:    ret
403  %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a)
404  %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
405  %r = mul i32 %r1, %r2
406  ret i32 %r
407}
408
409define i32 @mul_i32_same(<4 x i32> %a, <4 x i32> %b) {
410; CHECK-SD-LABEL: mul_i32_same:
411; CHECK-SD:       // %bb.0:
412; CHECK-SD-NEXT:    mul v0.4s, v0.4s, v1.4s
413; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
414; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
415; CHECK-SD-NEXT:    mov w8, v0.s[1]
416; CHECK-SD-NEXT:    fmov w9, s0
417; CHECK-SD-NEXT:    mul w0, w9, w8
418; CHECK-SD-NEXT:    ret
419;
420; CHECK-GI-LABEL: mul_i32_same:
421; CHECK-GI:       // %bb.0:
422; CHECK-GI-NEXT:    mov d2, v0.d[1]
423; CHECK-GI-NEXT:    mov d3, v1.d[1]
424; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
425; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v3.2s
426; CHECK-GI-NEXT:    mov w8, v0.s[1]
427; CHECK-GI-NEXT:    mov w9, v1.s[1]
428; CHECK-GI-NEXT:    fmov w10, s0
429; CHECK-GI-NEXT:    fmov w11, s1
430; CHECK-GI-NEXT:    mul w8, w10, w8
431; CHECK-GI-NEXT:    mul w9, w11, w9
432; CHECK-GI-NEXT:    mul w0, w8, w9
433; CHECK-GI-NEXT:    ret
434  %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
435  %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
436  %r = mul i32 %r1, %r2
437  ret i32 %r
438}
439
440define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
441; CHECK-SD-LABEL: and_i32:
442; CHECK-SD:       // %bb.0:
443; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
444; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
445; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
446; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
447; CHECK-SD-NEXT:    fmov x8, d0
448; CHECK-SD-NEXT:    lsr x9, x8, #32
449; CHECK-SD-NEXT:    and w0, w8, w9
450; CHECK-SD-NEXT:    ret
451;
452; CHECK-GI-LABEL: and_i32:
453; CHECK-GI:       // %bb.0:
454; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
455; CHECK-GI-NEXT:    mov d1, v2.d[1]
456; CHECK-GI-NEXT:    mov d3, v0.d[1]
457; CHECK-GI-NEXT:    and v1.8b, v2.8b, v1.8b
458; CHECK-GI-NEXT:    and v0.8b, v0.8b, v3.8b
459; CHECK-GI-NEXT:    mov w8, v1.s[1]
460; CHECK-GI-NEXT:    fmov w10, s1
461; CHECK-GI-NEXT:    mov w9, v0.s[1]
462; CHECK-GI-NEXT:    fmov w11, s0
463; CHECK-GI-NEXT:    and w8, w10, w8
464; CHECK-GI-NEXT:    and w8, w11, w8
465; CHECK-GI-NEXT:    and w0, w8, w9
466; CHECK-GI-NEXT:    ret
467  %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a)
468  %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
469  %r = and i32 %r1, %r2
470  ret i32 %r
471}
472
473define i32 @and_i32_same(<4 x i32> %a, <4 x i32> %b) {
474; CHECK-SD-LABEL: and_i32_same:
475; CHECK-SD:       // %bb.0:
476; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
477; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
478; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
479; CHECK-SD-NEXT:    fmov x8, d0
480; CHECK-SD-NEXT:    lsr x9, x8, #32
481; CHECK-SD-NEXT:    and w0, w8, w9
482; CHECK-SD-NEXT:    ret
483;
484; CHECK-GI-LABEL: and_i32_same:
485; CHECK-GI:       // %bb.0:
486; CHECK-GI-NEXT:    mov d2, v0.d[1]
487; CHECK-GI-NEXT:    mov d3, v1.d[1]
488; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
489; CHECK-GI-NEXT:    and v1.8b, v1.8b, v3.8b
490; CHECK-GI-NEXT:    mov w8, v0.s[1]
491; CHECK-GI-NEXT:    mov w9, v1.s[1]
492; CHECK-GI-NEXT:    fmov w10, s0
493; CHECK-GI-NEXT:    fmov w11, s1
494; CHECK-GI-NEXT:    and w8, w10, w8
495; CHECK-GI-NEXT:    and w9, w11, w9
496; CHECK-GI-NEXT:    and w0, w8, w9
497; CHECK-GI-NEXT:    ret
498  %r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a)
499  %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
500  %r = and i32 %r1, %r2
501  ret i32 %r
502}
503
504define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
505; CHECK-SD-LABEL: or_i32:
506; CHECK-SD:       // %bb.0:
507; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
508; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
509; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
510; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
511; CHECK-SD-NEXT:    fmov x8, d0
512; CHECK-SD-NEXT:    lsr x9, x8, #32
513; CHECK-SD-NEXT:    orr w0, w8, w9
514; CHECK-SD-NEXT:    ret
515;
516; CHECK-GI-LABEL: or_i32:
517; CHECK-GI:       // %bb.0:
518; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
519; CHECK-GI-NEXT:    mov d1, v2.d[1]
520; CHECK-GI-NEXT:    mov d3, v0.d[1]
521; CHECK-GI-NEXT:    orr v1.8b, v2.8b, v1.8b
522; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v3.8b
523; CHECK-GI-NEXT:    mov w8, v1.s[1]
524; CHECK-GI-NEXT:    fmov w10, s1
525; CHECK-GI-NEXT:    mov w9, v0.s[1]
526; CHECK-GI-NEXT:    fmov w11, s0
527; CHECK-GI-NEXT:    orr w8, w10, w8
528; CHECK-GI-NEXT:    orr w8, w11, w8
529; CHECK-GI-NEXT:    orr w0, w8, w9
530; CHECK-GI-NEXT:    ret
531  %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a)
532  %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
533  %r = or i32 %r1, %r2
534  ret i32 %r
535}
536
537define i32 @or_i32_same(<4 x i32> %a, <4 x i32> %b) {
538; CHECK-SD-LABEL: or_i32_same:
539; CHECK-SD:       // %bb.0:
540; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
541; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
542; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
543; CHECK-SD-NEXT:    fmov x8, d0
544; CHECK-SD-NEXT:    lsr x9, x8, #32
545; CHECK-SD-NEXT:    orr w0, w8, w9
546; CHECK-SD-NEXT:    ret
547;
548; CHECK-GI-LABEL: or_i32_same:
549; CHECK-GI:       // %bb.0:
550; CHECK-GI-NEXT:    mov d2, v0.d[1]
551; CHECK-GI-NEXT:    mov d3, v1.d[1]
552; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
553; CHECK-GI-NEXT:    orr v1.8b, v1.8b, v3.8b
554; CHECK-GI-NEXT:    mov w8, v0.s[1]
555; CHECK-GI-NEXT:    mov w9, v1.s[1]
556; CHECK-GI-NEXT:    fmov w10, s0
557; CHECK-GI-NEXT:    fmov w11, s1
558; CHECK-GI-NEXT:    orr w8, w10, w8
559; CHECK-GI-NEXT:    orr w9, w11, w9
560; CHECK-GI-NEXT:    orr w0, w8, w9
561; CHECK-GI-NEXT:    ret
562  %r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a)
563  %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
564  %r = or i32 %r1, %r2
565  ret i32 %r
566}
567
568define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
569; CHECK-SD-LABEL: xor_i32:
570; CHECK-SD:       // %bb.0:
571; CHECK-SD-NEXT:    eor v0.16b, v0.16b, v1.16b
572; CHECK-SD-NEXT:    eor v0.16b, v0.16b, v2.16b
573; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
574; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
575; CHECK-SD-NEXT:    fmov x8, d0
576; CHECK-SD-NEXT:    lsr x9, x8, #32
577; CHECK-SD-NEXT:    eor w0, w8, w9
578; CHECK-SD-NEXT:    ret
579;
580; CHECK-GI-LABEL: xor_i32:
581; CHECK-GI:       // %bb.0:
582; CHECK-GI-NEXT:    eor v0.16b, v0.16b, v1.16b
583; CHECK-GI-NEXT:    mov d1, v2.d[1]
584; CHECK-GI-NEXT:    mov d3, v0.d[1]
585; CHECK-GI-NEXT:    eor v1.8b, v2.8b, v1.8b
586; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v3.8b
587; CHECK-GI-NEXT:    mov w8, v1.s[1]
588; CHECK-GI-NEXT:    fmov w10, s1
589; CHECK-GI-NEXT:    mov w9, v0.s[1]
590; CHECK-GI-NEXT:    fmov w11, s0
591; CHECK-GI-NEXT:    eor w8, w10, w8
592; CHECK-GI-NEXT:    eor w8, w11, w8
593; CHECK-GI-NEXT:    eor w0, w8, w9
594; CHECK-GI-NEXT:    ret
595  %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a)
596  %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
597  %r = xor i32 %r1, %r2
598  ret i32 %r
599}
600
601define i32 @xor_i32_same(<4 x i32> %a, <4 x i32> %b) {
602; CHECK-SD-LABEL: xor_i32_same:
603; CHECK-SD:       // %bb.0:
604; CHECK-SD-NEXT:    eor v0.16b, v0.16b, v1.16b
605; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
606; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
607; CHECK-SD-NEXT:    fmov x8, d0
608; CHECK-SD-NEXT:    lsr x9, x8, #32
609; CHECK-SD-NEXT:    eor w0, w8, w9
610; CHECK-SD-NEXT:    ret
611;
612; CHECK-GI-LABEL: xor_i32_same:
613; CHECK-GI:       // %bb.0:
614; CHECK-GI-NEXT:    mov d2, v0.d[1]
615; CHECK-GI-NEXT:    mov d3, v1.d[1]
616; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
617; CHECK-GI-NEXT:    eor v1.8b, v1.8b, v3.8b
618; CHECK-GI-NEXT:    mov w8, v0.s[1]
619; CHECK-GI-NEXT:    mov w9, v1.s[1]
620; CHECK-GI-NEXT:    fmov w10, s0
621; CHECK-GI-NEXT:    fmov w11, s1
622; CHECK-GI-NEXT:    eor w8, w10, w8
623; CHECK-GI-NEXT:    eor w9, w11, w9
624; CHECK-GI-NEXT:    eor w0, w8, w9
625; CHECK-GI-NEXT:    ret
626  %r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
627  %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
628  %r = xor i32 %r1, %r2
629  ret i32 %r
630}
631
632define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) {
633; CHECK-SD-LABEL: umin_i32:
634; CHECK-SD:       // %bb.0:
635; CHECK-SD-NEXT:    umin v0.4s, v0.4s, v1.4s
636; CHECK-SD-NEXT:    umin v0.4s, v0.4s, v2.4s
637; CHECK-SD-NEXT:    uminv s0, v0.4s
638; CHECK-SD-NEXT:    fmov w0, s0
639; CHECK-SD-NEXT:    ret
640;
641; CHECK-GI-LABEL: umin_i32:
642; CHECK-GI:       // %bb.0:
643; CHECK-GI-NEXT:    umin v0.4s, v0.4s, v1.4s
644; CHECK-GI-NEXT:    uminv s1, v2.4s
645; CHECK-GI-NEXT:    uminv s0, v0.4s
646; CHECK-GI-NEXT:    fmov w9, s1
647; CHECK-GI-NEXT:    fmov w8, s0
648; CHECK-GI-NEXT:    cmp w8, w9
649; CHECK-GI-NEXT:    fcsel s0, s0, s1, lo
650; CHECK-GI-NEXT:    fmov w0, s0
651; CHECK-GI-NEXT:    ret
652  %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a)
653  %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
654  %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
655  ret i32 %r
656}
657
658define i32 @umin_i32_same(<4 x i32> %a, <4 x i32> %b) {
659; CHECK-SD-LABEL: umin_i32_same:
660; CHECK-SD:       // %bb.0:
661; CHECK-SD-NEXT:    umin v0.4s, v0.4s, v1.4s
662; CHECK-SD-NEXT:    uminv s0, v0.4s
663; CHECK-SD-NEXT:    fmov w0, s0
664; CHECK-SD-NEXT:    ret
665;
666; CHECK-GI-LABEL: umin_i32_same:
667; CHECK-GI:       // %bb.0:
668; CHECK-GI-NEXT:    uminv s0, v0.4s
669; CHECK-GI-NEXT:    uminv s1, v1.4s
670; CHECK-GI-NEXT:    fmov w8, s0
671; CHECK-GI-NEXT:    fmov w9, s1
672; CHECK-GI-NEXT:    cmp w8, w9
673; CHECK-GI-NEXT:    fcsel s0, s0, s1, lo
674; CHECK-GI-NEXT:    fmov w0, s0
675; CHECK-GI-NEXT:    ret
676  %r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
677  %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
678  %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
679  ret i32 %r
680}
681
682define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) {
683; CHECK-SD-LABEL: umax_i32:
684; CHECK-SD:       // %bb.0:
685; CHECK-SD-NEXT:    umax v0.4s, v0.4s, v1.4s
686; CHECK-SD-NEXT:    umax v0.4s, v0.4s, v2.4s
687; CHECK-SD-NEXT:    umaxv s0, v0.4s
688; CHECK-SD-NEXT:    fmov w0, s0
689; CHECK-SD-NEXT:    ret
690;
691; CHECK-GI-LABEL: umax_i32:
692; CHECK-GI:       // %bb.0:
693; CHECK-GI-NEXT:    umax v0.4s, v0.4s, v1.4s
694; CHECK-GI-NEXT:    umaxv s1, v2.4s
695; CHECK-GI-NEXT:    umaxv s0, v0.4s
696; CHECK-GI-NEXT:    fmov w9, s1
697; CHECK-GI-NEXT:    fmov w8, s0
698; CHECK-GI-NEXT:    cmp w8, w9
699; CHECK-GI-NEXT:    fcsel s0, s0, s1, hi
700; CHECK-GI-NEXT:    fmov w0, s0
701; CHECK-GI-NEXT:    ret
702  %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a)
703  %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
704  %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
705  ret i32 %r
706}
707
708define i32 @umax_i32_same(<4 x i32> %a, <4 x i32> %b) {
709; CHECK-SD-LABEL: umax_i32_same:
710; CHECK-SD:       // %bb.0:
711; CHECK-SD-NEXT:    umax v0.4s, v0.4s, v1.4s
712; CHECK-SD-NEXT:    umaxv s0, v0.4s
713; CHECK-SD-NEXT:    fmov w0, s0
714; CHECK-SD-NEXT:    ret
715;
716; CHECK-GI-LABEL: umax_i32_same:
717; CHECK-GI:       // %bb.0:
718; CHECK-GI-NEXT:    umaxv s0, v0.4s
719; CHECK-GI-NEXT:    umaxv s1, v1.4s
720; CHECK-GI-NEXT:    fmov w8, s0
721; CHECK-GI-NEXT:    fmov w9, s1
722; CHECK-GI-NEXT:    cmp w8, w9
723; CHECK-GI-NEXT:    fcsel s0, s0, s1, hi
724; CHECK-GI-NEXT:    fmov w0, s0
725; CHECK-GI-NEXT:    ret
726  %r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
727  %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
728  %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
729  ret i32 %r
730}
731
732define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) {
733; CHECK-SD-LABEL: smin_i32:
734; CHECK-SD:       // %bb.0:
735; CHECK-SD-NEXT:    smin v0.4s, v0.4s, v1.4s
736; CHECK-SD-NEXT:    smin v0.4s, v0.4s, v2.4s
737; CHECK-SD-NEXT:    sminv s0, v0.4s
738; CHECK-SD-NEXT:    fmov w0, s0
739; CHECK-SD-NEXT:    ret
740;
741; CHECK-GI-LABEL: smin_i32:
742; CHECK-GI:       // %bb.0:
743; CHECK-GI-NEXT:    smin v0.4s, v0.4s, v1.4s
744; CHECK-GI-NEXT:    sminv s1, v2.4s
745; CHECK-GI-NEXT:    sminv s0, v0.4s
746; CHECK-GI-NEXT:    fmov w9, s1
747; CHECK-GI-NEXT:    fmov w8, s0
748; CHECK-GI-NEXT:    cmp w8, w9
749; CHECK-GI-NEXT:    fcsel s0, s0, s1, lt
750; CHECK-GI-NEXT:    fmov w0, s0
751; CHECK-GI-NEXT:    ret
752  %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a)
753  %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
754  %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
755  ret i32 %r
756}
757
758define i32 @smin_i32_same(<4 x i32> %a, <4 x i32> %b) {
759; CHECK-SD-LABEL: smin_i32_same:
760; CHECK-SD:       // %bb.0:
761; CHECK-SD-NEXT:    smin v0.4s, v0.4s, v1.4s
762; CHECK-SD-NEXT:    sminv s0, v0.4s
763; CHECK-SD-NEXT:    fmov w0, s0
764; CHECK-SD-NEXT:    ret
765;
766; CHECK-GI-LABEL: smin_i32_same:
767; CHECK-GI:       // %bb.0:
768; CHECK-GI-NEXT:    sminv s0, v0.4s
769; CHECK-GI-NEXT:    sminv s1, v1.4s
770; CHECK-GI-NEXT:    fmov w8, s0
771; CHECK-GI-NEXT:    fmov w9, s1
772; CHECK-GI-NEXT:    cmp w8, w9
773; CHECK-GI-NEXT:    fcsel s0, s0, s1, lt
774; CHECK-GI-NEXT:    fmov w0, s0
775; CHECK-GI-NEXT:    ret
776  %r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
777  %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
778  %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
779  ret i32 %r
780}
781
782define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
783; CHECK-SD-LABEL: smax_i32:
784; CHECK-SD:       // %bb.0:
785; CHECK-SD-NEXT:    smax v0.4s, v0.4s, v1.4s
786; CHECK-SD-NEXT:    smax v0.4s, v0.4s, v2.4s
787; CHECK-SD-NEXT:    smaxv s0, v0.4s
788; CHECK-SD-NEXT:    fmov w0, s0
789; CHECK-SD-NEXT:    ret
790;
791; CHECK-GI-LABEL: smax_i32:
792; CHECK-GI:       // %bb.0:
793; CHECK-GI-NEXT:    smax v0.4s, v0.4s, v1.4s
794; CHECK-GI-NEXT:    smaxv s1, v2.4s
795; CHECK-GI-NEXT:    smaxv s0, v0.4s
796; CHECK-GI-NEXT:    fmov w9, s1
797; CHECK-GI-NEXT:    fmov w8, s0
798; CHECK-GI-NEXT:    cmp w8, w9
799; CHECK-GI-NEXT:    fcsel s0, s0, s1, gt
800; CHECK-GI-NEXT:    fmov w0, s0
801; CHECK-GI-NEXT:    ret
802  %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a)
803  %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
804  %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
805  ret i32 %r
806}
807
808define i32 @smax_i32_same(<4 x i32> %a, <4 x i32> %b) {
809; CHECK-SD-LABEL: smax_i32_same:
810; CHECK-SD:       // %bb.0:
811; CHECK-SD-NEXT:    smax v0.4s, v0.4s, v1.4s
812; CHECK-SD-NEXT:    smaxv s0, v0.4s
813; CHECK-SD-NEXT:    fmov w0, s0
814; CHECK-SD-NEXT:    ret
815;
816; CHECK-GI-LABEL: smax_i32_same:
817; CHECK-GI:       // %bb.0:
818; CHECK-GI-NEXT:    smaxv s0, v0.4s
819; CHECK-GI-NEXT:    smaxv s1, v1.4s
820; CHECK-GI-NEXT:    fmov w8, s0
821; CHECK-GI-NEXT:    fmov w9, s1
822; CHECK-GI-NEXT:    cmp w8, w9
823; CHECK-GI-NEXT:    fcsel s0, s0, s1, gt
824; CHECK-GI-NEXT:    fmov w0, s0
825; CHECK-GI-NEXT:    ret
826  %r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
827  %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
828  %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
829  ret i32 %r
830}
831
832
833define float @nested_fadd_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
834; CHECK-SD-LABEL: nested_fadd_f32:
835; CHECK-SD:       // %bb.0:
836; CHECK-SD-NEXT:    faddp v1.4s, v1.4s, v1.4s
837; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
838; CHECK-SD-NEXT:    faddp s1, v1.2s
839; CHECK-SD-NEXT:    faddp s0, v0.2s
840; CHECK-SD-NEXT:    fadd s1, s1, s3
841; CHECK-SD-NEXT:    fadd s0, s0, s2
842; CHECK-SD-NEXT:    fadd s0, s0, s1
843; CHECK-SD-NEXT:    ret
844;
845; CHECK-GI-LABEL: nested_fadd_f32:
846; CHECK-GI:       // %bb.0:
847; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
848; CHECK-GI-NEXT:    faddp v1.4s, v1.4s, v1.4s
849; CHECK-GI-NEXT:    faddp s0, v0.2s
850; CHECK-GI-NEXT:    faddp s1, v1.2s
851; CHECK-GI-NEXT:    fadd s0, s0, s2
852; CHECK-GI-NEXT:    fadd s1, s1, s3
853; CHECK-GI-NEXT:    fadd s0, s0, s1
854; CHECK-GI-NEXT:    ret
855  %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
856  %a1 = fadd fast float %r1, %c
857  %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
858  %a2 = fadd fast float %r2, %d
859  %r = fadd fast float %a1, %a2
860  ret float %r
861}
862
863define float @nested_fadd_f32_slow(<4 x float> %a, <4 x float> %b, float %c, float %d) {
864; CHECK-SD-LABEL: nested_fadd_f32_slow:
865; CHECK-SD:       // %bb.0:
866; CHECK-SD-NEXT:    mov s4, v1.s[2]
867; CHECK-SD-NEXT:    mov s5, v0.s[2]
868; CHECK-SD-NEXT:    faddp s6, v0.2s
869; CHECK-SD-NEXT:    faddp s7, v1.2s
870; CHECK-SD-NEXT:    mov s1, v1.s[3]
871; CHECK-SD-NEXT:    mov s0, v0.s[3]
872; CHECK-SD-NEXT:    fadd s5, s6, s5
873; CHECK-SD-NEXT:    fadd s4, s7, s4
874; CHECK-SD-NEXT:    fadd s0, s5, s0
875; CHECK-SD-NEXT:    fadd s1, s4, s1
876; CHECK-SD-NEXT:    fadd s0, s0, s2
877; CHECK-SD-NEXT:    fadd s1, s1, s3
878; CHECK-SD-NEXT:    fadd s0, s0, s1
879; CHECK-SD-NEXT:    ret
880;
881; CHECK-GI-LABEL: nested_fadd_f32_slow:
882; CHECK-GI:       // %bb.0:
883; CHECK-GI-NEXT:    mov s4, v0.s[2]
884; CHECK-GI-NEXT:    faddp s5, v0.2s
885; CHECK-GI-NEXT:    mov s6, v1.s[2]
886; CHECK-GI-NEXT:    faddp s7, v1.2s
887; CHECK-GI-NEXT:    mov s0, v0.s[3]
888; CHECK-GI-NEXT:    mov s1, v1.s[3]
889; CHECK-GI-NEXT:    fadd s4, s5, s4
890; CHECK-GI-NEXT:    fadd s5, s7, s6
891; CHECK-GI-NEXT:    fadd s0, s4, s0
892; CHECK-GI-NEXT:    fadd s1, s5, s1
893; CHECK-GI-NEXT:    fadd s0, s0, s2
894; CHECK-GI-NEXT:    fadd s1, s1, s3
895; CHECK-GI-NEXT:    fadd s0, s0, s1
896; CHECK-GI-NEXT:    ret
897  %r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
898  %a1 = fadd float %r1, %c
899  %r2 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
900  %a2 = fadd float %r2, %d
901  %r = fadd float %a1, %a2
902  ret float %r
903}
904
905define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
906; CHECK-SD-LABEL: nested_mul_f32:
907; CHECK-SD:       // %bb.0:
908; CHECK-SD-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
909; CHECK-SD-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
910; CHECK-SD-NEXT:    fmul v1.2s, v1.2s, v4.2s
911; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v5.2s
912; CHECK-SD-NEXT:    fmul s1, s1, v1.s[1]
913; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
914; CHECK-SD-NEXT:    fmul s1, s1, s3
915; CHECK-SD-NEXT:    fmul s0, s0, s2
916; CHECK-SD-NEXT:    fmul s0, s0, s1
917; CHECK-SD-NEXT:    ret
918;
919; CHECK-GI-LABEL: nested_mul_f32:
920; CHECK-GI:       // %bb.0:
921; CHECK-GI-NEXT:    mov d4, v0.d[1]
922; CHECK-GI-NEXT:    mov d5, v1.d[1]
923; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v4.2s
924; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v5.2s
925; CHECK-GI-NEXT:    mov s4, v0.s[1]
926; CHECK-GI-NEXT:    mov s5, v1.s[1]
927; CHECK-GI-NEXT:    fmul s0, s0, s4
928; CHECK-GI-NEXT:    fmul s1, s1, s5
929; CHECK-GI-NEXT:    fmul s0, s0, s2
930; CHECK-GI-NEXT:    fmul s1, s1, s3
931; CHECK-GI-NEXT:    fmul s0, s0, s1
932; CHECK-GI-NEXT:    ret
933  %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
934  %a1 = fmul fast float %r1, %c
935  %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
936  %a2 = fmul fast float %r2, %d
937  %r = fmul fast float %a1, %a2
938  ret float %r
939}
940
941define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
942; CHECK-SD-LABEL: nested_add_i32:
943; CHECK-SD:       // %bb.0:
944; CHECK-SD-NEXT:    addv s1, v1.4s
945; CHECK-SD-NEXT:    addv s0, v0.4s
946; CHECK-SD-NEXT:    fmov w8, s1
947; CHECK-SD-NEXT:    fmov w9, s0
948; CHECK-SD-NEXT:    add w9, w9, w0
949; CHECK-SD-NEXT:    add w8, w8, w1
950; CHECK-SD-NEXT:    add w0, w9, w8
951; CHECK-SD-NEXT:    ret
952;
953; CHECK-GI-LABEL: nested_add_i32:
954; CHECK-GI:       // %bb.0:
955; CHECK-GI-NEXT:    addv s0, v0.4s
956; CHECK-GI-NEXT:    addv s1, v1.4s
957; CHECK-GI-NEXT:    fmov w8, s0
958; CHECK-GI-NEXT:    fmov w9, s1
959; CHECK-GI-NEXT:    add w8, w8, w0
960; CHECK-GI-NEXT:    add w9, w9, w1
961; CHECK-GI-NEXT:    add w0, w8, w9
962; CHECK-GI-NEXT:    ret
963  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
964  %a1 = add i32 %r1, %c
965  %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
966  %a2 = add i32 %r2, %d
967  %r = add i32 %a1, %a2
968  ret i32 %r
969}
970
971define i32 @nested_add_c1_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
972; CHECK-SD-LABEL: nested_add_c1_i32:
973; CHECK-SD:       // %bb.0:
974; CHECK-SD-NEXT:    addv s1, v1.4s
975; CHECK-SD-NEXT:    addv s0, v0.4s
976; CHECK-SD-NEXT:    fmov w8, s1
977; CHECK-SD-NEXT:    fmov w9, s0
978; CHECK-SD-NEXT:    add w9, w0, w9
979; CHECK-SD-NEXT:    add w8, w8, w1
980; CHECK-SD-NEXT:    add w0, w9, w8
981; CHECK-SD-NEXT:    ret
982;
983; CHECK-GI-LABEL: nested_add_c1_i32:
984; CHECK-GI:       // %bb.0:
985; CHECK-GI-NEXT:    addv s0, v0.4s
986; CHECK-GI-NEXT:    addv s1, v1.4s
987; CHECK-GI-NEXT:    fmov w8, s0
988; CHECK-GI-NEXT:    fmov w9, s1
989; CHECK-GI-NEXT:    add w8, w0, w8
990; CHECK-GI-NEXT:    add w9, w9, w1
991; CHECK-GI-NEXT:    add w0, w8, w9
992; CHECK-GI-NEXT:    ret
993  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
994  %a1 = add i32 %c, %r1
995  %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
996  %a2 = add i32 %r2, %d
997  %r = add i32 %a1, %a2
998  ret i32 %r
999}
1000
1001define i32 @nested_add_c2_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
1002; CHECK-SD-LABEL: nested_add_c2_i32:
1003; CHECK-SD:       // %bb.0:
1004; CHECK-SD-NEXT:    addv s1, v1.4s
1005; CHECK-SD-NEXT:    addv s0, v0.4s
1006; CHECK-SD-NEXT:    fmov w8, s1
1007; CHECK-SD-NEXT:    fmov w9, s0
1008; CHECK-SD-NEXT:    add w9, w9, w0
1009; CHECK-SD-NEXT:    add w8, w1, w8
1010; CHECK-SD-NEXT:    add w0, w9, w8
1011; CHECK-SD-NEXT:    ret
1012;
1013; CHECK-GI-LABEL: nested_add_c2_i32:
1014; CHECK-GI:       // %bb.0:
1015; CHECK-GI-NEXT:    addv s0, v0.4s
1016; CHECK-GI-NEXT:    addv s1, v1.4s
1017; CHECK-GI-NEXT:    fmov w8, s0
1018; CHECK-GI-NEXT:    fmov w9, s1
1019; CHECK-GI-NEXT:    add w8, w8, w0
1020; CHECK-GI-NEXT:    add w9, w1, w9
1021; CHECK-GI-NEXT:    add w0, w8, w9
1022; CHECK-GI-NEXT:    ret
1023  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
1024  %a1 = add i32 %r1, %c
1025  %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
1026  %a2 = add i32 %d, %r2
1027  %r = add i32 %a1, %a2
1028  ret i32 %r
1029}
1030
1031define i32 @nested_add_manyreduct_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
1032; CHECK-SD-LABEL: nested_add_manyreduct_i32:
1033; CHECK-SD:       // %bb.0:
1034; CHECK-SD-NEXT:    add v1.4s, v1.4s, v3.4s
1035; CHECK-SD-NEXT:    add v0.4s, v0.4s, v2.4s
1036; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
1037; CHECK-SD-NEXT:    addv s0, v0.4s
1038; CHECK-SD-NEXT:    fmov w0, s0
1039; CHECK-SD-NEXT:    ret
1040;
1041; CHECK-GI-LABEL: nested_add_manyreduct_i32:
1042; CHECK-GI:       // %bb.0:
1043; CHECK-GI-NEXT:    addv s0, v0.4s
1044; CHECK-GI-NEXT:    addv s2, v2.4s
1045; CHECK-GI-NEXT:    addv s1, v1.4s
1046; CHECK-GI-NEXT:    addv s3, v3.4s
1047; CHECK-GI-NEXT:    fmov w8, s0
1048; CHECK-GI-NEXT:    fmov w9, s2
1049; CHECK-GI-NEXT:    fmov w10, s1
1050; CHECK-GI-NEXT:    fmov w11, s3
1051; CHECK-GI-NEXT:    add w8, w8, w9
1052; CHECK-GI-NEXT:    add w9, w10, w11
1053; CHECK-GI-NEXT:    add w0, w8, w9
1054; CHECK-GI-NEXT:    ret
1055  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
1056  %r3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %c)
1057  %a1 = add i32 %r1, %r3
1058  %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
1059  %r4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %d)
1060  %a2 = add i32 %r2, %r4
1061  %r = add i32 %a1, %a2
1062  ret i32 %r
1063}
1064
1065define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
1066; CHECK-SD-LABEL: nested_mul_i32:
1067; CHECK-SD:       // %bb.0:
1068; CHECK-SD-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
1069; CHECK-SD-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
1070; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v3.2s
1071; CHECK-SD-NEXT:    mul v1.2s, v1.2s, v2.2s
1072; CHECK-SD-NEXT:    mov w8, v0.s[1]
1073; CHECK-SD-NEXT:    fmov w10, s0
1074; CHECK-SD-NEXT:    mov w9, v1.s[1]
1075; CHECK-SD-NEXT:    mul w8, w10, w8
1076; CHECK-SD-NEXT:    fmov w10, s1
1077; CHECK-SD-NEXT:    mul w9, w10, w9
1078; CHECK-SD-NEXT:    mul w8, w8, w0
1079; CHECK-SD-NEXT:    mul w9, w9, w1
1080; CHECK-SD-NEXT:    mul w0, w8, w9
1081; CHECK-SD-NEXT:    ret
1082;
1083; CHECK-GI-LABEL: nested_mul_i32:
1084; CHECK-GI:       // %bb.0:
1085; CHECK-GI-NEXT:    mov d2, v0.d[1]
1086; CHECK-GI-NEXT:    mov d3, v1.d[1]
1087; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
1088; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v3.2s
1089; CHECK-GI-NEXT:    mov w8, v0.s[1]
1090; CHECK-GI-NEXT:    fmov w10, s0
1091; CHECK-GI-NEXT:    mov w9, v1.s[1]
1092; CHECK-GI-NEXT:    mul w8, w10, w8
1093; CHECK-GI-NEXT:    fmov w10, s1
1094; CHECK-GI-NEXT:    mul w9, w10, w9
1095; CHECK-GI-NEXT:    mul w8, w8, w0
1096; CHECK-GI-NEXT:    mul w9, w9, w1
1097; CHECK-GI-NEXT:    mul w0, w8, w9
1098; CHECK-GI-NEXT:    ret
1099  %r1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
1100  %a1 = mul i32 %r1, %c
1101  %r2 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %b)
1102  %a2 = mul i32 %r2, %d
1103  %r = mul i32 %a1, %a2
1104  ret i32 %r
1105}
1106
1107define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
1108; CHECK-SD-LABEL: nested_and_i32:
1109; CHECK-SD:       // %bb.0:
1110; CHECK-SD-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
1111; CHECK-SD-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
1112; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
1113; CHECK-SD-NEXT:    and v0.8b, v0.8b, v3.8b
1114; CHECK-SD-NEXT:    fmov x8, d1
1115; CHECK-SD-NEXT:    fmov x9, d0
1116; CHECK-SD-NEXT:    lsr x10, x9, #32
1117; CHECK-SD-NEXT:    lsr x11, x8, #32
1118; CHECK-SD-NEXT:    and w9, w9, w0
1119; CHECK-SD-NEXT:    and w8, w8, w1
1120; CHECK-SD-NEXT:    and w9, w9, w10
1121; CHECK-SD-NEXT:    and w8, w8, w11
1122; CHECK-SD-NEXT:    and w0, w9, w8
1123; CHECK-SD-NEXT:    ret
1124;
1125; CHECK-GI-LABEL: nested_and_i32:
1126; CHECK-GI:       // %bb.0:
1127; CHECK-GI-NEXT:    mov d2, v0.d[1]
1128; CHECK-GI-NEXT:    mov d3, v1.d[1]
1129; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
1130; CHECK-GI-NEXT:    and v1.8b, v1.8b, v3.8b
1131; CHECK-GI-NEXT:    mov w8, v0.s[1]
1132; CHECK-GI-NEXT:    mov w9, v1.s[1]
1133; CHECK-GI-NEXT:    fmov w10, s0
1134; CHECK-GI-NEXT:    fmov w11, s1
1135; CHECK-GI-NEXT:    and w10, w10, w0
1136; CHECK-GI-NEXT:    and w11, w11, w1
1137; CHECK-GI-NEXT:    and w8, w10, w8
1138; CHECK-GI-NEXT:    and w9, w11, w9
1139; CHECK-GI-NEXT:    and w0, w8, w9
1140; CHECK-GI-NEXT:    ret
1141  %r1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
1142  %a1 = and i32 %r1, %c
1143  %r2 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %b)
1144  %a2 = and i32 %r2, %d
1145  %r = and i32 %a1, %a2
1146  ret i32 %r
1147}
1148
1149define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
1150; CHECK-SD-LABEL: nested_or_i32:
1151; CHECK-SD:       // %bb.0:
1152; CHECK-SD-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
1153; CHECK-SD-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
1154; CHECK-SD-NEXT:    orr v1.8b, v1.8b, v2.8b
1155; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v3.8b
1156; CHECK-SD-NEXT:    fmov x8, d1
1157; CHECK-SD-NEXT:    fmov x9, d0
1158; CHECK-SD-NEXT:    lsr x10, x9, #32
1159; CHECK-SD-NEXT:    lsr x11, x8, #32
1160; CHECK-SD-NEXT:    orr w9, w9, w0
1161; CHECK-SD-NEXT:    orr w8, w8, w1
1162; CHECK-SD-NEXT:    orr w9, w9, w10
1163; CHECK-SD-NEXT:    orr w8, w8, w11
1164; CHECK-SD-NEXT:    orr w0, w9, w8
1165; CHECK-SD-NEXT:    ret
1166;
1167; CHECK-GI-LABEL: nested_or_i32:
1168; CHECK-GI:       // %bb.0:
1169; CHECK-GI-NEXT:    mov d2, v0.d[1]
1170; CHECK-GI-NEXT:    mov d3, v1.d[1]
1171; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
1172; CHECK-GI-NEXT:    orr v1.8b, v1.8b, v3.8b
1173; CHECK-GI-NEXT:    mov w8, v0.s[1]
1174; CHECK-GI-NEXT:    mov w9, v1.s[1]
1175; CHECK-GI-NEXT:    fmov w10, s0
1176; CHECK-GI-NEXT:    fmov w11, s1
1177; CHECK-GI-NEXT:    orr w10, w10, w0
1178; CHECK-GI-NEXT:    orr w11, w11, w1
1179; CHECK-GI-NEXT:    orr w8, w10, w8
1180; CHECK-GI-NEXT:    orr w9, w11, w9
1181; CHECK-GI-NEXT:    orr w0, w8, w9
1182; CHECK-GI-NEXT:    ret
1183  %r1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
1184  %a1 = or i32 %r1, %c
1185  %r2 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %b)
1186  %a2 = or i32 %r2, %d
1187  %r = or i32 %a1, %a2
1188  ret i32 %r
1189}
1190
1191define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
1192; CHECK-SD-LABEL: nested_xor_i32:
1193; CHECK-SD:       // %bb.0:
1194; CHECK-SD-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
1195; CHECK-SD-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
1196; CHECK-SD-NEXT:    eor v1.8b, v1.8b, v2.8b
1197; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v3.8b
1198; CHECK-SD-NEXT:    fmov x8, d1
1199; CHECK-SD-NEXT:    fmov x9, d0
1200; CHECK-SD-NEXT:    lsr x10, x9, #32
1201; CHECK-SD-NEXT:    lsr x11, x8, #32
1202; CHECK-SD-NEXT:    eor w9, w9, w0
1203; CHECK-SD-NEXT:    eor w8, w8, w1
1204; CHECK-SD-NEXT:    eor w9, w9, w10
1205; CHECK-SD-NEXT:    eor w8, w8, w11
1206; CHECK-SD-NEXT:    eor w0, w9, w8
1207; CHECK-SD-NEXT:    ret
1208;
1209; CHECK-GI-LABEL: nested_xor_i32:
1210; CHECK-GI:       // %bb.0:
1211; CHECK-GI-NEXT:    mov d2, v0.d[1]
1212; CHECK-GI-NEXT:    mov d3, v1.d[1]
1213; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
1214; CHECK-GI-NEXT:    eor v1.8b, v1.8b, v3.8b
1215; CHECK-GI-NEXT:    mov w8, v0.s[1]
1216; CHECK-GI-NEXT:    mov w9, v1.s[1]
1217; CHECK-GI-NEXT:    fmov w10, s0
1218; CHECK-GI-NEXT:    fmov w11, s1
1219; CHECK-GI-NEXT:    eor w10, w10, w0
1220; CHECK-GI-NEXT:    eor w11, w11, w1
1221; CHECK-GI-NEXT:    eor w8, w10, w8
1222; CHECK-GI-NEXT:    eor w9, w11, w9
1223; CHECK-GI-NEXT:    eor w0, w8, w9
1224; CHECK-GI-NEXT:    ret
1225  %r1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
1226  %a1 = xor i32 %r1, %c
1227  %r2 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %b)
1228  %a2 = xor i32 %r2, %d
1229  %r = xor i32 %a1, %a2
1230  ret i32 %r
1231}
1232
1233define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
1234; CHECK-SD-LABEL: nested_smin_i32:
1235; CHECK-SD:       // %bb.0:
1236; CHECK-SD-NEXT:    sminv s0, v0.4s
1237; CHECK-SD-NEXT:    sminv s1, v1.4s
1238; CHECK-SD-NEXT:    fmov w9, s0
1239; CHECK-SD-NEXT:    fmov w8, s1
1240; CHECK-SD-NEXT:    cmp w9, w0
1241; CHECK-SD-NEXT:    csel w9, w9, w0, lt
1242; CHECK-SD-NEXT:    cmp w8, w1
1243; CHECK-SD-NEXT:    csel w8, w8, w1, lt
1244; CHECK-SD-NEXT:    cmp w9, w8
1245; CHECK-SD-NEXT:    csel w0, w9, w8, lt
1246; CHECK-SD-NEXT:    ret
1247;
1248; CHECK-GI-LABEL: nested_smin_i32:
1249; CHECK-GI:       // %bb.0:
1250; CHECK-GI-NEXT:    sminv s0, v0.4s
1251; CHECK-GI-NEXT:    sminv s1, v1.4s
1252; CHECK-GI-NEXT:    fmov w8, s0
1253; CHECK-GI-NEXT:    fmov w9, s1
1254; CHECK-GI-NEXT:    cmp w8, w0
1255; CHECK-GI-NEXT:    csel w8, w8, w0, lt
1256; CHECK-GI-NEXT:    cmp w9, w1
1257; CHECK-GI-NEXT:    csel w9, w9, w1, lt
1258; CHECK-GI-NEXT:    cmp w8, w9
1259; CHECK-GI-NEXT:    csel w0, w8, w9, lt
1260; CHECK-GI-NEXT:    ret
1261  %r1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
1262  %a1 = call i32 @llvm.smin.i32(i32 %r1, i32 %c)
1263  %r2 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %b)
1264  %a2 = call i32 @llvm.smin.i32(i32 %r2, i32 %d)
1265  %r = call i32 @llvm.smin.i32(i32 %a1, i32 %a2)
1266  ret i32 %r
1267}
1268
1269define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
1270; CHECK-SD-LABEL: nested_smax_i32:
1271; CHECK-SD:       // %bb.0:
1272; CHECK-SD-NEXT:    smaxv s0, v0.4s
1273; CHECK-SD-NEXT:    smaxv s1, v1.4s
1274; CHECK-SD-NEXT:    fmov w9, s0
1275; CHECK-SD-NEXT:    fmov w8, s1
1276; CHECK-SD-NEXT:    cmp w9, w0
1277; CHECK-SD-NEXT:    csel w9, w9, w0, gt
1278; CHECK-SD-NEXT:    cmp w8, w1
1279; CHECK-SD-NEXT:    csel w8, w8, w1, gt
1280; CHECK-SD-NEXT:    cmp w9, w8
1281; CHECK-SD-NEXT:    csel w0, w9, w8, gt
1282; CHECK-SD-NEXT:    ret
1283;
1284; CHECK-GI-LABEL: nested_smax_i32:
1285; CHECK-GI:       // %bb.0:
1286; CHECK-GI-NEXT:    smaxv s0, v0.4s
1287; CHECK-GI-NEXT:    smaxv s1, v1.4s
1288; CHECK-GI-NEXT:    fmov w8, s0
1289; CHECK-GI-NEXT:    fmov w9, s1
1290; CHECK-GI-NEXT:    cmp w8, w0
1291; CHECK-GI-NEXT:    csel w8, w8, w0, gt
1292; CHECK-GI-NEXT:    cmp w9, w1
1293; CHECK-GI-NEXT:    csel w9, w9, w1, gt
1294; CHECK-GI-NEXT:    cmp w8, w9
1295; CHECK-GI-NEXT:    csel w0, w8, w9, gt
1296; CHECK-GI-NEXT:    ret
1297  %r1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
1298  %a1 = call i32 @llvm.smax.i32(i32 %r1, i32 %c)
1299  %r2 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %b)
1300  %a2 = call i32 @llvm.smax.i32(i32 %r2, i32 %d)
1301  %r = call i32 @llvm.smax.i32(i32 %a1, i32 %a2)
1302  ret i32 %r
1303}
1304
1305define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
1306; CHECK-SD-LABEL: nested_umin_i32:
1307; CHECK-SD:       // %bb.0:
1308; CHECK-SD-NEXT:    uminv s0, v0.4s
1309; CHECK-SD-NEXT:    uminv s1, v1.4s
1310; CHECK-SD-NEXT:    fmov w9, s0
1311; CHECK-SD-NEXT:    fmov w8, s1
1312; CHECK-SD-NEXT:    cmp w9, w0
1313; CHECK-SD-NEXT:    csel w9, w9, w0, lo
1314; CHECK-SD-NEXT:    cmp w8, w1
1315; CHECK-SD-NEXT:    csel w8, w8, w1, lo
1316; CHECK-SD-NEXT:    cmp w9, w8
1317; CHECK-SD-NEXT:    csel w0, w9, w8, lo
1318; CHECK-SD-NEXT:    ret
1319;
1320; CHECK-GI-LABEL: nested_umin_i32:
1321; CHECK-GI:       // %bb.0:
1322; CHECK-GI-NEXT:    uminv s0, v0.4s
1323; CHECK-GI-NEXT:    uminv s1, v1.4s
1324; CHECK-GI-NEXT:    fmov w8, s0
1325; CHECK-GI-NEXT:    fmov w9, s1
1326; CHECK-GI-NEXT:    cmp w8, w0
1327; CHECK-GI-NEXT:    csel w8, w8, w0, lo
1328; CHECK-GI-NEXT:    cmp w9, w1
1329; CHECK-GI-NEXT:    csel w9, w9, w1, lo
1330; CHECK-GI-NEXT:    cmp w8, w9
1331; CHECK-GI-NEXT:    csel w0, w8, w9, lo
1332; CHECK-GI-NEXT:    ret
1333  %r1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
1334  %a1 = call i32 @llvm.umin.i32(i32 %r1, i32 %c)
1335  %r2 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %b)
1336  %a2 = call i32 @llvm.umin.i32(i32 %r2, i32 %d)
1337  %r = call i32 @llvm.umin.i32(i32 %a1, i32 %a2)
1338  ret i32 %r
1339}
1340
1341define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
1342; CHECK-SD-LABEL: nested_umax_i32:
1343; CHECK-SD:       // %bb.0:
1344; CHECK-SD-NEXT:    umaxv s0, v0.4s
1345; CHECK-SD-NEXT:    umaxv s1, v1.4s
1346; CHECK-SD-NEXT:    fmov w9, s0
1347; CHECK-SD-NEXT:    fmov w8, s1
1348; CHECK-SD-NEXT:    cmp w9, w0
1349; CHECK-SD-NEXT:    csel w9, w9, w0, hi
1350; CHECK-SD-NEXT:    cmp w8, w1
1351; CHECK-SD-NEXT:    csel w8, w8, w1, hi
1352; CHECK-SD-NEXT:    cmp w9, w8
1353; CHECK-SD-NEXT:    csel w0, w9, w8, hi
1354; CHECK-SD-NEXT:    ret
1355;
1356; CHECK-GI-LABEL: nested_umax_i32:
1357; CHECK-GI:       // %bb.0:
1358; CHECK-GI-NEXT:    umaxv s0, v0.4s
1359; CHECK-GI-NEXT:    umaxv s1, v1.4s
1360; CHECK-GI-NEXT:    fmov w8, s0
1361; CHECK-GI-NEXT:    fmov w9, s1
1362; CHECK-GI-NEXT:    cmp w8, w0
1363; CHECK-GI-NEXT:    csel w8, w8, w0, hi
1364; CHECK-GI-NEXT:    cmp w9, w1
1365; CHECK-GI-NEXT:    csel w9, w9, w1, hi
1366; CHECK-GI-NEXT:    cmp w8, w9
1367; CHECK-GI-NEXT:    csel w0, w8, w9, hi
1368; CHECK-GI-NEXT:    ret
1369  %r1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
1370  %a1 = call i32 @llvm.umax.i32(i32 %r1, i32 %c)
1371  %r2 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %b)
1372  %a2 = call i32 @llvm.umax.i32(i32 %r2, i32 %d)
1373  %r = call i32 @llvm.umax.i32(i32 %a1, i32 %a2)
1374  ret i32 %r
1375}
1376
1377define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
1378; CHECK-SD-LABEL: nested_fmin_float:
1379; CHECK-SD:       // %bb.0:
1380; CHECK-SD-NEXT:    fminnmv s1, v1.4s
1381; CHECK-SD-NEXT:    fminnmv s0, v0.4s
1382; CHECK-SD-NEXT:    fminnm s1, s1, s3
1383; CHECK-SD-NEXT:    fminnm s0, s0, s2
1384; CHECK-SD-NEXT:    fminnm s0, s0, s1
1385; CHECK-SD-NEXT:    ret
1386;
1387; CHECK-GI-LABEL: nested_fmin_float:
1388; CHECK-GI:       // %bb.0:
1389; CHECK-GI-NEXT:    fminnmv s0, v0.4s
1390; CHECK-GI-NEXT:    fminnmv s1, v1.4s
1391; CHECK-GI-NEXT:    fminnm s0, s0, s2
1392; CHECK-GI-NEXT:    fminnm s1, s1, s3
1393; CHECK-GI-NEXT:    fminnm s0, s0, s1
1394; CHECK-GI-NEXT:    ret
1395  %r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
1396  %a1 = call float @llvm.minnum.f32(float %r1, float %c)
1397  %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
1398  %a2 = call float @llvm.minnum.f32(float %r2, float %d)
1399  %r = call float @llvm.minnum.f32(float %a1, float %a2)
1400  ret float %r
1401}
1402
1403define float @nested_fmax_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
1404; CHECK-SD-LABEL: nested_fmax_float:
1405; CHECK-SD:       // %bb.0:
1406; CHECK-SD-NEXT:    fmaxnmv s1, v1.4s
1407; CHECK-SD-NEXT:    fmaxnmv s0, v0.4s
1408; CHECK-SD-NEXT:    fmaxnm s1, s1, s3
1409; CHECK-SD-NEXT:    fmaxnm s0, s0, s2
1410; CHECK-SD-NEXT:    fmaxnm s0, s0, s1
1411; CHECK-SD-NEXT:    ret
1412;
1413; CHECK-GI-LABEL: nested_fmax_float:
1414; CHECK-GI:       // %bb.0:
1415; CHECK-GI-NEXT:    fmaxnmv s0, v0.4s
1416; CHECK-GI-NEXT:    fmaxnmv s1, v1.4s
1417; CHECK-GI-NEXT:    fmaxnm s0, s0, s2
1418; CHECK-GI-NEXT:    fmaxnm s1, s1, s3
1419; CHECK-GI-NEXT:    fmaxnm s0, s0, s1
1420; CHECK-GI-NEXT:    ret
1421  %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
1422  %a1 = call float @llvm.maxnum.f32(float %r1, float %c)
1423  %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
1424  %a2 = call float @llvm.maxnum.f32(float %r2, float %d)
1425  %r = call float @llvm.maxnum.f32(float %a1, float %a2)
1426  ret float %r
1427}
1428
1429
1430declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
1431declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
1432declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
1433declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
1434declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
1435declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
1436declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
1437declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
1438declare float @llvm.vector.reduce.fminimum.v8f32(<8 x float>)
1439declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float>)
1440declare float @llvm.vector.reduce.fmaximum.v8f32(<8 x float>)
1441declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>)
1442declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>)
1443declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
1444declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>)
1445declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>)
1446declare i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32>)
1447declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>)
1448declare i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32>)
1449declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>)
1450declare i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32>)
1451declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>)
1452declare i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32>)
1453declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>)
1454declare i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32>)
1455declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>)
1456declare i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32>)
1457declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>)
1458declare i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32>)
1459declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>)
1460declare i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32>)
1461declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>)
1462declare float @llvm.minnum.f32(float, float)
1463declare float @llvm.maxnum.f32(float, float)
1464declare float @llvm.minimum.f32(float, float)
1465declare float @llvm.maximum.f32(float, float)
1466declare i32 @llvm.umin.i32(i32, i32)
1467declare i32 @llvm.umax.i32(i32, i32)
1468declare i32 @llvm.smin.i32(i32, i32)
1469declare i32 @llvm.smax.i32(i32, i32)
1470