xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll (revision 142787d3687eb58633c7c55a7a9f328ba4504986)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
4
5define i32 @reduce_sum_2xi32(<2 x i32> %v) {
6; CHECK-LABEL: reduce_sum_2xi32:
7; CHECK:       # %bb.0:
8; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
9; CHECK-NEXT:    vmv.s.x v9, zero
10; CHECK-NEXT:    vredsum.vs v8, v8, v9
11; CHECK-NEXT:    vmv.x.s a0, v8
12; CHECK-NEXT:    ret
13  %e0 = extractelement <2 x i32> %v, i32 0
14  %e1 = extractelement <2 x i32> %v, i32 1
15  %add0 = add i32 %e0, %e1
16  ret i32 %add0
17}
18
19define i32 @reduce_sum_4xi32(<4 x i32> %v) {
20; CHECK-LABEL: reduce_sum_4xi32:
21; CHECK:       # %bb.0:
22; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
23; CHECK-NEXT:    vmv.s.x v9, zero
24; CHECK-NEXT:    vredsum.vs v8, v8, v9
25; CHECK-NEXT:    vmv.x.s a0, v8
26; CHECK-NEXT:    ret
27  %e0 = extractelement <4 x i32> %v, i32 0
28  %e1 = extractelement <4 x i32> %v, i32 1
29  %e2 = extractelement <4 x i32> %v, i32 2
30  %e3 = extractelement <4 x i32> %v, i32 3
31  %add0 = add i32 %e0, %e1
32  %add1 = add i32 %add0, %e2
33  %add2 = add i32 %add1, %e3
34  ret i32 %add2
35}
36
37define i32 @reduce_sum_8xi32(<8 x i32> %v) {
38; CHECK-LABEL: reduce_sum_8xi32:
39; CHECK:       # %bb.0:
40; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
41; CHECK-NEXT:    vmv.s.x v10, zero
42; CHECK-NEXT:    vredsum.vs v8, v8, v10
43; CHECK-NEXT:    vmv.x.s a0, v8
44; CHECK-NEXT:    ret
45  %e0 = extractelement <8 x i32> %v, i32 0
46  %e1 = extractelement <8 x i32> %v, i32 1
47  %e2 = extractelement <8 x i32> %v, i32 2
48  %e3 = extractelement <8 x i32> %v, i32 3
49  %e4 = extractelement <8 x i32> %v, i32 4
50  %e5 = extractelement <8 x i32> %v, i32 5
51  %e6 = extractelement <8 x i32> %v, i32 6
52  %e7 = extractelement <8 x i32> %v, i32 7
53  %add0 = add i32 %e0, %e1
54  %add1 = add i32 %add0, %e2
55  %add2 = add i32 %add1, %e3
56  %add3 = add i32 %add2, %e4
57  %add4 = add i32 %add3, %e5
58  %add5 = add i32 %add4, %e6
59  %add6 = add i32 %add5, %e7
60  ret i32 %add6
61}
62
63define i32 @reduce_sum_16xi32(<16 x i32> %v) {
64; CHECK-LABEL: reduce_sum_16xi32:
65; CHECK:       # %bb.0:
66; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
67; CHECK-NEXT:    vmv.s.x v12, zero
68; CHECK-NEXT:    vredsum.vs v8, v8, v12
69; CHECK-NEXT:    vmv.x.s a0, v8
70; CHECK-NEXT:    ret
71  %e0 = extractelement <16 x i32> %v, i32 0
72  %e1 = extractelement <16 x i32> %v, i32 1
73  %e2 = extractelement <16 x i32> %v, i32 2
74  %e3 = extractelement <16 x i32> %v, i32 3
75  %e4 = extractelement <16 x i32> %v, i32 4
76  %e5 = extractelement <16 x i32> %v, i32 5
77  %e6 = extractelement <16 x i32> %v, i32 6
78  %e7 = extractelement <16 x i32> %v, i32 7
79  %e8 = extractelement <16 x i32> %v, i32 8
80  %e9 = extractelement <16 x i32> %v, i32 9
81  %e10 = extractelement <16 x i32> %v, i32 10
82  %e11 = extractelement <16 x i32> %v, i32 11
83  %e12 = extractelement <16 x i32> %v, i32 12
84  %e13 = extractelement <16 x i32> %v, i32 13
85  %e14 = extractelement <16 x i32> %v, i32 14
86  %e15 = extractelement <16 x i32> %v, i32 15
87  %add0 = add i32 %e0, %e1
88  %add1 = add i32 %add0, %e2
89  %add2 = add i32 %add1, %e3
90  %add3 = add i32 %add2, %e4
91  %add4 = add i32 %add3, %e5
92  %add5 = add i32 %add4, %e6
93  %add6 = add i32 %add5, %e7
94  %add7 = add i32 %add6, %e8
95  %add8 = add i32 %add7, %e9
96  %add9 = add i32 %add8, %e10
97  %add10 = add i32 %add9, %e11
98  %add11 = add i32 %add10, %e12
99  %add12 = add i32 %add11, %e13
100  %add13 = add i32 %add12, %e14
101  %add14 = add i32 %add13, %e15
102  ret i32 %add14
103}
104
105define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
106; CHECK-LABEL: reduce_sum_16xi32_prefix2:
107; CHECK:       # %bb.0:
108; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
109; CHECK-NEXT:    vle32.v v8, (a0)
110; CHECK-NEXT:    vmv.s.x v9, zero
111; CHECK-NEXT:    vredsum.vs v8, v8, v9
112; CHECK-NEXT:    vmv.x.s a0, v8
113; CHECK-NEXT:    ret
114  %v = load <16 x i32>, ptr %p, align 256
115  %e0 = extractelement <16 x i32> %v, i32 0
116  %e1 = extractelement <16 x i32> %v, i32 1
117  %add0 = add i32 %e0, %e1
118  ret i32 %add0
119}
120
121define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
122; CHECK-LABEL: reduce_sum_16xi32_prefix3:
123; CHECK:       # %bb.0:
124; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
125; CHECK-NEXT:    vle32.v v8, (a0)
126; CHECK-NEXT:    vmv.s.x v9, zero
127; CHECK-NEXT:    vredsum.vs v8, v8, v9
128; CHECK-NEXT:    vmv.x.s a0, v8
129; CHECK-NEXT:    ret
130  %v = load <16 x i32>, ptr %p, align 256
131  %e0 = extractelement <16 x i32> %v, i32 0
132  %e1 = extractelement <16 x i32> %v, i32 1
133  %e2 = extractelement <16 x i32> %v, i32 2
134  %add0 = add i32 %e0, %e1
135  %add1 = add i32 %add0, %e2
136  ret i32 %add1
137}
138
139define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
140; CHECK-LABEL: reduce_sum_16xi32_prefix4:
141; CHECK:       # %bb.0:
142; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
143; CHECK-NEXT:    vle32.v v8, (a0)
144; CHECK-NEXT:    vmv.s.x v9, zero
145; CHECK-NEXT:    vredsum.vs v8, v8, v9
146; CHECK-NEXT:    vmv.x.s a0, v8
147; CHECK-NEXT:    ret
148  %v = load <16 x i32>, ptr %p, align 256
149  %e0 = extractelement <16 x i32> %v, i32 0
150  %e1 = extractelement <16 x i32> %v, i32 1
151  %e2 = extractelement <16 x i32> %v, i32 2
152  %e3 = extractelement <16 x i32> %v, i32 3
153  %add0 = add i32 %e0, %e1
154  %add1 = add i32 %add0, %e2
155  %add2 = add i32 %add1, %e3
156  ret i32 %add2
157}
158
159define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
160; CHECK-LABEL: reduce_sum_16xi32_prefix5:
161; CHECK:       # %bb.0:
162; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
163; CHECK-NEXT:    vle32.v v8, (a0)
164; CHECK-NEXT:    vmv.s.x v10, zero
165; CHECK-NEXT:    vredsum.vs v8, v8, v10
166; CHECK-NEXT:    vmv.x.s a0, v8
167; CHECK-NEXT:    ret
168  %v = load <16 x i32>, ptr %p, align 256
169  %e0 = extractelement <16 x i32> %v, i32 0
170  %e1 = extractelement <16 x i32> %v, i32 1
171  %e2 = extractelement <16 x i32> %v, i32 2
172  %e3 = extractelement <16 x i32> %v, i32 3
173  %e4 = extractelement <16 x i32> %v, i32 4
174  %add0 = add i32 %e0, %e1
175  %add1 = add i32 %add0, %e2
176  %add2 = add i32 %add1, %e3
177  %add3 = add i32 %add2, %e4
178  ret i32 %add3
179}
180
181define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
182; CHECK-LABEL: reduce_sum_16xi32_prefix6:
183; CHECK:       # %bb.0:
184; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
185; CHECK-NEXT:    vle32.v v8, (a0)
186; CHECK-NEXT:    vmv.s.x v10, zero
187; CHECK-NEXT:    vredsum.vs v8, v8, v10
188; CHECK-NEXT:    vmv.x.s a0, v8
189; CHECK-NEXT:    ret
190  %v = load <16 x i32>, ptr %p, align 256
191  %e0 = extractelement <16 x i32> %v, i32 0
192  %e1 = extractelement <16 x i32> %v, i32 1
193  %e2 = extractelement <16 x i32> %v, i32 2
194  %e3 = extractelement <16 x i32> %v, i32 3
195  %e4 = extractelement <16 x i32> %v, i32 4
196  %e5 = extractelement <16 x i32> %v, i32 5
197  %add0 = add i32 %e0, %e1
198  %add1 = add i32 %add0, %e2
199  %add2 = add i32 %add1, %e3
200  %add3 = add i32 %add2, %e4
201  %add4 = add i32 %add3, %e5
202  ret i32 %add4
203}
204
205define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
206; CHECK-LABEL: reduce_sum_16xi32_prefix7:
207; CHECK:       # %bb.0:
208; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
209; CHECK-NEXT:    vle32.v v8, (a0)
210; CHECK-NEXT:    vmv.s.x v10, zero
211; CHECK-NEXT:    vredsum.vs v8, v8, v10
212; CHECK-NEXT:    vmv.x.s a0, v8
213; CHECK-NEXT:    ret
214  %v = load <16 x i32>, ptr %p, align 256
215  %e0 = extractelement <16 x i32> %v, i32 0
216  %e1 = extractelement <16 x i32> %v, i32 1
217  %e2 = extractelement <16 x i32> %v, i32 2
218  %e3 = extractelement <16 x i32> %v, i32 3
219  %e4 = extractelement <16 x i32> %v, i32 4
220  %e5 = extractelement <16 x i32> %v, i32 5
221  %e6 = extractelement <16 x i32> %v, i32 6
222  %add0 = add i32 %e0, %e1
223  %add1 = add i32 %add0, %e2
224  %add2 = add i32 %add1, %e3
225  %add3 = add i32 %add2, %e4
226  %add4 = add i32 %add3, %e5
227  %add5 = add i32 %add4, %e6
228  ret i32 %add5
229}
230
231define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
232; CHECK-LABEL: reduce_sum_16xi32_prefix8:
233; CHECK:       # %bb.0:
234; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
235; CHECK-NEXT:    vle32.v v8, (a0)
236; CHECK-NEXT:    vmv.s.x v10, zero
237; CHECK-NEXT:    vredsum.vs v8, v8, v10
238; CHECK-NEXT:    vmv.x.s a0, v8
239; CHECK-NEXT:    ret
240  %v = load <16 x i32>, ptr %p, align 256
241  %e0 = extractelement <16 x i32> %v, i32 0
242  %e1 = extractelement <16 x i32> %v, i32 1
243  %e2 = extractelement <16 x i32> %v, i32 2
244  %e3 = extractelement <16 x i32> %v, i32 3
245  %e4 = extractelement <16 x i32> %v, i32 4
246  %e5 = extractelement <16 x i32> %v, i32 5
247  %e6 = extractelement <16 x i32> %v, i32 6
248  %e7 = extractelement <16 x i32> %v, i32 7
249  %add0 = add i32 %e0, %e1
250  %add1 = add i32 %add0, %e2
251  %add2 = add i32 %add1, %e3
252  %add3 = add i32 %add2, %e4
253  %add4 = add i32 %add3, %e5
254  %add5 = add i32 %add4, %e6
255  %add6 = add i32 %add5, %e7
256  ret i32 %add6
257}
258
259define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
260; CHECK-LABEL: reduce_sum_16xi32_prefix9:
261; CHECK:       # %bb.0:
262; CHECK-NEXT:    vsetivli zero, 9, e32, m4, ta, ma
263; CHECK-NEXT:    vle32.v v8, (a0)
264; CHECK-NEXT:    vmv.s.x v12, zero
265; CHECK-NEXT:    vredsum.vs v8, v8, v12
266; CHECK-NEXT:    vmv.x.s a0, v8
267; CHECK-NEXT:    ret
268  %v = load <16 x i32>, ptr %p, align 256
269  %e0 = extractelement <16 x i32> %v, i32 0
270  %e1 = extractelement <16 x i32> %v, i32 1
271  %e2 = extractelement <16 x i32> %v, i32 2
272  %e3 = extractelement <16 x i32> %v, i32 3
273  %e4 = extractelement <16 x i32> %v, i32 4
274  %e5 = extractelement <16 x i32> %v, i32 5
275  %e6 = extractelement <16 x i32> %v, i32 6
276  %e7 = extractelement <16 x i32> %v, i32 7
277  %e8 = extractelement <16 x i32> %v, i32 8
278  %add0 = add i32 %e0, %e1
279  %add1 = add i32 %add0, %e2
280  %add2 = add i32 %add1, %e3
281  %add3 = add i32 %add2, %e4
282  %add4 = add i32 %add3, %e5
283  %add5 = add i32 %add4, %e6
284  %add6 = add i32 %add5, %e7
285  %add7 = add i32 %add6, %e8
286  ret i32 %add7
287}
288
289define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
290; CHECK-LABEL: reduce_sum_16xi32_prefix13:
291; CHECK:       # %bb.0:
292; CHECK-NEXT:    vsetivli zero, 13, e32, m4, ta, ma
293; CHECK-NEXT:    vle32.v v8, (a0)
294; CHECK-NEXT:    vmv.s.x v12, zero
295; CHECK-NEXT:    vredsum.vs v8, v8, v12
296; CHECK-NEXT:    vmv.x.s a0, v8
297; CHECK-NEXT:    ret
298  %v = load <16 x i32>, ptr %p, align 256
299  %e0 = extractelement <16 x i32> %v, i32 0
300  %e1 = extractelement <16 x i32> %v, i32 1
301  %e2 = extractelement <16 x i32> %v, i32 2
302  %e3 = extractelement <16 x i32> %v, i32 3
303  %e4 = extractelement <16 x i32> %v, i32 4
304  %e5 = extractelement <16 x i32> %v, i32 5
305  %e6 = extractelement <16 x i32> %v, i32 6
306  %e7 = extractelement <16 x i32> %v, i32 7
307  %e8 = extractelement <16 x i32> %v, i32 8
308  %e9 = extractelement <16 x i32> %v, i32 9
309  %e10 = extractelement <16 x i32> %v, i32 10
310  %e11 = extractelement <16 x i32> %v, i32 11
311  %e12 = extractelement <16 x i32> %v, i32 12
312  %add0 = add i32 %e0, %e1
313  %add1 = add i32 %add0, %e2
314  %add2 = add i32 %add1, %e3
315  %add3 = add i32 %add2, %e4
316  %add4 = add i32 %add3, %e5
317  %add5 = add i32 %add4, %e6
318  %add6 = add i32 %add5, %e7
319  %add7 = add i32 %add6, %e8
320  %add8 = add i32 %add7, %e9
321  %add9 = add i32 %add8, %e10
322  %add10 = add i32 %add9, %e11
323  %add11 = add i32 %add10, %e12
324  ret i32 %add11
325}
326
327
328define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
329; CHECK-LABEL: reduce_sum_16xi32_prefix14:
330; CHECK:       # %bb.0:
331; CHECK-NEXT:    vsetivli zero, 14, e32, m4, ta, ma
332; CHECK-NEXT:    vle32.v v8, (a0)
333; CHECK-NEXT:    vmv.s.x v12, zero
334; CHECK-NEXT:    vredsum.vs v8, v8, v12
335; CHECK-NEXT:    vmv.x.s a0, v8
336; CHECK-NEXT:    ret
337  %v = load <16 x i32>, ptr %p, align 256
338  %e0 = extractelement <16 x i32> %v, i32 0
339  %e1 = extractelement <16 x i32> %v, i32 1
340  %e2 = extractelement <16 x i32> %v, i32 2
341  %e3 = extractelement <16 x i32> %v, i32 3
342  %e4 = extractelement <16 x i32> %v, i32 4
343  %e5 = extractelement <16 x i32> %v, i32 5
344  %e6 = extractelement <16 x i32> %v, i32 6
345  %e7 = extractelement <16 x i32> %v, i32 7
346  %e8 = extractelement <16 x i32> %v, i32 8
347  %e9 = extractelement <16 x i32> %v, i32 9
348  %e10 = extractelement <16 x i32> %v, i32 10
349  %e11 = extractelement <16 x i32> %v, i32 11
350  %e12 = extractelement <16 x i32> %v, i32 12
351  %e13 = extractelement <16 x i32> %v, i32 13
352  %add0 = add i32 %e0, %e1
353  %add1 = add i32 %add0, %e2
354  %add2 = add i32 %add1, %e3
355  %add3 = add i32 %add2, %e4
356  %add4 = add i32 %add3, %e5
357  %add5 = add i32 %add4, %e6
358  %add6 = add i32 %add5, %e7
359  %add7 = add i32 %add6, %e8
360  %add8 = add i32 %add7, %e9
361  %add9 = add i32 %add8, %e10
362  %add10 = add i32 %add9, %e11
363  %add11 = add i32 %add10, %e12
364  %add12 = add i32 %add11, %e13
365  ret i32 %add12
366}
367
368define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
369; CHECK-LABEL: reduce_sum_16xi32_prefix15:
370; CHECK:       # %bb.0:
371; CHECK-NEXT:    vsetivli zero, 15, e32, m4, ta, ma
372; CHECK-NEXT:    vle32.v v8, (a0)
373; CHECK-NEXT:    vmv.s.x v12, zero
374; CHECK-NEXT:    vredsum.vs v8, v8, v12
375; CHECK-NEXT:    vmv.x.s a0, v8
376; CHECK-NEXT:    ret
377  %v = load <16 x i32>, ptr %p, align 256
378  %e0 = extractelement <16 x i32> %v, i32 0
379  %e1 = extractelement <16 x i32> %v, i32 1
380  %e2 = extractelement <16 x i32> %v, i32 2
381  %e3 = extractelement <16 x i32> %v, i32 3
382  %e4 = extractelement <16 x i32> %v, i32 4
383  %e5 = extractelement <16 x i32> %v, i32 5
384  %e6 = extractelement <16 x i32> %v, i32 6
385  %e7 = extractelement <16 x i32> %v, i32 7
386  %e8 = extractelement <16 x i32> %v, i32 8
387  %e9 = extractelement <16 x i32> %v, i32 9
388  %e10 = extractelement <16 x i32> %v, i32 10
389  %e11 = extractelement <16 x i32> %v, i32 11
390  %e12 = extractelement <16 x i32> %v, i32 12
391  %e13 = extractelement <16 x i32> %v, i32 13
392  %e14 = extractelement <16 x i32> %v, i32 14
393  %add0 = add i32 %e0, %e1
394  %add1 = add i32 %add0, %e2
395  %add2 = add i32 %add1, %e3
396  %add3 = add i32 %add2, %e4
397  %add4 = add i32 %add3, %e5
398  %add5 = add i32 %add4, %e6
399  %add6 = add i32 %add5, %e7
400  %add7 = add i32 %add6, %e8
401  %add8 = add i32 %add7, %e9
402  %add9 = add i32 %add8, %e10
403  %add10 = add i32 %add9, %e11
404  %add11 = add i32 %add10, %e12
405  %add12 = add i32 %add11, %e13
406  %add13 = add i32 %add12, %e14
407  ret i32 %add13
408}
409
410; Check that we can match with the operand ordered reversed, but the
411; reduction order unchanged.
412define i32 @reduce_sum_4xi32_op_order(<4 x i32> %v) {
413; CHECK-LABEL: reduce_sum_4xi32_op_order:
414; CHECK:       # %bb.0:
415; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
416; CHECK-NEXT:    vmv.s.x v9, zero
417; CHECK-NEXT:    vredsum.vs v8, v8, v9
418; CHECK-NEXT:    vmv.x.s a0, v8
419; CHECK-NEXT:    ret
420  %e0 = extractelement <4 x i32> %v, i32 0
421  %e1 = extractelement <4 x i32> %v, i32 1
422  %e2 = extractelement <4 x i32> %v, i32 2
423  %e3 = extractelement <4 x i32> %v, i32 3
424  %add0 = add i32 %e1, %e0
425  %add1 = add i32 %e2, %add0
426  %add2 = add i32 %add1, %e3
427  ret i32 %add2
428}
429
430; Negative test - Reduction order isn't compatibile with current
431; incremental matching scheme.
432define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) {
433; RV32-LABEL: reduce_sum_4xi32_reduce_order:
434; RV32:       # %bb.0:
435; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
436; RV32-NEXT:    vmv.x.s a0, v8
437; RV32-NEXT:    vslidedown.vi v9, v8, 1
438; RV32-NEXT:    vmv.x.s a1, v9
439; RV32-NEXT:    vslidedown.vi v9, v8, 2
440; RV32-NEXT:    vslidedown.vi v8, v8, 3
441; RV32-NEXT:    vmv.x.s a2, v9
442; RV32-NEXT:    vmv.x.s a3, v8
443; RV32-NEXT:    add a1, a1, a2
444; RV32-NEXT:    add a0, a0, a3
445; RV32-NEXT:    add a0, a0, a1
446; RV32-NEXT:    ret
447;
448; RV64-LABEL: reduce_sum_4xi32_reduce_order:
449; RV64:       # %bb.0:
450; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
451; RV64-NEXT:    vmv.x.s a0, v8
452; RV64-NEXT:    vslidedown.vi v9, v8, 1
453; RV64-NEXT:    vmv.x.s a1, v9
454; RV64-NEXT:    vslidedown.vi v9, v8, 2
455; RV64-NEXT:    vslidedown.vi v8, v8, 3
456; RV64-NEXT:    vmv.x.s a2, v9
457; RV64-NEXT:    vmv.x.s a3, v8
458; RV64-NEXT:    add a1, a1, a2
459; RV64-NEXT:    add a0, a0, a3
460; RV64-NEXT:    addw a0, a0, a1
461; RV64-NEXT:    ret
462  %e0 = extractelement <4 x i32> %v, i32 0
463  %e1 = extractelement <4 x i32> %v, i32 1
464  %e2 = extractelement <4 x i32> %v, i32 2
465  %e3 = extractelement <4 x i32> %v, i32 3
466  %add0 = add i32 %e1, %e2
467  %add1 = add i32 %e0, %add0
468  %add2 = add i32 %add1, %e3
469  ret i32 %add2
470}
471
472;; Most of the cornercases are exercised above, the following just
473;; makes sure that other opcodes work as expected.
474
475define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
476; CHECK-LABEL: reduce_xor_16xi32_prefix2:
477; CHECK:       # %bb.0:
478; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
479; CHECK-NEXT:    vle32.v v8, (a0)
480; CHECK-NEXT:    vmv.s.x v9, zero
481; CHECK-NEXT:    vredxor.vs v8, v8, v9
482; CHECK-NEXT:    vmv.x.s a0, v8
483; CHECK-NEXT:    ret
484  %v = load <16 x i32>, ptr %p, align 256
485  %e0 = extractelement <16 x i32> %v, i32 0
486  %e1 = extractelement <16 x i32> %v, i32 1
487  %xor0 = xor i32 %e0, %e1
488  ret i32 %xor0
489}
490
491define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
492; CHECK-LABEL: reduce_xor_16xi32_prefix5:
493; CHECK:       # %bb.0:
494; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
495; CHECK-NEXT:    vle32.v v8, (a0)
496; CHECK-NEXT:    vmv.s.x v10, zero
497; CHECK-NEXT:    vredxor.vs v8, v8, v10
498; CHECK-NEXT:    vmv.x.s a0, v8
499; CHECK-NEXT:    ret
500  %v = load <16 x i32>, ptr %p, align 256
501  %e0 = extractelement <16 x i32> %v, i32 0
502  %e1 = extractelement <16 x i32> %v, i32 1
503  %e2 = extractelement <16 x i32> %v, i32 2
504  %e3 = extractelement <16 x i32> %v, i32 3
505  %e4 = extractelement <16 x i32> %v, i32 4
506  %xor0 = xor i32 %e0, %e1
507  %xor1 = xor i32 %xor0, %e2
508  %xor2 = xor i32 %xor1, %e3
509  %xor3 = xor i32 %xor2, %e4
510  ret i32 %xor3
511}
512
513define i32 @reduce_and_16xi32_prefix2(ptr %p) {
514; CHECK-LABEL: reduce_and_16xi32_prefix2:
515; CHECK:       # %bb.0:
516; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
517; CHECK-NEXT:    vle32.v v8, (a0)
518; CHECK-NEXT:    vredand.vs v8, v8, v8
519; CHECK-NEXT:    vmv.x.s a0, v8
520; CHECK-NEXT:    ret
521  %v = load <16 x i32>, ptr %p, align 256
522  %e0 = extractelement <16 x i32> %v, i32 0
523  %e1 = extractelement <16 x i32> %v, i32 1
524  %and0 = and i32 %e0, %e1
525  ret i32 %and0
526}
527
528define i32 @reduce_and_16xi32_prefix5(ptr %p) {
529; CHECK-LABEL: reduce_and_16xi32_prefix5:
530; CHECK:       # %bb.0:
531; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
532; CHECK-NEXT:    vle32.v v8, (a0)
533; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
534; CHECK-NEXT:    vmv.v.i v10, -1
535; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
536; CHECK-NEXT:    vredand.vs v8, v8, v10
537; CHECK-NEXT:    vmv.x.s a0, v8
538; CHECK-NEXT:    ret
539  %v = load <16 x i32>, ptr %p, align 256
540  %e0 = extractelement <16 x i32> %v, i32 0
541  %e1 = extractelement <16 x i32> %v, i32 1
542  %e2 = extractelement <16 x i32> %v, i32 2
543  %e3 = extractelement <16 x i32> %v, i32 3
544  %e4 = extractelement <16 x i32> %v, i32 4
545  %and0 = and i32 %e0, %e1
546  %and1 = and i32 %and0, %e2
547  %and2 = and i32 %and1, %e3
548  %and3 = and i32 %and2, %e4
549  ret i32 %and3
550}
551
552define i32 @reduce_or_16xi32_prefix2(ptr %p) {
553; CHECK-LABEL: reduce_or_16xi32_prefix2:
554; CHECK:       # %bb.0:
555; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
556; CHECK-NEXT:    vle32.v v8, (a0)
557; CHECK-NEXT:    vredor.vs v8, v8, v8
558; CHECK-NEXT:    vmv.x.s a0, v8
559; CHECK-NEXT:    ret
560  %v = load <16 x i32>, ptr %p, align 256
561  %e0 = extractelement <16 x i32> %v, i32 0
562  %e1 = extractelement <16 x i32> %v, i32 1
563  %or0 = or i32 %e0, %e1
564  ret i32 %or0
565}
566
567define i32 @reduce_or_16xi32_prefix5(ptr %p) {
568; CHECK-LABEL: reduce_or_16xi32_prefix5:
569; CHECK:       # %bb.0:
570; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
571; CHECK-NEXT:    vle32.v v8, (a0)
572; CHECK-NEXT:    vmv.s.x v10, zero
573; CHECK-NEXT:    vredor.vs v8, v8, v10
574; CHECK-NEXT:    vmv.x.s a0, v8
575; CHECK-NEXT:    ret
576  %v = load <16 x i32>, ptr %p, align 256
577  %e0 = extractelement <16 x i32> %v, i32 0
578  %e1 = extractelement <16 x i32> %v, i32 1
579  %e2 = extractelement <16 x i32> %v, i32 2
580  %e3 = extractelement <16 x i32> %v, i32 3
581  %e4 = extractelement <16 x i32> %v, i32 4
582  %or0 = or i32 %e0, %e1
583  %or1 = or i32 %or0, %e2
584  %or2 = or i32 %or1, %e3
585  %or3 = or i32 %or2, %e4
586  ret i32 %or3
587}
588
589declare i32 @llvm.smax.i32(i32 %a, i32 %b)
590declare i32 @llvm.smin.i32(i32 %a, i32 %b)
591declare i32 @llvm.umax.i32(i32 %a, i32 %b)
592declare i32 @llvm.umin.i32(i32 %a, i32 %b)
593
594define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
595; CHECK-LABEL: reduce_smax_16xi32_prefix2:
596; CHECK:       # %bb.0:
597; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
598; CHECK-NEXT:    vle32.v v8, (a0)
599; CHECK-NEXT:    vredmax.vs v8, v8, v8
600; CHECK-NEXT:    vmv.x.s a0, v8
601; CHECK-NEXT:    ret
602  %v = load <16 x i32>, ptr %p, align 256
603  %e0 = extractelement <16 x i32> %v, i32 0
604  %e1 = extractelement <16 x i32> %v, i32 1
605  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
606  ret i32 %smax0
607}
608
609define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
610; CHECK-LABEL: reduce_smax_16xi32_prefix5:
611; CHECK:       # %bb.0:
612; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
613; CHECK-NEXT:    vle32.v v8, (a0)
614; CHECK-NEXT:    lui a0, 524288
615; CHECK-NEXT:    vmv.s.x v10, a0
616; CHECK-NEXT:    vredmax.vs v8, v8, v10
617; CHECK-NEXT:    vmv.x.s a0, v8
618; CHECK-NEXT:    ret
619  %v = load <16 x i32>, ptr %p, align 256
620  %e0 = extractelement <16 x i32> %v, i32 0
621  %e1 = extractelement <16 x i32> %v, i32 1
622  %e2 = extractelement <16 x i32> %v, i32 2
623  %e3 = extractelement <16 x i32> %v, i32 3
624  %e4 = extractelement <16 x i32> %v, i32 4
625  %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
626  %smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2)
627  %smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3)
628  %smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4)
629  ret i32 %smax3
630}
631
632define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
633; CHECK-LABEL: reduce_smin_16xi32_prefix2:
634; CHECK:       # %bb.0:
635; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
636; CHECK-NEXT:    vle32.v v8, (a0)
637; CHECK-NEXT:    vredmin.vs v8, v8, v8
638; CHECK-NEXT:    vmv.x.s a0, v8
639; CHECK-NEXT:    ret
640  %v = load <16 x i32>, ptr %p, align 256
641  %e0 = extractelement <16 x i32> %v, i32 0
642  %e1 = extractelement <16 x i32> %v, i32 1
643  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
644  ret i32 %smin0
645}
646
647define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
648; CHECK-LABEL: reduce_smin_16xi32_prefix5:
649; CHECK:       # %bb.0:
650; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
651; CHECK-NEXT:    vle32.v v8, (a0)
652; CHECK-NEXT:    lui a0, 524288
653; CHECK-NEXT:    addi a0, a0, -1
654; CHECK-NEXT:    vmv.s.x v10, a0
655; CHECK-NEXT:    vredmin.vs v8, v8, v10
656; CHECK-NEXT:    vmv.x.s a0, v8
657; CHECK-NEXT:    ret
658  %v = load <16 x i32>, ptr %p, align 256
659  %e0 = extractelement <16 x i32> %v, i32 0
660  %e1 = extractelement <16 x i32> %v, i32 1
661  %e2 = extractelement <16 x i32> %v, i32 2
662  %e3 = extractelement <16 x i32> %v, i32 3
663  %e4 = extractelement <16 x i32> %v, i32 4
664  %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
665  %smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2)
666  %smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3)
667  %smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4)
668  ret i32 %smin3
669}
670
671define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
672; CHECK-LABEL: reduce_umax_16xi32_prefix2:
673; CHECK:       # %bb.0:
674; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
675; CHECK-NEXT:    vle32.v v8, (a0)
676; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
677; CHECK-NEXT:    vmv.x.s a0, v8
678; CHECK-NEXT:    ret
679  %v = load <16 x i32>, ptr %p, align 256
680  %e0 = extractelement <16 x i32> %v, i32 0
681  %e1 = extractelement <16 x i32> %v, i32 1
682  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
683  ret i32 %umax0
684}
685
686define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
687; CHECK-LABEL: reduce_umax_16xi32_prefix5:
688; CHECK:       # %bb.0:
689; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
690; CHECK-NEXT:    vle32.v v8, (a0)
691; CHECK-NEXT:    vmv.s.x v10, zero
692; CHECK-NEXT:    vredmaxu.vs v8, v8, v10
693; CHECK-NEXT:    vmv.x.s a0, v8
694; CHECK-NEXT:    ret
695  %v = load <16 x i32>, ptr %p, align 256
696  %e0 = extractelement <16 x i32> %v, i32 0
697  %e1 = extractelement <16 x i32> %v, i32 1
698  %e2 = extractelement <16 x i32> %v, i32 2
699  %e3 = extractelement <16 x i32> %v, i32 3
700  %e4 = extractelement <16 x i32> %v, i32 4
701  %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
702  %umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2)
703  %umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3)
704  %umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4)
705  ret i32 %umax3
706}
707
708define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
709; CHECK-LABEL: reduce_umin_16xi32_prefix2:
710; CHECK:       # %bb.0:
711; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
712; CHECK-NEXT:    vle32.v v8, (a0)
713; CHECK-NEXT:    vredminu.vs v8, v8, v8
714; CHECK-NEXT:    vmv.x.s a0, v8
715; CHECK-NEXT:    ret
716  %v = load <16 x i32>, ptr %p, align 256
717  %e0 = extractelement <16 x i32> %v, i32 0
718  %e1 = extractelement <16 x i32> %v, i32 1
719  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
720  ret i32 %umin0
721}
722
723define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
724; RV32-LABEL: reduce_umin_16xi32_prefix5:
725; RV32:       # %bb.0:
726; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
727; RV32-NEXT:    vle32.v v8, (a0)
728; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
729; RV32-NEXT:    vmv.v.i v10, -1
730; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
731; RV32-NEXT:    vredminu.vs v8, v8, v10
732; RV32-NEXT:    vmv.x.s a0, v8
733; RV32-NEXT:    ret
734;
735; RV64-LABEL: reduce_umin_16xi32_prefix5:
736; RV64:       # %bb.0:
737; RV64-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
738; RV64-NEXT:    vle32.v v8, (a0)
739; RV64-NEXT:    li a0, -1
740; RV64-NEXT:    vmv.s.x v10, a0
741; RV64-NEXT:    vredminu.vs v8, v8, v10
742; RV64-NEXT:    vmv.x.s a0, v8
743; RV64-NEXT:    ret
744  %v = load <16 x i32>, ptr %p, align 256
745  %e0 = extractelement <16 x i32> %v, i32 0
746  %e1 = extractelement <16 x i32> %v, i32 1
747  %e2 = extractelement <16 x i32> %v, i32 2
748  %e3 = extractelement <16 x i32> %v, i32 3
749  %e4 = extractelement <16 x i32> %v, i32 4
750  %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
751  %umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2)
752  %umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3)
753  %umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4)
754  ret i32 %umin3
755}
756
757define float @reduce_fadd_16xf32_prefix2(ptr %p) {
758; CHECK-LABEL: reduce_fadd_16xf32_prefix2:
759; CHECK:       # %bb.0:
760; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
761; CHECK-NEXT:    vle32.v v8, (a0)
762; CHECK-NEXT:    vmv.s.x v9, zero
763; CHECK-NEXT:    vfredusum.vs v8, v8, v9
764; CHECK-NEXT:    vfmv.f.s fa0, v8
765; CHECK-NEXT:    ret
766  %v = load <16 x float>, ptr %p, align 256
767  %e0 = extractelement <16 x float> %v, i32 0
768  %e1 = extractelement <16 x float> %v, i32 1
769  %fadd0 = fadd fast float %e0, %e1
770  ret float %fadd0
771}
772
773define float @reduce_fadd_16xi32_prefix5(ptr %p) {
774; CHECK-LABEL: reduce_fadd_16xi32_prefix5:
775; CHECK:       # %bb.0:
776; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
777; CHECK-NEXT:    vle32.v v8, (a0)
778; CHECK-NEXT:    lui a0, 524288
779; CHECK-NEXT:    vmv.s.x v10, a0
780; CHECK-NEXT:    vfredusum.vs v8, v8, v10
781; CHECK-NEXT:    vfmv.f.s fa0, v8
782; CHECK-NEXT:    ret
783  %v = load <16 x float>, ptr %p, align 256
784  %e0 = extractelement <16 x float> %v, i32 0
785  %e1 = extractelement <16 x float> %v, i32 1
786  %e2 = extractelement <16 x float> %v, i32 2
787  %e3 = extractelement <16 x float> %v, i32 3
788  %e4 = extractelement <16 x float> %v, i32 4
789  %fadd0 = fadd fast float %e0, %e1
790  %fadd1 = fadd fast float %fadd0, %e2
791  %fadd2 = fadd fast float %fadd1, %e3
792  %fadd3 = fadd fast float %fadd2, %e4
793  ret float %fadd3
794}
795
796;; Corner case tests for fadd associativity
797
798; Negative test, not associative.  Would need strict opcode.
799define float @reduce_fadd_2xf32_non_associative(ptr %p) {
800; CHECK-LABEL: reduce_fadd_2xf32_non_associative:
801; CHECK:       # %bb.0:
802; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
803; CHECK-NEXT:    vle32.v v8, (a0)
804; CHECK-NEXT:    vfmv.f.s fa5, v8
805; CHECK-NEXT:    vslidedown.vi v8, v8, 1
806; CHECK-NEXT:    vfmv.f.s fa4, v8
807; CHECK-NEXT:    fadd.s fa0, fa5, fa4
808; CHECK-NEXT:    ret
809  %v = load <2 x float>, ptr %p, align 256
810  %e0 = extractelement <2 x float> %v, i32 0
811  %e1 = extractelement <2 x float> %v, i32 1
812  %fadd0 = fadd float %e0, %e1
813  ret float %fadd0
814}
815
816; Positive test - minimal set of fast math flags
817define float @reduce_fadd_2xf32_reassoc_only(ptr %p) {
818; CHECK-LABEL: reduce_fadd_2xf32_reassoc_only:
819; CHECK:       # %bb.0:
820; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
821; CHECK-NEXT:    vle32.v v8, (a0)
822; CHECK-NEXT:    lui a0, 524288
823; CHECK-NEXT:    vmv.s.x v9, a0
824; CHECK-NEXT:    vfredusum.vs v8, v8, v9
825; CHECK-NEXT:    vfmv.f.s fa0, v8
826; CHECK-NEXT:    ret
827  %v = load <2 x float>, ptr %p, align 256
828  %e0 = extractelement <2 x float> %v, i32 0
829  %e1 = extractelement <2 x float> %v, i32 1
830  %fadd0 = fadd reassoc float %e0, %e1
831  ret float %fadd0
832}
833
834; Negative test - wrong fast math flag.
835define float @reduce_fadd_2xf32_ninf_only(ptr %p) {
836; CHECK-LABEL: reduce_fadd_2xf32_ninf_only:
837; CHECK:       # %bb.0:
838; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
839; CHECK-NEXT:    vle32.v v8, (a0)
840; CHECK-NEXT:    vfmv.f.s fa5, v8
841; CHECK-NEXT:    vslidedown.vi v8, v8, 1
842; CHECK-NEXT:    vfmv.f.s fa4, v8
843; CHECK-NEXT:    fadd.s fa0, fa5, fa4
844; CHECK-NEXT:    ret
845  %v = load <2 x float>, ptr %p, align 256
846  %e0 = extractelement <2 x float> %v, i32 0
847  %e1 = extractelement <2 x float> %v, i32 1
848  %fadd0 = fadd ninf float %e0, %e1
849  ret float %fadd0
850}
851
852
853; Negative test - last fadd is not associative
854define float @reduce_fadd_4xi32_non_associative(ptr %p) {
855; CHECK-LABEL: reduce_fadd_4xi32_non_associative:
856; CHECK:       # %bb.0:
857; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
858; CHECK-NEXT:    vle32.v v8, (a0)
859; CHECK-NEXT:    lui a0, 524288
860; CHECK-NEXT:    vmv.s.x v9, a0
861; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
862; CHECK-NEXT:    vfredusum.vs v9, v8, v9
863; CHECK-NEXT:    vslidedown.vi v8, v8, 3
864; CHECK-NEXT:    vfmv.f.s fa5, v8
865; CHECK-NEXT:    vfmv.f.s fa4, v9
866; CHECK-NEXT:    fadd.s fa0, fa4, fa5
867; CHECK-NEXT:    ret
868  %v = load <4 x float>, ptr %p, align 256
869  %e0 = extractelement <4 x float> %v, i32 0
870  %e1 = extractelement <4 x float> %v, i32 1
871  %e2 = extractelement <4 x float> %v, i32 2
872  %e3 = extractelement <4 x float> %v, i32 3
873  %fadd0 = fadd fast float %e0, %e1
874  %fadd1 = fadd fast float %fadd0, %e2
875  %fadd2 = fadd float %fadd1, %e3
876  ret float %fadd2
877}
878
879; Negative test - first fadd is not associative
880; We could form a reduce for elements 2 and 3.
881define float @reduce_fadd_4xi32_non_associative2(ptr %p) {
882; CHECK-LABEL: reduce_fadd_4xi32_non_associative2:
883; CHECK:       # %bb.0:
884; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
885; CHECK-NEXT:    vle32.v v8, (a0)
886; CHECK-NEXT:    vfmv.f.s fa5, v8
887; CHECK-NEXT:    vslidedown.vi v9, v8, 1
888; CHECK-NEXT:    vfmv.f.s fa4, v9
889; CHECK-NEXT:    vslidedown.vi v9, v8, 2
890; CHECK-NEXT:    vslidedown.vi v8, v8, 3
891; CHECK-NEXT:    vfmv.f.s fa3, v9
892; CHECK-NEXT:    vfmv.f.s fa2, v8
893; CHECK-NEXT:    fadd.s fa5, fa5, fa4
894; CHECK-NEXT:    fadd.s fa4, fa3, fa2
895; CHECK-NEXT:    fadd.s fa0, fa5, fa4
896; CHECK-NEXT:    ret
897  %v = load <4 x float>, ptr %p, align 256
898  %e0 = extractelement <4 x float> %v, i32 0
899  %e1 = extractelement <4 x float> %v, i32 1
900  %e2 = extractelement <4 x float> %v, i32 2
901  %e3 = extractelement <4 x float> %v, i32 3
902  %fadd0 = fadd float %e0, %e1
903  %fadd1 = fadd fast float %fadd0, %e2
904  %fadd2 = fadd fast float %fadd1, %e3
905  ret float %fadd2
906}
907