xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll (revision 3133acf1fbd1cc57ea8e74288ee9a0acd027d749)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
3; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
4
5define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
6; NON-POW2-LABEL: @v3_load_i32_mul_by_constant_store(
7; NON-POW2-NEXT:  entry:
8; NON-POW2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
9; NON-POW2-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
10; NON-POW2-NEXT:    [[TMP1:%.*]] = mul nsw <3 x i32> [[TMP0]], splat (i32 10)
11; NON-POW2-NEXT:    store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
12; NON-POW2-NEXT:    ret void
13;
14; POW2-ONLY-LABEL: @v3_load_i32_mul_by_constant_store(
15; POW2-ONLY-NEXT:  entry:
16; POW2-ONLY-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
17; POW2-ONLY-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
18; POW2-ONLY-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
19; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
20; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4
21; POW2-ONLY-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], splat (i32 10)
22; POW2-ONLY-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
23; POW2-ONLY-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
24; POW2-ONLY-NEXT:    store i32 [[MUL_2]], ptr [[DST_2]], align 4
25; POW2-ONLY-NEXT:    ret void
26;
27entry:
28  %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
29  %l.src.0 = load i32, ptr %gep.src.0, align 4
30  %mul.0 = mul nsw i32 %l.src.0, 10
31
32  %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
33  %l.src.1 = load i32, ptr %gep.src.1, align 4
34  %mul.1 = mul nsw i32 %l.src.1, 10
35
36  %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
37  %l.src.2 = load i32, ptr %gep.src.2, align 4
38  %mul.2 = mul nsw i32 %l.src.2, 10
39
40  store i32 %mul.0, ptr %dst
41
42  %dst.1 = getelementptr i32, ptr %dst, i32 1
43  store i32 %mul.1, ptr %dst.1
44
45  %dst.2 = getelementptr i32, ptr %dst, i32 2
46  store i32 %mul.2, ptr %dst.2
47
48  ret void
49}
50
51; Should no be vectorized with a undef/poison element as padding, as
52; division by undef/poison may cause UB.  Must use VL predication or
53; masking instead, where RISCV wins.
54define void @v3_load_i32_udiv_by_constant_store(ptr %src, ptr %dst) {
55; NON-POW2-LABEL: @v3_load_i32_udiv_by_constant_store(
56; NON-POW2-NEXT:  entry:
57; NON-POW2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
58; NON-POW2-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
59; NON-POW2-NEXT:    [[TMP1:%.*]] = udiv <3 x i32> splat (i32 10), [[TMP0]]
60; NON-POW2-NEXT:    store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
61; NON-POW2-NEXT:    ret void
62;
63; POW2-ONLY-LABEL: @v3_load_i32_udiv_by_constant_store(
64; POW2-ONLY-NEXT:  entry:
65; POW2-ONLY-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
66; POW2-ONLY-NEXT:    [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
67; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = udiv i32 10, [[L_SRC_0]]
68; POW2-ONLY-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
69; POW2-ONLY-NEXT:    [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
70; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = udiv i32 10, [[L_SRC_1]]
71; POW2-ONLY-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
72; POW2-ONLY-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
73; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = udiv i32 10, [[L_SRC_2]]
74; POW2-ONLY-NEXT:    store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
75; POW2-ONLY-NEXT:    [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
76; POW2-ONLY-NEXT:    store i32 [[MUL_1]], ptr [[DST_1]], align 4
77; POW2-ONLY-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
78; POW2-ONLY-NEXT:    store i32 [[MUL_2]], ptr [[DST_2]], align 4
79; POW2-ONLY-NEXT:    ret void
80;
81entry:
82  %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
83  %l.src.0 = load i32, ptr %gep.src.0, align 4
84  %mul.0 = udiv i32 10, %l.src.0
85
86  %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
87  %l.src.1 = load i32, ptr %gep.src.1, align 4
88  %mul.1 = udiv i32 10, %l.src.1
89
90  %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
91  %l.src.2 = load i32, ptr %gep.src.2, align 4
92  %mul.2 = udiv i32 10, %l.src.2
93
94  store i32 %mul.0, ptr %dst
95
96  %dst.1 = getelementptr i32, ptr %dst, i32 1
97  store i32 %mul.1, ptr %dst.1
98
99  %dst.2 = getelementptr i32, ptr %dst, i32 2
100  store i32 %mul.2, ptr %dst.2
101
102  ret void
103}
104
105
106
107define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) {
108; NON-POW2-LABEL: @v3_load_i32_mul_store(
109; NON-POW2-NEXT:  entry:
110; NON-POW2-NEXT:    [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
111; NON-POW2-NEXT:    [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
112; NON-POW2-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
113; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
114; NON-POW2-NEXT:    [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
115; NON-POW2-NEXT:    store <3 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
116; NON-POW2-NEXT:    ret void
117;
118; POW2-ONLY-LABEL: @v3_load_i32_mul_store(
119; POW2-ONLY-NEXT:  entry:
120; POW2-ONLY-NEXT:    [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
121; POW2-ONLY-NEXT:    [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
122; POW2-ONLY-NEXT:    [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
123; POW2-ONLY-NEXT:    [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
124; POW2-ONLY-NEXT:    [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
125; POW2-ONLY-NEXT:    [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
126; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
127; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
128; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
129; POW2-ONLY-NEXT:    [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
130; POW2-ONLY-NEXT:    store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
131; POW2-ONLY-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
132; POW2-ONLY-NEXT:    store i32 [[MUL_2]], ptr [[DST_2]], align 4
133; POW2-ONLY-NEXT:    ret void
134;
135entry:
136  %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
137  %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
138  %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
139  %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
140  %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
141
142  %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
143  %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
144  %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
145  %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
146  %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
147
148  %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
149  %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
150  %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
151  %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
152  %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
153
154  store i32 %mul.0, ptr %dst
155
156  %dst.1 = getelementptr i32, ptr %dst, i32 1
157  store i32 %mul.1, ptr %dst.1
158
159  %dst.2 = getelementptr i32, ptr %dst, i32 2
160  store i32 %mul.2, ptr %dst.2
161
162  ret void
163}
164
165define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) {
166; NON-POW2-LABEL: @v3_load_i32_mul_add_const_store(
167; NON-POW2-NEXT:  entry:
168; NON-POW2-NEXT:    [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
169; NON-POW2-NEXT:    [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
170; NON-POW2-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
171; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
172; NON-POW2-NEXT:    [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
173; NON-POW2-NEXT:    [[TMP3:%.*]] = add <3 x i32> [[TMP2]], splat (i32 9)
174; NON-POW2-NEXT:    store <3 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
175; NON-POW2-NEXT:    ret void
176;
177; POW2-ONLY-LABEL: @v3_load_i32_mul_add_const_store(
178; POW2-ONLY-NEXT:  entry:
179; POW2-ONLY-NEXT:    [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
180; POW2-ONLY-NEXT:    [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
181; POW2-ONLY-NEXT:    [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
182; POW2-ONLY-NEXT:    [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
183; POW2-ONLY-NEXT:    [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
184; POW2-ONLY-NEXT:    [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
185; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
186; POW2-ONLY-NEXT:    [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
187; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
188; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
189; POW2-ONLY-NEXT:    [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
190; POW2-ONLY-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP2]], splat (i32 9)
191; POW2-ONLY-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
192; POW2-ONLY-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
193; POW2-ONLY-NEXT:    store i32 [[ADD_2]], ptr [[DST_2]], align 4
194; POW2-ONLY-NEXT:    ret void
195;
196entry:
197  %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
198  %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
199  %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
200  %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
201  %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
202  %add.0 = add i32 %mul.0, 9
203
204  %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
205  %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
206  %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
207  %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
208  %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
209  %add.1 = add i32 %mul.1, 9
210
211  %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
212  %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
213  %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
214  %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
215  %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
216  %add.2 = add i32 %mul.2, 9
217
218  store i32 %add.0, ptr %dst
219
220  %dst.1 = getelementptr i32, ptr %dst, i32 1
221  store i32 %add.1, ptr %dst.1
222
223  %dst.2 = getelementptr i32, ptr %dst, i32 2
224  store i32 %add.2, ptr %dst.2
225
226  ret void
227}
228
229define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
230; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
231; NON-POW2-NEXT:  entry:
232; NON-POW2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
233; NON-POW2-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
234; NON-POW2-NEXT:    [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], splat (float 1.000000e+01)
235; NON-POW2-NEXT:    store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4
236; NON-POW2-NEXT:    ret void
237;
238; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
239; POW2-ONLY-NEXT:  entry:
240; POW2-ONLY-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
241; POW2-ONLY-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
242; POW2-ONLY-NEXT:    [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
243; POW2-ONLY-NEXT:    [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
244; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
245; POW2-ONLY-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], splat (float 1.000000e+01)
246; POW2-ONLY-NEXT:    store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
247; POW2-ONLY-NEXT:    [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
248; POW2-ONLY-NEXT:    store float [[FADD_2]], ptr [[DST_2]], align 4
249; POW2-ONLY-NEXT:    ret void
250;
251entry:
252  %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
253  %l.src.0 = load float , ptr %gep.src.0, align 4
254  %fadd.0 = fadd float %l.src.0, 10.0
255
256  %gep.src.1 = getelementptr inbounds float , ptr %src, i32 1
257  %l.src.1 = load float, ptr %gep.src.1, align 4
258  %fadd.1 = fadd float %l.src.1, 10.0
259
260  %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
261  %l.src.2 = load float, ptr %gep.src.2, align 4
262  %fadd.2 = fadd float %l.src.2, 10.0
263
264  store float %fadd.0, ptr %dst
265
266  %dst.1 = getelementptr float, ptr %dst, i32 1
267  store float %fadd.1, ptr %dst.1
268
269  %dst.2 = getelementptr float, ptr %dst, i32 2
270  store float %fadd.2, ptr %dst.2
271
272  ret void
273}
274
275define void @phi_store3(ptr %dst) {
276; NON-POW2-LABEL: @phi_store3(
277; NON-POW2-NEXT:  entry:
278; NON-POW2-NEXT:    br label [[EXIT:%.*]]
279; NON-POW2:       invoke.cont8.loopexit:
280; NON-POW2-NEXT:    br label [[EXIT]]
281; NON-POW2:       exit:
282; NON-POW2-NEXT:    [[TMP0:%.*]] = phi <3 x i32> [ <i32 1, i32 2, i32 3>, [[ENTRY:%.*]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
283; NON-POW2-NEXT:    store <3 x i32> [[TMP0]], ptr [[DST:%.*]], align 4
284; NON-POW2-NEXT:    ret void
285;
286; POW2-ONLY-LABEL: @phi_store3(
287; POW2-ONLY-NEXT:  entry:
288; POW2-ONLY-NEXT:    br label [[EXIT:%.*]]
289; POW2-ONLY:       invoke.cont8.loopexit:
290; POW2-ONLY-NEXT:    br label [[EXIT]]
291; POW2-ONLY:       exit:
292; POW2-ONLY-NEXT:    [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
293; POW2-ONLY-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
294; POW2-ONLY-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
295; POW2-ONLY-NEXT:    store <2 x i32> [[TMP0]], ptr [[DST]], align 4
296; POW2-ONLY-NEXT:    store i32 [[P_2]], ptr [[DST_2]], align 4
297; POW2-ONLY-NEXT:    ret void
298;
299entry:
300  br label %exit
301
302invoke.cont8.loopexit:                            ; No predecessors!
303  br label %exit
304
305exit:
306  %p.0 = phi i32 [ 1, %entry ], [ 0, %invoke.cont8.loopexit ]
307  %p.1 = phi i32 [ 2, %entry ], [ 0, %invoke.cont8.loopexit ]
308  %p.2 = phi i32 [ 3, %entry ], [ 0, %invoke.cont8.loopexit ]
309
310  %dst.1 = getelementptr i32, ptr %dst, i32 1
311  %dst.2 = getelementptr i32, ptr %dst, i32 2
312
313  store i32 %p.0, ptr %dst, align 4
314  store i32 %p.1, ptr %dst.1, align 4
315  store i32 %p.2, ptr %dst.2, align 4
316  ret void
317}
318
319define void @store_try_reorder(ptr %dst) {
320; NON-POW2-LABEL: @store_try_reorder(
321; NON-POW2-NEXT:  entry:
322; NON-POW2-NEXT:    store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
323; NON-POW2-NEXT:    ret void
324;
325; POW2-ONLY-LABEL: @store_try_reorder(
326; POW2-ONLY-NEXT:  entry:
327; POW2-ONLY-NEXT:    [[ADD:%.*]] = add i32 0, 0
328; POW2-ONLY-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
329; POW2-ONLY-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
330; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
331; POW2-ONLY-NEXT:    ret void
332;
333entry:
334  %add = add i32 0, 0
335  store i32 %add, ptr %dst, align 4
336  %add207 = sub i32 0, 0
337  %arrayidx.i1887 = getelementptr i32, ptr %dst, i64 1
338  store i32 %add207, ptr %arrayidx.i1887, align 4
339  %add216 = sub i32 0, 0
340  %arrayidx.i1891 = getelementptr i32, ptr %dst, i64 2
341  store i32 %add216, ptr %arrayidx.i1891, align 4
342  ret void
343}
344
345define void @vec3_fpext_cost(ptr %Colour, float %0) {
346; NON-POW2-LABEL: @vec3_fpext_cost(
347; NON-POW2-NEXT:  entry:
348; NON-POW2-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 0
349; NON-POW2-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer
350; NON-POW2-NEXT:    [[TMP3:%.*]] = fpext <3 x float> [[TMP2]] to <3 x double>
351; NON-POW2-NEXT:    [[TMP4:%.*]] = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> [[TMP3]], <3 x double> zeroinitializer, <3 x double> zeroinitializer)
352; NON-POW2-NEXT:    [[TMP5:%.*]] = fptrunc <3 x double> [[TMP4]] to <3 x float>
353; NON-POW2-NEXT:    store <3 x float> [[TMP5]], ptr [[COLOUR:%.*]], align 4
354; NON-POW2-NEXT:    ret void
355;
356; POW2-ONLY-LABEL: @vec3_fpext_cost(
357; POW2-ONLY-NEXT:  entry:
358; POW2-ONLY-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
359; POW2-ONLY-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0:%.*]], i32 0
360; POW2-ONLY-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
361; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
362; POW2-ONLY-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
363; POW2-ONLY-NEXT:    [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float>
364; POW2-ONLY-NEXT:    store <2 x float> [[TMP5]], ptr [[COLOUR]], align 4
365; POW2-ONLY-NEXT:    [[CONV78:%.*]] = fpext float [[TMP0]] to double
366; POW2-ONLY-NEXT:    [[TMP6:%.*]] = call double @llvm.fmuladd.f64(double [[CONV78]], double 0.000000e+00, double 0.000000e+00)
367; POW2-ONLY-NEXT:    [[CONV82:%.*]] = fptrunc double [[TMP6]] to float
368; POW2-ONLY-NEXT:    store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
369; POW2-ONLY-NEXT:    ret void
370;
371entry:
372  %arrayidx72 = getelementptr float, ptr %Colour, i64 1
373  %arrayidx80 = getelementptr float, ptr %Colour, i64 2
374  %conv62 = fpext float %0 to double
375  %1 = call double @llvm.fmuladd.f64(double %conv62, double 0.000000e+00, double 0.000000e+00)
376  %conv66 = fptrunc double %1 to float
377  store float %conv66, ptr %Colour, align 4
378  %conv70 = fpext float %0 to double
379  %2 = call double @llvm.fmuladd.f64(double %conv70, double 0.000000e+00, double 0.000000e+00)
380  %conv74 = fptrunc double %2 to float
381  store float %conv74, ptr %arrayidx72, align 4
382  %conv78 = fpext float %0 to double
383  %3 = call double @llvm.fmuladd.f64(double %conv78, double 0.000000e+00, double 0.000000e+00)
384  %conv82 = fptrunc double %3 to float
385  store float %conv82, ptr %arrayidx80, align 4
386  ret void
387}
388
389define void @fpext_scatter(ptr %dst, double %conv) {
390; CHECK-LABEL: @fpext_scatter(
391; CHECK-NEXT:  entry:
392; CHECK-NEXT:    [[CONV25:%.*]] = fptrunc double [[CONV:%.*]] to float
393; CHECK-NEXT:    [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
394; CHECK-NEXT:    store float [[CONV25]], ptr [[LENGTHS]], align 4
395; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
396; CHECK-NEXT:    store float [[CONV25]], ptr [[ARRAYIDX32]], align 4
397; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
398; CHECK-NEXT:    store float [[CONV25]], ptr [[ARRAYIDX37]], align 4
399; CHECK-NEXT:    ret void
400;
401entry:
402  %conv25 = fptrunc double %conv to float
403  %Lengths = getelementptr float, ptr %dst, i64 0
404  store float %conv25, ptr %Lengths, align 4
405  %arrayidx32 = getelementptr float, ptr %dst, i64 1
406  store float %conv25, ptr %arrayidx32, align 4
407  %arrayidx37 = getelementptr float, ptr %dst, i64 2
408  store float %conv25, ptr %arrayidx37, align 4
409  ret void
410}
411
412define i32 @reduce_add(ptr %src) {
413; CHECK-LABEL: @reduce_add(
414; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
415; CHECK-NEXT:    [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
416; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
417; CHECK-NEXT:    [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
418; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
419; CHECK-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
420; CHECK-NEXT:    [[ADD_0:%.*]] = add i32 [[L_SRC_0]], [[L_SRC_1]]
421; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[L_SRC_2]]
422; CHECK-NEXT:    ret i32 [[ADD_1]]
423;
424  %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
425  %l.src.0 = load i32, ptr %gep.src.0, align 4
426  %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
427  %l.src.1 = load i32, ptr %gep.src.1, align 4
428  %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
429  %l.src.2 = load i32, ptr %gep.src.2, align 4
430
431  %add.0 = add i32 %l.src.0, %l.src.1
432  %add.1 = add i32 %add.0, %l.src.2
433  ret i32 %add.1
434}
435
436define float @reduce_fadd(ptr %src) {
437; NON-POW2-LABEL: @reduce_fadd(
438; NON-POW2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
439; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
440; NON-POW2-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP1]])
441; NON-POW2-NEXT:    ret float [[TMP2]]
442;
443; POW2-ONLY-LABEL: @reduce_fadd(
444; POW2-ONLY-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
445; POW2-ONLY-NEXT:    [[L_SRC_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
446; POW2-ONLY-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 1
447; POW2-ONLY-NEXT:    [[L_SRC_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
448; POW2-ONLY-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
449; POW2-ONLY-NEXT:    [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
450; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[L_SRC_0]], [[L_SRC_1]]
451; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[L_SRC_2]]
452; POW2-ONLY-NEXT:    ret float [[ADD_1]]
453;
454  %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
455  %l.src.0 = load float, ptr %gep.src.0, align 4
456  %gep.src.1 = getelementptr inbounds float, ptr %src, i32 1
457  %l.src.1 = load float, ptr %gep.src.1, align 4
458  %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
459  %l.src.2 = load float, ptr %gep.src.2, align 4
460
461  %add.0 = fadd fast float %l.src.0, %l.src.1
462  %add.1 = fadd fast float %add.0, %l.src.2
463  ret float %add.1
464}
465
466define i32 @reduce_add_after_mul(ptr %src) {
467; NON-POW2-LABEL: @reduce_add_after_mul(
468; NON-POW2-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
469; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
470; NON-POW2-NEXT:    [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP1]], splat (i32 10)
471; NON-POW2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP2]])
472; NON-POW2-NEXT:    ret i32 [[TMP3]]
473;
474; POW2-ONLY-LABEL: @reduce_add_after_mul(
475; POW2-ONLY-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
476; POW2-ONLY-NEXT:    [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
477; POW2-ONLY-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
478; POW2-ONLY-NEXT:    [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
479; POW2-ONLY-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
480; POW2-ONLY-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
481; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10
482; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10
483; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
484; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
485; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
486; POW2-ONLY-NEXT:    ret i32 [[ADD_1]]
487;
488  %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
489  %l.src.0 = load i32, ptr %gep.src.0, align 4
490  %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
491  %l.src.1 = load i32, ptr %gep.src.1, align 4
492  %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
493  %l.src.2 = load i32, ptr %gep.src.2, align 4
494
495  %mul.0 = mul nsw i32 %l.src.0, 10
496  %mul.1 = mul nsw i32 %l.src.1, 10
497  %mul.2 = mul nsw i32 %l.src.2, 10
498
499  %add.0 = add i32 %mul.0, %mul.1
500  %add.1 = add i32 %add.0, %mul.2
501  ret i32 %add.1
502}
503
504define i32 @dot_product_i32(ptr %a, ptr %b) {
505; NON-POW2-LABEL: @dot_product_i32(
506; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
507; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
508; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
509; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
510; NON-POW2-NEXT:    [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
511; NON-POW2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
512; NON-POW2-NEXT:    ret i32 [[TMP4]]
513;
514; POW2-ONLY-LABEL: @dot_product_i32(
515; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
516; POW2-ONLY-NEXT:    [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
517; POW2-ONLY-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
518; POW2-ONLY-NEXT:    [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
519; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
520; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
521; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
522; POW2-ONLY-NEXT:    [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
523; POW2-ONLY-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
524; POW2-ONLY-NEXT:    [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
525; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
526; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
527; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
528; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
529; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
530; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
531; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
532; POW2-ONLY-NEXT:    ret i32 [[ADD_1]]
533;
534  %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
535  %l.a.0 = load i32, ptr %gep.a.0, align 4
536  %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1
537  %l.a.1 = load i32, ptr %gep.a.1, align 4
538  %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2
539  %l.a.2 = load i32, ptr %gep.a.2, align 4
540
541  %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0
542  %l.b.0 = load i32, ptr %gep.b.0, align 4
543  %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1
544  %l.b.1 = load i32, ptr %gep.b.1, align 4
545  %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2
546  %l.b.2 = load i32, ptr %gep.b.2, align 4
547
548  %mul.0 = mul nsw i32 %l.a.0, %l.b.0
549  %mul.1 = mul nsw i32 %l.a.1, %l.b.1
550  %mul.2 = mul nsw i32 %l.a.2, %l.b.2
551
552  %add.0 = add i32 %mul.0, %mul.1
553  %add.1 = add i32 %add.0, %mul.2
554  ret i32 %add.1
555}
556
557; Same as above, except the reduction order has been perturbed.  This
558; is checking for our ability to reorder.
559define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
560; NON-POW2-LABEL: @dot_product_i32_reorder(
561; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
562; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
563; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
564; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
565; NON-POW2-NEXT:    [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
566; NON-POW2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
567; NON-POW2-NEXT:    ret i32 [[TMP4]]
568;
569; POW2-ONLY-LABEL: @dot_product_i32_reorder(
570; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
571; POW2-ONLY-NEXT:    [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
572; POW2-ONLY-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
573; POW2-ONLY-NEXT:    [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
574; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
575; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
576; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
577; POW2-ONLY-NEXT:    [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
578; POW2-ONLY-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
579; POW2-ONLY-NEXT:    [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
580; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
581; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
582; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
583; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
584; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
585; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
586; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
587; POW2-ONLY-NEXT:    ret i32 [[ADD_1]]
588;
589  %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
590  %l.a.0 = load i32, ptr %gep.a.0, align 4
591  %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1
592  %l.a.1 = load i32, ptr %gep.a.1, align 4
593  %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2
594  %l.a.2 = load i32, ptr %gep.a.2, align 4
595
596  %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0
597  %l.b.0 = load i32, ptr %gep.b.0, align 4
598  %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1
599  %l.b.1 = load i32, ptr %gep.b.1, align 4
600  %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2
601  %l.b.2 = load i32, ptr %gep.b.2, align 4
602
603  %mul.0 = mul nsw i32 %l.a.0, %l.b.0
604  %mul.1 = mul nsw i32 %l.a.1, %l.b.1
605  %mul.2 = mul nsw i32 %l.a.2, %l.b.2
606
607  %add.0 = add i32 %mul.1, %mul.0
608  %add.1 = add i32 %add.0, %mul.2
609  ret i32 %add.1
610}
611
612define float @dot_product_fp32(ptr %a, ptr %b) {
613; NON-POW2-LABEL: @dot_product_fp32(
614; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
615; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
616; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
617; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
618; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
619; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
620; NON-POW2-NEXT:    ret float [[TMP4]]
621;
622; POW2-ONLY-LABEL: @dot_product_fp32(
623; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
624; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
625; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
626; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
627; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
628; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
629; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
630; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
631; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
632; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
633; POW2-ONLY-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
634; POW2-ONLY-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
635; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
636; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
637; POW2-ONLY-NEXT:    ret float [[ADD_1]]
638;
639  %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
640  %l.a.0 = load float, ptr %gep.a.0, align 4
641  %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1
642  %l.a.1 = load float, ptr %gep.a.1, align 4
643  %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2
644  %l.a.2 = load float, ptr %gep.a.2, align 4
645
646  %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0
647  %l.b.0 = load float, ptr %gep.b.0, align 4
648  %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1
649  %l.b.1 = load float, ptr %gep.b.1, align 4
650  %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2
651  %l.b.2 = load float, ptr %gep.b.2, align 4
652
653  %mul.0 = fmul fast float %l.a.0, %l.b.0
654  %mul.1 = fmul fast float %l.a.1, %l.b.1
655  %mul.2 = fmul fast float %l.a.2, %l.b.2
656
657  %add.0 = fadd fast float %mul.0, %mul.1
658  %add.1 = fadd fast float %add.0, %mul.2
659  ret float %add.1
660}
661
662; Same as above, except the reduction order has been perturbed.  This
663; is checking for our ability to reorder.
664define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
665; NON-POW2-LABEL: @dot_product_fp32_reorder(
666; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
667; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
668; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
669; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
670; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
671; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
672; NON-POW2-NEXT:    ret float [[TMP4]]
673;
674; POW2-ONLY-LABEL: @dot_product_fp32_reorder(
675; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
676; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
677; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
678; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
679; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
680; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
681; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
682; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
683; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
684; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
685; POW2-ONLY-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
686; POW2-ONLY-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
687; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]]
688; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
689; POW2-ONLY-NEXT:    ret float [[ADD_1]]
690;
691  %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
692  %l.a.0 = load float, ptr %gep.a.0, align 4
693  %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1
694  %l.a.1 = load float, ptr %gep.a.1, align 4
695  %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2
696  %l.a.2 = load float, ptr %gep.a.2, align 4
697
698  %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0
699  %l.b.0 = load float, ptr %gep.b.0, align 4
700  %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1
701  %l.b.1 = load float, ptr %gep.b.1, align 4
702  %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2
703  %l.b.2 = load float, ptr %gep.b.2, align 4
704
705  %mul.0 = fmul fast float %l.a.0, %l.b.0
706  %mul.1 = fmul fast float %l.a.1, %l.b.1
707  %mul.2 = fmul fast float %l.a.2, %l.b.2
708
709  %add.0 = fadd fast float %mul.1, %mul.0
710  %add.1 = fadd fast float %add.0, %mul.2
711  ret float %add.1
712}
713
714
715define double @dot_product_fp64(ptr %a, ptr %b) {
716; NON-POW2-LABEL: @dot_product_fp64(
717; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
718; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
719; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
720; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
721; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
722; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
723; NON-POW2-NEXT:    ret double [[TMP4]]
724;
725; POW2-ONLY-LABEL: @dot_product_fp64(
726; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
727; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
728; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
729; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
730; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
731; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
732; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4
733; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
734; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
735; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
736; POW2-ONLY-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
737; POW2-ONLY-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
738; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
739; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
740; POW2-ONLY-NEXT:    ret double [[ADD_1]]
741;
742  %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
743  %l.a.0 = load double, ptr %gep.a.0, align 4
744  %gep.a.1 = getelementptr inbounds double, ptr %a, i32 1
745  %l.a.1 = load double, ptr %gep.a.1, align 4
746  %gep.a.2 = getelementptr inbounds double, ptr %a, i32 2
747  %l.a.2 = load double, ptr %gep.a.2, align 4
748
749  %gep.b.0 = getelementptr inbounds double, ptr %b, i32 0
750  %l.b.0 = load double, ptr %gep.b.0, align 4
751  %gep.b.1 = getelementptr inbounds double, ptr %b, i32 1
752  %l.b.1 = load double, ptr %gep.b.1, align 4
753  %gep.b.2 = getelementptr inbounds double, ptr %b, i32 2
754  %l.b.2 = load double, ptr %gep.b.2, align 4
755
756  %mul.0 = fmul fast double %l.a.0, %l.b.0
757  %mul.1 = fmul fast double %l.a.1, %l.b.1
758  %mul.2 = fmul fast double %l.a.2, %l.b.2
759
760  %add.0 = fadd fast double %mul.0, %mul.1
761  %add.1 = fadd fast double %add.0, %mul.2
762  ret double %add.1
763}
764
765;; Covers a case where SLP would previous crash due to a
766;; missing bailout in TryToFindDuplicates for the case
767;; where a VL=3 list was vectorized directly (without
768;; a root instruction such as a store or reduce).
769define double @no_root_reshuffle(ptr  %ptr) {
770; CHECK-LABEL: @no_root_reshuffle(
771; CHECK-NEXT:  entry:
772; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[PTR:%.*]], align 8
773; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[TMP0]], [[TMP0]]
774; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
775; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
776; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 16
777; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
778; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]]
779; CHECK-NEXT:    [[MUL6:%.*]] = fmul fast double [[TMP3]], [[TMP1]]
780; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[MUL6]], [[MUL]]
781; CHECK-NEXT:    ret double [[ADD]]
782;
783entry:
784  %0 = load double, ptr %ptr, align 8
785  %mul = fmul fast double %0, %0
786  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 8
787  %1 = load double, ptr %arrayidx2, align 8
788  %arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 16
789  %2 = load double, ptr %arrayidx3, align 8
790  %3 = fmul fast double %2, %2
791  %mul6 = fmul fast double %3, %1
792  %add = fadd fast double %mul6, %mul
793  ret double %add
794}
795
796define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
797; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
798; NON-POW2-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
799; NON-POW2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
800; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
801; NON-POW2-NEXT:    [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
802; NON-POW2-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
803; NON-POW2-NEXT:    ret float [[TMP5]]
804;
805; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
806; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
807; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
808; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
809; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
810; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
811; POW2-ONLY-NEXT:    ret float [[ADD_1]]
812;
813  %mul.0 = fmul fast float %a, 10.0
814  %mul.1 = fmul fast float %b, 10.0
815  %mul.2 = fmul fast float %c, 10.0
816
817  %add.0 = fadd fast float %mul.0, %mul.1
818  %add.1 = fadd fast float %add.0, %mul.2
819  ret float %add.1
820}
821
822
823declare float @llvm.fmuladd.f32(float, float, float)
824
825declare double @llvm.fmuladd.f64(double, double, double)
826