xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll (revision 580210a0c938531ef9fd79f9ffedb93eeb2e66c2)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=slp-vectorizer,dce -slp-threshold=-999 -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
3
4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
5target triple = "x86_64-apple-macosx10.8.0"
6
7declare double @llvm.fabs.f64(double) nounwind readnone
8
9define void @vec_fabs_f64(ptr %a, ptr %b, ptr %c) {
10; CHECK-LABEL: @vec_fabs_f64(
11; CHECK-NEXT:  entry:
12; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
13; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
14; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
15; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP4]])
16; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[C:%.*]], align 8
17; CHECK-NEXT:    ret void
18;
19entry:
20  %i0 = load double, ptr %a, align 8
21  %i1 = load double, ptr %b, align 8
22  %mul = fmul double %i0, %i1
23  %call = tail call double @llvm.fabs.f64(double %mul) nounwind readnone
24  %arrayidx3 = getelementptr inbounds double, ptr %a, i64 1
25  %i3 = load double, ptr %arrayidx3, align 8
26  %arrayidx4 = getelementptr inbounds double, ptr %b, i64 1
27  %i4 = load double, ptr %arrayidx4, align 8
28  %mul5 = fmul double %i3, %i4
29  %call5 = tail call double @llvm.fabs.f64(double %mul5) nounwind readnone
30  store double %call, ptr %c, align 8
31  %arrayidx5 = getelementptr inbounds double, ptr %c, i64 1
32  store double %call5, ptr %arrayidx5, align 8
33  ret void
34}
35
36declare float @llvm.copysign.f32(float, float) nounwind readnone
37
38define void @vec_copysign_f32(ptr %a, ptr %b, ptr noalias %c) {
39; CHECK-LABEL: @vec_copysign_f32(
40; CHECK-NEXT:  entry:
41; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
42; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
43; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
44; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[C:%.*]], align 4
45; CHECK-NEXT:    ret void
46;
47entry:
48  %0 = load float, ptr %a, align 4
49  %1 = load float, ptr %b, align 4
50  %call0 = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
51  store float %call0, ptr %c, align 4
52
53  %ix2 = getelementptr inbounds float, ptr %a, i64 1
54  %2 = load float, ptr %ix2, align 4
55  %ix3 = getelementptr inbounds float, ptr %b, i64 1
56  %3 = load float, ptr %ix3, align 4
57  %call1 = tail call float @llvm.copysign.f32(float %2, float %3) nounwind readnone
58  %c1 = getelementptr inbounds float, ptr %c, i64 1
59  store float %call1, ptr %c1, align 4
60
61  %ix4 = getelementptr inbounds float, ptr %a, i64 2
62  %4 = load float, ptr %ix4, align 4
63  %ix5 = getelementptr inbounds float, ptr %b, i64 2
64  %5 = load float, ptr %ix5, align 4
65  %call2 = tail call float @llvm.copysign.f32(float %4, float %5) nounwind readnone
66  %c2 = getelementptr inbounds float, ptr %c, i64 2
67  store float %call2, ptr %c2, align 4
68
69  %ix6 = getelementptr inbounds float, ptr %a, i64 3
70  %6 = load float, ptr %ix6, align 4
71  %ix7 = getelementptr inbounds float, ptr %b, i64 3
72  %7 = load float, ptr %ix7, align 4
73  %call3 = tail call float @llvm.copysign.f32(float %6, float %7) nounwind readnone
74  %c3 = getelementptr inbounds float, ptr %c, i64 3
75  store float %call3, ptr %c3, align 4
76
77  ret void
78}
79
80declare i32 @llvm.bswap.i32(i32) nounwind readnone
81
82define void @vec_bswap_i32(ptr %a, ptr %b, ptr %c) {
83; CHECK-LABEL: @vec_bswap_i32(
84; CHECK-NEXT:  entry:
85; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
86; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
87; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
88; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP4]])
89; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[C:%.*]], align 4
90; CHECK-NEXT:    ret void
91;
92entry:
93  %i0 = load i32, ptr %a, align 4
94  %i1 = load i32, ptr %b, align 4
95  %add1 = add i32 %i0, %i1
96  %call1 = tail call i32 @llvm.bswap.i32(i32 %add1) nounwind readnone
97
98  %arrayidx2 = getelementptr inbounds i32, ptr %a, i32 1
99  %i2 = load i32, ptr %arrayidx2, align 4
100  %arrayidx3 = getelementptr inbounds i32, ptr %b, i32 1
101  %i3 = load i32, ptr %arrayidx3, align 4
102  %add2 = add i32 %i2, %i3
103  %call2 = tail call i32 @llvm.bswap.i32(i32 %add2) nounwind readnone
104
105  %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 2
106  %i4 = load i32, ptr %arrayidx4, align 4
107  %arrayidx5 = getelementptr inbounds i32, ptr %b, i32 2
108  %i5 = load i32, ptr %arrayidx5, align 4
109  %add3 = add i32 %i4, %i5
110  %call3 = tail call i32 @llvm.bswap.i32(i32 %add3) nounwind readnone
111
112  %arrayidx6 = getelementptr inbounds i32, ptr %a, i32 3
113  %i6 = load i32, ptr %arrayidx6, align 4
114  %arrayidx7 = getelementptr inbounds i32, ptr %b, i32 3
115  %i7 = load i32, ptr %arrayidx7, align 4
116  %add4 = add i32 %i6, %i7
117  %call4 = tail call i32 @llvm.bswap.i32(i32 %add4) nounwind readnone
118
119  store i32 %call1, ptr %c, align 4
120  %arrayidx8 = getelementptr inbounds i32, ptr %c, i32 1
121  store i32 %call2, ptr %arrayidx8, align 4
122  %arrayidx9 = getelementptr inbounds i32, ptr %c, i32 2
123  store i32 %call3, ptr %arrayidx9, align 4
124  %arrayidx10 = getelementptr inbounds i32, ptr %c, i32 3
125  store i32 %call4, ptr %arrayidx10, align 4
126  ret void
127
128}
129
130declare i32 @llvm.ctlz.i32(i32,i1) nounwind readnone
131
132define void @vec_ctlz_i32(ptr %a, ptr %b, ptr %c, i1) {
133; CHECK-LABEL: @vec_ctlz_i32(
134; CHECK-NEXT:  entry:
135; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
136; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
137; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
138; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP5]], i1 true)
139; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[C:%.*]], align 4
140; CHECK-NEXT:    ret void
141;
142entry:
143  %i0 = load i32, ptr %a, align 4
144  %i1 = load i32, ptr %b, align 4
145  %add1 = add i32 %i0, %i1
146  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
147
148  %arrayidx2 = getelementptr inbounds i32, ptr %a, i32 1
149  %i2 = load i32, ptr %arrayidx2, align 4
150  %arrayidx3 = getelementptr inbounds i32, ptr %b, i32 1
151  %i3 = load i32, ptr %arrayidx3, align 4
152  %add2 = add i32 %i2, %i3
153  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 true) nounwind readnone
154
155  %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 2
156  %i4 = load i32, ptr %arrayidx4, align 4
157  %arrayidx5 = getelementptr inbounds i32, ptr %b, i32 2
158  %i5 = load i32, ptr %arrayidx5, align 4
159  %add3 = add i32 %i4, %i5
160  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
161
162  %arrayidx6 = getelementptr inbounds i32, ptr %a, i32 3
163  %i6 = load i32, ptr %arrayidx6, align 4
164  %arrayidx7 = getelementptr inbounds i32, ptr %b, i32 3
165  %i7 = load i32, ptr %arrayidx7, align 4
166  %add4 = add i32 %i6, %i7
167  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 true) nounwind readnone
168
169  store i32 %call1, ptr %c, align 4
170  %arrayidx8 = getelementptr inbounds i32, ptr %c, i32 1
171  store i32 %call2, ptr %arrayidx8, align 4
172  %arrayidx9 = getelementptr inbounds i32, ptr %c, i32 2
173  store i32 %call3, ptr %arrayidx9, align 4
174  %arrayidx10 = getelementptr inbounds i32, ptr %c, i32 3
175  store i32 %call4, ptr %arrayidx10, align 4
176  ret void
177
178}
179
180define void @vec_ctlz_i32_neg(ptr %a, ptr %b, ptr %c, i1) {
181; CHECK-LABEL: @vec_ctlz_i32_neg(
182; CHECK-NEXT:  entry:
183; CHECK-NEXT:    [[I0:%.*]] = load i32, ptr [[A:%.*]], align 4
184; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[B:%.*]], align 4
185; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[I0]], [[I1]]
186; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD1]], i1 true) #[[ATTR3:[0-9]+]]
187; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
188; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
189; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
190; CHECK-NEXT:    [[I3:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
191; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[I2]], [[I3]]
192; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD2]], i1 false) #[[ATTR3]]
193; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
194; CHECK-NEXT:    [[I4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
195; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
196; CHECK-NEXT:    [[I5:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
197; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[I4]], [[I5]]
198; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD3]], i1 true) #[[ATTR3]]
199; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 3
200; CHECK-NEXT:    [[I6:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
201; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 3
202; CHECK-NEXT:    [[I7:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
203; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[I6]], [[I7]]
204; CHECK-NEXT:    [[CALL4:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD4]], i1 false) #[[ATTR3]]
205; CHECK-NEXT:    store i32 [[CALL1]], ptr [[C:%.*]], align 4
206; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 1
207; CHECK-NEXT:    store i32 [[CALL2]], ptr [[ARRAYIDX8]], align 4
208; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 2
209; CHECK-NEXT:    store i32 [[CALL3]], ptr [[ARRAYIDX9]], align 4
210; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 3
211; CHECK-NEXT:    store i32 [[CALL4]], ptr [[ARRAYIDX10]], align 4
212; CHECK-NEXT:    ret void
213;
214entry:
215  %i0 = load i32, ptr %a, align 4
216  %i1 = load i32, ptr %b, align 4
217  %add1 = add i32 %i0, %i1
218  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
219
220  %arrayidx2 = getelementptr inbounds i32, ptr %a, i32 1
221  %i2 = load i32, ptr %arrayidx2, align 4
222  %arrayidx3 = getelementptr inbounds i32, ptr %b, i32 1
223  %i3 = load i32, ptr %arrayidx3, align 4
224  %add2 = add i32 %i2, %i3
225  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 false) nounwind readnone
226
227  %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 2
228  %i4 = load i32, ptr %arrayidx4, align 4
229  %arrayidx5 = getelementptr inbounds i32, ptr %b, i32 2
230  %i5 = load i32, ptr %arrayidx5, align 4
231  %add3 = add i32 %i4, %i5
232  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
233
234  %arrayidx6 = getelementptr inbounds i32, ptr %a, i32 3
235  %i6 = load i32, ptr %arrayidx6, align 4
236  %arrayidx7 = getelementptr inbounds i32, ptr %b, i32 3
237  %i7 = load i32, ptr %arrayidx7, align 4
238  %add4 = add i32 %i6, %i7
239  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 false) nounwind readnone
240
241  store i32 %call1, ptr %c, align 4
242  %arrayidx8 = getelementptr inbounds i32, ptr %c, i32 1
243  store i32 %call2, ptr %arrayidx8, align 4
244  %arrayidx9 = getelementptr inbounds i32, ptr %c, i32 2
245  store i32 %call3, ptr %arrayidx9, align 4
246  %arrayidx10 = getelementptr inbounds i32, ptr %c, i32 3
247  store i32 %call4, ptr %arrayidx10, align 4
248  ret void
249
250
251}
252
253
254declare i32 @llvm.cttz.i32(i32,i1) nounwind readnone
255
256define void @vec_cttz_i32(ptr %a, ptr %b, ptr %c, i1) {
257; CHECK-LABEL: @vec_cttz_i32(
258; CHECK-NEXT:  entry:
259; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
260; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
261; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
262; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP5]], i1 true)
263; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[C:%.*]], align 4
264; CHECK-NEXT:    ret void
265;
266entry:
267  %i0 = load i32, ptr %a, align 4
268  %i1 = load i32, ptr %b, align 4
269  %add1 = add i32 %i0, %i1
270  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
271
272  %arrayidx2 = getelementptr inbounds i32, ptr %a, i32 1
273  %i2 = load i32, ptr %arrayidx2, align 4
274  %arrayidx3 = getelementptr inbounds i32, ptr %b, i32 1
275  %i3 = load i32, ptr %arrayidx3, align 4
276  %add2 = add i32 %i2, %i3
277  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 true) nounwind readnone
278
279  %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 2
280  %i4 = load i32, ptr %arrayidx4, align 4
281  %arrayidx5 = getelementptr inbounds i32, ptr %b, i32 2
282  %i5 = load i32, ptr %arrayidx5, align 4
283  %add3 = add i32 %i4, %i5
284  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
285
286  %arrayidx6 = getelementptr inbounds i32, ptr %a, i32 3
287  %i6 = load i32, ptr %arrayidx6, align 4
288  %arrayidx7 = getelementptr inbounds i32, ptr %b, i32 3
289  %i7 = load i32, ptr %arrayidx7, align 4
290  %add4 = add i32 %i6, %i7
291  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 true) nounwind readnone
292
293  store i32 %call1, ptr %c, align 4
294  %arrayidx8 = getelementptr inbounds i32, ptr %c, i32 1
295  store i32 %call2, ptr %arrayidx8, align 4
296  %arrayidx9 = getelementptr inbounds i32, ptr %c, i32 2
297  store i32 %call3, ptr %arrayidx9, align 4
298  %arrayidx10 = getelementptr inbounds i32, ptr %c, i32 3
299  store i32 %call4, ptr %arrayidx10, align 4
300  ret void
301
302}
303
304define void @vec_cttz_i32_neg(ptr %a, ptr %b, ptr %c, i1) {
305; CHECK-LABEL: @vec_cttz_i32_neg(
306; CHECK-NEXT:  entry:
307; CHECK-NEXT:    [[I0:%.*]] = load i32, ptr [[A:%.*]], align 4
308; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[B:%.*]], align 4
309; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[I0]], [[I1]]
310; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD1]], i1 true) #[[ATTR3]]
311; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
312; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
313; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
314; CHECK-NEXT:    [[I3:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
315; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[I2]], [[I3]]
316; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD2]], i1 false) #[[ATTR3]]
317; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
318; CHECK-NEXT:    [[I4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
319; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
320; CHECK-NEXT:    [[I5:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
321; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[I4]], [[I5]]
322; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD3]], i1 true) #[[ATTR3]]
323; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 3
324; CHECK-NEXT:    [[I6:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
325; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 3
326; CHECK-NEXT:    [[I7:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4
327; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[I6]], [[I7]]
328; CHECK-NEXT:    [[CALL4:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD4]], i1 false) #[[ATTR3]]
329; CHECK-NEXT:    store i32 [[CALL1]], ptr [[C:%.*]], align 4
330; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 1
331; CHECK-NEXT:    store i32 [[CALL2]], ptr [[ARRAYIDX8]], align 4
332; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 2
333; CHECK-NEXT:    store i32 [[CALL3]], ptr [[ARRAYIDX9]], align 4
334; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 3
335; CHECK-NEXT:    store i32 [[CALL4]], ptr [[ARRAYIDX10]], align 4
336; CHECK-NEXT:    ret void
337;
338entry:
339  %i0 = load i32, ptr %a, align 4
340  %i1 = load i32, ptr %b, align 4
341  %add1 = add i32 %i0, %i1
342  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
343
344  %arrayidx2 = getelementptr inbounds i32, ptr %a, i32 1
345  %i2 = load i32, ptr %arrayidx2, align 4
346  %arrayidx3 = getelementptr inbounds i32, ptr %b, i32 1
347  %i3 = load i32, ptr %arrayidx3, align 4
348  %add2 = add i32 %i2, %i3
349  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 false) nounwind readnone
350
351  %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 2
352  %i4 = load i32, ptr %arrayidx4, align 4
353  %arrayidx5 = getelementptr inbounds i32, ptr %b, i32 2
354  %i5 = load i32, ptr %arrayidx5, align 4
355  %add3 = add i32 %i4, %i5
356  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
357
358  %arrayidx6 = getelementptr inbounds i32, ptr %a, i32 3
359  %i6 = load i32, ptr %arrayidx6, align 4
360  %arrayidx7 = getelementptr inbounds i32, ptr %b, i32 3
361  %i7 = load i32, ptr %arrayidx7, align 4
362  %add4 = add i32 %i6, %i7
363  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 false) nounwind readnone
364
365  store i32 %call1, ptr %c, align 4
366  %arrayidx8 = getelementptr inbounds i32, ptr %c, i32 1
367  store i32 %call2, ptr %arrayidx8, align 4
368  %arrayidx9 = getelementptr inbounds i32, ptr %c, i32 2
369  store i32 %call3, ptr %arrayidx9, align 4
370  %arrayidx10 = getelementptr inbounds i32, ptr %c, i32 3
371  store i32 %call4, ptr %arrayidx10, align 4
372  ret void
373
374}
375
376
377declare float @llvm.powi.f32.i32(float, i32)
378define void @vec_powi_f32(ptr %a, ptr %b, ptr %c, i32 %P) {
379; CHECK-LABEL: @vec_powi_f32(
380; CHECK-NEXT:  entry:
381; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
382; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
383; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
384; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 [[P:%.*]])
385; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
386; CHECK-NEXT:    ret void
387;
388entry:
389  %i0 = load float, ptr %a, align 4
390  %i1 = load float, ptr %b, align 4
391  %add1 = fadd float %i0, %i1
392  %call1 = tail call float @llvm.powi.f32.i32(float %add1,i32 %P) nounwind readnone
393
394  %arrayidx2 = getelementptr inbounds float, ptr %a, i32 1
395  %i2 = load float, ptr %arrayidx2, align 4
396  %arrayidx3 = getelementptr inbounds float, ptr %b, i32 1
397  %i3 = load float, ptr %arrayidx3, align 4
398  %add2 = fadd float %i2, %i3
399  %call2 = tail call float @llvm.powi.f32.i32(float %add2,i32 %P) nounwind readnone
400
401  %arrayidx4 = getelementptr inbounds float, ptr %a, i32 2
402  %i4 = load float, ptr %arrayidx4, align 4
403  %arrayidx5 = getelementptr inbounds float, ptr %b, i32 2
404  %i5 = load float, ptr %arrayidx5, align 4
405  %add3 = fadd float %i4, %i5
406  %call3 = tail call float @llvm.powi.f32.i32(float %add3,i32 %P) nounwind readnone
407
408  %arrayidx6 = getelementptr inbounds float, ptr %a, i32 3
409  %i6 = load float, ptr %arrayidx6, align 4
410  %arrayidx7 = getelementptr inbounds float, ptr %b, i32 3
411  %i7 = load float, ptr %arrayidx7, align 4
412  %add4 = fadd float %i6, %i7
413  %call4 = tail call float @llvm.powi.f32.i32(float %add4,i32 %P) nounwind readnone
414
415  store float %call1, ptr %c, align 4
416  %arrayidx8 = getelementptr inbounds float, ptr %c, i32 1
417  store float %call2, ptr %arrayidx8, align 4
418  %arrayidx9 = getelementptr inbounds float, ptr %c, i32 2
419  store float %call3, ptr %arrayidx9, align 4
420  %arrayidx10 = getelementptr inbounds float, ptr %c, i32 3
421  store float %call4, ptr %arrayidx10, align 4
422  ret void
423
424}
425
426
427define void @vec_powi_f32_neg(ptr %a, ptr %b, ptr %c, i32 %P, i32 %Q) {
428; CHECK-LABEL: @vec_powi_f32_neg(
429; CHECK-NEXT:  entry:
430; CHECK-NEXT:    [[I0:%.*]] = load float, ptr [[A:%.*]], align 4
431; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[B:%.*]], align 4
432; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[I0]], [[I1]]
433; CHECK-NEXT:    [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD1]], i32 [[P:%.*]]) #[[ATTR3]]
434; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
435; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
436; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
437; CHECK-NEXT:    [[I3:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
438; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[I2]], [[I3]]
439; CHECK-NEXT:    [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD2]], i32 [[Q:%.*]]) #[[ATTR3]]
440; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
441; CHECK-NEXT:    [[I4:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
442; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
443; CHECK-NEXT:    [[I5:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
444; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[I4]], [[I5]]
445; CHECK-NEXT:    [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD3]], i32 [[P]]) #[[ATTR3]]
446; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3
447; CHECK-NEXT:    [[I6:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
448; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[B]], i32 3
449; CHECK-NEXT:    [[I7:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
450; CHECK-NEXT:    [[ADD4:%.*]] = fadd float [[I6]], [[I7]]
451; CHECK-NEXT:    [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD4]], i32 [[Q]]) #[[ATTR3]]
452; CHECK-NEXT:    store float [[CALL1]], ptr [[C:%.*]], align 4
453; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[C]], i32 1
454; CHECK-NEXT:    store float [[CALL2]], ptr [[ARRAYIDX8]], align 4
455; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[C]], i32 2
456; CHECK-NEXT:    store float [[CALL3]], ptr [[ARRAYIDX9]], align 4
457; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[C]], i32 3
458; CHECK-NEXT:    store float [[CALL4]], ptr [[ARRAYIDX10]], align 4
459; CHECK-NEXT:    ret void
460;
461entry:
462  %i0 = load float, ptr %a, align 4
463  %i1 = load float, ptr %b, align 4
464  %add1 = fadd float %i0, %i1
465  %call1 = tail call float @llvm.powi.f32.i32(float %add1,i32 %P) nounwind readnone
466
467  %arrayidx2 = getelementptr inbounds float, ptr %a, i32 1
468  %i2 = load float, ptr %arrayidx2, align 4
469  %arrayidx3 = getelementptr inbounds float, ptr %b, i32 1
470  %i3 = load float, ptr %arrayidx3, align 4
471  %add2 = fadd float %i2, %i3
472  %call2 = tail call float @llvm.powi.f32.i32(float %add2,i32 %Q) nounwind readnone
473
474  %arrayidx4 = getelementptr inbounds float, ptr %a, i32 2
475  %i4 = load float, ptr %arrayidx4, align 4
476  %arrayidx5 = getelementptr inbounds float, ptr %b, i32 2
477  %i5 = load float, ptr %arrayidx5, align 4
478  %add3 = fadd float %i4, %i5
479  %call3 = tail call float @llvm.powi.f32.i32(float %add3,i32 %P) nounwind readnone
480
481  %arrayidx6 = getelementptr inbounds float, ptr %a, i32 3
482  %i6 = load float, ptr %arrayidx6, align 4
483  %arrayidx7 = getelementptr inbounds float, ptr %b, i32 3
484  %i7 = load float, ptr %arrayidx7, align 4
485  %add4 = fadd float %i6, %i7
486  %call4 = tail call float @llvm.powi.f32.i32(float %add4,i32 %Q) nounwind readnone
487
488  store float %call1, ptr %c, align 4
489  %arrayidx8 = getelementptr inbounds float, ptr %c, i32 1
490  store float %call2, ptr %arrayidx8, align 4
491  %arrayidx9 = getelementptr inbounds float, ptr %c, i32 2
492  store float %call3, ptr %arrayidx9, align 4
493  %arrayidx10 = getelementptr inbounds float, ptr %c, i32 3
494  store float %call4, ptr %arrayidx10, align 4
495  ret void
496
497}
498