xref: /llvm-project/clang/test/CodeGen/builtins-elementwise-math.c (revision 1ac3665e66c7ddb20ef26bc275ad005186ab09fb)
1 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
2 
3 typedef _Float16 half;
4 
5 typedef half half2 __attribute__((ext_vector_type(2)));
6 typedef float float2 __attribute__((ext_vector_type(2)));
7 typedef float float4 __attribute__((ext_vector_type(4)));
8 typedef short int si8 __attribute__((ext_vector_type(8)));
9 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
10 typedef double double2 __attribute__((ext_vector_type(2)));
11 typedef double double3 __attribute__((ext_vector_type(3)));
12 
13 __attribute__((address_space(1))) int int_as_one;
14 typedef int bar;
15 bar b;
16 
17 struct StructWithBitfield {
18   int i : 5;
19   short s : 3;
20   char c: 2;
21   long long int lli : 3;
22 };
23 
24 void test_builtin_elementwise_abs(float f1, float f2, double d1, double d2,
25                                   float4 vf1, float4 vf2, si8 vi1, si8 vi2,
26                                   long long int i1, long long int i2, short si,
27                                   _BitInt(31) bi1, _BitInt(31) bi2, int i,
28                                   char ci) {
29   // CHECK-LABEL: define void @test_builtin_elementwise_abs(
30   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
31   // CHECK-NEXT:  call float @llvm.fabs.f32(float [[F1]])
32   f2 = __builtin_elementwise_abs(f1);
33 
34   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
35   // CHECK-NEXT: call double @llvm.fabs.f64(double [[D1]])
36   d2 = __builtin_elementwise_abs(d1);
37 
38   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
39   // CHECK-NEXT: call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VF1]])
40   vf2 = __builtin_elementwise_abs(vf1);
41 
42   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
43   // CHECK-NEXT: call i64 @llvm.abs.i64(i64 [[I1]], i1 false)
44   i2 = __builtin_elementwise_abs(i1);
45 
46   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
47   // CHECK:      [[S1:%.+]] = trunc i64 [[I1]] to i16
48   // CHECK-NEXT: call i16 @llvm.abs.i16(i16 [[S1]], i1 false)
49   i1 = __builtin_elementwise_abs((short)i1);
50 
51   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
52   // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[VI1]], i1 false)
53   vi2 = __builtin_elementwise_abs(vi1);
54 
55   // CHECK:      [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
56   // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[CVI2]], i1 false)
57   const si8 cvi2 = vi2;
58   vi2 = __builtin_elementwise_abs(cvi2);
59 
60   // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
61   // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
62   // CHECK-NEXT: call i31 @llvm.abs.i31(i31 [[LOADEDV]], i1 false)
63   bi2 = __builtin_elementwise_abs(bi1);
64 
65   // CHECK:      [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
66   // CHECK-NEXT: call i32 @llvm.abs.i32(i32 [[IA1]], i1 false)
67   b = __builtin_elementwise_abs(int_as_one);
68 
69   // CHECK:   call i32 @llvm.abs.i32(i32 -10, i1 false)
70   b = __builtin_elementwise_abs(-10);
71 
72   // CHECK:      [[SI:%.+]] = load i16, ptr %si.addr, align 2
73   // CHECK-NEXT: [[RES:%.+]] = call i16 @llvm.abs.i16(i16 [[SI]], i1 false)
74   si = __builtin_elementwise_abs(si);
75 
76   struct StructWithBitfield t;
77 
78   // CHECK:      [[BFLOAD:%.+]] = load i16, ptr %t, align 8
79   // CHECK-NEXT: [[BFSHL:%.+]] = shl i16 [[BFLOAD]], 11
80   // CHECK-NEXT: [[BFASHR:%.+]] = ashr i16 [[BFSHL]], 11
81   // CHECK-NEXT: [[BFCAST:%.+]] = sext i16 [[BFASHR]] to i32
82   // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.abs.i32(i32 [[BFCAST]], i1 false)
83   i = __builtin_elementwise_abs(t.i);
84 
85   // CHECK:      [[BFLOAD:%.+]] = load i16, ptr %t, align 8
86   // CHECK-NEXT: [[BFSHL:%.+]] = shl i16 [[BFLOAD]], 8
87   // CHECK-NEXT: [[BFASHR:%.+]] = ashr i16 [[BFSHL]], 13
88   // CHECK-NEXT: [[RES:%.+]] = call i16 @llvm.abs.i16(i16 [[BFASHR]], i1 false)
89   si = __builtin_elementwise_abs(t.s);
90 
91   // CHECK:      [[BFLOAD:%.+]] = load i16, ptr %t, align 8
92   // CHECK-NEXT: [[BFSHL:%.+]] = shl i16 [[BFLOAD]], 6
93   // CHECK-NEXT: [[BFASHR:%.+]] = ashr i16 [[BFSHL]], 14
94   // CHECK-NEXT: [[BFCAST:%.+]] = trunc i16 [[BFASHR]] to i8
95   // CHECK-NEXT: [[RES:%.+]] = call i8 @llvm.abs.i8(i8 [[BFCAST]], i1 false)
96   ci = __builtin_elementwise_abs(t.c);
97 
98   // CHECK:      [[BFLOAD:%.+]] = load i16, ptr %t, align 8
99   // CHECK-NEXT: [[BFSHL:%.+]] = shl i16 [[BFLOAD]], 3
100   // CHECK-NEXT: [[BFASHR:%.+]] = ashr i16 [[BFSHL]], 13
101   // CHECK-NEXT: [[BFCAST:%.+]] = sext i16 [[BFASHR]] to i64
102   // CHECK-NEXT: [[RES:%.+]] = call i64 @llvm.abs.i64(i64 [[BFCAST]], i1 false)
103   i1 = __builtin_elementwise_abs(t.lli);
104 }
105 
106 void test_builtin_elementwise_add_sat(float f1, float f2, double d1, double d2,
107                                       float4 vf1, float4 vf2, long long int i1,
108                                       long long int i2, si8 vi1, si8 vi2,
109                                       unsigned u1, unsigned u2, u4 vu1, u4 vu2,
110                                       _BitInt(31) bi1, _BitInt(31) bi2,
111                                       unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2,
112                                       char c1, char c2, unsigned char uc1,
113                                       unsigned char uc2, short s1, short s2,
114                                       unsigned short us1, unsigned short us2) {
115   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
116   // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
117   // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 [[I2]])
118   i1 = __builtin_elementwise_add_sat(i1, i2);
119 
120   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
121   // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 10)
122   i1 = __builtin_elementwise_add_sat(i1, 10ll);
123 
124   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
125   // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
126   // CHECK-NEXT: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
127   vi1 = __builtin_elementwise_add_sat(vi1, vi2);
128 
129   // CHECK:      [[U1:%.+]] = load i32, ptr %u1.addr, align 4
130   // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
131   // CHECK-NEXT: call i32 @llvm.uadd.sat.i32(i32 [[U1]], i32 [[U2]])
132   u1 = __builtin_elementwise_add_sat(u1, u2);
133 
134   // CHECK:      [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
135   // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
136   // CHECK-NEXT: call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
137   vu1 = __builtin_elementwise_add_sat(vu1, vu2);
138 
139   // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
140   // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
141   // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
142   // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
143   // CHECK-NEXT: call i31 @llvm.sadd.sat.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
144   bi1 = __builtin_elementwise_add_sat(bi1, bi2);
145 
146   // CHECK:      [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
147   // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
148   // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
149   // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
150   // CHECK-NEXT: call i55 @llvm.uadd.sat.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
151   bu1 = __builtin_elementwise_add_sat(bu1, bu2);
152 
153   // CHECK:      [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
154   // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
155   // CHECK-NEXT: call i32 @llvm.sadd.sat.i32(i32 [[IAS1]], i32 [[B]])
156   int_as_one = __builtin_elementwise_add_sat(int_as_one, b);
157 
158   // CHECK: store i64 98, ptr %i1.addr, align 8
159   i1 = __builtin_elementwise_add_sat(1, 'a');
160 
161   // CHECK:      [[C1:%.+]] = load i8, ptr %c1.addr, align 1
162   // CHECK-NEXT: [[C2:%.+]] = load i8, ptr %c2.addr, align 1
163   // CHECK-NEXT: call i8 @llvm.sadd.sat.i8(i8 [[C1]], i8 [[C2]])
164   c1 = __builtin_elementwise_add_sat(c1, c2);
165 
166   // CHECK:      [[UC1:%.+]] = load i8, ptr %uc1.addr, align 1
167   // CHECK-NEXT: [[UC2:%.+]] = load i8, ptr %uc2.addr, align 1
168   // CHECK-NEXT: call i8 @llvm.uadd.sat.i8(i8 [[UC1]], i8 [[UC2]])
169   uc1 = __builtin_elementwise_add_sat(uc1, uc2);
170 
171   // CHECK:      [[S1:%.+]] = load i16, ptr %s1.addr, align 2
172   // CHECK-NEXT: [[S2:%.+]] = load i16, ptr %s2.addr, align 2
173   // CHECK-NEXT: call i16 @llvm.sadd.sat.i16(i16 [[S1]], i16 [[S2]])
174   s1 = __builtin_elementwise_add_sat(s1, s2);
175 
176   // CHECK:      [[S1:%.+]] = load i16, ptr %s1.addr, align 2
177   // CHECK:      [[I1:%.+]] = sext i16 [[S1]] to i32
178   // CHECK-NEXT: [[S2:%.+]] = load i16, ptr %s2.addr, align 2
179   // CHECK:      [[I2:%.+]] = sext i16 [[S2]] to i32
180   // CHECK-NEXT: call i32 @llvm.sadd.sat.i32(i32 [[I1]], i32 [[I2]])
181   s1 = __builtin_elementwise_add_sat((int)s1, (int)s2);
182 
183   // CHECK:      [[US1:%.+]] = load i16, ptr %us1.addr, align 2
184   // CHECK-NEXT: [[US2:%.+]] = load i16, ptr %us2.addr, align 2
185   // CHECK-NEXT: call i16 @llvm.uadd.sat.i16(i16 [[US1]], i16 [[US2]])
186   us1 = __builtin_elementwise_add_sat(us1, us2);
187 }
188 
189 void test_builtin_elementwise_sub_sat(float f1, float f2, double d1, double d2,
190                                       float4 vf1, float4 vf2, long long int i1,
191                                       long long int i2, si8 vi1, si8 vi2,
192                                       unsigned u1, unsigned u2, u4 vu1, u4 vu2,
193                                       _BitInt(31) bi1, _BitInt(31) bi2,
194                                       unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2,
195                                       char c1, char c2, unsigned char uc1,
196                                       unsigned char uc2, short s1, short s2,
197                                       unsigned short us1, unsigned short us2) {
198   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
199   // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
200   // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 [[I2]])
201   i1 = __builtin_elementwise_sub_sat(i1, i2);
202 
203   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
204   // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 10)
205   i1 = __builtin_elementwise_sub_sat(i1, 10ll);
206 
207   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
208   // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
209   // CHECK-NEXT: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
210   vi1 = __builtin_elementwise_sub_sat(vi1, vi2);
211 
212   // CHECK:      [[U1:%.+]] = load i32, ptr %u1.addr, align 4
213   // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
214   // CHECK-NEXT: call i32 @llvm.usub.sat.i32(i32 [[U1]], i32 [[U2]])
215   u1 = __builtin_elementwise_sub_sat(u1, u2);
216 
217   // CHECK:      [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
218   // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
219   // CHECK-NEXT: call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
220   vu1 = __builtin_elementwise_sub_sat(vu1, vu2);
221 
222   // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
223   // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
224   // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
225   // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
226   // CHECK-NEXT: call i31 @llvm.ssub.sat.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
227   bi1 = __builtin_elementwise_sub_sat(bi1, bi2);
228 
229   // CHECK:      [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
230   // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
231   // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
232   // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
233   // CHECK-NEXT: call i55 @llvm.usub.sat.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
234   bu1 = __builtin_elementwise_sub_sat(bu1, bu2);
235 
236   // CHECK:      [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
237   // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
238   // CHECK-NEXT: call i32 @llvm.ssub.sat.i32(i32 [[IAS1]], i32 [[B]])
239   int_as_one = __builtin_elementwise_sub_sat(int_as_one, b);
240 
241   // CHECK: store i64 -96, ptr %i1.addr, align 8
242   i1 = __builtin_elementwise_sub_sat(1, 'a');
243 
244   // CHECK:      [[C1:%.+]] = load i8, ptr %c1.addr, align 1
245   // CHECK-NEXT: [[C2:%.+]] = load i8, ptr %c2.addr, align 1
246   // CHECK-NEXT: call i8 @llvm.ssub.sat.i8(i8 [[C1]], i8 [[C2]])
247   c1 = __builtin_elementwise_sub_sat(c1, c2);
248 
249   // CHECK:      [[UC1:%.+]] = load i8, ptr %uc1.addr, align 1
250   // CHECK-NEXT: [[UC2:%.+]] = load i8, ptr %uc2.addr, align 1
251   // CHECK-NEXT: call i8 @llvm.usub.sat.i8(i8 [[UC1]], i8 [[UC2]])
252   uc1 = __builtin_elementwise_sub_sat(uc1, uc2);
253 
254   // CHECK:      [[S1:%.+]] = load i16, ptr %s1.addr, align 2
255   // CHECK-NEXT: [[S2:%.+]] = load i16, ptr %s2.addr, align 2
256   // CHECK-NEXT: call i16 @llvm.ssub.sat.i16(i16 [[S1]], i16 [[S2]])
257   s1 = __builtin_elementwise_sub_sat(s1, s2);
258 
259   // CHECK:      [[US1:%.+]] = load i16, ptr %us1.addr, align 2
260   // CHECK-NEXT: [[US2:%.+]] = load i16, ptr %us2.addr, align 2
261   // CHECK-NEXT: call i16 @llvm.usub.sat.i16(i16 [[US1]], i16 [[US2]])
262   us1 = __builtin_elementwise_sub_sat(us1, us2);
263 }
264 
265 void test_builtin_elementwise_maximum(float f1, float f2, double d1, double d2,
266                                       float4 vf1, float4 vf2, long long int i1,
267                                       long long int i2, si8 vi1, si8 vi2,
268                                       unsigned u1, unsigned u2, u4 vu1, u4 vu2,
269                                       _BitInt(31) bi1, _BitInt(31) bi2,
270                                       unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
271   // CHECK-LABEL: define void @test_builtin_elementwise_maximum(
272   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
273   // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
274   // CHECK-NEXT:  call float @llvm.maximum.f32(float [[F1]], float [[F2]])
275   f1 = __builtin_elementwise_maximum(f1, f2);
276 
277   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
278   // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
279   // CHECK-NEXT: call double @llvm.maximum.f64(double [[D1]], double [[D2]])
280   d1 = __builtin_elementwise_maximum(d1, d2);
281 
282   // CHECK:      [[D2:%.+]] = load double, ptr %d2.addr, align 8
283   // CHECK-NEXT: call double @llvm.maximum.f64(double 2.000000e+01, double [[D2]])
284   d1 = __builtin_elementwise_maximum(20.0, d2);
285 
286   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
287   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
288   // CHECK-NEXT: call <4 x float> @llvm.maximum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
289   vf1 = __builtin_elementwise_maximum(vf1, vf2);
290 
291   // CHECK:      [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
292   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
293   // CHECK-NEXT: call <4 x float> @llvm.maximum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
294   const float4 cvf1 = vf1;
295   vf1 = __builtin_elementwise_maximum(cvf1, vf2);
296 
297   // CHECK:      [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
298   // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
299   // CHECK-NEXT: call <4 x float> @llvm.maximum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
300   vf1 = __builtin_elementwise_maximum(vf2, cvf1);
301 }
302 
303 void test_builtin_elementwise_minimum(float f1, float f2, double d1, double d2,
304                                       float4 vf1, float4 vf2, long long int i1,
305                                       long long int i2, si8 vi1, si8 vi2,
306                                       unsigned u1, unsigned u2, u4 vu1, u4 vu2,
307                                       _BitInt(31) bi1, _BitInt(31) bi2,
308                                       unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
309   // CHECK-LABEL: define void @test_builtin_elementwise_minimum(
310   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
311   // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
312   // CHECK-NEXT:  call float @llvm.minimum.f32(float [[F1]], float [[F2]])
313   f1 = __builtin_elementwise_minimum(f1, f2);
314 
315   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
316   // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
317   // CHECK-NEXT: call double @llvm.minimum.f64(double [[D1]], double [[D2]])
318   d1 = __builtin_elementwise_minimum(d1, d2);
319 
320   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
321   // CHECK-NEXT: call double @llvm.minimum.f64(double [[D1]], double 2.000000e+00)
322   d1 = __builtin_elementwise_minimum(d1, 2.0);
323 
324   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
325   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
326   // CHECK-NEXT: call <4 x float> @llvm.minimum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
327   vf1 = __builtin_elementwise_minimum(vf1, vf2);
328 
329   // CHECK:      [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
330   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
331   // CHECK-NEXT: call <4 x float> @llvm.minimum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
332   const float4 cvf1 = vf1;
333   vf1 = __builtin_elementwise_minimum(cvf1, vf2);
334 
335   // CHECK:      [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
336   // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
337   // CHECK-NEXT: call <4 x float> @llvm.minimum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
338   vf1 = __builtin_elementwise_minimum(vf2, cvf1);
339 }
340 
341 void test_builtin_elementwise_max(float f1, float f2, double d1, double d2,
342                                   float4 vf1, float4 vf2, long long int i1,
343                                   long long int i2, si8 vi1, si8 vi2,
344                                   unsigned u1, unsigned u2, u4 vu1, u4 vu2,
345                                   _BitInt(31) bi1, _BitInt(31) bi2,
346                                   unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
347   // CHECK-LABEL: define void @test_builtin_elementwise_max(
348   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
349   // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
350   // CHECK-NEXT:  call float @llvm.maxnum.f32(float [[F1]], float [[F2]])
351   f1 = __builtin_elementwise_max(f1, f2);
352 
353   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
354   // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
355   // CHECK-NEXT: call double @llvm.maxnum.f64(double [[D1]], double [[D2]])
356   d1 = __builtin_elementwise_max(d1, d2);
357 
358   // CHECK:      [[D2:%.+]] = load double, ptr %d2.addr, align 8
359   // CHECK-NEXT: call double @llvm.maxnum.f64(double 2.000000e+01, double [[D2]])
360   d1 = __builtin_elementwise_max(20.0, d2);
361 
362   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
363   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
364   // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
365   vf1 = __builtin_elementwise_max(vf1, vf2);
366 
367   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
368   // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
369   // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 [[I2]])
370   i1 = __builtin_elementwise_max(i1, i2);
371 
372   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
373   // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 10)
374   i1 = __builtin_elementwise_max(i1, 10ll);
375 
376   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
377   // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
378   // CHECK-NEXT: call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
379   vi1 = __builtin_elementwise_max(vi1, vi2);
380 
381   // CHECK:      [[U1:%.+]] = load i32, ptr %u1.addr, align 4
382   // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
383   // CHECK-NEXT: call i32 @llvm.umax.i32(i32 [[U1]], i32 [[U2]])
384   u1 = __builtin_elementwise_max(u1, u2);
385 
386   // CHECK:      [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
387   // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
388   // CHECK-NEXT: call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
389   vu1 = __builtin_elementwise_max(vu1, vu2);
390 
391   // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
392   // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
393   // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
394   // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
395   // CHECK-NEXT: call i31 @llvm.smax.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
396   bi1 = __builtin_elementwise_max(bi1, bi2);
397 
398   // CHECK:      [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
399   // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
400   // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
401   // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
402   // CHECK-NEXT: call i55 @llvm.umax.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
403   bu1 = __builtin_elementwise_max(bu1, bu2);
404 
405   // CHECK:      [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
406   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
407   // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
408   const float4 cvf1 = vf1;
409   vf1 = __builtin_elementwise_max(cvf1, vf2);
410 
411   // CHECK:      [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
412   // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
413   // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
414   vf1 = __builtin_elementwise_max(vf2, cvf1);
415 
416   // CHECK:      [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
417   // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
418   // CHECK-NEXT: call i32 @llvm.smax.i32(i32 [[IAS1]], i32 [[B]])
419   int_as_one = __builtin_elementwise_max(int_as_one, b);
420 
421   // CHECK: call i32 @llvm.smax.i32(i32 1, i32 97)
422   i1 = __builtin_elementwise_max(1, 'a');
423 }
424 
425 void test_builtin_elementwise_min(float f1, float f2, double d1, double d2,
426                                   float4 vf1, float4 vf2, long long int i1,
427                                   long long int i2, si8 vi1, si8 vi2,
428                                   unsigned u1, unsigned u2, u4 vu1, u4 vu2,
429                                   _BitInt(31) bi1, _BitInt(31) bi2,
430                                   unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
431   // CHECK-LABEL: define void @test_builtin_elementwise_min(
432   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
433   // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
434   // CHECK-NEXT:  call float @llvm.minnum.f32(float [[F1]], float [[F2]])
435   f1 = __builtin_elementwise_min(f1, f2);
436 
437   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
438   // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
439   // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double [[D2]])
440   d1 = __builtin_elementwise_min(d1, d2);
441 
442   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
443   // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double 2.000000e+00)
444   d1 = __builtin_elementwise_min(d1, 2.0);
445 
446   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
447   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
448   // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
449   vf1 = __builtin_elementwise_min(vf1, vf2);
450 
451   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
452   // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
453   // CHECK-NEXT: call i64 @llvm.smin.i64(i64 [[I1]], i64 [[I2]])
454   i1 = __builtin_elementwise_min(i1, i2);
455 
456   // CHECK:      [[I2:%.+]] = load i64, ptr %i2.addr, align 8
457   // CHECK-NEXT: call i64 @llvm.smin.i64(i64 -11, i64 [[I2]])
458   i1 = __builtin_elementwise_min(-11ll, i2);
459 
460   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
461   // CHECK:      [[S1:%.+]] = trunc i64 [[I1]] to i16
462   // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
463   // CHECK:      [[S2:%.+]] = trunc i64 [[I2]] to i16
464   // CHECK-NEXT: call i16 @llvm.smin.i16(i16 [[S1]], i16 [[S2]])
465   i1 = __builtin_elementwise_min((short)i1, (short)i2);
466 
467   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
468   // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
469   // CHECK-NEXT: call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
470   vi1 = __builtin_elementwise_min(vi1, vi2);
471 
472   // CHECK:      [[U1:%.+]] = load i32, ptr %u1.addr, align 4
473   // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
474   // CHECK-NEXT: call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
475   u1 = __builtin_elementwise_min(u1, u2);
476 
477   // CHECK:      [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
478   // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
479   // CHECK-NEXT: call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
480   vu1 = __builtin_elementwise_min(vu1, vu2);
481 
482   // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
483   // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
484   // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
485   // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
486   // CHECK-NEXT: call i31 @llvm.smin.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
487   bi1 = __builtin_elementwise_min(bi1, bi2);
488 
489   // CHECK:      [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
490   // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
491   // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
492   // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
493   // CHECK-NEXT: call i55 @llvm.umin.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
494   bu1 = __builtin_elementwise_min(bu1, bu2);
495 
496   // CHECK:      [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
497   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
498   // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
499   const float4 cvf1 = vf1;
500   vf1 = __builtin_elementwise_min(cvf1, vf2);
501 
502   // CHECK:      [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
503   // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
504   // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
505   vf1 = __builtin_elementwise_min(vf2, cvf1);
506 
507   // CHECK:      [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
508   // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
509   // CHECK-NEXT: call i32 @llvm.smin.i32(i32 [[IAS1]], i32 [[B]])
510   int_as_one = __builtin_elementwise_min(int_as_one, b);
511 }
512 
513 void test_builtin_elementwise_bitreverse(si8 vi1, si8 vi2,
514                                   long long int i1, long long int i2, short si,
515                                   _BitInt(31) bi1, _BitInt(31) bi2,
516                                   char ci) {
517 
518 
519   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
520   // CHECK-NEXT: call i64 @llvm.bitreverse.i64(i64 [[I1]])
521   i2 = __builtin_elementwise_bitreverse(i1);
522 
523   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
524   // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[VI1]])
525   vi2 = __builtin_elementwise_bitreverse(vi1);
526 
527   // CHECK:      [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
528   // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[CVI2]])
529   const si8 cvi2 = vi2;
530   vi2 = __builtin_elementwise_bitreverse(cvi2);
531 
532   // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
533   // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
534   // CHECK-NEXT: call i31 @llvm.bitreverse.i31(i31 [[LOADEDV]])
535   bi2 = __builtin_elementwise_bitreverse(bi1);
536 
537   // CHECK:      [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
538   // CHECK-NEXT: call i32 @llvm.bitreverse.i32(i32 [[IA1]])
539   b = __builtin_elementwise_bitreverse(int_as_one);
540 
541   // CHECK:      store i32 1879048191, ptr @b, align 4
542   b = __builtin_elementwise_bitreverse(-10);
543 
544   // CHECK:      [[SI:%.+]] = load i16, ptr %si.addr, align 2
545   // CHECK-NEXT: [[RES:%.+]] = call i16 @llvm.bitreverse.i16(i16 [[SI]])
546   si = __builtin_elementwise_bitreverse(si);
547 
548   // CHECK:      store i16 28671, ptr %si.addr, align 2
549   si = __builtin_elementwise_bitreverse((short)-10);
550 
551   // CHECK:      store i16 28671, ptr %si.addr, align 2
552   si = __builtin_elementwise_bitreverse((unsigned short)-10);
553 
554   // CHECK:      [[CI:%.+]] = load i8, ptr %ci.addr, align 1
555   // CHECK-NEXT: [[RES:%.+]] = call i8 @llvm.bitreverse.i8(i8 [[CI]])
556   ci = __builtin_elementwise_bitreverse(ci);
557 
558   // CHECK:      store i8 111, ptr %ci.addr, align 1
559   ci = __builtin_elementwise_bitreverse((unsigned char)-10);
560 
561   // CHECK:      store i8 111, ptr %ci.addr, align 1
562   ci = __builtin_elementwise_bitreverse((char)-10);
563 }
564 
565 void test_builtin_elementwise_ceil(float f1, float f2, double d1, double d2,
566                                    float4 vf1, float4 vf2) {
567   // CHECK-LABEL: define void @test_builtin_elementwise_ceil(
568   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
569   // CHECK-NEXT:  call float @llvm.ceil.f32(float [[F1]])
570   f2 = __builtin_elementwise_ceil(f1);
571 
572   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
573   // CHECK-NEXT: call double @llvm.ceil.f64(double [[D1]])
574   d2 = __builtin_elementwise_ceil(d1);
575 
576   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
577   // CHECK-NEXT: call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VF1]])
578   vf2 = __builtin_elementwise_ceil(vf1);
579 }
580 
581 void test_builtin_elementwise_acos(float f1, float f2, double d1, double d2,
582                                   float4 vf1, float4 vf2) {
583   // CHECK-LABEL: define void @test_builtin_elementwise_acos(
584   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
585   // CHECK-NEXT:  call float @llvm.acos.f32(float [[F1]])
586   f2 = __builtin_elementwise_acos(f1);
587 
588   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
589   // CHECK-NEXT: call double @llvm.acos.f64(double [[D1]])
590   d2 = __builtin_elementwise_acos(d1);
591 
592   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
593   // CHECK-NEXT: call <4 x float> @llvm.acos.v4f32(<4 x float> [[VF1]])
594   vf2 = __builtin_elementwise_acos(vf1);
595 }
596 
597 void test_builtin_elementwise_asin(float f1, float f2, double d1, double d2,
598                                   float4 vf1, float4 vf2) {
599   // CHECK-LABEL: define void @test_builtin_elementwise_asin(
600   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
601   // CHECK-NEXT:  call float @llvm.asin.f32(float [[F1]])
602   f2 = __builtin_elementwise_asin(f1);
603 
604   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
605   // CHECK-NEXT: call double @llvm.asin.f64(double [[D1]])
606   d2 = __builtin_elementwise_asin(d1);
607 
608   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
609   // CHECK-NEXT: call <4 x float> @llvm.asin.v4f32(<4 x float> [[VF1]])
610   vf2 = __builtin_elementwise_asin(vf1);
611 }
612 
613 void test_builtin_elementwise_atan(float f1, float f2, double d1, double d2,
614                                   float4 vf1, float4 vf2) {
615   // CHECK-LABEL: define void @test_builtin_elementwise_atan(
616   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
617   // CHECK-NEXT:  call float @llvm.atan.f32(float [[F1]])
618   f2 = __builtin_elementwise_atan(f1);
619 
620   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
621   // CHECK-NEXT: call double @llvm.atan.f64(double [[D1]])
622   d2 = __builtin_elementwise_atan(d1);
623 
624   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
625   // CHECK-NEXT: call <4 x float> @llvm.atan.v4f32(<4 x float> [[VF1]])
626   vf2 = __builtin_elementwise_atan(vf1);
627 }
628 
629 void test_builtin_elementwise_atan2(float f1, float f2, float f3, double d1,
630                                     double d2, double d3, float4 vf1,
631                                     float4 vf2, float4 vf3) {
632   // CHECK-LABEL: define void @test_builtin_elementwise_atan2(
633   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
634   // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
635   // CHECK-NEXT: call float @llvm.atan2.f32(float [[F1]], float [[F2]])
636   f3 = __builtin_elementwise_atan2(f1, f2);
637 
638   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
639   // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
640   // CHECK-NEXT: call double @llvm.atan2.f64(double [[D1]], double [[D2]])
641   d3 = __builtin_elementwise_atan2(d1, d2);
642 
643   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
644   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
645   // CHECK-NEXT: call <4 x float> @llvm.atan2.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
646   vf3 = __builtin_elementwise_atan2(vf1, vf2);
647 }
648 
649 void test_builtin_elementwise_cos(float f1, float f2, double d1, double d2,
650                                   float4 vf1, float4 vf2) {
651   // CHECK-LABEL: define void @test_builtin_elementwise_cos(
652   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
653   // CHECK-NEXT:  call float @llvm.cos.f32(float [[F1]])
654   f2 = __builtin_elementwise_cos(f1);
655 
656   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
657   // CHECK-NEXT: call double @llvm.cos.f64(double [[D1]])
658   d2 = __builtin_elementwise_cos(d1);
659 
660   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
661   // CHECK-NEXT: call <4 x float> @llvm.cos.v4f32(<4 x float> [[VF1]])
662   vf2 = __builtin_elementwise_cos(vf1);
663 }
664 
665 void test_builtin_elementwise_cosh(float f1, float f2, double d1, double d2,
666                                   float4 vf1, float4 vf2) {
667   // CHECK-LABEL: define void @test_builtin_elementwise_cosh(
668   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
669   // CHECK-NEXT:  call float @llvm.cosh.f32(float [[F1]])
670   f2 = __builtin_elementwise_cosh(f1);
671 
672   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
673   // CHECK-NEXT: call double @llvm.cosh.f64(double [[D1]])
674   d2 = __builtin_elementwise_cosh(d1);
675 
676   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
677   // CHECK-NEXT: call <4 x float> @llvm.cosh.v4f32(<4 x float> [[VF1]])
678   vf2 = __builtin_elementwise_cosh(vf1);
679 }
680 
681 void test_builtin_elementwise_exp(float f1, float f2, double d1, double d2,
682                                   float4 vf1, float4 vf2) {
683   // CHECK-LABEL: define void @test_builtin_elementwise_exp(
684   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
685   // CHECK-NEXT:  call float @llvm.exp.f32(float [[F1]])
686   f2 = __builtin_elementwise_exp(f1);
687 
688   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
689   // CHECK-NEXT: call double @llvm.exp.f64(double [[D1]])
690   d2 = __builtin_elementwise_exp(d1);
691 
692   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
693   // CHECK-NEXT: call <4 x float> @llvm.exp.v4f32(<4 x float> [[VF1]])
694   vf2 = __builtin_elementwise_exp(vf1);
695 }
696 
697 void test_builtin_elementwise_exp2(float f1, float f2, double d1, double d2,
698                                   float4 vf1, float4 vf2) {
699   // CHECK-LABEL: define void @test_builtin_elementwise_exp2(
700   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
701   // CHECK-NEXT:  call float @llvm.exp2.f32(float [[F1]])
702   f2 = __builtin_elementwise_exp2(f1);
703 
704   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
705   // CHECK-NEXT: call double @llvm.exp2.f64(double [[D1]])
706   d2 = __builtin_elementwise_exp2(d1);
707 
708   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
709   // CHECK-NEXT: call <4 x float> @llvm.exp2.v4f32(<4 x float> [[VF1]])
710   vf2 = __builtin_elementwise_exp2(vf1);
711 }
712 
713 
714 void test_builtin_elementwise_floor(float f1, float f2, double d1, double d2,
715                                     float4 vf1, float4 vf2) {
716   // CHECK-LABEL: define void @test_builtin_elementwise_floor(
717   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
718   // CHECK-NEXT:  call float @llvm.floor.f32(float [[F1]])
719   f2 = __builtin_elementwise_floor(f1);
720 
721   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
722   // CHECK-NEXT: call double @llvm.floor.f64(double [[D1]])
723   d2 = __builtin_elementwise_floor(d1);
724 
725   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
726   // CHECK-NEXT: call <4 x float> @llvm.floor.v4f32(<4 x float> [[VF1]])
727   vf2 = __builtin_elementwise_floor(vf1);
728 }
729 
730 void test_builtin_elementwise_log(float f1, float f2, double d1, double d2,
731                                   float4 vf1, float4 vf2) {
732   // CHECK-LABEL: define void @test_builtin_elementwise_log(
733   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
734   // CHECK-NEXT:  call float @llvm.log.f32(float [[F1]])
735   f2 = __builtin_elementwise_log(f1);
736 
737   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
738   // CHECK-NEXT: call double @llvm.log.f64(double [[D1]])
739   d2 = __builtin_elementwise_log(d1);
740 
741   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
742   // CHECK-NEXT: call <4 x float> @llvm.log.v4f32(<4 x float> [[VF1]])
743   vf2 = __builtin_elementwise_log(vf1);
744 }
745 
746 void test_builtin_elementwise_log10(float f1, float f2, double d1, double d2,
747                                   float4 vf1, float4 vf2) {
748   // CHECK-LABEL: define void @test_builtin_elementwise_log10(
749   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
750   // CHECK-NEXT:  call float @llvm.log10.f32(float [[F1]])
751   f2 = __builtin_elementwise_log10(f1);
752 
753   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
754   // CHECK-NEXT: call double @llvm.log10.f64(double [[D1]])
755   d2 = __builtin_elementwise_log10(d1);
756 
757   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
758   // CHECK-NEXT: call <4 x float> @llvm.log10.v4f32(<4 x float> [[VF1]])
759   vf2 = __builtin_elementwise_log10(vf1);
760 }
761 
762 void test_builtin_elementwise_log2(float f1, float f2, double d1, double d2,
763                                   float4 vf1, float4 vf2) {
764   // CHECK-LABEL: define void @test_builtin_elementwise_log2(
765   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
766   // CHECK-NEXT:  call float @llvm.log2.f32(float [[F1]])
767   f2 = __builtin_elementwise_log2(f1);
768 
769   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
770   // CHECK-NEXT: call double @llvm.log2.f64(double [[D1]])
771   d2 = __builtin_elementwise_log2(d1);
772 
773   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
774   // CHECK-NEXT: call <4 x float> @llvm.log2.v4f32(<4 x float> [[VF1]])
775   vf2 = __builtin_elementwise_log2(vf1);
776 }
777 
778 void test_builtin_elementwise_popcount(si8 vi1, si8 vi2, long long int i1,
779                                        long long int i2, short si,
780                                        _BitInt(31) bi1, _BitInt(31) bi2,
781                                        char ci) {
782   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
783   // CHECK-NEXT: call i64 @llvm.ctpop.i64(i64 [[I1]])
784   i2 = __builtin_elementwise_popcount(i1);
785 
786   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
787   // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[VI1]])
788   vi2 = __builtin_elementwise_popcount(vi1);
789 
790   // CHECK:      [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
791   // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[CVI2]])
792   const si8 cvi2 = vi2;
793   vi2 = __builtin_elementwise_popcount(cvi2);
794 
795   // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
796   // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
797   // CHECK-NEXT: call i31 @llvm.ctpop.i31(i31 [[LOADEDV]])
798   bi2 = __builtin_elementwise_popcount(bi1);
799 
800   // CHECK:      [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
801   // CHECK-NEXT: call i32 @llvm.ctpop.i32(i32 [[IA1]])
802   b = __builtin_elementwise_popcount(int_as_one);
803 
804   // CHECK:      store i32 30, ptr @b, align 4
805   b = __builtin_elementwise_popcount(-10);
806 
807   // CHECK:      [[SI:%.+]] = load i16, ptr %si.addr, align 2
808   // CHECK-NEXT: [[RES:%.+]] = call i16 @llvm.ctpop.i16(i16 [[SI]])
809   si = __builtin_elementwise_popcount(si);
810 
811   // CHECK:      store i16 3, ptr %si.addr, align 2
812   si = __builtin_elementwise_popcount((unsigned short)32771);
813 
814   // CHECK:      store i16 3, ptr %si.addr, align 2
815   si = __builtin_elementwise_popcount((short)32771);
816 
817   // CHECK:      [[CI:%.+]] = load i8, ptr %ci.addr, align 1
818   // CHECK-NEXT: [[RES:%.+]] = call i8 @llvm.ctpop.i8(i8 [[CI]])
819   ci = __builtin_elementwise_popcount(ci);
820 
821   // CHECK:      store i8 2, ptr %ci.addr, align 1
822   ci = __builtin_elementwise_popcount((unsigned char)192);
823 
824   // CHECK:      store i8 2, ptr %ci.addr, align 1
825   ci = __builtin_elementwise_popcount((char)192);
826 }
827 
828 void test_builtin_elementwise_fmod(float f1, float f2, double d1, double d2,
829                                       float4 vf1, float4 vf2) {
830 
831   // CHECK-LABEL: define void @test_builtin_elementwise_fmod(
832   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
833   // CHECK:      [[F2:%.+]] = load float, ptr %f2.addr, align 4
834   // CHECK-NEXT:  frem float [[F1]], [[F2]]
835   f2 = __builtin_elementwise_fmod(f1, f2);
836 
837   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
838   // CHECK:      [[D2:%.+]] = load double, ptr %d2.addr, align 8
839   // CHECK-NEXT: frem double [[D1]], [[D2]]
840   d2 = __builtin_elementwise_fmod(d1, d2);
841 
842   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
843   // CHECK:      [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
844   // CHECK-NEXT: frem <4 x float> [[VF1]], [[VF2]]
845   vf2 = __builtin_elementwise_fmod(vf1, vf2);
846 }
847 
848 void test_builtin_elementwise_pow(float f1, float f2, double d1, double d2,
849                                       float4 vf1, float4 vf2) {
850 
851   // CHECK-LABEL: define void @test_builtin_elementwise_pow(
852   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
853   // CHECK:      [[F2:%.+]] = load float, ptr %f2.addr, align 4
854   // CHECK-NEXT:  call float @llvm.pow.f32(float [[F1]], float [[F2]])
855   f2 = __builtin_elementwise_pow(f1, f2);
856 
857   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
858   // CHECK:      [[D2:%.+]] = load double, ptr %d2.addr, align 8
859   // CHECK-NEXT: call double @llvm.pow.f64(double [[D1]], double [[D2]])
860   d2 = __builtin_elementwise_pow(d1, d2);
861 
862   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
863   // CHECK:      [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
864   // CHECK-NEXT: call <4 x float> @llvm.pow.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
865   vf2 = __builtin_elementwise_pow(vf1, vf2);
866 }
867 
868 void test_builtin_elementwise_roundeven(float f1, float f2, double d1, double d2,
869                                         float4 vf1, float4 vf2) {
870   // CHECK-LABEL: define void @test_builtin_elementwise_roundeven(
871   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
872   // CHECK-NEXT:  call float @llvm.roundeven.f32(float [[F1]])
873   f2 = __builtin_elementwise_roundeven(f1);
874 
875   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
876   // CHECK-NEXT: call double @llvm.roundeven.f64(double [[D1]])
877   d2 = __builtin_elementwise_roundeven(d1);
878 
879   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
880   // CHECK-NEXT: call <4 x float> @llvm.roundeven.v4f32(<4 x float> [[VF1]])
881   vf2 = __builtin_elementwise_roundeven(vf1);
882 }
883 
884 void test_builtin_elementwise_round(float f1, float f2, double d1, double d2,
885                                         float4 vf1, float4 vf2) {
886   // CHECK-LABEL: define void @test_builtin_elementwise_round(
887   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
888   // CHECK-NEXT:  call float @llvm.round.f32(float [[F1]])
889   f2 = __builtin_elementwise_round(f1);
890 
891   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
892   // CHECK-NEXT: call double @llvm.round.f64(double [[D1]])
893   d2 = __builtin_elementwise_round(d1);
894 
895   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
896   // CHECK-NEXT: call <4 x float> @llvm.round.v4f32(<4 x float> [[VF1]])
897   vf2 = __builtin_elementwise_round(vf1);
898 }
899 
900 void test_builtin_elementwise_rint(float f1, float f2, double d1, double d2,
901                                    float4 vf1, float4 vf2) {
902   // CHECK-LABEL: define void @test_builtin_elementwise_rint(
903   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
904   // CHECK-NEXT:  call float @llvm.rint.f32(float [[F1]])
905   f2 = __builtin_elementwise_rint(f1);
906 
907   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
908   // CHECK-NEXT: call double @llvm.rint.f64(double [[D1]])
909   d2 = __builtin_elementwise_rint(d1);
910 
911   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
912   // CHECK-NEXT: call <4 x float> @llvm.rint.v4f32(<4 x float> [[VF1]])
913   vf2 = __builtin_elementwise_rint(vf1);
914 }
915 
916 void test_builtin_elementwise_nearbyint(float f1, float f2, double d1, double d2,
917                                         float4 vf1, float4 vf2) {
918   // CHECK-LABEL: define void @test_builtin_elementwise_nearbyint(
919   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
920   // CHECK-NEXT:  call float @llvm.nearbyint.f32(float [[F1]])
921   f2 = __builtin_elementwise_nearbyint(f1);
922 
923   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
924   // CHECK-NEXT: call double @llvm.nearbyint.f64(double [[D1]])
925   d2 = __builtin_elementwise_nearbyint(d1);
926 
927   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
928   // CHECK-NEXT: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VF1]])
929   vf2 = __builtin_elementwise_nearbyint(vf1);
930 }
931 
932 void test_builtin_elementwise_sin(float f1, float f2, double d1, double d2,
933                                   float4 vf1, float4 vf2) {
934   // CHECK-LABEL: define void @test_builtin_elementwise_sin(
935   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
936   // CHECK-NEXT:  call float @llvm.sin.f32(float [[F1]])
937   f2 = __builtin_elementwise_sin(f1);
938 
939   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
940   // CHECK-NEXT: call double @llvm.sin.f64(double [[D1]])
941   d2 = __builtin_elementwise_sin(d1);
942 
943   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
944   // CHECK-NEXT: call <4 x float> @llvm.sin.v4f32(<4 x float> [[VF1]])
945   vf2 = __builtin_elementwise_sin(vf1);
946 }
947 
948 void test_builtin_elementwise_sinh(float f1, float f2, double d1, double d2,
949                                   float4 vf1, float4 vf2) {
950   // CHECK-LABEL: define void @test_builtin_elementwise_sinh(
951   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
952   // CHECK-NEXT:  call float @llvm.sinh.f32(float [[F1]])
953   f2 = __builtin_elementwise_sinh(f1);
954 
955   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
956   // CHECK-NEXT: call double @llvm.sinh.f64(double [[D1]])
957   d2 = __builtin_elementwise_sinh(d1);
958 
959   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
960   // CHECK-NEXT: call <4 x float> @llvm.sinh.v4f32(<4 x float> [[VF1]])
961   vf2 = __builtin_elementwise_sinh(vf1);
962 }
963 
964 void test_builtin_elementwise_sqrt(float f1, float f2, double d1, double d2,
965                                   float4 vf1, float4 vf2) {
966   // CHECK-LABEL: define void @test_builtin_elementwise_sqrt(
967   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
968   // CHECK-NEXT:  call float @llvm.sqrt.f32(float [[F1]])
969   f2 = __builtin_elementwise_sqrt(f1);
970 
971   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
972   // CHECK-NEXT: call double @llvm.sqrt.f64(double [[D1]])
973   d2 = __builtin_elementwise_sqrt(d1);
974 
975   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
976   // CHECK-NEXT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[VF1]])
977   vf2 = __builtin_elementwise_sqrt(vf1);
978 }
979 
980 void test_builtin_elementwise_tan(float f1, float f2, double d1, double d2,
981                                   float4 vf1, float4 vf2) {
982   // CHECK-LABEL: define void @test_builtin_elementwise_tan(
983   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
984   // CHECK-NEXT:  call float @llvm.tan.f32(float [[F1]])
985   f2 = __builtin_elementwise_tan(f1);
986 
987   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
988   // CHECK-NEXT: call double @llvm.tan.f64(double [[D1]])
989   d2 = __builtin_elementwise_tan(d1);
990 
991   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
992   // CHECK-NEXT: call <4 x float> @llvm.tan.v4f32(<4 x float> [[VF1]])
993   vf2 = __builtin_elementwise_tan(vf1);
994 }
995 
996 void test_builtin_elementwise_tanh(float f1, float f2, double d1, double d2,
997                                   float4 vf1, float4 vf2) {
998   // CHECK-LABEL: define void @test_builtin_elementwise_tanh(
999   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
1000   // CHECK-NEXT:  call float @llvm.tanh.f32(float [[F1]])
1001   f2 = __builtin_elementwise_tanh(f1);
1002 
1003   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
1004   // CHECK-NEXT: call double @llvm.tanh.f64(double [[D1]])
1005   d2 = __builtin_elementwise_tanh(d1);
1006 
1007   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
1008   // CHECK-NEXT: call <4 x float> @llvm.tanh.v4f32(<4 x float> [[VF1]])
1009   vf2 = __builtin_elementwise_tanh(vf1);
1010 }
1011 
1012 void test_builtin_elementwise_trunc(float f1, float f2, double d1, double d2,
1013                                     float4 vf1, float4 vf2) {
1014   // CHECK-LABEL: define void @test_builtin_elementwise_trunc(
1015   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
1016   // CHECK-NEXT:  call float @llvm.trunc.f32(float [[F1]])
1017   f2 = __builtin_elementwise_trunc(f1);
1018 
1019   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
1020   // CHECK-NEXT: call double @llvm.trunc.f64(double [[D1]])
1021   d2 = __builtin_elementwise_trunc(d1);
1022 
1023   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
1024   // CHECK-NEXT: call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VF1]])
1025   vf2 = __builtin_elementwise_trunc(vf1);
1026 }
1027 
1028 void test_builtin_elementwise_canonicalize(float f1, float f2, double d1, double d2,
1029                                            float4 vf1, float4 vf2) {
1030   // CHECK-LABEL: define void @test_builtin_elementwise_canonicalize(
1031   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
1032   // CHECK-NEXT:  call float @llvm.canonicalize.f32(float [[F1]])
1033   f2 = __builtin_elementwise_canonicalize(f1);
1034 
1035   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
1036   // CHECK-NEXT: call double @llvm.canonicalize.f64(double [[D1]])
1037   d2 = __builtin_elementwise_canonicalize(d1);
1038 
1039   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
1040   // CHECK-NEXT: call <4 x float> @llvm.canonicalize.v4f32(<4 x float> [[VF1]])
1041   vf2 = __builtin_elementwise_canonicalize(vf1);
1042 }
1043 
1044 void test_builtin_elementwise_copysign(float f1, float f2, double d1, double d2,
1045                                        float4 vf1, float4 vf2, double2 v2f64) {
1046   // CHECK-LABEL: define void @test_builtin_elementwise_copysign(
1047   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
1048   // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
1049   // CHECK-NEXT:  call float @llvm.copysign.f32(float %0, float %1)
1050   f1 = __builtin_elementwise_copysign(f1, f2);
1051 
1052   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
1053   // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
1054   // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double [[D2]])
1055   d1 = __builtin_elementwise_copysign(d1, d2);
1056 
1057   // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
1058   // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double 2.000000e+00)
1059   d1 = __builtin_elementwise_copysign(d1, 2.0);
1060 
1061   // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
1062   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
1063   // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
1064   vf1 = __builtin_elementwise_copysign(vf1, vf2);
1065 
1066   // CHECK:      [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
1067   // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
1068   // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
1069   const float4 cvf1 = vf1;
1070   vf1 = __builtin_elementwise_copysign(cvf1, vf2);
1071 
1072   // CHECK:      [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
1073   // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
1074   // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
1075   vf1 = __builtin_elementwise_copysign(vf2, cvf1);
1076 
1077 
1078   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr
1079   // CHECK-NEXT: call float @llvm.copysign.f32(float [[F1]], float 2.000000e+00)
1080   f1 = __builtin_elementwise_copysign(f1, 2.0f);
1081 
1082   // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr
1083   // CHECK-NEXT: call float @llvm.copysign.f32(float 2.000000e+00, float [[F1]])
1084   f1 = __builtin_elementwise_copysign(2.0f, f1);
1085 
1086   // CHECK:      [[V2F64:%.+]] = load <2 x double>, ptr %v2f64.addr, align 16
1087   // CHECK-NEXT: call <2 x double> @llvm.copysign.v2f64(<2 x double> splat (double 1.000000e+00), <2 x double> [[V2F64]])
1088   v2f64 = __builtin_elementwise_copysign((double2)1.0, v2f64);
1089 }
1090 
1091 void test_builtin_elementwise_fma(float f32, double f64,
1092                                   float2 v2f32, float4 v4f32,
1093                                   double2 v2f64, double3 v3f64,
1094                                   const float4 c_v4f32,
1095                                   half f16, half2 v2f16) {
1096   // CHECK-LABEL: define void @test_builtin_elementwise_fma(
1097   // CHECK:      [[F32_0:%.+]] = load float, ptr %f32.addr
1098   // CHECK-NEXT: [[F32_1:%.+]] = load float, ptr %f32.addr
1099   // CHECK-NEXT: [[F32_2:%.+]] = load float, ptr %f32.addr
1100   // CHECK-NEXT: call float @llvm.fma.f32(float [[F32_0]], float [[F32_1]], float [[F32_2]])
1101   float f2 = __builtin_elementwise_fma(f32, f32, f32);
1102 
1103   // CHECK:      [[F64_0:%.+]] = load double, ptr %f64.addr
1104   // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
1105   // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
1106   // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
1107   double d2 = __builtin_elementwise_fma(f64, f64, f64);
1108 
1109   // CHECK:      [[V4F32_0:%.+]] = load <4 x float>, ptr %v4f32.addr
1110   // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %v4f32.addr
1111   // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %v4f32.addr
1112   // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
1113   float4 tmp_v4f32 = __builtin_elementwise_fma(v4f32, v4f32, v4f32);
1114 
1115 
1116   // FIXME: Are we really still doing the 3 vector load workaround
1117   // CHECK:      [[V3F64_LOAD_0:%.+]] = load <4 x double>, ptr %v3f64.addr
1118   // CHECK-NEXT: [[V3F64_0:%.+]] = shufflevector
1119   // CHECK-NEXT: [[V3F64_LOAD_1:%.+]] = load <4 x double>, ptr %v3f64.addr
1120   // CHECK-NEXT: [[V3F64_1:%.+]] = shufflevector
1121   // CHECK-NEXT: [[V3F64_LOAD_2:%.+]] = load <4 x double>, ptr %v3f64.addr
1122   // CHECK-NEXT: [[V3F64_2:%.+]] = shufflevector
1123     // CHECK-NEXT: call <3 x double> @llvm.fma.v3f64(<3 x double> [[V3F64_0]], <3 x double> [[V3F64_1]], <3 x double> [[V3F64_2]])
1124   v3f64 = __builtin_elementwise_fma(v3f64, v3f64, v3f64);
1125 
1126   // CHECK:      [[F64_0:%.+]] = load double, ptr %f64.addr
1127   // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
1128   // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
1129   // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
1130   v2f64 = __builtin_elementwise_fma(f64, f64, f64);
1131 
1132   // CHECK:      [[V4F32_0:%.+]] = load <4 x float>, ptr %c_v4f32.addr
1133   // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %c_v4f32.addr
1134   // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %c_v4f32.addr
1135   // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
1136   v4f32 = __builtin_elementwise_fma(c_v4f32, c_v4f32, c_v4f32);
1137 
1138   // CHECK:      [[F16_0:%.+]] = load half, ptr %f16.addr
1139   // CHECK-NEXT: [[F16_1:%.+]] = load half, ptr %f16.addr
1140   // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
1141   // CHECK-NEXT: call half @llvm.fma.f16(half [[F16_0]], half [[F16_1]], half [[F16_2]])
1142   half tmp_f16 = __builtin_elementwise_fma(f16, f16, f16);
1143 
1144   // CHECK:      [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
1145   // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
1146   // CHECK-NEXT: [[V2F16_2:%.+]] = load <2 x half>, ptr %v2f16.addr
1147   // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
1148   half2 tmp0_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, v2f16);
1149 
1150   // CHECK:      [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
1151   // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
1152   // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
1153   // CHECK-NEXT: [[V2F16_2_INSERT:%.+]] = insertelement
1154   // CHECK-NEXT: [[V2F16_2:%.+]] = shufflevector <2 x half> [[V2F16_2_INSERT]], <2 x half> poison, <2 x i32> zeroinitializer
1155   // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
1156   half2 tmp1_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, (half2)f16);
1157 
1158   // CHECK:      [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
1159   // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
1160   // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> splat (half 0xH4400))
1161   half2 tmp2_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, (half2)4.0);
1162 
1163 }
1164