xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll (revision 818d715989a82a54bac038b9c293e34dbea45f5c)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -vector-library=Accelerate -S %s | FileCheck %s
3; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -S %s | FileCheck --check-prefix NOACCELERATE %s
4
5target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
6target triple = "arm64-apple-ios14.0.0"
7
8declare float @llvm.sin.f32(float)
9
10; Accelerate provides sin() for <4 x float>
11define <4 x float> @int_sin_4x(ptr %a) {
12; CHECK-LABEL: @int_sin_4x(
13; CHECK-NEXT:  entry:
14; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
15; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
16; CHECK-NEXT:    ret <4 x float> [[TMP1]]
17;
18; NOACCELERATE-LABEL: @int_sin_4x(
19; NOACCELERATE-NEXT:  entry:
20; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
21; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
22; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
23; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
24; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
25; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
26; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
27; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
28; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
29; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
30; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
31; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
32;
33entry:
34  %0 = load <4 x float>, ptr %a, align 16
35  %vecext = extractelement <4 x float> %0, i32 0
36  %1 = tail call fast float @llvm.sin.f32(float %vecext)
37  %vecins = insertelement <4 x float> undef, float %1, i32 0
38  %vecext.1 = extractelement <4 x float> %0, i32 1
39  %2 = tail call fast float @llvm.sin.f32(float %vecext.1)
40  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
41  %vecext.2 = extractelement <4 x float> %0, i32 2
42  %3 = tail call fast float @llvm.sin.f32(float %vecext.2)
43  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
44  %vecext.3 = extractelement <4 x float> %0, i32 3
45  %4 = tail call fast float @llvm.sin.f32(float %vecext.3)
46  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
47  ret <4 x float> %vecins.3
48}
49
50declare float @ceilf(float) readonly nounwind willreturn
51
52define <4 x float> @ceil_4x(ptr %a) {
53; CHECK-LABEL: @ceil_4x(
54; CHECK-NEXT:  entry:
55; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
56; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
57; CHECK-NEXT:    ret <4 x float> [[TMP1]]
58;
59; NOACCELERATE-LABEL: @ceil_4x(
60; NOACCELERATE-NEXT:  entry:
61; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
62; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
63; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
64;
65entry:
66  %0 = load <4 x float>, ptr %a, align 16
67  %vecext = extractelement <4 x float> %0, i32 0
68  %1 = tail call fast float @ceilf(float %vecext)
69  %vecins = insertelement <4 x float> undef, float %1, i32 0
70  %vecext.1 = extractelement <4 x float> %0, i32 1
71  %2 = tail call fast float @ceilf(float %vecext.1)
72  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
73  %vecext.2 = extractelement <4 x float> %0, i32 2
74  %3 = tail call fast float @ceilf(float %vecext.2)
75  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
76  %vecext.3 = extractelement <4 x float> %0, i32 3
77  %4 = tail call fast float @ceilf(float %vecext.3)
78  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
79  ret <4 x float> %vecins.3
80}
81
82declare float @fabsf(float) readonly nounwind willreturn
83
84define <4 x float> @fabs_4x(ptr %a) {
85; CHECK-LABEL: @fabs_4x(
86; CHECK-NEXT:  entry:
87; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
88; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
89; CHECK-NEXT:    ret <4 x float> [[TMP1]]
90;
91; NOACCELERATE-LABEL: @fabs_4x(
92; NOACCELERATE-NEXT:  entry:
93; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
94; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
95; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
96;
97entry:
98  %0 = load <4 x float>, ptr %a, align 16
99  %vecext = extractelement <4 x float> %0, i32 0
100  %1 = tail call fast float @fabsf(float %vecext)
101  %vecins = insertelement <4 x float> undef, float %1, i32 0
102  %vecext.1 = extractelement <4 x float> %0, i32 1
103  %2 = tail call fast float @fabsf(float %vecext.1)
104  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
105  %vecext.2 = extractelement <4 x float> %0, i32 2
106  %3 = tail call fast float @fabsf(float %vecext.2)
107  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
108  %vecext.3 = extractelement <4 x float> %0, i32 3
109  %4 = tail call fast float @fabsf(float %vecext.3)
110  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
111  ret <4 x float> %vecins.3
112}
113declare float @llvm.fabs.f32(float)
114define <4 x float> @int_fabs_4x(ptr %a) {
115; CHECK-LABEL: @int_fabs_4x(
116; CHECK-NEXT:  entry:
117; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
118; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
119; CHECK-NEXT:    ret <4 x float> [[TMP1]]
120;
121; NOACCELERATE-LABEL: @int_fabs_4x(
122; NOACCELERATE-NEXT:  entry:
123; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
124; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
125; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
126;
127entry:
128  %0 = load <4 x float>, ptr %a, align 16
129  %vecext = extractelement <4 x float> %0, i32 0
130  %1 = tail call fast float @llvm.fabs.f32(float %vecext)
131  %vecins = insertelement <4 x float> undef, float %1, i32 0
132  %vecext.1 = extractelement <4 x float> %0, i32 1
133  %2 = tail call fast float @llvm.fabs.f32(float %vecext.1)
134  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
135  %vecext.2 = extractelement <4 x float> %0, i32 2
136  %3 = tail call fast float @llvm.fabs.f32(float %vecext.2)
137  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
138  %vecext.3 = extractelement <4 x float> %0, i32 3
139  %4 = tail call fast float @llvm.fabs.f32(float %vecext.3)
140  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
141  ret <4 x float> %vecins.3
142}
143declare float @floorf(float) readonly nounwind willreturn
144define <4 x float> @floor_4x(ptr %a) {
145; CHECK-LABEL: @floor_4x(
146; CHECK-NEXT:  entry:
147; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
148; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
149; CHECK-NEXT:    ret <4 x float> [[TMP1]]
150;
151; NOACCELERATE-LABEL: @floor_4x(
152; NOACCELERATE-NEXT:  entry:
153; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
154; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
155; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
156;
157entry:
158  %0 = load <4 x float>, ptr %a, align 16
159  %vecext = extractelement <4 x float> %0, i32 0
160  %1 = tail call fast float @floorf(float %vecext)
161  %vecins = insertelement <4 x float> undef, float %1, i32 0
162  %vecext.1 = extractelement <4 x float> %0, i32 1
163  %2 = tail call fast float @floorf(float %vecext.1)
164  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
165  %vecext.2 = extractelement <4 x float> %0, i32 2
166  %3 = tail call fast float @floorf(float %vecext.2)
167  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
168  %vecext.3 = extractelement <4 x float> %0, i32 3
169  %4 = tail call fast float @floorf(float %vecext.3)
170  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
171  ret <4 x float> %vecins.3
172}
173declare float @sqrtf(float) readonly nounwind willreturn
174define <4 x float> @sqrt_4x(ptr %a) {
175; CHECK-LABEL: @sqrt_4x(
176; CHECK-NEXT:  entry:
177; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
178; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
179; CHECK-NEXT:    ret <4 x float> [[TMP1]]
180;
181; NOACCELERATE-LABEL: @sqrt_4x(
182; NOACCELERATE-NEXT:  entry:
183; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
184; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
185; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
186;
187entry:
188  %0 = load <4 x float>, ptr %a, align 16
189  %vecext = extractelement <4 x float> %0, i32 0
190  %1 = tail call fast float @sqrtf(float %vecext)
191  %vecins = insertelement <4 x float> undef, float %1, i32 0
192  %vecext.1 = extractelement <4 x float> %0, i32 1
193  %2 = tail call fast float @sqrtf(float %vecext.1)
194  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
195  %vecext.2 = extractelement <4 x float> %0, i32 2
196  %3 = tail call fast float @sqrtf(float %vecext.2)
197  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
198  %vecext.3 = extractelement <4 x float> %0, i32 3
199  %4 = tail call fast float @sqrtf(float %vecext.3)
200  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
201  ret <4 x float> %vecins.3
202}
203declare float @expf(float) readonly nounwind willreturn
204define <4 x float> @exp_4x(ptr %a) {
205; CHECK-LABEL: @exp_4x(
206; CHECK-NEXT:  entry:
207; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
208; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]])
209; CHECK-NEXT:    ret <4 x float> [[TMP1]]
210;
211; NOACCELERATE-LABEL: @exp_4x(
212; NOACCELERATE-NEXT:  entry:
213; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
214; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
215; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
216; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
217; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
218; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
219; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
220; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
221; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
222; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
223; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
224; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
225;
226entry:
227  %0 = load <4 x float>, ptr %a, align 16
228  %vecext = extractelement <4 x float> %0, i32 0
229  %1 = tail call fast float @expf(float %vecext)
230  %vecins = insertelement <4 x float> undef, float %1, i32 0
231  %vecext.1 = extractelement <4 x float> %0, i32 1
232  %2 = tail call fast float @expf(float %vecext.1)
233  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
234  %vecext.2 = extractelement <4 x float> %0, i32 2
235  %3 = tail call fast float @expf(float %vecext.2)
236  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
237  %vecext.3 = extractelement <4 x float> %0, i32 3
238  %4 = tail call fast float @expf(float %vecext.3)
239  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
240  ret <4 x float> %vecins.3
241}
242declare float @expm1f(float) readonly nounwind willreturn
243define <4 x float> @expm1_4x(ptr %a) {
244; CHECK-LABEL: @expm1_4x(
245; CHECK-NEXT:  entry:
246; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
247; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]])
248; CHECK-NEXT:    ret <4 x float> [[TMP1]]
249;
250; NOACCELERATE-LABEL: @expm1_4x(
251; NOACCELERATE-NEXT:  entry:
252; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
253; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
254; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @expm1f(float [[VECEXT]])
255; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
256; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
257; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expm1f(float [[VECEXT_1]])
258; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
259; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
260; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @expm1f(float [[VECEXT_2]])
261; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
262; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
263; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @expm1f(float [[VECEXT_3]])
264; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
265; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
266;
267entry:
268  %0 = load <4 x float>, ptr %a, align 16
269  %vecext = extractelement <4 x float> %0, i32 0
270  %1 = tail call fast float @expm1f(float %vecext)
271  %vecins = insertelement <4 x float> undef, float %1, i32 0
272  %vecext.1 = extractelement <4 x float> %0, i32 1
273  %2 = tail call fast float @expm1f(float %vecext.1)
274  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
275  %vecext.2 = extractelement <4 x float> %0, i32 2
276  %3 = tail call fast float @expm1f(float %vecext.2)
277  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
278  %vecext.3 = extractelement <4 x float> %0, i32 3
279  %4 = tail call fast float @expm1f(float %vecext.3)
280  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
281  ret <4 x float> %vecins.3
282}
283declare float @logf(float) readonly nounwind willreturn
284define <4 x float> @log_4x(ptr %a) {
285; CHECK-LABEL: @log_4x(
286; CHECK-NEXT:  entry:
287; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
288; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]])
289; CHECK-NEXT:    ret <4 x float> [[TMP1]]
290;
291; NOACCELERATE-LABEL: @log_4x(
292; NOACCELERATE-NEXT:  entry:
293; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
294; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
295; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
296; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
297; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
298; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
299; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
300; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
301; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
302; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
303; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
304; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
305;
306entry:
307  %0 = load <4 x float>, ptr %a, align 16
308  %vecext = extractelement <4 x float> %0, i32 0
309  %1 = tail call fast float @logf(float %vecext)
310  %vecins = insertelement <4 x float> undef, float %1, i32 0
311  %vecext.1 = extractelement <4 x float> %0, i32 1
312  %2 = tail call fast float @logf(float %vecext.1)
313  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
314  %vecext.2 = extractelement <4 x float> %0, i32 2
315  %3 = tail call fast float @logf(float %vecext.2)
316  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
317  %vecext.3 = extractelement <4 x float> %0, i32 3
318  %4 = tail call fast float @logf(float %vecext.3)
319  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
320  ret <4 x float> %vecins.3
321}
322declare float @log1pf(float) readonly nounwind willreturn
323define <4 x float> @log1p_4x(ptr %a) {
324; CHECK-LABEL: @log1p_4x(
325; CHECK-NEXT:  entry:
326; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
327; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]])
328; CHECK-NEXT:    ret <4 x float> [[TMP1]]
329;
330; NOACCELERATE-LABEL: @log1p_4x(
331; NOACCELERATE-NEXT:  entry:
332; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
333; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
334; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @log1pf(float [[VECEXT]])
335; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
336; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
337; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @log1pf(float [[VECEXT_1]])
338; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
339; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
340; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @log1pf(float [[VECEXT_2]])
341; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
342; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
343; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @log1pf(float [[VECEXT_3]])
344; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
345; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
346;
347entry:
348  %0 = load <4 x float>, ptr %a, align 16
349  %vecext = extractelement <4 x float> %0, i32 0
350  %1 = tail call fast float @log1pf(float %vecext)
351  %vecins = insertelement <4 x float> undef, float %1, i32 0
352  %vecext.1 = extractelement <4 x float> %0, i32 1
353  %2 = tail call fast float @log1pf(float %vecext.1)
354  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
355  %vecext.2 = extractelement <4 x float> %0, i32 2
356  %3 = tail call fast float @log1pf(float %vecext.2)
357  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
358  %vecext.3 = extractelement <4 x float> %0, i32 3
359  %4 = tail call fast float @log1pf(float %vecext.3)
360  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
361  ret <4 x float> %vecins.3
362}
363declare float @log10pf(float) readonly nounwind willreturn
364define <4 x float> @log10p_4x(ptr %a) {
365; CHECK-LABEL: @log10p_4x(
366; CHECK-NEXT:  entry:
367; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
368; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
369; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
370; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
371; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
372; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
373; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
374; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
375; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
376; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
377; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
378; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
379; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
380; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
381;
382; NOACCELERATE-LABEL: @log10p_4x(
383; NOACCELERATE-NEXT:  entry:
384; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
385; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
386; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
387; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
388; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
389; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
390; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
391; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
392; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
393; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
394; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
395; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
396; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
397; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
398;
399entry:
400  %0 = load <4 x float>, ptr %a, align 16
401  %vecext = extractelement <4 x float> %0, i32 0
402  %1 = tail call fast float @log10pf(float %vecext)
403  %vecins = insertelement <4 x float> undef, float %1, i32 0
404  %vecext.1 = extractelement <4 x float> %0, i32 1
405  %2 = tail call fast float @log10pf(float %vecext.1)
406  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
407  %vecext.2 = extractelement <4 x float> %0, i32 2
408  %3 = tail call fast float @log10pf(float %vecext.2)
409  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
410  %vecext.3 = extractelement <4 x float> %0, i32 3
411  %4 = tail call fast float @log10pf(float %vecext.3)
412  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
413  ret <4 x float> %vecins.3
414}
415declare float @logbf(float) readonly nounwind willreturn
416define <4 x float> @logb_4x(ptr %a) {
417; CHECK-LABEL: @logb_4x(
418; CHECK-NEXT:  entry:
419; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
420; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]])
421; CHECK-NEXT:    ret <4 x float> [[TMP1]]
422;
423; NOACCELERATE-LABEL: @logb_4x(
424; NOACCELERATE-NEXT:  entry:
425; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
426; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
427; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @logbf(float [[VECEXT]])
428; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
429; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
430; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logbf(float [[VECEXT_1]])
431; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
432; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
433; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @logbf(float [[VECEXT_2]])
434; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
435; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
436; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @logbf(float [[VECEXT_3]])
437; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
438; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
439;
440entry:
441  %0 = load <4 x float>, ptr %a, align 16
442  %vecext = extractelement <4 x float> %0, i32 0
443  %1 = tail call fast float @logbf(float %vecext)
444  %vecins = insertelement <4 x float> undef, float %1, i32 0
445  %vecext.1 = extractelement <4 x float> %0, i32 1
446  %2 = tail call fast float @logbf(float %vecext.1)
447  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
448  %vecext.2 = extractelement <4 x float> %0, i32 2
449  %3 = tail call fast float @logbf(float %vecext.2)
450  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
451  %vecext.3 = extractelement <4 x float> %0, i32 3
452  %4 = tail call fast float @logbf(float %vecext.3)
453  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
454  ret <4 x float> %vecins.3
455}
456declare float @sinf(float) readonly nounwind willreturn
457define <4 x float> @sin_4x(ptr %a) {
458; CHECK-LABEL: @sin_4x(
459; CHECK-NEXT:  entry:
460; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
461; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
462; CHECK-NEXT:    ret <4 x float> [[TMP1]]
463;
464; NOACCELERATE-LABEL: @sin_4x(
465; NOACCELERATE-NEXT:  entry:
466; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
467; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
468; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
469; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
470; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
471; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
472; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
473; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
474; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
475; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
476; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
477; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
478;
479entry:
480  %0 = load <4 x float>, ptr %a, align 16
481  %vecext = extractelement <4 x float> %0, i32 0
482  %1 = tail call fast float @sinf(float %vecext)
483  %vecins = insertelement <4 x float> undef, float %1, i32 0
484  %vecext.1 = extractelement <4 x float> %0, i32 1
485  %2 = tail call fast float @sinf(float %vecext.1)
486  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
487  %vecext.2 = extractelement <4 x float> %0, i32 2
488  %3 = tail call fast float @sinf(float %vecext.2)
489  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
490  %vecext.3 = extractelement <4 x float> %0, i32 3
491  %4 = tail call fast float @sinf(float %vecext.3)
492  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
493  ret <4 x float> %vecins.3
494}
495declare float @cosf(float) readonly nounwind willreturn
496define <4 x float> @cos_4x(ptr %a) {
497; CHECK-LABEL: @cos_4x(
498; CHECK-NEXT:  entry:
499; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
500; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
501; CHECK-NEXT:    ret <4 x float> [[TMP1]]
502;
503; NOACCELERATE-LABEL: @cos_4x(
504; NOACCELERATE-NEXT:  entry:
505; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
506; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
507; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
508; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
509; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
510; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
511; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
512; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
513; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
514; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
515; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
516; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
517;
518entry:
519  %0 = load <4 x float>, ptr %a, align 16
520  %vecext = extractelement <4 x float> %0, i32 0
521  %1 = tail call fast float @cosf(float %vecext)
522  %vecins = insertelement <4 x float> undef, float %1, i32 0
523  %vecext.1 = extractelement <4 x float> %0, i32 1
524  %2 = tail call fast float @cosf(float %vecext.1)
525  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
526  %vecext.2 = extractelement <4 x float> %0, i32 2
527  %3 = tail call fast float @cosf(float %vecext.2)
528  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
529  %vecext.3 = extractelement <4 x float> %0, i32 3
530  %4 = tail call fast float @cosf(float %vecext.3)
531  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
532  ret <4 x float> %vecins.3
533}
534declare float @tanf(float) readonly nounwind willreturn
535define <4 x float> @tan_4x(ptr %a) {
536; CHECK-LABEL: @tan_4x(
537; CHECK-NEXT:  entry:
538; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
539; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]])
540; CHECK-NEXT:    ret <4 x float> [[TMP1]]
541;
542; NOACCELERATE-LABEL: @tan_4x(
543; NOACCELERATE-NEXT:  entry:
544; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
545; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
546; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]])
547; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
548; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
549; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
550; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
551; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
552; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
553; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
554; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
555; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
556;
557entry:
558  %0 = load <4 x float>, ptr %a, align 16
559  %vecext = extractelement <4 x float> %0, i32 0
560  %1 = tail call fast float @tanf(float %vecext)
561  %vecins = insertelement <4 x float> undef, float %1, i32 0
562  %vecext.1 = extractelement <4 x float> %0, i32 1
563  %2 = tail call fast float @tanf(float %vecext.1)
564  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
565  %vecext.2 = extractelement <4 x float> %0, i32 2
566  %3 = tail call fast float @tanf(float %vecext.2)
567  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
568  %vecext.3 = extractelement <4 x float> %0, i32 3
569  %4 = tail call fast float @tanf(float %vecext.3)
570  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
571  ret <4 x float> %vecins.3
572}
573declare float @asinf(float) readonly nounwind willreturn
574define <4 x float> @asin_4x(ptr %a) {
575; CHECK-LABEL: @asin_4x(
576; CHECK-NEXT:  entry:
577; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
578; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
579; CHECK-NEXT:    ret <4 x float> [[TMP1]]
580;
581; NOACCELERATE-LABEL: @asin_4x(
582; NOACCELERATE-NEXT:  entry:
583; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
584; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
585; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @asinf(float [[VECEXT]])
586; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
587; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
588; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
589; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
590; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
591; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
592; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
593; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
594; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
595;
596entry:
597  %0 = load <4 x float>, ptr %a, align 16
598  %vecext = extractelement <4 x float> %0, i32 0
599  %1 = tail call fast float @asinf(float %vecext)
600  %vecins = insertelement <4 x float> undef, float %1, i32 0
601  %vecext.1 = extractelement <4 x float> %0, i32 1
602  %2 = tail call fast float @asinf(float %vecext.1)
603  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
604  %vecext.2 = extractelement <4 x float> %0, i32 2
605  %3 = tail call fast float @asinf(float %vecext.2)
606  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
607  %vecext.3 = extractelement <4 x float> %0, i32 3
608  %4 = tail call fast float @asinf(float %vecext.3)
609  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
610  ret <4 x float> %vecins.3
611}
612define <4 x float> @int_asin_4x(ptr %a) {
613; CHECK-LABEL: @int_asin_4x(
614; CHECK-NEXT:  entry:
615; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
616; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
617; CHECK-NEXT:    ret <4 x float> [[TMP1]]
618;
619; NOACCELERATE-LABEL: @int_asin_4x(
620; NOACCELERATE-NEXT:  entry:
621; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
622; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
623; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]])
624; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
625; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
626; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
627; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
628; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
629; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
630; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
631; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
632; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
633;
634entry:
635  %0 = load <4 x float>, ptr %a, align 16
636  %vecext = extractelement <4 x float> %0, i32 0
637  %1 = tail call fast float @llvm.asin.f32(float %vecext)
638  %vecins = insertelement <4 x float> undef, float %1, i32 0
639  %vecext.1 = extractelement <4 x float> %0, i32 1
640  %2 = tail call fast float @llvm.asin.f32(float %vecext.1)
641  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
642  %vecext.2 = extractelement <4 x float> %0, i32 2
643  %3 = tail call fast float @llvm.asin.f32(float %vecext.2)
644  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
645  %vecext.3 = extractelement <4 x float> %0, i32 3
646  %4 = tail call fast float @llvm.asin.f32(float %vecext.3)
647  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
648  ret <4 x float> %vecins.3
649}
650declare float @acosf(float) readonly nounwind willreturn
651define <4 x float> @acos_4x(ptr %a) {
652; CHECK-LABEL: @acos_4x(
653; CHECK-NEXT:  entry:
654; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
655; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
656; CHECK-NEXT:    ret <4 x float> [[TMP1]]
657;
658; NOACCELERATE-LABEL: @acos_4x(
659; NOACCELERATE-NEXT:  entry:
660; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
661; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
662; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]])
663; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
664; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
665; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
666; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
667; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
668; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
669; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
670; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
671; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
672;
673entry:
674  %0 = load <4 x float>, ptr %a, align 16
675  %vecext = extractelement <4 x float> %0, i32 0
676  %1 = tail call fast float @acosf(float %vecext)
677  %vecins = insertelement <4 x float> undef, float %1, i32 0
678  %vecext.1 = extractelement <4 x float> %0, i32 1
679  %2 = tail call fast float @acosf(float %vecext.1)
680  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
681  %vecext.2 = extractelement <4 x float> %0, i32 2
682  %3 = tail call fast float @acosf(float %vecext.2)
683  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
684  %vecext.3 = extractelement <4 x float> %0, i32 3
685  %4 = tail call fast float @acosf(float %vecext.3)
686  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
687  ret <4 x float> %vecins.3
688}
689define <4 x float> @int_acos_4x(ptr %a) {
690; CHECK-LABEL: @int_acos_4x(
691; CHECK-NEXT:  entry:
692; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
693; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
694; CHECK-NEXT:    ret <4 x float> [[TMP1]]
695;
696; NOACCELERATE-LABEL: @int_acos_4x(
697; NOACCELERATE-NEXT:  entry:
698; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
699; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
700; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
701; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
702; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
703; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
704; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
705; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
706; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
707; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
708; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
709; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
710;
711entry:
712  %0 = load <4 x float>, ptr %a, align 16
713  %vecext = extractelement <4 x float> %0, i32 0
714  %1 = tail call fast float @llvm.acos.f32(float %vecext)
715  %vecins = insertelement <4 x float> undef, float %1, i32 0
716  %vecext.1 = extractelement <4 x float> %0, i32 1
717  %2 = tail call fast float @llvm.acos.f32(float %vecext.1)
718  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
719  %vecext.2 = extractelement <4 x float> %0, i32 2
720  %3 = tail call fast float @llvm.acos.f32(float %vecext.2)
721  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
722  %vecext.3 = extractelement <4 x float> %0, i32 3
723  %4 = tail call fast float @llvm.acos.f32(float %vecext.3)
724  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
725  ret <4 x float> %vecins.3
726}
727declare float @atanf(float) readonly nounwind willreturn
728define <4 x float> @atan_4x(ptr %a) {
729; CHECK-LABEL: @atan_4x(
730; CHECK-NEXT:  entry:
731; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
732; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
733; CHECK-NEXT:    ret <4 x float> [[TMP1]]
734;
735; NOACCELERATE-LABEL: @atan_4x(
736; NOACCELERATE-NEXT:  entry:
737; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
738; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
739; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]])
740; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
741; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
742; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
743; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
744; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
745; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
746; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
747; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
748; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
749;
750entry:
751  %0 = load <4 x float>, ptr %a, align 16
752  %vecext = extractelement <4 x float> %0, i32 0
753  %1 = tail call fast float @atanf(float %vecext)
754  %vecins = insertelement <4 x float> undef, float %1, i32 0
755  %vecext.1 = extractelement <4 x float> %0, i32 1
756  %2 = tail call fast float @atanf(float %vecext.1)
757  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
758  %vecext.2 = extractelement <4 x float> %0, i32 2
759  %3 = tail call fast float @atanf(float %vecext.2)
760  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
761  %vecext.3 = extractelement <4 x float> %0, i32 3
762  %4 = tail call fast float @atanf(float %vecext.3)
763  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
764  ret <4 x float> %vecins.3
765}
766define <4 x float> @int_atan_4x(ptr %a) {
767; CHECK-LABEL: @int_atan_4x(
768; CHECK-NEXT:  entry:
769; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
770; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
771; CHECK-NEXT:    ret <4 x float> [[TMP1]]
772;
773; NOACCELERATE-LABEL: @int_atan_4x(
774; NOACCELERATE-NEXT:  entry:
775; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
776; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
777; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
778; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
779; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
780; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
781; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
782; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
783; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
784; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
785; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
786; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
787;
788entry:
789  %0 = load <4 x float>, ptr %a, align 16
790  %vecext = extractelement <4 x float> %0, i32 0
791  %1 = tail call fast float @llvm.atan.f32(float %vecext)
792  %vecins = insertelement <4 x float> undef, float %1, i32 0
793  %vecext.1 = extractelement <4 x float> %0, i32 1
794  %2 = tail call fast float @llvm.atan.f32(float %vecext.1)
795  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
796  %vecext.2 = extractelement <4 x float> %0, i32 2
797  %3 = tail call fast float @llvm.atan.f32(float %vecext.2)
798  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
799  %vecext.3 = extractelement <4 x float> %0, i32 3
800  %4 = tail call fast float @llvm.atan.f32(float %vecext.3)
801  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
802  ret <4 x float> %vecins.3
803}
804declare float @atan2f(float,float) readonly nounwind willreturn
805define <4 x float> @atan2_4x(ptr %a, ptr %b) {
806; CHECK-LABEL: @atan2_4x(
807; CHECK-NEXT:  entry:
808; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
809; CHECK-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
810; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
811; CHECK-NEXT:    ret <4 x float> [[TMP1]]
812;
813; NOACCELERATE-LABEL: @atan2_4x(
814; NOACCELERATE-NEXT:  entry:
815; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
816; NOACCELERATE-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
817; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
818; NOACCELERATE-NEXT:    [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
819; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @atan2f(float [[VECEXT]], float [[VECEXTB]])
820; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
821; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
822; NOACCELERATE-NEXT:    [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
823; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]])
824; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
825; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
826; NOACCELERATE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
827; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
828; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
829; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
830; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
831;
832entry:
833  %0 = load <4 x float>, ptr %a, align 16
834  %bb = load <4 x float>, ptr %b, align 16
835  %vecext = extractelement <4 x float> %0, i32 0
836  %vecextb = extractelement <4 x float> %bb, i32 0
837  %1 = tail call fast float @atan2f(float %vecext, float %vecextb)
838  %vecins = insertelement <4 x float> undef, float %1, i32 0
839  %vecext.1 = extractelement <4 x float> %0, i32 1
840  %vecextb.1 = extractelement <4 x float> %bb, i32 1
841  %2 = tail call fast float @atan2f(float %vecext.1, float %vecextb.1)
842  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
843  %vecext.2 = extractelement <4 x float> %0, i32 2
844  %vecextb.2 = extractelement <4 x float> %bb, i32 2
845  %3 = tail call fast float @atan2f(float %vecext.2, float %vecextb.2)
846  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
847  %vecext.3 = extractelement <4 x float> %0, i32 3
848  %vecextb.3 = extractelement <4 x float> %bb, i32 3
849  %4 = tail call fast float @atan2f(float %vecext.3, float %vecextb.3)
850  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
851  ret <4 x float> %vecins.3
852}
853define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
854; CHECK-LABEL: @int_atan2_4x(
855; CHECK-NEXT:  entry:
856; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
857; CHECK-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
858; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
859; CHECK-NEXT:    ret <4 x float> [[TMP1]]
860;
861; NOACCELERATE-LABEL: @int_atan2_4x(
862; NOACCELERATE-NEXT:  entry:
863; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
864; NOACCELERATE-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
865; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
866; NOACCELERATE-NEXT:    [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
867; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT]], float [[VECEXTB]])
868; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
869; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
870; NOACCELERATE-NEXT:    [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
871; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]])
872; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
873; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
874; NOACCELERATE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
875; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
876; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
877; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
878; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
879;
880entry:
881  %0 = load <4 x float>, ptr %a, align 16
882  %bb = load <4 x float>, ptr %b, align 16
883  %vecext = extractelement <4 x float> %0, i32 0
884  %vecextb = extractelement <4 x float> %bb, i32 0
885  %1 = tail call fast float @llvm.atan2.f32(float %vecext, float %vecextb)
886  %vecins = insertelement <4 x float> undef, float %1, i32 0
887  %vecext.1 = extractelement <4 x float> %0, i32 1
888  %vecextb.1 = extractelement <4 x float> %bb, i32 1
889  %2 = tail call fast float @llvm.atan2.f32(float %vecext.1, float %vecextb.1)
890  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
891  %vecext.2 = extractelement <4 x float> %0, i32 2
892  %vecextb.2 = extractelement <4 x float> %bb, i32 2
893  %3 = tail call fast float @llvm.atan2.f32(float %vecext.2, float %vecextb.2)
894  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
895  %vecext.3 = extractelement <4 x float> %0, i32 3
896  %vecextb.3 = extractelement <4 x float> %bb, i32 3
897  %4 = tail call fast float @llvm.atan2.f32(float %vecext.3, float %vecextb.3)
898  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
899  ret <4 x float> %vecins.3
900}
901declare float @sinhf(float) readonly nounwind willreturn
902define <4 x float> @sinh_4x(ptr %a) {
903; CHECK-LABEL: @sinh_4x(
904; CHECK-NEXT:  entry:
905; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
906; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
907; CHECK-NEXT:    ret <4 x float> [[TMP1]]
908;
909; NOACCELERATE-LABEL: @sinh_4x(
910; NOACCELERATE-NEXT:  entry:
911; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
912; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
913; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]])
914; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
915; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
916; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
917; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
918; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
919; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
920; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
921; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
922; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
923;
924entry:
925  %0 = load <4 x float>, ptr %a, align 16
926  %vecext = extractelement <4 x float> %0, i32 0
927  %1 = tail call fast float @sinhf(float %vecext)
928  %vecins = insertelement <4 x float> undef, float %1, i32 0
929  %vecext.1 = extractelement <4 x float> %0, i32 1
930  %2 = tail call fast float @sinhf(float %vecext.1)
931  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
932  %vecext.2 = extractelement <4 x float> %0, i32 2
933  %3 = tail call fast float @sinhf(float %vecext.2)
934  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
935  %vecext.3 = extractelement <4 x float> %0, i32 3
936  %4 = tail call fast float @sinhf(float %vecext.3)
937  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
938  ret <4 x float> %vecins.3
939}
940define <4 x float> @int_sinh_4x(ptr %a) {
941; CHECK-LABEL: @int_sinh_4x(
942; CHECK-NEXT:  entry:
943; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
944; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
945; CHECK-NEXT:    ret <4 x float> [[TMP1]]
946;
947; NOACCELERATE-LABEL: @int_sinh_4x(
948; NOACCELERATE-NEXT:  entry:
949; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
950; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
951; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
952; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
953; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
954; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
955; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
956; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
957; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
958; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
959; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
960; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
961;
962entry:
963  %0 = load <4 x float>, ptr %a, align 16
964  %vecext = extractelement <4 x float> %0, i32 0
965  %1 = tail call fast float @llvm.sinh.f32(float %vecext)
966  %vecins = insertelement <4 x float> undef, float %1, i32 0
967  %vecext.1 = extractelement <4 x float> %0, i32 1
968  %2 = tail call fast float @llvm.sinh.f32(float %vecext.1)
969  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
970  %vecext.2 = extractelement <4 x float> %0, i32 2
971  %3 = tail call fast float @llvm.sinh.f32(float %vecext.2)
972  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
973  %vecext.3 = extractelement <4 x float> %0, i32 3
974  %4 = tail call fast float @llvm.sinh.f32(float %vecext.3)
975  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
976  ret <4 x float> %vecins.3
977}
978declare float @coshf(float) readonly nounwind willreturn
979define <4 x float> @cosh_4x(ptr %a) {
980; CHECK-LABEL: @cosh_4x(
981; CHECK-NEXT:  entry:
982; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
983; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
984; CHECK-NEXT:    ret <4 x float> [[TMP1]]
985;
986; NOACCELERATE-LABEL: @cosh_4x(
987; NOACCELERATE-NEXT:  entry:
988; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
989; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
990; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @coshf(float [[VECEXT]])
991; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
992; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
993; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]])
994; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
995; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
996; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
997; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
998; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
999; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1000;
1001entry:
1002  %0 = load <4 x float>, ptr %a, align 16
1003  %vecext = extractelement <4 x float> %0, i32 0
1004  %1 = tail call fast float @coshf(float %vecext)
1005  %vecins = insertelement <4 x float> undef, float %1, i32 0
1006  %vecext.1 = extractelement <4 x float> %0, i32 1
1007  %2 = tail call fast float @coshf(float %vecext.1)
1008  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1009  %vecext.2 = extractelement <4 x float> %0, i32 2
1010  %3 = tail call fast float @coshf(float %vecext.2)
1011  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1012  %vecext.3 = extractelement <4 x float> %0, i32 3
1013  %4 = tail call fast float @coshf(float %vecext.3)
1014  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1015  ret <4 x float> %vecins.3
1016}
1017define <4 x float> @int_cosh_4x(ptr %a) {
1018; CHECK-LABEL: @int_cosh_4x(
1019; CHECK-NEXT:  entry:
1020; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1021; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
1022; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1023;
1024; NOACCELERATE-LABEL: @int_cosh_4x(
1025; NOACCELERATE-NEXT:  entry:
1026; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1027; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1028; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]])
1029; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1030; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1031; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
1032; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1033; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1034; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
1035; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1036; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1037; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1038;
1039entry:
1040  %0 = load <4 x float>, ptr %a, align 16
1041  %vecext = extractelement <4 x float> %0, i32 0
1042  %1 = tail call fast float @llvm.cosh.f32(float %vecext)
1043  %vecins = insertelement <4 x float> undef, float %1, i32 0
1044  %vecext.1 = extractelement <4 x float> %0, i32 1
1045  %2 = tail call fast float @llvm.cosh.f32(float %vecext.1)
1046  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1047  %vecext.2 = extractelement <4 x float> %0, i32 2
1048  %3 = tail call fast float @llvm.cosh.f32(float %vecext.2)
1049  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1050  %vecext.3 = extractelement <4 x float> %0, i32 3
1051  %4 = tail call fast float @llvm.cosh.f32(float %vecext.3)
1052  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1053  ret <4 x float> %vecins.3
1054}
1055declare float @tanhf(float) readonly nounwind willreturn
1056define <4 x float> @tanh_4x(ptr %a) {
1057; CHECK-LABEL: @tanh_4x(
1058; CHECK-NEXT:  entry:
1059; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1060; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
1061; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1062;
1063; NOACCELERATE-LABEL: @tanh_4x(
1064; NOACCELERATE-NEXT:  entry:
1065; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1066; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1067; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]])
1068; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1069; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1070; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
1071; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1072; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1073; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
1074; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1075; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1076; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1077;
1078entry:
1079  %0 = load <4 x float>, ptr %a, align 16
1080  %vecext = extractelement <4 x float> %0, i32 0
1081  %1 = tail call fast float @tanhf(float %vecext)
1082  %vecins = insertelement <4 x float> undef, float %1, i32 0
1083  %vecext.1 = extractelement <4 x float> %0, i32 1
1084  %2 = tail call fast float @tanhf(float %vecext.1)
1085  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1086  %vecext.2 = extractelement <4 x float> %0, i32 2
1087  %3 = tail call fast float @tanhf(float %vecext.2)
1088  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1089  %vecext.3 = extractelement <4 x float> %0, i32 3
1090  %4 = tail call fast float @tanhf(float %vecext.3)
1091  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1092  ret <4 x float> %vecins.3
1093}
1094define <4 x float> @int_tanh_4x(ptr %a) {
1095; CHECK-LABEL: @int_tanh_4x(
1096; CHECK-NEXT:  entry:
1097; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1098; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
1099; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1100;
1101; NOACCELERATE-LABEL: @int_tanh_4x(
1102; NOACCELERATE-NEXT:  entry:
1103; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1104; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1105; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
1106; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1107; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1108; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
1109; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1110; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1111; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
1112; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1113; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1114; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1115;
1116entry:
1117  %0 = load <4 x float>, ptr %a, align 16
1118  %vecext = extractelement <4 x float> %0, i32 0
1119  %1 = tail call fast float @llvm.tanh.f32(float %vecext)
1120  %vecins = insertelement <4 x float> undef, float %1, i32 0
1121  %vecext.1 = extractelement <4 x float> %0, i32 1
1122  %2 = tail call fast float @llvm.tanh.f32(float %vecext.1)
1123  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1124  %vecext.2 = extractelement <4 x float> %0, i32 2
1125  %3 = tail call fast float @llvm.tanh.f32(float %vecext.2)
1126  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1127  %vecext.3 = extractelement <4 x float> %0, i32 3
1128  %4 = tail call fast float @llvm.tanh.f32(float %vecext.3)
1129  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1130  ret <4 x float> %vecins.3
1131}
1132declare float @asinhf(float) readonly nounwind willreturn
1133define <4 x float> @asinh_4x(ptr %a) {
1134; CHECK-LABEL: @asinh_4x(
1135; CHECK-NEXT:  entry:
1136; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1137; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]])
1138; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1139;
1140; NOACCELERATE-LABEL: @asinh_4x(
1141; NOACCELERATE-NEXT:  entry:
1142; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1143; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1144; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]])
1145; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1146; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1147; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]])
1148; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1149; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1150; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @asinhf(float [[VECEXT_2]])
1151; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
1152; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1153; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @asinhf(float [[VECEXT_3]])
1154; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
1155; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
1156;
1157entry:
1158  %0 = load <4 x float>, ptr %a, align 16
1159  %vecext = extractelement <4 x float> %0, i32 0
1160  %1 = tail call fast float @asinhf(float %vecext)
1161  %vecins = insertelement <4 x float> undef, float %1, i32 0
1162  %vecext.1 = extractelement <4 x float> %0, i32 1
1163  %2 = tail call fast float @asinhf(float %vecext.1)
1164  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1165  %vecext.2 = extractelement <4 x float> %0, i32 2
1166  %3 = tail call fast float @asinhf(float %vecext.2)
1167  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1168  %vecext.3 = extractelement <4 x float> %0, i32 3
1169  %4 = tail call fast float @asinhf(float %vecext.3)
1170  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1171  ret <4 x float> %vecins.3
1172}
1173declare float @acoshf(float) readonly nounwind willreturn
1174define <4 x float> @acosh_4x(ptr %a) {
1175; CHECK-LABEL: @acosh_4x(
1176; CHECK-NEXT:  entry:
1177; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1178; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]])
1179; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1180;
1181; NOACCELERATE-LABEL: @acosh_4x(
1182; NOACCELERATE-NEXT:  entry:
1183; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1184; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1185; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]])
1186; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1187; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1188; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]])
1189; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1190; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1191; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @acoshf(float [[VECEXT_2]])
1192; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
1193; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1194; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @acoshf(float [[VECEXT_3]])
1195; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
1196; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
1197;
1198entry:
1199  %0 = load <4 x float>, ptr %a, align 16
1200  %vecext = extractelement <4 x float> %0, i32 0
1201  %1 = tail call fast float @acoshf(float %vecext)
1202  %vecins = insertelement <4 x float> undef, float %1, i32 0
1203  %vecext.1 = extractelement <4 x float> %0, i32 1
1204  %2 = tail call fast float @acoshf(float %vecext.1)
1205  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1206  %vecext.2 = extractelement <4 x float> %0, i32 2
1207  %3 = tail call fast float @acoshf(float %vecext.2)
1208  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1209  %vecext.3 = extractelement <4 x float> %0, i32 3
1210  %4 = tail call fast float @acoshf(float %vecext.3)
1211  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1212  ret <4 x float> %vecins.3
1213}
1214declare float @atanhf(float) readonly nounwind willreturn
1215define <4 x float> @atanh_4x(ptr %a) {
1216; CHECK-LABEL: @atanh_4x(
1217; CHECK-NEXT:  entry:
1218; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1219; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]])
1220; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1221;
1222; NOACCELERATE-LABEL: @atanh_4x(
1223; NOACCELERATE-NEXT:  entry:
1224; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1225; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1226; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @atanhf(float [[VECEXT]])
1227; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1228; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1229; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @atanhf(float [[VECEXT_1]])
1230; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1231; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1232; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @atanhf(float [[VECEXT_2]])
1233; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
1234; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1235; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @atanhf(float [[VECEXT_3]])
1236; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
1237; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
1238;
1239entry:
1240  %0 = load <4 x float>, ptr %a, align 16
1241  %vecext = extractelement <4 x float> %0, i32 0
1242  %1 = tail call fast float @atanhf(float %vecext)
1243  %vecins = insertelement <4 x float> undef, float %1, i32 0
1244  %vecext.1 = extractelement <4 x float> %0, i32 1
1245  %2 = tail call fast float @atanhf(float %vecext.1)
1246  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1247  %vecext.2 = extractelement <4 x float> %0, i32 2
1248  %3 = tail call fast float @atanhf(float %vecext.2)
1249  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1250  %vecext.3 = extractelement <4 x float> %0, i32 3
1251  %4 = tail call fast float @atanhf(float %vecext.3)
1252  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1253  ret <4 x float> %vecins.3
1254}
1255
1256; Accelerate *does not* provide sin() for <2 x float>.
1257define <2 x float> @sin_2x(ptr %a) {
1258; CHECK-LABEL: @sin_2x(
1259; CHECK-NEXT:  entry:
1260; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1261; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1262; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
1263; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1264; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1265; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
1266; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1267; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
1268;
1269; NOACCELERATE-LABEL: @sin_2x(
1270; NOACCELERATE-NEXT:  entry:
1271; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1272; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1273; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
1274; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1275; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1276; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
1277; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1278; NOACCELERATE-NEXT:    ret <2 x float> [[VECINS_1]]
1279;
1280entry:
1281  %0 = load <2 x float>, ptr %a, align 16
1282  %vecext = extractelement <2 x float> %0, i32 0
1283  %1 = tail call fast float @llvm.sin.f32(float %vecext)
1284  %vecins = insertelement <2 x float> undef, float %1, i32 0
1285  %vecext.1 = extractelement <2 x float> %0, i32 1
1286  %2 = tail call fast float @llvm.sin.f32(float %vecext.1)
1287  %vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
1288  ret <2 x float> %vecins.1
1289}
1290
1291
1292declare float @llvm.cos.f32(float)
1293
1294; Accelerate provides cos() for <4 x float>
1295define <4 x float> @int_cos_4x(ptr %a) {
1296; CHECK-LABEL: @int_cos_4x(
1297; CHECK-NEXT:  entry:
1298; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1299; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
1300; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1301;
1302; NOACCELERATE-LABEL: @int_cos_4x(
1303; NOACCELERATE-NEXT:  entry:
1304; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1305; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1306; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
1307; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1308; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1309; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
1310; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1311; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1312; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
1313; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1314; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1315; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1316;
1317entry:
1318  %0 = load <4 x float>, ptr %a, align 16
1319  %vecext = extractelement <4 x float> %0, i32 0
1320  %1 = tail call fast float @llvm.cos.f32(float %vecext)
1321  %vecins = insertelement <4 x float> undef, float %1, i32 0
1322  %vecext.1 = extractelement <4 x float> %0, i32 1
1323  %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
1324  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1325  %vecext.2 = extractelement <4 x float> %0, i32 2
1326  %3 = tail call fast float @llvm.cos.f32(float %vecext.2)
1327  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1328  %vecext.3 = extractelement <4 x float> %0, i32 3
1329  %4 = tail call fast float @llvm.cos.f32(float %vecext.3)
1330  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1331  ret <4 x float> %vecins.3
1332}
1333
1334; Accelerate *does not* provide cos() for <2 x float>.
1335define <2 x float> @cos_2x(ptr %a) {
1336; CHECK-LABEL: @cos_2x(
1337; CHECK-NEXT:  entry:
1338; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1339; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1340; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
1341; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1342; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1343; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
1344; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1345; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
1346;
1347; NOACCELERATE-LABEL: @cos_2x(
1348; NOACCELERATE-NEXT:  entry:
1349; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1350; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1351; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
1352; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1353; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1354; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
1355; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1356; NOACCELERATE-NEXT:    ret <2 x float> [[VECINS_1]]
1357;
1358entry:
1359  %0 = load <2 x float>, ptr %a, align 16
1360  %vecext = extractelement <2 x float> %0, i32 0
1361  %1 = tail call fast float @llvm.cos.f32(float %vecext)
1362  %vecins = insertelement <2 x float> undef, float %1, i32 0
1363  %vecext.1 = extractelement <2 x float> %0, i32 1
1364  %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
1365  %vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
1366  ret <2 x float> %vecins.1
1367}
1368