xref: /llvm-project/llvm/test/CodeGen/X86/extractelement-fp.ll (revision bfd8f7ee4a85ae8873db14fa6e7e31223a1df169)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X64
3; RUN: llc < %s -mtriple=i686-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X86
4
5define float @fneg_v4f32(<4 x float> %x) nounwind {
6; X64-LABEL: fneg_v4f32:
7; X64:       # %bb.0:
8; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
9; X64-NEXT:    vxorps %xmm1, %xmm0, %xmm0
10; X64-NEXT:    retq
11;
12; X86-LABEL: fneg_v4f32:
13; X86:       # %bb.0:
14; X86-NEXT:    pushl %eax
15; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
16; X86-NEXT:    vxorps %xmm1, %xmm0, %xmm0
17; X86-NEXT:    vmovss %xmm0, (%esp)
18; X86-NEXT:    flds (%esp)
19; X86-NEXT:    popl %eax
20; X86-NEXT:    retl
21  %v = fneg <4 x float> %x
22  %r = extractelement <4 x float> %v, i32 0
23  ret float %r
24}
25
26define double @fneg_v4f64(<4 x double> %x) nounwind {
27; X64-LABEL: fneg_v4f64:
28; X64:       # %bb.0:
29; X64-NEXT:    vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
30; X64-NEXT:    # xmm1 = mem[0,0]
31; X64-NEXT:    vxorps %xmm1, %xmm0, %xmm0
32; X64-NEXT:    vzeroupper
33; X64-NEXT:    retq
34;
35; X86-LABEL: fneg_v4f64:
36; X86:       # %bb.0:
37; X86-NEXT:    pushl %ebp
38; X86-NEXT:    movl %esp, %ebp
39; X86-NEXT:    andl $-8, %esp
40; X86-NEXT:    subl $8, %esp
41; X86-NEXT:    vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
42; X86-NEXT:    # xmm1 = mem[0,0]
43; X86-NEXT:    vxorps %xmm1, %xmm0, %xmm0
44; X86-NEXT:    vmovlps %xmm0, (%esp)
45; X86-NEXT:    fldl (%esp)
46; X86-NEXT:    movl %ebp, %esp
47; X86-NEXT:    popl %ebp
48; X86-NEXT:    vzeroupper
49; X86-NEXT:    retl
50  %v = fneg <4 x double> %x
51  %r = extractelement <4 x double> %v, i32 0
52  ret double %r
53}
54
55define float @fadd_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
56; X64-LABEL: fadd_v4f32:
57; X64:       # %bb.0:
58; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0
59; X64-NEXT:    retq
60;
61; X86-LABEL: fadd_v4f32:
62; X86:       # %bb.0:
63; X86-NEXT:    pushl %eax
64; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0
65; X86-NEXT:    vmovss %xmm0, (%esp)
66; X86-NEXT:    flds (%esp)
67; X86-NEXT:    popl %eax
68; X86-NEXT:    retl
69  %v = fadd <4 x float> %x, %y
70  %r = extractelement <4 x float> %v, i32 0
71  ret float %r
72}
73
74define double @fadd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
75; X64-LABEL: fadd_v4f64:
76; X64:       # %bb.0:
77; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
78; X64-NEXT:    vzeroupper
79; X64-NEXT:    retq
80;
81; X86-LABEL: fadd_v4f64:
82; X86:       # %bb.0:
83; X86-NEXT:    pushl %ebp
84; X86-NEXT:    movl %esp, %ebp
85; X86-NEXT:    andl $-8, %esp
86; X86-NEXT:    subl $8, %esp
87; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
88; X86-NEXT:    vmovsd %xmm0, (%esp)
89; X86-NEXT:    fldl (%esp)
90; X86-NEXT:    movl %ebp, %esp
91; X86-NEXT:    popl %ebp
92; X86-NEXT:    vzeroupper
93; X86-NEXT:    retl
94  %v = fadd <4 x double> %x, %y
95  %r = extractelement <4 x double> %v, i32 0
96  ret double %r
97}
98
99define float @fsub_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
100; X64-LABEL: fsub_v4f32:
101; X64:       # %bb.0:
102; X64-NEXT:    vsubss %xmm1, %xmm0, %xmm0
103; X64-NEXT:    retq
104;
105; X86-LABEL: fsub_v4f32:
106; X86:       # %bb.0:
107; X86-NEXT:    pushl %eax
108; X86-NEXT:    vsubss %xmm1, %xmm0, %xmm0
109; X86-NEXT:    vmovss %xmm0, (%esp)
110; X86-NEXT:    flds (%esp)
111; X86-NEXT:    popl %eax
112; X86-NEXT:    retl
113  %v = fsub <4 x float> %x, %y
114  %r = extractelement <4 x float> %v, i32 0
115  ret float %r
116}
117
118define double @fsub_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
119; X64-LABEL: fsub_v4f64:
120; X64:       # %bb.0:
121; X64-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
122; X64-NEXT:    vzeroupper
123; X64-NEXT:    retq
124;
125; X86-LABEL: fsub_v4f64:
126; X86:       # %bb.0:
127; X86-NEXT:    pushl %ebp
128; X86-NEXT:    movl %esp, %ebp
129; X86-NEXT:    andl $-8, %esp
130; X86-NEXT:    subl $8, %esp
131; X86-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
132; X86-NEXT:    vmovsd %xmm0, (%esp)
133; X86-NEXT:    fldl (%esp)
134; X86-NEXT:    movl %ebp, %esp
135; X86-NEXT:    popl %ebp
136; X86-NEXT:    vzeroupper
137; X86-NEXT:    retl
138  %v = fsub <4 x double> %x, %y
139  %r = extractelement <4 x double> %v, i32 0
140  ret double %r
141}
142
143define float @fmul_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
144; X64-LABEL: fmul_v4f32:
145; X64:       # %bb.0:
146; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0
147; X64-NEXT:    retq
148;
149; X86-LABEL: fmul_v4f32:
150; X86:       # %bb.0:
151; X86-NEXT:    pushl %eax
152; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0
153; X86-NEXT:    vmovss %xmm0, (%esp)
154; X86-NEXT:    flds (%esp)
155; X86-NEXT:    popl %eax
156; X86-NEXT:    retl
157  %v = fmul <4 x float> %x, %y
158  %r = extractelement <4 x float> %v, i32 0
159  ret float %r
160}
161
162define double @fmul_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
163; X64-LABEL: fmul_v4f64:
164; X64:       # %bb.0:
165; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
166; X64-NEXT:    vzeroupper
167; X64-NEXT:    retq
168;
169; X86-LABEL: fmul_v4f64:
170; X86:       # %bb.0:
171; X86-NEXT:    pushl %ebp
172; X86-NEXT:    movl %esp, %ebp
173; X86-NEXT:    andl $-8, %esp
174; X86-NEXT:    subl $8, %esp
175; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
176; X86-NEXT:    vmovsd %xmm0, (%esp)
177; X86-NEXT:    fldl (%esp)
178; X86-NEXT:    movl %ebp, %esp
179; X86-NEXT:    popl %ebp
180; X86-NEXT:    vzeroupper
181; X86-NEXT:    retl
182  %v = fmul <4 x double> %x, %y
183  %r = extractelement <4 x double> %v, i32 0
184  ret double %r
185}
186
187define float @fdiv_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
188; X64-LABEL: fdiv_v4f32:
189; X64:       # %bb.0:
190; X64-NEXT:    vdivss %xmm1, %xmm0, %xmm0
191; X64-NEXT:    retq
192;
193; X86-LABEL: fdiv_v4f32:
194; X86:       # %bb.0:
195; X86-NEXT:    pushl %eax
196; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0
197; X86-NEXT:    vmovss %xmm0, (%esp)
198; X86-NEXT:    flds (%esp)
199; X86-NEXT:    popl %eax
200; X86-NEXT:    retl
201  %v = fdiv <4 x float> %x, %y
202  %r = extractelement <4 x float> %v, i32 0
203  ret float %r
204}
205
206define double @fdiv_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
207; X64-LABEL: fdiv_v4f64:
208; X64:       # %bb.0:
209; X64-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
210; X64-NEXT:    vzeroupper
211; X64-NEXT:    retq
212;
213; X86-LABEL: fdiv_v4f64:
214; X86:       # %bb.0:
215; X86-NEXT:    pushl %ebp
216; X86-NEXT:    movl %esp, %ebp
217; X86-NEXT:    andl $-8, %esp
218; X86-NEXT:    subl $8, %esp
219; X86-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
220; X86-NEXT:    vmovsd %xmm0, (%esp)
221; X86-NEXT:    fldl (%esp)
222; X86-NEXT:    movl %ebp, %esp
223; X86-NEXT:    popl %ebp
224; X86-NEXT:    vzeroupper
225; X86-NEXT:    retl
226  %v = fdiv <4 x double> %x, %y
227  %r = extractelement <4 x double> %v, i32 0
228  ret double %r
229}
230
231define float @frem_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
232; X64-LABEL: frem_v4f32:
233; X64:       # %bb.0:
234; X64-NEXT:    jmp fmodf@PLT # TAILCALL
235;
236; X86-LABEL: frem_v4f32:
237; X86:       # %bb.0:
238; X86-NEXT:    subl $8, %esp
239; X86-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
240; X86-NEXT:    vmovss %xmm0, (%esp)
241; X86-NEXT:    calll fmodf
242; X86-NEXT:    addl $8, %esp
243; X86-NEXT:    retl
244  %v = frem <4 x float> %x, %y
245  %r = extractelement <4 x float> %v, i32 0
246  ret float %r
247}
248
249define double @frem_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
250; X64-LABEL: frem_v4f64:
251; X64:       # %bb.0:
252; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
253; X64-NEXT:    # kill: def $xmm1 killed $xmm1 killed $ymm1
254; X64-NEXT:    vzeroupper
255; X64-NEXT:    jmp fmod@PLT # TAILCALL
256;
257; X86-LABEL: frem_v4f64:
258; X86:       # %bb.0:
259; X86-NEXT:    subl $16, %esp
260; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
261; X86-NEXT:    vmovups %xmm0, (%esp)
262; X86-NEXT:    vzeroupper
263; X86-NEXT:    calll fmod
264; X86-NEXT:    addl $16, %esp
265; X86-NEXT:    retl
266  %v = frem <4 x double> %x, %y
267  %r = extractelement <4 x double> %v, i32 0
268  ret double %r
269}
270
271define i1 @fcmp_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
272; CHECK-LABEL: fcmp_v4f32:
273; CHECK:       # %bb.0:
274; CHECK-NEXT:    vucomiss %xmm1, %xmm0
275; CHECK-NEXT:    seta %al
276; CHECK-NEXT:    ret{{[l|q]}}
277  %v = fcmp ogt <4 x float> %x, %y
278  %r = extractelement <4 x i1> %v, i32 0
279  ret i1 %r
280}
281
282define i1 @fcmp_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
283; CHECK-LABEL: fcmp_v4f64:
284; CHECK:       # %bb.0:
285; CHECK-NEXT:    vucomisd %xmm0, %xmm1
286; CHECK-NEXT:    setb %al
287; CHECK-NEXT:    vzeroupper
288; CHECK-NEXT:    ret{{[l|q]}}
289  %v = fcmp ugt <4 x double> %x, %y
290  %r = extractelement <4 x i1> %v, i32 0
291  ret i1 %r
292}
293
294; If we do the fcmp transform late, make sure we have the right types.
295; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=13700
296
297define void @extsetcc(<4 x float> %x) {
298; X64-LABEL: extsetcc:
299; X64:       # %bb.0:
300; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
301; X64-NEXT:    vucomiss %xmm1, %xmm0
302; X64-NEXT:    setb (%rax)
303; X64-NEXT:    retq
304;
305; X86-LABEL: extsetcc:
306; X86:       # %bb.0:
307; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
308; X86-NEXT:    vucomiss %xmm1, %xmm0
309; X86-NEXT:    setb (%eax)
310; X86-NEXT:    retl
311  %cmp = fcmp ult <4 x float> %x, zeroinitializer
312  %sext = sext <4 x i1> %cmp to <4 x i32>
313  %e = extractelement <4 x i1> %cmp, i1 0
314  store i1 %e, ptr undef
315  ret void
316}
317
318; This used to crash by creating a setcc with an i64 condition on a 32-bit target.
319define <3 x double> @extvselectsetcc_crash(<2 x double> %x) {
320; X64-LABEL: extvselectsetcc_crash:
321; X64:       # %bb.0:
322; X64-NEXT:    vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
323; X64-NEXT:    vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
324; X64-NEXT:    vandpd %xmm2, %xmm1, %xmm1
325; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
326; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
327; X64-NEXT:    retq
328;
329; X86-LABEL: extvselectsetcc_crash:
330; X86:       # %bb.0:
331; X86-NEXT:    vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
332; X86-NEXT:    vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
333; X86-NEXT:    vandpd %xmm2, %xmm1, %xmm1
334; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
335; X86-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
336; X86-NEXT:    retl
337  %cmp = fcmp oeq <2 x double> %x, <double 5.0, double 5.0>
338  %s = select <2 x i1> %cmp, <2 x double> <double 1.0, double undef>, <2 x double> <double 0.0, double undef>
339  %r = shufflevector <2 x double> %s, <2 x double> %x, <3 x i32> <i32 0, i32 2, i32 3>
340  ret <3 x double> %r
341}
342
343define float @select_fcmp_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) nounwind {
344; X64-LABEL: select_fcmp_v4f32:
345; X64:       # %bb.0:
346; X64-NEXT:    vcmpneq_oqss %xmm1, %xmm0, %xmm0
347; X64-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm0
348; X64-NEXT:    retq
349;
350; X86-LABEL: select_fcmp_v4f32:
351; X86:       # %bb.0:
352; X86-NEXT:    pushl %ebp
353; X86-NEXT:    movl %esp, %ebp
354; X86-NEXT:    andl $-16, %esp
355; X86-NEXT:    subl $16, %esp
356; X86-NEXT:    vmovaps 8(%ebp), %xmm3
357; X86-NEXT:    vcmpneq_oqss %xmm1, %xmm0, %xmm0
358; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm0
359; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
360; X86-NEXT:    flds {{[0-9]+}}(%esp)
361; X86-NEXT:    movl %ebp, %esp
362; X86-NEXT:    popl %ebp
363; X86-NEXT:    retl
364  %c = fcmp one <4 x float> %x, %y
365  %s = select <4 x i1> %c, <4 x float> %z, <4 x float> %w
366  %r = extractelement <4 x float> %s, i32 0
367  ret float %r
368}
369
370define double @select_fcmp_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, <4 x double> %w) nounwind {
371; X64-LABEL: select_fcmp_v4f64:
372; X64:       # %bb.0:
373; X64-NEXT:    vcmpnltsd %xmm0, %xmm1, %xmm0
374; X64-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
375; X64-NEXT:    vzeroupper
376; X64-NEXT:    retq
377;
378; X86-LABEL: select_fcmp_v4f64:
379; X86:       # %bb.0:
380; X86-NEXT:    pushl %ebp
381; X86-NEXT:    movl %esp, %ebp
382; X86-NEXT:    andl $-32, %esp
383; X86-NEXT:    subl $32, %esp
384; X86-NEXT:    vcmpnltsd %xmm0, %xmm1, %xmm0
385; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
386; X86-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
387; X86-NEXT:    vmovlpd %xmm0, {{[0-9]+}}(%esp)
388; X86-NEXT:    fldl {{[0-9]+}}(%esp)
389; X86-NEXT:    movl %ebp, %esp
390; X86-NEXT:    popl %ebp
391; X86-NEXT:    vzeroupper
392; X86-NEXT:    retl
393  %c = fcmp ule <4 x double> %x, %y
394  %s = select <4 x i1> %c, <4 x double> %z, <4 x double> %w
395  %r = extractelement <4 x double> %s, i32 0
396  ret double %r
397}
398
399define float @fsqrt_v4f32(<4 x float> %x) nounwind {
400; X64-LABEL: fsqrt_v4f32:
401; X64:       # %bb.0:
402; X64-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
403; X64-NEXT:    retq
404;
405; X86-LABEL: fsqrt_v4f32:
406; X86:       # %bb.0:
407; X86-NEXT:    pushl %eax
408; X86-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
409; X86-NEXT:    vmovss %xmm0, (%esp)
410; X86-NEXT:    flds (%esp)
411; X86-NEXT:    popl %eax
412; X86-NEXT:    retl
413  %v = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
414  %r = extractelement <4 x float> %v, i32 0
415  ret float %r
416}
417
418define double @fsqrt_v4f64(<4 x double> %x) nounwind {
419; X64-LABEL: fsqrt_v4f64:
420; X64:       # %bb.0:
421; X64-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
422; X64-NEXT:    vzeroupper
423; X64-NEXT:    retq
424;
425; X86-LABEL: fsqrt_v4f64:
426; X86:       # %bb.0:
427; X86-NEXT:    pushl %ebp
428; X86-NEXT:    movl %esp, %ebp
429; X86-NEXT:    andl $-8, %esp
430; X86-NEXT:    subl $8, %esp
431; X86-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
432; X86-NEXT:    vmovsd %xmm0, (%esp)
433; X86-NEXT:    fldl (%esp)
434; X86-NEXT:    movl %ebp, %esp
435; X86-NEXT:    popl %ebp
436; X86-NEXT:    vzeroupper
437; X86-NEXT:    retl
438  %v = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %x)
439  %r = extractelement <4 x double> %v, i32 0
440  ret double %r
441}
442
443define float @fsin_v4f32(<4 x float> %x) nounwind {
444; X64-LABEL: fsin_v4f32:
445; X64:       # %bb.0:
446; X64-NEXT:    jmp sinf@PLT # TAILCALL
447;
448; X86-LABEL: fsin_v4f32:
449; X86:       # %bb.0:
450; X86-NEXT:    pushl %eax
451; X86-NEXT:    vmovss %xmm0, (%esp)
452; X86-NEXT:    calll sinf
453; X86-NEXT:    popl %eax
454; X86-NEXT:    retl
455  %v = call <4 x float> @llvm.sin.v4f32(<4 x float> %x)
456  %r = extractelement <4 x float> %v, i32 0
457  ret float %r
458}
459
460define double @fsin_v4f64(<4 x double> %x) nounwind {
461; X64-LABEL: fsin_v4f64:
462; X64:       # %bb.0:
463; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
464; X64-NEXT:    vzeroupper
465; X64-NEXT:    jmp sin@PLT # TAILCALL
466;
467; X86-LABEL: fsin_v4f64:
468; X86:       # %bb.0:
469; X86-NEXT:    subl $8, %esp
470; X86-NEXT:    vmovlps %xmm0, (%esp)
471; X86-NEXT:    vzeroupper
472; X86-NEXT:    calll sin
473; X86-NEXT:    addl $8, %esp
474; X86-NEXT:    retl
475  %v = call <4 x double> @llvm.sin.v4f64(<4 x double> %x)
476  %r = extractelement <4 x double> %v, i32 0
477  ret double %r
478}
479
480define float @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind {
481; X64-LABEL: fma_v4f32:
482; X64:       # %bb.0:
483; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
484; X64-NEXT:    retq
485;
486; X86-LABEL: fma_v4f32:
487; X86:       # %bb.0:
488; X86-NEXT:    pushl %eax
489; X86-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
490; X86-NEXT:    vmovss %xmm0, (%esp)
491; X86-NEXT:    flds (%esp)
492; X86-NEXT:    popl %eax
493; X86-NEXT:    retl
494  %v = call <4 x float> @llvm.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z)
495  %r = extractelement <4 x float> %v, i32 0
496  ret float %r
497}
498
499define double @fma_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z) nounwind {
500; X64-LABEL: fma_v4f64:
501; X64:       # %bb.0:
502; X64-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
503; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
504; X64-NEXT:    vzeroupper
505; X64-NEXT:    retq
506;
507; X86-LABEL: fma_v4f64:
508; X86:       # %bb.0:
509; X86-NEXT:    pushl %ebp
510; X86-NEXT:    movl %esp, %ebp
511; X86-NEXT:    andl $-8, %esp
512; X86-NEXT:    subl $8, %esp
513; X86-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm2
514; X86-NEXT:    vmovsd %xmm1, (%esp)
515; X86-NEXT:    fldl (%esp)
516; X86-NEXT:    movl %ebp, %esp
517; X86-NEXT:    popl %ebp
518; X86-NEXT:    vzeroupper
519; X86-NEXT:    retl
520  %v = call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z)
521  %r = extractelement <4 x double> %v, i32 0
522  ret double %r
523}
524
525define float @fabs_v4f32(<4 x float> %x) nounwind {
526; X64-LABEL: fabs_v4f32:
527; X64:       # %bb.0:
528; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
529; X64-NEXT:    vandps %xmm1, %xmm0, %xmm0
530; X64-NEXT:    retq
531;
532; X86-LABEL: fabs_v4f32:
533; X86:       # %bb.0:
534; X86-NEXT:    pushl %eax
535; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
536; X86-NEXT:    vandps %xmm1, %xmm0, %xmm0
537; X86-NEXT:    vmovss %xmm0, (%esp)
538; X86-NEXT:    flds (%esp)
539; X86-NEXT:    popl %eax
540; X86-NEXT:    retl
541  %v = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
542  %r = extractelement <4 x float> %v, i32 0
543  ret float %r
544}
545
546define double @fabs_v4f64(<4 x double> %x) nounwind {
547; X64-LABEL: fabs_v4f64:
548; X64:       # %bb.0:
549; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
550; X64-NEXT:    vzeroupper
551; X64-NEXT:    retq
552;
553; X86-LABEL: fabs_v4f64:
554; X86:       # %bb.0:
555; X86-NEXT:    pushl %ebp
556; X86-NEXT:    movl %esp, %ebp
557; X86-NEXT:    andl $-8, %esp
558; X86-NEXT:    subl $8, %esp
559; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
560; X86-NEXT:    vmovlps %xmm0, (%esp)
561; X86-NEXT:    fldl (%esp)
562; X86-NEXT:    movl %ebp, %esp
563; X86-NEXT:    popl %ebp
564; X86-NEXT:    vzeroupper
565; X86-NEXT:    retl
566  %v = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
567  %r = extractelement <4 x double> %v, i32 0
568  ret double %r
569}
570
571define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
572; X64-LABEL: fmaxnum_v4f32:
573; X64:       # %bb.0:
574; X64-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
575; X64-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
576; X64-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
577; X64-NEXT:    retq
578;
579; X86-LABEL: fmaxnum_v4f32:
580; X86:       # %bb.0:
581; X86-NEXT:    pushl %eax
582; X86-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
583; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
584; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
585; X86-NEXT:    vmovss %xmm0, (%esp)
586; X86-NEXT:    flds (%esp)
587; X86-NEXT:    popl %eax
588; X86-NEXT:    retl
589  %v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
590  %r = extractelement <4 x float> %v, i32 0
591  ret float %r
592}
593
594define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
595; X64-LABEL: fmaxnum_v4f64:
596; X64:       # %bb.0:
597; X64-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
598; X64-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
599; X64-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
600; X64-NEXT:    vzeroupper
601; X64-NEXT:    retq
602;
603; X86-LABEL: fmaxnum_v4f64:
604; X86:       # %bb.0:
605; X86-NEXT:    pushl %ebp
606; X86-NEXT:    movl %esp, %ebp
607; X86-NEXT:    andl $-8, %esp
608; X86-NEXT:    subl $8, %esp
609; X86-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
610; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
611; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
612; X86-NEXT:    vmovlpd %xmm0, (%esp)
613; X86-NEXT:    fldl (%esp)
614; X86-NEXT:    movl %ebp, %esp
615; X86-NEXT:    popl %ebp
616; X86-NEXT:    vzeroupper
617; X86-NEXT:    retl
618  %v = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y)
619  %r = extractelement <4 x double> %v, i32 0
620  ret double %r
621}
622
623define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
624; X64-LABEL: fminnum_v4f32:
625; X64:       # %bb.0:
626; X64-NEXT:    vminss %xmm0, %xmm1, %xmm2
627; X64-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
628; X64-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
629; X64-NEXT:    retq
630;
631; X86-LABEL: fminnum_v4f32:
632; X86:       # %bb.0:
633; X86-NEXT:    pushl %eax
634; X86-NEXT:    vminss %xmm0, %xmm1, %xmm2
635; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
636; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
637; X86-NEXT:    vmovss %xmm0, (%esp)
638; X86-NEXT:    flds (%esp)
639; X86-NEXT:    popl %eax
640; X86-NEXT:    retl
641  %v = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
642  %r = extractelement <4 x float> %v, i32 0
643  ret float %r
644}
645
646define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
647; X64-LABEL: fminnum_v4f64:
648; X64:       # %bb.0:
649; X64-NEXT:    vminsd %xmm0, %xmm1, %xmm2
650; X64-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
651; X64-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
652; X64-NEXT:    vzeroupper
653; X64-NEXT:    retq
654;
655; X86-LABEL: fminnum_v4f64:
656; X86:       # %bb.0:
657; X86-NEXT:    pushl %ebp
658; X86-NEXT:    movl %esp, %ebp
659; X86-NEXT:    andl $-8, %esp
660; X86-NEXT:    subl $8, %esp
661; X86-NEXT:    vminsd %xmm0, %xmm1, %xmm2
662; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
663; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
664; X86-NEXT:    vmovlpd %xmm0, (%esp)
665; X86-NEXT:    fldl (%esp)
666; X86-NEXT:    movl %ebp, %esp
667; X86-NEXT:    popl %ebp
668; X86-NEXT:    vzeroupper
669; X86-NEXT:    retl
670  %v = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y)
671  %r = extractelement <4 x double> %v, i32 0
672  ret double %r
673}
674
675define float @fmaximum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
676; X64-LABEL: fmaximum_v4f32:
677; X64:       # %bb.0:
678; X64-NEXT:    vmovd %xmm0, %eax
679; X64-NEXT:    testl %eax, %eax
680; X64-NEXT:    js .LBB30_1
681; X64-NEXT:  # %bb.2:
682; X64-NEXT:    vmovdqa %xmm0, %xmm2
683; X64-NEXT:    jmp .LBB30_3
684; X64-NEXT:  .LBB30_1:
685; X64-NEXT:    vmovdqa %xmm1, %xmm2
686; X64-NEXT:    vmovdqa %xmm0, %xmm1
687; X64-NEXT:  .LBB30_3:
688; X64-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
689; X64-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
690; X64-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
691; X64-NEXT:    retq
692;
693; X86-LABEL: fmaximum_v4f32:
694; X86:       # %bb.0:
695; X86-NEXT:    vmovd %xmm0, %eax
696; X86-NEXT:    testl %eax, %eax
697; X86-NEXT:    js .LBB30_1
698; X86-NEXT:  # %bb.2:
699; X86-NEXT:    vmovdqa %xmm0, %xmm2
700; X86-NEXT:    jmp .LBB30_3
701; X86-NEXT:  .LBB30_1:
702; X86-NEXT:    vmovdqa %xmm1, %xmm2
703; X86-NEXT:    vmovdqa %xmm0, %xmm1
704; X86-NEXT:  .LBB30_3:
705; X86-NEXT:    pushl %eax
706; X86-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
707; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
708; X86-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
709; X86-NEXT:    vmovss %xmm0, (%esp)
710; X86-NEXT:    flds (%esp)
711; X86-NEXT:    popl %eax
712; X86-NEXT:    retl
713  %v = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
714  %r = extractelement <4 x float> %v, i32 0
715  ret float %r
716}
717
718define double @fmaximum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
719; X64-LABEL: fmaximum_v4f64:
720; X64:       # %bb.0:
721; X64-NEXT:    vmovq %xmm0, %rax
722; X64-NEXT:    testq %rax, %rax
723; X64-NEXT:    js .LBB31_1
724; X64-NEXT:  # %bb.2:
725; X64-NEXT:    vmovdqa %xmm0, %xmm2
726; X64-NEXT:    jmp .LBB31_3
727; X64-NEXT:  .LBB31_1:
728; X64-NEXT:    vmovdqa %xmm1, %xmm2
729; X64-NEXT:    vmovdqa %xmm0, %xmm1
730; X64-NEXT:  .LBB31_3:
731; X64-NEXT:    vmaxsd %xmm2, %xmm1, %xmm0
732; X64-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
733; X64-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
734; X64-NEXT:    vzeroupper
735; X64-NEXT:    retq
736;
737; X86-LABEL: fmaximum_v4f64:
738; X86:       # %bb.0:
739; X86-NEXT:    vextractps $1, %xmm0, %eax
740; X86-NEXT:    testl %eax, %eax
741; X86-NEXT:    js .LBB31_1
742; X86-NEXT:  # %bb.2:
743; X86-NEXT:    vmovapd %xmm0, %xmm2
744; X86-NEXT:    jmp .LBB31_3
745; X86-NEXT:  .LBB31_1:
746; X86-NEXT:    vmovapd %xmm1, %xmm2
747; X86-NEXT:    vmovapd %xmm0, %xmm1
748; X86-NEXT:  .LBB31_3:
749; X86-NEXT:    pushl %ebp
750; X86-NEXT:    movl %esp, %ebp
751; X86-NEXT:    andl $-8, %esp
752; X86-NEXT:    subl $8, %esp
753; X86-NEXT:    vmaxsd %xmm2, %xmm1, %xmm0
754; X86-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
755; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
756; X86-NEXT:    vmovlpd %xmm0, (%esp)
757; X86-NEXT:    fldl (%esp)
758; X86-NEXT:    movl %ebp, %esp
759; X86-NEXT:    popl %ebp
760; X86-NEXT:    vzeroupper
761; X86-NEXT:    retl
762  %v = call <4 x double> @llvm.maximum.v4f64(<4 x double> %x, <4 x double> %y)
763  %r = extractelement <4 x double> %v, i32 0
764  ret double %r
765}
766
767define float @fminimum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
768; X64-LABEL: fminimum_v4f32:
769; X64:       # %bb.0:
770; X64-NEXT:    vmovd %xmm0, %eax
771; X64-NEXT:    testl %eax, %eax
772; X64-NEXT:    js .LBB32_1
773; X64-NEXT:  # %bb.2:
774; X64-NEXT:    vmovdqa %xmm1, %xmm2
775; X64-NEXT:    jmp .LBB32_3
776; X64-NEXT:  .LBB32_1:
777; X64-NEXT:    vmovdqa %xmm0, %xmm2
778; X64-NEXT:    vmovdqa %xmm1, %xmm0
779; X64-NEXT:  .LBB32_3:
780; X64-NEXT:    vminss %xmm2, %xmm0, %xmm1
781; X64-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
782; X64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
783; X64-NEXT:    retq
784;
785; X86-LABEL: fminimum_v4f32:
786; X86:       # %bb.0:
787; X86-NEXT:    vmovd %xmm0, %eax
788; X86-NEXT:    testl %eax, %eax
789; X86-NEXT:    js .LBB32_1
790; X86-NEXT:  # %bb.2:
791; X86-NEXT:    vmovdqa %xmm1, %xmm2
792; X86-NEXT:    jmp .LBB32_3
793; X86-NEXT:  .LBB32_1:
794; X86-NEXT:    vmovdqa %xmm0, %xmm2
795; X86-NEXT:    vmovdqa %xmm1, %xmm0
796; X86-NEXT:  .LBB32_3:
797; X86-NEXT:    pushl %eax
798; X86-NEXT:    vminss %xmm2, %xmm0, %xmm1
799; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
800; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
801; X86-NEXT:    vmovss %xmm0, (%esp)
802; X86-NEXT:    flds (%esp)
803; X86-NEXT:    popl %eax
804; X86-NEXT:    retl
805  %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
806  %r = extractelement <4 x float> %v, i32 0
807  ret float %r
808}
809
810define double @fminimum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
811; X64-LABEL: fminimum_v4f64:
812; X64:       # %bb.0:
813; X64-NEXT:    vmovq %xmm0, %rax
814; X64-NEXT:    testq %rax, %rax
815; X64-NEXT:    js .LBB33_1
816; X64-NEXT:  # %bb.2:
817; X64-NEXT:    vmovdqa %xmm1, %xmm2
818; X64-NEXT:    jmp .LBB33_3
819; X64-NEXT:  .LBB33_1:
820; X64-NEXT:    vmovdqa %xmm0, %xmm2
821; X64-NEXT:    vmovdqa %xmm1, %xmm0
822; X64-NEXT:  .LBB33_3:
823; X64-NEXT:    vminsd %xmm2, %xmm0, %xmm1
824; X64-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
825; X64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
826; X64-NEXT:    vzeroupper
827; X64-NEXT:    retq
828;
829; X86-LABEL: fminimum_v4f64:
830; X86:       # %bb.0:
831; X86-NEXT:    vextractps $1, %xmm0, %eax
832; X86-NEXT:    testl %eax, %eax
833; X86-NEXT:    js .LBB33_1
834; X86-NEXT:  # %bb.2:
835; X86-NEXT:    vmovapd %xmm1, %xmm2
836; X86-NEXT:    jmp .LBB33_3
837; X86-NEXT:  .LBB33_1:
838; X86-NEXT:    vmovapd %xmm0, %xmm2
839; X86-NEXT:    vmovapd %xmm1, %xmm0
840; X86-NEXT:  .LBB33_3:
841; X86-NEXT:    pushl %ebp
842; X86-NEXT:    movl %esp, %ebp
843; X86-NEXT:    andl $-8, %esp
844; X86-NEXT:    subl $8, %esp
845; X86-NEXT:    vminsd %xmm2, %xmm0, %xmm1
846; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
847; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
848; X86-NEXT:    vmovlpd %xmm0, (%esp)
849; X86-NEXT:    fldl (%esp)
850; X86-NEXT:    movl %ebp, %esp
851; X86-NEXT:    popl %ebp
852; X86-NEXT:    vzeroupper
853; X86-NEXT:    retl
854  %v = call <4 x double> @llvm.minimum.v4f64(<4 x double> %x, <4 x double> %y)
855  %r = extractelement <4 x double> %v, i32 0
856  ret double %r
857}
858
859define float @maxps_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
860; X64-LABEL: maxps_v4f32:
861; X64:       # %bb.0:
862; X64-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
863; X64-NEXT:    retq
864;
865; X86-LABEL: maxps_v4f32:
866; X86:       # %bb.0:
867; X86-NEXT:    pushl %eax
868; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
869; X86-NEXT:    vmovss %xmm0, (%esp)
870; X86-NEXT:    flds (%esp)
871; X86-NEXT:    popl %eax
872; X86-NEXT:    retl
873  %cmp = fcmp ogt <4 x float> %x, %y
874  %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y
875  %r = extractelement <4 x float> %v, i32 0
876  ret float %r
877}
878
879define double @maxpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
880; X64-LABEL: maxpd_v4f64:
881; X64:       # %bb.0:
882; X64-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
883; X64-NEXT:    vzeroupper
884; X64-NEXT:    retq
885;
886; X86-LABEL: maxpd_v4f64:
887; X86:       # %bb.0:
888; X86-NEXT:    pushl %ebp
889; X86-NEXT:    movl %esp, %ebp
890; X86-NEXT:    andl $-8, %esp
891; X86-NEXT:    subl $8, %esp
892; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
893; X86-NEXT:    vmovsd %xmm0, (%esp)
894; X86-NEXT:    fldl (%esp)
895; X86-NEXT:    movl %ebp, %esp
896; X86-NEXT:    popl %ebp
897; X86-NEXT:    vzeroupper
898; X86-NEXT:    retl
899  %cmp = fcmp ogt <4 x double> %x, %y
900  %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y
901  %r = extractelement <4 x double> %v, i32 0
902  ret double %r
903}
904
905define float @minps_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
906; X64-LABEL: minps_v4f32:
907; X64:       # %bb.0:
908; X64-NEXT:    vminss %xmm1, %xmm0, %xmm0
909; X64-NEXT:    retq
910;
911; X86-LABEL: minps_v4f32:
912; X86:       # %bb.0:
913; X86-NEXT:    pushl %eax
914; X86-NEXT:    vminss %xmm1, %xmm0, %xmm0
915; X86-NEXT:    vmovss %xmm0, (%esp)
916; X86-NEXT:    flds (%esp)
917; X86-NEXT:    popl %eax
918; X86-NEXT:    retl
919  %cmp = fcmp olt <4 x float> %x, %y
920  %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y
921  %r = extractelement <4 x float> %v, i32 0
922  ret float %r
923}
924
925define double @minpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
926; X64-LABEL: minpd_v4f64:
927; X64:       # %bb.0:
928; X64-NEXT:    vminsd %xmm1, %xmm0, %xmm0
929; X64-NEXT:    vzeroupper
930; X64-NEXT:    retq
931;
932; X86-LABEL: minpd_v4f64:
933; X86:       # %bb.0:
934; X86-NEXT:    pushl %ebp
935; X86-NEXT:    movl %esp, %ebp
936; X86-NEXT:    andl $-8, %esp
937; X86-NEXT:    subl $8, %esp
938; X86-NEXT:    vminsd %xmm1, %xmm0, %xmm0
939; X86-NEXT:    vmovsd %xmm0, (%esp)
940; X86-NEXT:    fldl (%esp)
941; X86-NEXT:    movl %ebp, %esp
942; X86-NEXT:    popl %ebp
943; X86-NEXT:    vzeroupper
944; X86-NEXT:    retl
945  %cmp = fcmp olt <4 x double> %x, %y
946  %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y
947  %r = extractelement <4 x double> %v, i32 0
948  ret double %r
949}
950
951define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
952; X64-LABEL: copysign_v4f32:
953; X64:       # %bb.0:
954; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
955; X64-NEXT:    vandps %xmm2, %xmm1, %xmm1
956; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
957; X64-NEXT:    vandps %xmm2, %xmm0, %xmm0
958; X64-NEXT:    vorps %xmm1, %xmm0, %xmm0
959; X64-NEXT:    retq
960;
961; X86-LABEL: copysign_v4f32:
962; X86:       # %bb.0:
963; X86-NEXT:    pushl %eax
964; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
965; X86-NEXT:    vandps %xmm2, %xmm1, %xmm1
966; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
967; X86-NEXT:    vandps %xmm2, %xmm0, %xmm0
968; X86-NEXT:    vorps %xmm1, %xmm0, %xmm0
969; X86-NEXT:    vmovss %xmm0, (%esp)
970; X86-NEXT:    flds (%esp)
971; X86-NEXT:    popl %eax
972; X86-NEXT:    retl
973  %v = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %y)
974  %r = extractelement <4 x float> %v, i32 0
975  ret float %r
976}
977
978define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
979; X64-LABEL: copysign_v4f64:
980; X64:       # %bb.0:
981; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
982; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
983; X64-NEXT:    vorps %xmm1, %xmm0, %xmm0
984; X64-NEXT:    vzeroupper
985; X64-NEXT:    retq
986;
987; X86-LABEL: copysign_v4f64:
988; X86:       # %bb.0:
989; X86-NEXT:    pushl %ebp
990; X86-NEXT:    movl %esp, %ebp
991; X86-NEXT:    andl $-8, %esp
992; X86-NEXT:    subl $8, %esp
993; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
994; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
995; X86-NEXT:    vorps %xmm1, %xmm0, %xmm0
996; X86-NEXT:    vmovlps %xmm0, (%esp)
997; X86-NEXT:    fldl (%esp)
998; X86-NEXT:    movl %ebp, %esp
999; X86-NEXT:    popl %ebp
1000; X86-NEXT:    vzeroupper
1001; X86-NEXT:    retl
1002  %v = call <4 x double> @llvm.copysign.v4f64(<4 x double> %x, <4 x double> %y)
1003  %r = extractelement <4 x double> %v, i32 0
1004  ret double %r
1005}
1006
1007define float @floor_v4f32(<4 x float> %x) nounwind {
1008; X64-LABEL: floor_v4f32:
1009; X64:       # %bb.0:
1010; X64-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
1011; X64-NEXT:    retq
1012;
1013; X86-LABEL: floor_v4f32:
1014; X86:       # %bb.0:
1015; X86-NEXT:    pushl %eax
1016; X86-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
1017; X86-NEXT:    vmovss %xmm0, (%esp)
1018; X86-NEXT:    flds (%esp)
1019; X86-NEXT:    popl %eax
1020; X86-NEXT:    retl
1021  %v = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
1022  %r = extractelement <4 x float> %v, i32 0
1023  ret float %r
1024}
1025
1026define double @floor_v4f64(<4 x double> %x) nounwind {
1027; X64-LABEL: floor_v4f64:
1028; X64:       # %bb.0:
1029; X64-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
1030; X64-NEXT:    vzeroupper
1031; X64-NEXT:    retq
1032;
1033; X86-LABEL: floor_v4f64:
1034; X86:       # %bb.0:
1035; X86-NEXT:    pushl %ebp
1036; X86-NEXT:    movl %esp, %ebp
1037; X86-NEXT:    andl $-8, %esp
1038; X86-NEXT:    subl $8, %esp
1039; X86-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
1040; X86-NEXT:    vmovsd %xmm0, (%esp)
1041; X86-NEXT:    fldl (%esp)
1042; X86-NEXT:    movl %ebp, %esp
1043; X86-NEXT:    popl %ebp
1044; X86-NEXT:    vzeroupper
1045; X86-NEXT:    retl
1046  %v = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
1047  %r = extractelement <4 x double> %v, i32 0
1048  ret double %r
1049}
1050
1051define float @ceil_v4f32(<4 x float> %x) nounwind {
1052; X64-LABEL: ceil_v4f32:
1053; X64:       # %bb.0:
1054; X64-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
1055; X64-NEXT:    retq
1056;
1057; X86-LABEL: ceil_v4f32:
1058; X86:       # %bb.0:
1059; X86-NEXT:    pushl %eax
1060; X86-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
1061; X86-NEXT:    vmovss %xmm0, (%esp)
1062; X86-NEXT:    flds (%esp)
1063; X86-NEXT:    popl %eax
1064; X86-NEXT:    retl
1065  %v = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
1066  %r = extractelement <4 x float> %v, i32 0
1067  ret float %r
1068}
1069
1070define double @ceil_v4f64(<4 x double> %x) nounwind {
1071; X64-LABEL: ceil_v4f64:
1072; X64:       # %bb.0:
1073; X64-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
1074; X64-NEXT:    vzeroupper
1075; X64-NEXT:    retq
1076;
1077; X86-LABEL: ceil_v4f64:
1078; X86:       # %bb.0:
1079; X86-NEXT:    pushl %ebp
1080; X86-NEXT:    movl %esp, %ebp
1081; X86-NEXT:    andl $-8, %esp
1082; X86-NEXT:    subl $8, %esp
1083; X86-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
1084; X86-NEXT:    vmovsd %xmm0, (%esp)
1085; X86-NEXT:    fldl (%esp)
1086; X86-NEXT:    movl %ebp, %esp
1087; X86-NEXT:    popl %ebp
1088; X86-NEXT:    vzeroupper
1089; X86-NEXT:    retl
1090  %v = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
1091  %r = extractelement <4 x double> %v, i32 0
1092  ret double %r
1093}
1094
1095define float @trunc_v4f32(<4 x float> %x) nounwind {
1096; X64-LABEL: trunc_v4f32:
1097; X64:       # %bb.0:
1098; X64-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
1099; X64-NEXT:    retq
1100;
1101; X86-LABEL: trunc_v4f32:
1102; X86:       # %bb.0:
1103; X86-NEXT:    pushl %eax
1104; X86-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
1105; X86-NEXT:    vmovss %xmm0, (%esp)
1106; X86-NEXT:    flds (%esp)
1107; X86-NEXT:    popl %eax
1108; X86-NEXT:    retl
1109  %v = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x)
1110  %r = extractelement <4 x float> %v, i32 0
1111  ret float %r
1112}
1113
1114define double @trunc_v4f64(<4 x double> %x) nounwind {
1115; X64-LABEL: trunc_v4f64:
1116; X64:       # %bb.0:
1117; X64-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
1118; X64-NEXT:    vzeroupper
1119; X64-NEXT:    retq
1120;
1121; X86-LABEL: trunc_v4f64:
1122; X86:       # %bb.0:
1123; X86-NEXT:    pushl %ebp
1124; X86-NEXT:    movl %esp, %ebp
1125; X86-NEXT:    andl $-8, %esp
1126; X86-NEXT:    subl $8, %esp
1127; X86-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
1128; X86-NEXT:    vmovsd %xmm0, (%esp)
1129; X86-NEXT:    fldl (%esp)
1130; X86-NEXT:    movl %ebp, %esp
1131; X86-NEXT:    popl %ebp
1132; X86-NEXT:    vzeroupper
1133; X86-NEXT:    retl
1134  %v = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x)
1135  %r = extractelement <4 x double> %v, i32 0
1136  ret double %r
1137}
1138
1139define float @rint_v4f32(<4 x float> %x) nounwind {
1140; X64-LABEL: rint_v4f32:
1141; X64:       # %bb.0:
1142; X64-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
1143; X64-NEXT:    retq
1144;
1145; X86-LABEL: rint_v4f32:
1146; X86:       # %bb.0:
1147; X86-NEXT:    pushl %eax
1148; X86-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
1149; X86-NEXT:    vmovss %xmm0, (%esp)
1150; X86-NEXT:    flds (%esp)
1151; X86-NEXT:    popl %eax
1152; X86-NEXT:    retl
1153  %v = call <4 x float> @llvm.rint.v4f32(<4 x float> %x)
1154  %r = extractelement <4 x float> %v, i32 0
1155  ret float %r
1156}
1157
1158define double @rint_v4f64(<4 x double> %x) nounwind {
1159; X64-LABEL: rint_v4f64:
1160; X64:       # %bb.0:
1161; X64-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0
1162; X64-NEXT:    vzeroupper
1163; X64-NEXT:    retq
1164;
1165; X86-LABEL: rint_v4f64:
1166; X86:       # %bb.0:
1167; X86-NEXT:    pushl %ebp
1168; X86-NEXT:    movl %esp, %ebp
1169; X86-NEXT:    andl $-8, %esp
1170; X86-NEXT:    subl $8, %esp
1171; X86-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0
1172; X86-NEXT:    vmovsd %xmm0, (%esp)
1173; X86-NEXT:    fldl (%esp)
1174; X86-NEXT:    movl %ebp, %esp
1175; X86-NEXT:    popl %ebp
1176; X86-NEXT:    vzeroupper
1177; X86-NEXT:    retl
1178  %v = call <4 x double> @llvm.rint.v4f64(<4 x double> %x)
1179  %r = extractelement <4 x double> %v, i32 0
1180  ret double %r
1181}
1182
1183define float @nearbyint_v4f32(<4 x float> %x) nounwind {
1184; X64-LABEL: nearbyint_v4f32:
1185; X64:       # %bb.0:
1186; X64-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
1187; X64-NEXT:    retq
1188;
1189; X86-LABEL: nearbyint_v4f32:
1190; X86:       # %bb.0:
1191; X86-NEXT:    pushl %eax
1192; X86-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
1193; X86-NEXT:    vmovss %xmm0, (%esp)
1194; X86-NEXT:    flds (%esp)
1195; X86-NEXT:    popl %eax
1196; X86-NEXT:    retl
1197  %v = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x)
1198  %r = extractelement <4 x float> %v, i32 0
1199  ret float %r
1200}
1201
1202define double @nearbyint_v4f64(<4 x double> %x) nounwind {
1203; X64-LABEL: nearbyint_v4f64:
1204; X64:       # %bb.0:
1205; X64-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
1206; X64-NEXT:    vzeroupper
1207; X64-NEXT:    retq
1208;
1209; X86-LABEL: nearbyint_v4f64:
1210; X86:       # %bb.0:
1211; X86-NEXT:    pushl %ebp
1212; X86-NEXT:    movl %esp, %ebp
1213; X86-NEXT:    andl $-8, %esp
1214; X86-NEXT:    subl $8, %esp
1215; X86-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
1216; X86-NEXT:    vmovsd %xmm0, (%esp)
1217; X86-NEXT:    fldl (%esp)
1218; X86-NEXT:    movl %ebp, %esp
1219; X86-NEXT:    popl %ebp
1220; X86-NEXT:    vzeroupper
1221; X86-NEXT:    retl
1222  %v = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x)
1223  %r = extractelement <4 x double> %v, i32 0
1224  ret double %r
1225}
1226
1227define float @round_v4f32(<4 x float> %x) nounwind {
1228; X64-LABEL: round_v4f32:
1229; X64:       # %bb.0:
1230; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1231; X64-NEXT:    vandps %xmm1, %xmm0, %xmm1
1232; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
1233; X64-NEXT:    vorps %xmm2, %xmm1, %xmm1
1234; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1235; X64-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
1236; X64-NEXT:    retq
1237;
1238; X86-LABEL: round_v4f32:
1239; X86:       # %bb.0:
1240; X86-NEXT:    pushl %eax
1241; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1242; X86-NEXT:    vandps %xmm1, %xmm0, %xmm1
1243; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
1244; X86-NEXT:    vorps %xmm2, %xmm1, %xmm1
1245; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1246; X86-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
1247; X86-NEXT:    vmovss %xmm0, (%esp)
1248; X86-NEXT:    flds (%esp)
1249; X86-NEXT:    popl %eax
1250; X86-NEXT:    retl
1251  %v = call <4 x float> @llvm.round.v4f32(<4 x float> %x)
1252  %r = extractelement <4 x float> %v, i32 0
1253  ret float %r
1254}
1255
1256define double @round_v4f64(<4 x double> %x) nounwind {
1257; X64-LABEL: round_v4f64:
1258; X64:       # %bb.0:
1259; X64-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1260; X64-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
1261; X64-NEXT:    # xmm2 = mem[0,0]
1262; X64-NEXT:    vorpd %xmm2, %xmm1, %xmm1
1263; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1264; X64-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
1265; X64-NEXT:    vzeroupper
1266; X64-NEXT:    retq
1267;
1268; X86-LABEL: round_v4f64:
1269; X86:       # %bb.0:
1270; X86-NEXT:    pushl %ebp
1271; X86-NEXT:    movl %esp, %ebp
1272; X86-NEXT:    andl $-8, %esp
1273; X86-NEXT:    subl $8, %esp
1274; X86-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
1275; X86-NEXT:    vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
1276; X86-NEXT:    # xmm2 = mem[0,0]
1277; X86-NEXT:    vorpd %xmm2, %xmm1, %xmm1
1278; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1279; X86-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
1280; X86-NEXT:    vmovsd %xmm0, (%esp)
1281; X86-NEXT:    fldl (%esp)
1282; X86-NEXT:    movl %ebp, %esp
1283; X86-NEXT:    popl %ebp
1284; X86-NEXT:    vzeroupper
1285; X86-NEXT:    retl
1286  %v = call <4 x double> @llvm.round.v4f64(<4 x double> %x)
1287  %r = extractelement <4 x double> %v, i32 0
1288  ret double %r
1289}
1290
1291define float @rcp_v4f32(<4 x float> %x) nounwind {
1292; X64-LABEL: rcp_v4f32:
1293; X64:       # %bb.0:
1294; X64-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
1295; X64-NEXT:    retq
1296;
1297; X86-LABEL: rcp_v4f32:
1298; X86:       # %bb.0:
1299; X86-NEXT:    pushl %eax
1300; X86-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
1301; X86-NEXT:    vmovss %xmm0, (%esp)
1302; X86-NEXT:    flds (%esp)
1303; X86-NEXT:    popl %eax
1304; X86-NEXT:    retl
1305  %v = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %x)
1306  %r = extractelement <4 x float> %v, i32 0
1307  ret float %r
1308}
1309
1310define float @rcp_v8f32(<8 x float> %x) nounwind {
1311; X64-LABEL: rcp_v8f32:
1312; X64:       # %bb.0:
1313; X64-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
1314; X64-NEXT:    vzeroupper
1315; X64-NEXT:    retq
1316;
1317; X86-LABEL: rcp_v8f32:
1318; X86:       # %bb.0:
1319; X86-NEXT:    pushl %eax
1320; X86-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
1321; X86-NEXT:    vmovss %xmm0, (%esp)
1322; X86-NEXT:    flds (%esp)
1323; X86-NEXT:    popl %eax
1324; X86-NEXT:    vzeroupper
1325; X86-NEXT:    retl
1326  %v = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %x)
1327  %r = extractelement <8 x float> %v, i32 0
1328  ret float %r
1329}
1330
1331define float @rsqrt_v4f32(<4 x float> %x) nounwind {
1332; X64-LABEL: rsqrt_v4f32:
1333; X64:       # %bb.0:
1334; X64-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
1335; X64-NEXT:    retq
1336;
1337; X86-LABEL: rsqrt_v4f32:
1338; X86:       # %bb.0:
1339; X86-NEXT:    pushl %eax
1340; X86-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
1341; X86-NEXT:    vmovss %xmm0, (%esp)
1342; X86-NEXT:    flds (%esp)
1343; X86-NEXT:    popl %eax
1344; X86-NEXT:    retl
1345  %v = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %x)
1346  %r = extractelement <4 x float> %v, i32 0
1347  ret float %r
1348}
1349
1350define float @rsqrt_v8f32(<8 x float> %x) nounwind {
1351; X64-LABEL: rsqrt_v8f32:
1352; X64:       # %bb.0:
1353; X64-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
1354; X64-NEXT:    vzeroupper
1355; X64-NEXT:    retq
1356;
1357; X86-LABEL: rsqrt_v8f32:
1358; X86:       # %bb.0:
1359; X86-NEXT:    pushl %eax
1360; X86-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
1361; X86-NEXT:    vmovss %xmm0, (%esp)
1362; X86-NEXT:    flds (%esp)
1363; X86-NEXT:    popl %eax
1364; X86-NEXT:    vzeroupper
1365; X86-NEXT:    retl
1366  %v = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %x)
1367  %r = extractelement <8 x float> %v, i32 0
1368  ret float %r
1369}
1370
1371declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
1372declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
1373declare <4 x float> @llvm.sin.v4f32(<4 x float>)
1374declare <4 x double> @llvm.sin.v4f64(<4 x double>)
1375declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
1376declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
1377declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
1378declare <4 x double> @llvm.fabs.v4f64(<4 x double>)
1379declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
1380declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
1381declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
1382declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
1383declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
1384declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
1385declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
1386declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
1387declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
1388declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>)
1389declare <4 x float> @llvm.floor.v4f32(<4 x float>)
1390declare <4 x double> @llvm.floor.v4f64(<4 x double>)
1391declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
1392declare <4 x double> @llvm.ceil.v4f64(<4 x double>)
1393declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
1394declare <4 x double> @llvm.trunc.v4f64(<4 x double>)
1395declare <4 x float> @llvm.rint.v4f32(<4 x float>)
1396declare <4 x double> @llvm.rint.v4f64(<4 x double>)
1397declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
1398declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>)
1399declare <4 x float> @llvm.round.v4f32(<4 x float>)
1400declare <4 x double> @llvm.round.v4f64(<4 x double>)
1401
1402declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>)
1403declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>)
1404declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>)
1405declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>)
1406