xref: /llvm-project/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll (revision 67c3f2b4303972a6dc8ada54efe1d5d80d119a51)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2  -O3 | FileCheck %s --check-prefixes=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c  -O3 | FileCheck %s --check-prefixes=AVX,F16C
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  -O3 | FileCheck %s --check-prefixes=AVX,AVX512
5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
7
8declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata)
9declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata)
10declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata)
11declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata)
12declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata)
13declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata)
14declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
15declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata)
16declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata)
17declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata)
18
19define half @fadd_f16(half %a, half %b) nounwind strictfp {
20; SSE2-LABEL: fadd_f16:
21; SSE2:       # %bb.0:
22; SSE2-NEXT:    pushq %rax
23; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
24; SSE2-NEXT:    movaps %xmm1, %xmm0
25; SSE2-NEXT:    callq __extendhfsf2@PLT
26; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
27; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
28; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
29; SSE2-NEXT:    callq __extendhfsf2@PLT
30; SSE2-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
31; SSE2-NEXT:    callq __truncsfhf2@PLT
32; SSE2-NEXT:    popq %rax
33; SSE2-NEXT:    retq
34;
35; AVX-LABEL: fadd_f16:
36; AVX:       # %bb.0:
37; AVX-NEXT:    vpextrw $0, %xmm0, %eax
38; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
39; AVX-NEXT:    movzwl %cx, %ecx
40; AVX-NEXT:    vmovd %ecx, %xmm0
41; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
42; AVX-NEXT:    movzwl %ax, %eax
43; AVX-NEXT:    vmovd %eax, %xmm1
44; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
45; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
46; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
47; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
48; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
49; AVX-NEXT:    vmovd %xmm0, %eax
50; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
51; AVX-NEXT:    retq
52;
53; X86-LABEL: fadd_f16:
54; X86:       # %bb.0:
55; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
56; X86-NEXT:    vaddsh {{[0-9]+}}(%esp), %xmm0, %xmm0
57; X86-NEXT:    retl
58;
59; X64-LABEL: fadd_f16:
60; X64:       # %bb.0:
61; X64-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
62; X64-NEXT:    retq
63  %ret = call half @llvm.experimental.constrained.fadd.f16(half %a, half %b,
64                                                           metadata !"round.dynamic",
65                                                           metadata !"fpexcept.strict") #0
66  ret half %ret
67}
68
69define half @fsub_f16(half %a, half %b) nounwind strictfp {
70; SSE2-LABEL: fsub_f16:
71; SSE2:       # %bb.0:
72; SSE2-NEXT:    pushq %rax
73; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
74; SSE2-NEXT:    movaps %xmm1, %xmm0
75; SSE2-NEXT:    callq __extendhfsf2@PLT
76; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
77; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
78; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
79; SSE2-NEXT:    callq __extendhfsf2@PLT
80; SSE2-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
81; SSE2-NEXT:    callq __truncsfhf2@PLT
82; SSE2-NEXT:    popq %rax
83; SSE2-NEXT:    retq
84;
85; AVX-LABEL: fsub_f16:
86; AVX:       # %bb.0:
87; AVX-NEXT:    vpextrw $0, %xmm0, %eax
88; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
89; AVX-NEXT:    movzwl %cx, %ecx
90; AVX-NEXT:    vmovd %ecx, %xmm0
91; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
92; AVX-NEXT:    movzwl %ax, %eax
93; AVX-NEXT:    vmovd %eax, %xmm1
94; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
95; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
96; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
97; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
98; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
99; AVX-NEXT:    vmovd %xmm0, %eax
100; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
101; AVX-NEXT:    retq
102;
103; X86-LABEL: fsub_f16:
104; X86:       # %bb.0:
105; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
106; X86-NEXT:    vsubsh {{[0-9]+}}(%esp), %xmm0, %xmm0
107; X86-NEXT:    retl
108;
109; X64-LABEL: fsub_f16:
110; X64:       # %bb.0:
111; X64-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
112; X64-NEXT:    retq
113  %ret = call half @llvm.experimental.constrained.fsub.f16(half %a, half %b,
114                                                           metadata !"round.dynamic",
115                                                           metadata !"fpexcept.strict") #0
116  ret half %ret
117}
118
119define half @fmul_f16(half %a, half %b) nounwind strictfp {
120; SSE2-LABEL: fmul_f16:
121; SSE2:       # %bb.0:
122; SSE2-NEXT:    pushq %rax
123; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
124; SSE2-NEXT:    movaps %xmm1, %xmm0
125; SSE2-NEXT:    callq __extendhfsf2@PLT
126; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
127; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
128; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
129; SSE2-NEXT:    callq __extendhfsf2@PLT
130; SSE2-NEXT:    mulss (%rsp), %xmm0 # 4-byte Folded Reload
131; SSE2-NEXT:    callq __truncsfhf2@PLT
132; SSE2-NEXT:    popq %rax
133; SSE2-NEXT:    retq
134;
135; AVX-LABEL: fmul_f16:
136; AVX:       # %bb.0:
137; AVX-NEXT:    vpextrw $0, %xmm0, %eax
138; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
139; AVX-NEXT:    movzwl %cx, %ecx
140; AVX-NEXT:    vmovd %ecx, %xmm0
141; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
142; AVX-NEXT:    movzwl %ax, %eax
143; AVX-NEXT:    vmovd %eax, %xmm1
144; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
145; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
146; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
147; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
148; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
149; AVX-NEXT:    vmovd %xmm0, %eax
150; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
151; AVX-NEXT:    retq
152;
153; X86-LABEL: fmul_f16:
154; X86:       # %bb.0:
155; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
156; X86-NEXT:    vmulsh {{[0-9]+}}(%esp), %xmm0, %xmm0
157; X86-NEXT:    retl
158;
159; X64-LABEL: fmul_f16:
160; X64:       # %bb.0:
161; X64-NEXT:    vmulsh %xmm1, %xmm0, %xmm0
162; X64-NEXT:    retq
163  %ret = call half @llvm.experimental.constrained.fmul.f16(half %a, half %b,
164                                                           metadata !"round.dynamic",
165                                                           metadata !"fpexcept.strict") #0
166  ret half %ret
167}
168
169define half @fdiv_f16(half %a, half %b) nounwind strictfp {
170; SSE2-LABEL: fdiv_f16:
171; SSE2:       # %bb.0:
172; SSE2-NEXT:    pushq %rax
173; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
174; SSE2-NEXT:    movaps %xmm1, %xmm0
175; SSE2-NEXT:    callq __extendhfsf2@PLT
176; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
177; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
178; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
179; SSE2-NEXT:    callq __extendhfsf2@PLT
180; SSE2-NEXT:    divss (%rsp), %xmm0 # 4-byte Folded Reload
181; SSE2-NEXT:    callq __truncsfhf2@PLT
182; SSE2-NEXT:    popq %rax
183; SSE2-NEXT:    retq
184;
185; AVX-LABEL: fdiv_f16:
186; AVX:       # %bb.0:
187; AVX-NEXT:    vpextrw $0, %xmm0, %eax
188; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
189; AVX-NEXT:    movzwl %cx, %ecx
190; AVX-NEXT:    vmovd %ecx, %xmm0
191; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
192; AVX-NEXT:    movzwl %ax, %eax
193; AVX-NEXT:    vmovd %eax, %xmm1
194; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
195; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
196; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
197; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
198; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
199; AVX-NEXT:    vmovd %xmm0, %eax
200; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
201; AVX-NEXT:    retq
202;
203; X86-LABEL: fdiv_f16:
204; X86:       # %bb.0:
205; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
206; X86-NEXT:    vdivsh {{[0-9]+}}(%esp), %xmm0, %xmm0
207; X86-NEXT:    retl
208;
209; X64-LABEL: fdiv_f16:
210; X64:       # %bb.0:
211; X64-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
212; X64-NEXT:    retq
213  %ret = call half @llvm.experimental.constrained.fdiv.f16(half %a, half %b,
214                                                           metadata !"round.dynamic",
215                                                           metadata !"fpexcept.strict") #0
216  ret half %ret
217}
218
219define void @fpext_f16_to_f32(ptr %val, ptr %ret) nounwind strictfp {
220; SSE2-LABEL: fpext_f16_to_f32:
221; SSE2:       # %bb.0:
222; SSE2-NEXT:    pushq %rbx
223; SSE2-NEXT:    movq %rsi, %rbx
224; SSE2-NEXT:    pinsrw $0, (%rdi), %xmm0
225; SSE2-NEXT:    callq __extendhfsf2@PLT
226; SSE2-NEXT:    movd %xmm0, (%rbx)
227; SSE2-NEXT:    popq %rbx
228; SSE2-NEXT:    retq
229;
230; AVX-LABEL: fpext_f16_to_f32:
231; AVX:       # %bb.0:
232; AVX-NEXT:    movzwl (%rdi), %eax
233; AVX-NEXT:    vmovd %eax, %xmm0
234; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
235; AVX-NEXT:    vmovss %xmm0, (%rsi)
236; AVX-NEXT:    retq
237;
238; X86-LABEL: fpext_f16_to_f32:
239; X86:       # %bb.0:
240; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
241; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
242; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
243; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
244; X86-NEXT:    vmovss %xmm0, (%eax)
245; X86-NEXT:    retl
246;
247; X64-LABEL: fpext_f16_to_f32:
248; X64:       # %bb.0:
249; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
250; X64-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
251; X64-NEXT:    vmovss %xmm0, (%rsi)
252; X64-NEXT:    retq
253  %1 = load half, ptr %val, align 4
254  %res = call float @llvm.experimental.constrained.fpext.f32.f16(half %1,
255                                                                 metadata !"fpexcept.strict") #0
256  store float %res, ptr %ret, align 8
257  ret void
258}
259
260define void @fpext_f16_to_f64(ptr %val, ptr %ret) nounwind strictfp {
261; SSE2-LABEL: fpext_f16_to_f64:
262; SSE2:       # %bb.0:
263; SSE2-NEXT:    pushq %rbx
264; SSE2-NEXT:    movq %rsi, %rbx
265; SSE2-NEXT:    pinsrw $0, (%rdi), %xmm0
266; SSE2-NEXT:    callq __extendhfsf2@PLT
267; SSE2-NEXT:    cvtss2sd %xmm0, %xmm0
268; SSE2-NEXT:    movsd %xmm0, (%rbx)
269; SSE2-NEXT:    popq %rbx
270; SSE2-NEXT:    retq
271;
272; AVX-LABEL: fpext_f16_to_f64:
273; AVX:       # %bb.0:
274; AVX-NEXT:    movzwl (%rdi), %eax
275; AVX-NEXT:    vmovd %eax, %xmm0
276; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
277; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
278; AVX-NEXT:    vmovsd %xmm0, (%rsi)
279; AVX-NEXT:    retq
280;
281; X86-LABEL: fpext_f16_to_f64:
282; X86:       # %bb.0:
283; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
284; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
285; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
286; X86-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
287; X86-NEXT:    vmovsd %xmm0, (%eax)
288; X86-NEXT:    retl
289;
290; X64-LABEL: fpext_f16_to_f64:
291; X64:       # %bb.0:
292; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
293; X64-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
294; X64-NEXT:    vmovsd %xmm0, (%rsi)
295; X64-NEXT:    retq
296  %1 = load half, ptr %val, align 4
297  %res = call double @llvm.experimental.constrained.fpext.f64.f16(half %1,
298                                                                  metadata !"fpexcept.strict") #0
299  store double %res, ptr %ret, align 8
300  ret void
301}
302
303define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp {
304; SSE2-LABEL: fptrunc_float_to_f16:
305; SSE2:       # %bb.0:
306; SSE2-NEXT:    pushq %rbx
307; SSE2-NEXT:    movq %rsi, %rbx
308; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
309; SSE2-NEXT:    callq __truncsfhf2@PLT
310; SSE2-NEXT:    pextrw $0, %xmm0, %eax
311; SSE2-NEXT:    movw %ax, (%rbx)
312; SSE2-NEXT:    popq %rbx
313; SSE2-NEXT:    retq
314;
315; AVX-LABEL: fptrunc_float_to_f16:
316; AVX:       # %bb.0:
317; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
318; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
319; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
320; AVX-NEXT:    retq
321;
322; X86-LABEL: fptrunc_float_to_f16:
323; X86:       # %bb.0:
324; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
325; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
326; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
327; X86-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
328; X86-NEXT:    vmovsh %xmm0, (%eax)
329; X86-NEXT:    retl
330;
331; X64-LABEL: fptrunc_float_to_f16:
332; X64:       # %bb.0:
333; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
334; X64-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
335; X64-NEXT:    vmovsh %xmm0, (%rsi)
336; X64-NEXT:    retq
337  %1 = load float, ptr %val, align 8
338  %res = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %1,
339                                                                  metadata !"round.dynamic",
340                                                                  metadata !"fpexcept.strict") #0
341  store half %res, ptr %ret, align 4
342  ret void
343}
344
345define void @fptrunc_double_to_f16(ptr %val, ptr%ret) nounwind strictfp {
346; SSE2-LABEL: fptrunc_double_to_f16:
347; SSE2:       # %bb.0:
348; SSE2-NEXT:    pushq %rbx
349; SSE2-NEXT:    movq %rsi, %rbx
350; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
351; SSE2-NEXT:    callq __truncdfhf2@PLT
352; SSE2-NEXT:    pextrw $0, %xmm0, %eax
353; SSE2-NEXT:    movw %ax, (%rbx)
354; SSE2-NEXT:    popq %rbx
355; SSE2-NEXT:    retq
356;
357; AVX-LABEL: fptrunc_double_to_f16:
358; AVX:       # %bb.0:
359; AVX-NEXT:    pushq %rbx
360; AVX-NEXT:    movq %rsi, %rbx
361; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
362; AVX-NEXT:    callq __truncdfhf2@PLT
363; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
364; AVX-NEXT:    popq %rbx
365; AVX-NEXT:    retq
366;
367; X86-LABEL: fptrunc_double_to_f16:
368; X86:       # %bb.0:
369; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
370; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
371; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
372; X86-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
373; X86-NEXT:    vmovsh %xmm0, (%eax)
374; X86-NEXT:    retl
375;
376; X64-LABEL: fptrunc_double_to_f16:
377; X64:       # %bb.0:
378; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
379; X64-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
380; X64-NEXT:    vmovsh %xmm0, (%rsi)
381; X64-NEXT:    retq
382  %1 = load double, ptr %val, align 8
383  %res = call half @llvm.experimental.constrained.fptrunc.f16.f64(double %1,
384                                                                  metadata !"round.dynamic",
385                                                                  metadata !"fpexcept.strict") #0
386  store half %res, ptr %ret, align 4
387  ret void
388}
389
390define void @fsqrt_f16(ptr %a) nounwind strictfp {
391; SSE2-LABEL: fsqrt_f16:
392; SSE2:       # %bb.0:
393; SSE2-NEXT:    pushq %rbx
394; SSE2-NEXT:    movq %rdi, %rbx
395; SSE2-NEXT:    pinsrw $0, (%rdi), %xmm0
396; SSE2-NEXT:    callq __extendhfsf2@PLT
397; SSE2-NEXT:    sqrtss %xmm0, %xmm0
398; SSE2-NEXT:    callq __truncsfhf2@PLT
399; SSE2-NEXT:    pextrw $0, %xmm0, %eax
400; SSE2-NEXT:    movw %ax, (%rbx)
401; SSE2-NEXT:    popq %rbx
402; SSE2-NEXT:    retq
403;
404; AVX-LABEL: fsqrt_f16:
405; AVX:       # %bb.0:
406; AVX-NEXT:    movzwl (%rdi), %eax
407; AVX-NEXT:    vmovd %eax, %xmm0
408; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
409; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
410; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
411; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
412; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
413; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
414; AVX-NEXT:    retq
415;
416; X86-LABEL: fsqrt_f16:
417; X86:       # %bb.0:
418; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
419; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
420; X86-NEXT:    vsqrtsh %xmm0, %xmm0, %xmm0
421; X86-NEXT:    vmovsh %xmm0, (%eax)
422; X86-NEXT:    retl
423;
424; X64-LABEL: fsqrt_f16:
425; X64:       # %bb.0:
426; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
427; X64-NEXT:    vsqrtsh %xmm0, %xmm0, %xmm0
428; X64-NEXT:    vmovsh %xmm0, (%rdi)
429; X64-NEXT:    retq
430  %1 = load half, ptr %a, align 4
431  %res = call half @llvm.experimental.constrained.sqrt.f16(half %1,
432                                                           metadata !"round.dynamic",
433                                                           metadata !"fpexcept.strict") #0
434  store half %res, ptr %a, align 4
435  ret void
436}
437
438define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
439; SSE2-LABEL: fma_f16:
440; SSE2:       # %bb.0:
441; SSE2-NEXT:    subq $24, %rsp
442; SSE2-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
443; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
444; SSE2-NEXT:    movaps %xmm1, %xmm0
445; SSE2-NEXT:    callq __extendhfsf2@PLT
446; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
447; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
448; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
449; SSE2-NEXT:    callq __extendhfsf2@PLT
450; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
451; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
452; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
453; SSE2-NEXT:    callq __extendhfsf2@PLT
454; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
455; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
456; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
457; SSE2-NEXT:    # xmm2 = mem[0],zero,zero,zero
458; SSE2-NEXT:    callq fmaf@PLT
459; SSE2-NEXT:    callq __truncsfhf2@PLT
460; SSE2-NEXT:    addq $24, %rsp
461; SSE2-NEXT:    retq
462;
463; F16C-LABEL: fma_f16:
464; F16C:       # %bb.0:
465; F16C-NEXT:    pushq %rax
466; F16C-NEXT:    vpextrw $0, %xmm0, %eax
467; F16C-NEXT:    vpextrw $0, %xmm1, %ecx
468; F16C-NEXT:    vpextrw $0, %xmm2, %edx
469; F16C-NEXT:    movzwl %dx, %edx
470; F16C-NEXT:    vmovd %edx, %xmm0
471; F16C-NEXT:    vcvtph2ps %xmm0, %xmm2
472; F16C-NEXT:    movzwl %cx, %ecx
473; F16C-NEXT:    vmovd %ecx, %xmm0
474; F16C-NEXT:    vcvtph2ps %xmm0, %xmm1
475; F16C-NEXT:    movzwl %ax, %eax
476; F16C-NEXT:    vmovd %eax, %xmm0
477; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
478; F16C-NEXT:    callq fmaf@PLT
479; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
480; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
481; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
482; F16C-NEXT:    vmovd %xmm0, %eax
483; F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
484; F16C-NEXT:    popq %rax
485; F16C-NEXT:    retq
486;
487; AVX512-LABEL: fma_f16:
488; AVX512:       # %bb.0:
489; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
490; AVX512-NEXT:    vpextrw $0, %xmm0, %ecx
491; AVX512-NEXT:    vpextrw $0, %xmm2, %edx
492; AVX512-NEXT:    movzwl %dx, %edx
493; AVX512-NEXT:    vmovd %edx, %xmm0
494; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
495; AVX512-NEXT:    movzwl %cx, %ecx
496; AVX512-NEXT:    vmovd %ecx, %xmm1
497; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
498; AVX512-NEXT:    movzwl %ax, %eax
499; AVX512-NEXT:    vmovd %eax, %xmm2
500; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
501; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
502; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
503; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
504; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
505; AVX512-NEXT:    vmovd %xmm0, %eax
506; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
507; AVX512-NEXT:    retq
508;
509; X86-LABEL: fma_f16:
510; X86:       # %bb.0:
511; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
512; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
513; X86-NEXT:    vfmadd213sh {{[0-9]+}}(%esp), %xmm1, %xmm0
514; X86-NEXT:    retl
515;
516; X64-LABEL: fma_f16:
517; X64:       # %bb.0:
518; X64-NEXT:    vfmadd213sh %xmm2, %xmm1, %xmm0
519; X64-NEXT:    retq
520  %res = call half @llvm.experimental.constrained.fma.f16(half %a, half %b, half %c,
521                                                          metadata !"round.dynamic",
522                                                          metadata !"fpexcept.strict") #0
523  ret half %res
524}
525
526attributes #0 = { strictfp }
527