1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX,F16C 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512 5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 7 8declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) 9declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) 10declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) 11declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata) 12declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) 13declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata) 14declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) 15declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata) 16declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata) 17declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata) 18 19define half @fadd_f16(half %a, half %b) nounwind strictfp { 20; SSE2-LABEL: fadd_f16: 21; SSE2: # %bb.0: 22; SSE2-NEXT: pushq %rax 23; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 24; SSE2-NEXT: movaps %xmm1, %xmm0 25; SSE2-NEXT: callq __extendhfsf2@PLT 26; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill 27; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload 28; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 29; SSE2-NEXT: callq __extendhfsf2@PLT 30; SSE2-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload 31; SSE2-NEXT: callq __truncsfhf2@PLT 32; SSE2-NEXT: popq %rax 33; SSE2-NEXT: retq 34; 35; AVX-LABEL: fadd_f16: 36; AVX: # %bb.0: 37; AVX-NEXT: vpextrw $0, %xmm0, %eax 38; AVX-NEXT: vpextrw $0, %xmm1, %ecx 39; AVX-NEXT: movzwl %cx, %ecx 40; AVX-NEXT: vmovd %ecx, %xmm0 41; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 42; AVX-NEXT: movzwl %ax, %eax 43; AVX-NEXT: vmovd %eax, %xmm1 44; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 45; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 46; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 47; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 48; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 49; AVX-NEXT: vmovd %xmm0, %eax 50; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 51; AVX-NEXT: retq 52; 53; X86-LABEL: fadd_f16: 54; X86: # %bb.0: 55; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 56; X86-NEXT: vaddsh {{[0-9]+}}(%esp), %xmm0, %xmm0 57; X86-NEXT: retl 58; 59; X64-LABEL: fadd_f16: 60; X64: # %bb.0: 61; X64-NEXT: vaddsh %xmm1, %xmm0, %xmm0 62; X64-NEXT: retq 63 %ret = call half @llvm.experimental.constrained.fadd.f16(half %a, half %b, 64 metadata !"round.dynamic", 65 metadata !"fpexcept.strict") #0 66 ret half %ret 67} 68 69define half @fsub_f16(half %a, half %b) nounwind strictfp { 70; SSE2-LABEL: fsub_f16: 71; SSE2: # %bb.0: 72; SSE2-NEXT: pushq %rax 73; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 74; SSE2-NEXT: movaps %xmm1, %xmm0 75; SSE2-NEXT: callq __extendhfsf2@PLT 76; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill 77; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload 78; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 79; SSE2-NEXT: callq __extendhfsf2@PLT 80; SSE2-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload 81; SSE2-NEXT: callq __truncsfhf2@PLT 82; SSE2-NEXT: popq %rax 83; SSE2-NEXT: retq 84; 85; AVX-LABEL: fsub_f16: 86; AVX: # %bb.0: 87; AVX-NEXT: vpextrw $0, %xmm0, %eax 88; AVX-NEXT: vpextrw $0, %xmm1, %ecx 89; AVX-NEXT: movzwl %cx, %ecx 90; AVX-NEXT: vmovd %ecx, %xmm0 91; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 92; AVX-NEXT: movzwl %ax, %eax 93; AVX-NEXT: vmovd %eax, %xmm1 94; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 95; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 96; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 97; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 98; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 99; AVX-NEXT: vmovd %xmm0, %eax 100; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 101; AVX-NEXT: retq 102; 103; X86-LABEL: fsub_f16: 104; X86: # %bb.0: 105; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 106; X86-NEXT: vsubsh {{[0-9]+}}(%esp), %xmm0, %xmm0 107; X86-NEXT: retl 108; 109; X64-LABEL: fsub_f16: 110; X64: # %bb.0: 111; X64-NEXT: vsubsh %xmm1, %xmm0, %xmm0 112; X64-NEXT: retq 113 %ret = call half @llvm.experimental.constrained.fsub.f16(half %a, half %b, 114 metadata !"round.dynamic", 115 metadata !"fpexcept.strict") #0 116 ret half %ret 117} 118 119define half @fmul_f16(half %a, half %b) nounwind strictfp { 120; SSE2-LABEL: fmul_f16: 121; SSE2: # %bb.0: 122; SSE2-NEXT: pushq %rax 123; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 124; SSE2-NEXT: movaps %xmm1, %xmm0 125; SSE2-NEXT: callq __extendhfsf2@PLT 126; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill 127; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload 128; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 129; SSE2-NEXT: callq __extendhfsf2@PLT 130; SSE2-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload 131; SSE2-NEXT: callq __truncsfhf2@PLT 132; SSE2-NEXT: popq %rax 133; SSE2-NEXT: retq 134; 135; AVX-LABEL: fmul_f16: 136; AVX: # %bb.0: 137; AVX-NEXT: vpextrw $0, %xmm0, %eax 138; AVX-NEXT: vpextrw $0, %xmm1, %ecx 139; AVX-NEXT: movzwl %cx, %ecx 140; AVX-NEXT: vmovd %ecx, %xmm0 141; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 142; AVX-NEXT: movzwl %ax, %eax 143; AVX-NEXT: vmovd %eax, %xmm1 144; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 145; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 146; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 147; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 148; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 149; AVX-NEXT: vmovd %xmm0, %eax 150; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 151; AVX-NEXT: retq 152; 153; X86-LABEL: fmul_f16: 154; X86: # %bb.0: 155; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 156; X86-NEXT: vmulsh {{[0-9]+}}(%esp), %xmm0, %xmm0 157; X86-NEXT: retl 158; 159; X64-LABEL: fmul_f16: 160; X64: # %bb.0: 161; X64-NEXT: vmulsh %xmm1, %xmm0, %xmm0 162; X64-NEXT: retq 163 %ret = call half @llvm.experimental.constrained.fmul.f16(half %a, half %b, 164 metadata !"round.dynamic", 165 metadata !"fpexcept.strict") #0 166 ret half %ret 167} 168 169define half @fdiv_f16(half %a, half %b) nounwind strictfp { 170; SSE2-LABEL: fdiv_f16: 171; SSE2: # %bb.0: 172; SSE2-NEXT: pushq %rax 173; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 174; SSE2-NEXT: movaps %xmm1, %xmm0 175; SSE2-NEXT: callq __extendhfsf2@PLT 176; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill 177; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload 178; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 179; SSE2-NEXT: callq __extendhfsf2@PLT 180; SSE2-NEXT: divss (%rsp), %xmm0 # 4-byte Folded Reload 181; SSE2-NEXT: callq __truncsfhf2@PLT 182; SSE2-NEXT: popq %rax 183; SSE2-NEXT: retq 184; 185; AVX-LABEL: fdiv_f16: 186; AVX: # %bb.0: 187; AVX-NEXT: vpextrw $0, %xmm0, %eax 188; AVX-NEXT: vpextrw $0, %xmm1, %ecx 189; AVX-NEXT: movzwl %cx, %ecx 190; AVX-NEXT: vmovd %ecx, %xmm0 191; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 192; AVX-NEXT: movzwl %ax, %eax 193; AVX-NEXT: vmovd %eax, %xmm1 194; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 195; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 196; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 197; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 198; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 199; AVX-NEXT: vmovd %xmm0, %eax 200; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 201; AVX-NEXT: retq 202; 203; X86-LABEL: fdiv_f16: 204; X86: # %bb.0: 205; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 206; X86-NEXT: vdivsh {{[0-9]+}}(%esp), %xmm0, %xmm0 207; X86-NEXT: retl 208; 209; X64-LABEL: fdiv_f16: 210; X64: # %bb.0: 211; X64-NEXT: vdivsh %xmm1, %xmm0, %xmm0 212; X64-NEXT: retq 213 %ret = call half @llvm.experimental.constrained.fdiv.f16(half %a, half %b, 214 metadata !"round.dynamic", 215 metadata !"fpexcept.strict") #0 216 ret half %ret 217} 218 219define void @fpext_f16_to_f32(ptr %val, ptr %ret) nounwind strictfp { 220; SSE2-LABEL: fpext_f16_to_f32: 221; SSE2: # %bb.0: 222; SSE2-NEXT: pushq %rbx 223; SSE2-NEXT: movq %rsi, %rbx 224; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0 225; SSE2-NEXT: callq __extendhfsf2@PLT 226; SSE2-NEXT: movd %xmm0, (%rbx) 227; SSE2-NEXT: popq %rbx 228; SSE2-NEXT: retq 229; 230; AVX-LABEL: fpext_f16_to_f32: 231; AVX: # %bb.0: 232; AVX-NEXT: movzwl (%rdi), %eax 233; AVX-NEXT: vmovd %eax, %xmm0 234; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 235; AVX-NEXT: vmovss %xmm0, (%rsi) 236; AVX-NEXT: retq 237; 238; X86-LABEL: fpext_f16_to_f32: 239; X86: # %bb.0: 240; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 241; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 242; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 243; X86-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 244; X86-NEXT: vmovss %xmm0, (%eax) 245; X86-NEXT: retl 246; 247; X64-LABEL: fpext_f16_to_f32: 248; X64: # %bb.0: 249; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 250; X64-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 251; X64-NEXT: vmovss %xmm0, (%rsi) 252; X64-NEXT: retq 253 %1 = load half, ptr %val, align 4 254 %res = call float @llvm.experimental.constrained.fpext.f32.f16(half %1, 255 metadata !"fpexcept.strict") #0 256 store float %res, ptr %ret, align 8 257 ret void 258} 259 260define void @fpext_f16_to_f64(ptr %val, ptr %ret) nounwind strictfp { 261; SSE2-LABEL: fpext_f16_to_f64: 262; SSE2: # %bb.0: 263; SSE2-NEXT: pushq %rbx 264; SSE2-NEXT: movq %rsi, %rbx 265; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0 266; SSE2-NEXT: callq __extendhfsf2@PLT 267; SSE2-NEXT: cvtss2sd %xmm0, %xmm0 268; SSE2-NEXT: movsd %xmm0, (%rbx) 269; SSE2-NEXT: popq %rbx 270; SSE2-NEXT: retq 271; 272; AVX-LABEL: fpext_f16_to_f64: 273; AVX: # %bb.0: 274; AVX-NEXT: movzwl (%rdi), %eax 275; AVX-NEXT: vmovd %eax, %xmm0 276; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 277; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 278; AVX-NEXT: vmovsd %xmm0, (%rsi) 279; AVX-NEXT: retq 280; 281; X86-LABEL: fpext_f16_to_f64: 282; X86: # %bb.0: 283; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 284; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 285; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 286; X86-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0 287; X86-NEXT: vmovsd %xmm0, (%eax) 288; X86-NEXT: retl 289; 290; X64-LABEL: fpext_f16_to_f64: 291; X64: # %bb.0: 292; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 293; X64-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0 294; X64-NEXT: vmovsd %xmm0, (%rsi) 295; X64-NEXT: retq 296 %1 = load half, ptr %val, align 4 297 %res = call double @llvm.experimental.constrained.fpext.f64.f16(half %1, 298 metadata !"fpexcept.strict") #0 299 store double %res, ptr %ret, align 8 300 ret void 301} 302 303define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp { 304; SSE2-LABEL: fptrunc_float_to_f16: 305; SSE2: # %bb.0: 306; SSE2-NEXT: pushq %rbx 307; SSE2-NEXT: movq %rsi, %rbx 308; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 309; SSE2-NEXT: callq __truncsfhf2@PLT 310; SSE2-NEXT: pextrw $0, %xmm0, %eax 311; SSE2-NEXT: movw %ax, (%rbx) 312; SSE2-NEXT: popq %rbx 313; SSE2-NEXT: retq 314; 315; AVX-LABEL: fptrunc_float_to_f16: 316; AVX: # %bb.0: 317; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 318; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 319; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 320; AVX-NEXT: retq 321; 322; X86-LABEL: fptrunc_float_to_f16: 323; X86: # %bb.0: 324; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 325; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 326; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 327; X86-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 328; X86-NEXT: vmovsh %xmm0, (%eax) 329; X86-NEXT: retl 330; 331; X64-LABEL: fptrunc_float_to_f16: 332; X64: # %bb.0: 333; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 334; X64-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 335; X64-NEXT: vmovsh %xmm0, (%rsi) 336; X64-NEXT: retq 337 %1 = load float, ptr %val, align 8 338 %res = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %1, 339 metadata !"round.dynamic", 340 metadata !"fpexcept.strict") #0 341 store half %res, ptr %ret, align 4 342 ret void 343} 344 345define void @fptrunc_double_to_f16(ptr %val, ptr%ret) nounwind strictfp { 346; SSE2-LABEL: fptrunc_double_to_f16: 347; SSE2: # %bb.0: 348; SSE2-NEXT: pushq %rbx 349; SSE2-NEXT: movq %rsi, %rbx 350; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 351; SSE2-NEXT: callq __truncdfhf2@PLT 352; SSE2-NEXT: pextrw $0, %xmm0, %eax 353; SSE2-NEXT: movw %ax, (%rbx) 354; SSE2-NEXT: popq %rbx 355; SSE2-NEXT: retq 356; 357; AVX-LABEL: fptrunc_double_to_f16: 358; AVX: # %bb.0: 359; AVX-NEXT: pushq %rbx 360; AVX-NEXT: movq %rsi, %rbx 361; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 362; AVX-NEXT: callq __truncdfhf2@PLT 363; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 364; AVX-NEXT: popq %rbx 365; AVX-NEXT: retq 366; 367; X86-LABEL: fptrunc_double_to_f16: 368; X86: # %bb.0: 369; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 370; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 371; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 372; X86-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0 373; X86-NEXT: vmovsh %xmm0, (%eax) 374; X86-NEXT: retl 375; 376; X64-LABEL: fptrunc_double_to_f16: 377; X64: # %bb.0: 378; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 379; X64-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0 380; X64-NEXT: vmovsh %xmm0, (%rsi) 381; X64-NEXT: retq 382 %1 = load double, ptr %val, align 8 383 %res = call half @llvm.experimental.constrained.fptrunc.f16.f64(double %1, 384 metadata !"round.dynamic", 385 metadata !"fpexcept.strict") #0 386 store half %res, ptr %ret, align 4 387 ret void 388} 389 390define void @fsqrt_f16(ptr %a) nounwind strictfp { 391; SSE2-LABEL: fsqrt_f16: 392; SSE2: # %bb.0: 393; SSE2-NEXT: pushq %rbx 394; SSE2-NEXT: movq %rdi, %rbx 395; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0 396; SSE2-NEXT: callq __extendhfsf2@PLT 397; SSE2-NEXT: sqrtss %xmm0, %xmm0 398; SSE2-NEXT: callq __truncsfhf2@PLT 399; SSE2-NEXT: pextrw $0, %xmm0, %eax 400; SSE2-NEXT: movw %ax, (%rbx) 401; SSE2-NEXT: popq %rbx 402; SSE2-NEXT: retq 403; 404; AVX-LABEL: fsqrt_f16: 405; AVX: # %bb.0: 406; AVX-NEXT: movzwl (%rdi), %eax 407; AVX-NEXT: vmovd %eax, %xmm0 408; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 409; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 410; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 411; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 412; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 413; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) 414; AVX-NEXT: retq 415; 416; X86-LABEL: fsqrt_f16: 417; X86: # %bb.0: 418; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 419; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 420; X86-NEXT: vsqrtsh %xmm0, %xmm0, %xmm0 421; X86-NEXT: vmovsh %xmm0, (%eax) 422; X86-NEXT: retl 423; 424; X64-LABEL: fsqrt_f16: 425; X64: # %bb.0: 426; X64-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 427; X64-NEXT: vsqrtsh %xmm0, %xmm0, %xmm0 428; X64-NEXT: vmovsh %xmm0, (%rdi) 429; X64-NEXT: retq 430 %1 = load half, ptr %a, align 4 431 %res = call half @llvm.experimental.constrained.sqrt.f16(half %1, 432 metadata !"round.dynamic", 433 metadata !"fpexcept.strict") #0 434 store half %res, ptr %a, align 4 435 ret void 436} 437 438define half @fma_f16(half %a, half %b, half %c) nounwind strictfp { 439; SSE2-LABEL: fma_f16: 440; SSE2: # %bb.0: 441; SSE2-NEXT: subq $24, %rsp 442; SSE2-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 443; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 444; SSE2-NEXT: movaps %xmm1, %xmm0 445; SSE2-NEXT: callq __extendhfsf2@PLT 446; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 447; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload 448; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 449; SSE2-NEXT: callq __extendhfsf2@PLT 450; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 451; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload 452; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero 453; SSE2-NEXT: callq __extendhfsf2@PLT 454; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload 455; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero 456; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload 457; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero 458; SSE2-NEXT: callq fmaf@PLT 459; SSE2-NEXT: callq __truncsfhf2@PLT 460; SSE2-NEXT: addq $24, %rsp 461; SSE2-NEXT: retq 462; 463; F16C-LABEL: fma_f16: 464; F16C: # %bb.0: 465; F16C-NEXT: pushq %rax 466; F16C-NEXT: vpextrw $0, %xmm0, %eax 467; F16C-NEXT: vpextrw $0, %xmm1, %ecx 468; F16C-NEXT: vpextrw $0, %xmm2, %edx 469; F16C-NEXT: movzwl %dx, %edx 470; F16C-NEXT: vmovd %edx, %xmm0 471; F16C-NEXT: vcvtph2ps %xmm0, %xmm2 472; F16C-NEXT: movzwl %cx, %ecx 473; F16C-NEXT: vmovd %ecx, %xmm0 474; F16C-NEXT: vcvtph2ps %xmm0, %xmm1 475; F16C-NEXT: movzwl %ax, %eax 476; F16C-NEXT: vmovd %eax, %xmm0 477; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 478; F16C-NEXT: callq fmaf@PLT 479; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 480; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 481; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 482; F16C-NEXT: vmovd %xmm0, %eax 483; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 484; F16C-NEXT: popq %rax 485; F16C-NEXT: retq 486; 487; AVX512-LABEL: fma_f16: 488; AVX512: # %bb.0: 489; AVX512-NEXT: vpextrw $0, %xmm1, %eax 490; AVX512-NEXT: vpextrw $0, %xmm0, %ecx 491; AVX512-NEXT: vpextrw $0, %xmm2, %edx 492; AVX512-NEXT: movzwl %dx, %edx 493; AVX512-NEXT: vmovd %edx, %xmm0 494; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 495; AVX512-NEXT: movzwl %cx, %ecx 496; AVX512-NEXT: vmovd %ecx, %xmm1 497; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 498; AVX512-NEXT: movzwl %ax, %eax 499; AVX512-NEXT: vmovd %eax, %xmm2 500; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 501; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0 502; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 503; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] 504; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 505; AVX512-NEXT: vmovd %xmm0, %eax 506; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 507; AVX512-NEXT: retq 508; 509; X86-LABEL: fma_f16: 510; X86: # %bb.0: 511; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero 512; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero 513; X86-NEXT: vfmadd213sh {{[0-9]+}}(%esp), %xmm1, %xmm0 514; X86-NEXT: retl 515; 516; X64-LABEL: fma_f16: 517; X64: # %bb.0: 518; X64-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 519; X64-NEXT: retq 520 %res = call half @llvm.experimental.constrained.fma.f16(half %a, half %b, half %c, 521 metadata !"round.dynamic", 522 metadata !"fpexcept.strict") #0 523 ret half %res 524} 525 526attributes #0 = { strictfp } 527