1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5 2; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE 3; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX1 4; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX2 5; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512,AVX512F 6; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512,AVX512BW 7 8define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind { 9; SSE-LABEL: v_test_canonicalize__half: 10; SSE: # %bb.0: # %entry 11; SSE-NEXT: pushq %rbx 12; SSE-NEXT: subq $16, %rsp 13; SSE-NEXT: movq %rdi, %rbx 14; SSE-NEXT: pinsrw $0, (%rdi), %xmm0 15; SSE-NEXT: callq __extendhfsf2@PLT 16; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 17; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 18; SSE-NEXT: callq __extendhfsf2@PLT 19; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 20; SSE-NEXT: callq __truncsfhf2@PLT 21; SSE-NEXT: pextrw $0, %xmm0, %eax 22; SSE-NEXT: movw %ax, (%rbx) 23; SSE-NEXT: addq $16, %rsp 24; SSE-NEXT: popq %rbx 25; SSE-NEXT: retq 26; 27; AVX-LABEL: v_test_canonicalize__half: 28; AVX: # %bb.0: # %entry 29; AVX-NEXT: pushq %rbx 30; AVX-NEXT: subq $16, %rsp 31; AVX-NEXT: movq %rdi, %rbx 32; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 33; AVX-NEXT: callq __extendhfsf2@PLT 34; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 35; AVX-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 36; AVX-NEXT: callq __extendhfsf2@PLT 37; AVX-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 38; AVX-NEXT: callq __truncsfhf2@PLT 39; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 40; AVX-NEXT: addq $16, %rsp 41; AVX-NEXT: popq %rbx 42; AVX-NEXT: retq 43; 44; AVX512-LABEL: v_test_canonicalize__half: 45; AVX512: # %bb.0: # %entry 46; AVX512-NEXT: movzwl (%rdi), %eax 47; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx 48; AVX512-NEXT: vmovd %ecx, %xmm0 49; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 50; AVX512-NEXT: vmovd %eax, %xmm1 51; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 52; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 53; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 54; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 55; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 56; AVX512-NEXT: vpextrw $0, %xmm0, (%rdi) 57; AVX512-NEXT: retq 58entry: 59 %val = load half, half addrspace(1)* %out 60 %canonicalized = call half @llvm.canonicalize.f16(half %val) 61 store half %canonicalized, half addrspace(1)* %out 62 ret void 63} 64 65define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind { 66; SSE-LABEL: complex_canonicalize_fmul_half: 67; SSE: # %bb.0: # %entry 68; SSE-NEXT: pushq %rax 69; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 70; SSE-NEXT: callq __extendhfsf2@PLT 71; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill 72; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload 73; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero 74; SSE-NEXT: callq __extendhfsf2@PLT 75; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 76; SSE-NEXT: movss (%rsp), %xmm1 # 4-byte Reload 77; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero 78; SSE-NEXT: subss %xmm0, %xmm1 79; SSE-NEXT: movaps %xmm1, %xmm0 80; SSE-NEXT: callq __truncsfhf2@PLT 81; SSE-NEXT: callq __extendhfsf2@PLT 82; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill 83; SSE-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 84; SSE-NEXT: callq __truncsfhf2@PLT 85; SSE-NEXT: callq __extendhfsf2@PLT 86; SSE-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload 87; SSE-NEXT: callq __truncsfhf2@PLT 88; SSE-NEXT: callq __extendhfsf2@PLT 89; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill 90; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 91; SSE-NEXT: callq __extendhfsf2@PLT 92; SSE-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload 93; SSE-NEXT: callq __truncsfhf2@PLT 94; SSE-NEXT: callq __extendhfsf2@PLT 95; SSE-NEXT: subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 96; SSE-NEXT: callq __truncsfhf2@PLT 97; SSE-NEXT: popq %rax 98; SSE-NEXT: retq 99; 100; AVX-LABEL: complex_canonicalize_fmul_half: 101; AVX: # %bb.0: # %entry 102; AVX-NEXT: pushq %rax 103; AVX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 104; AVX-NEXT: callq __extendhfsf2@PLT 105; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill 106; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload 107; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero 108; AVX-NEXT: callq __extendhfsf2@PLT 109; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 110; AVX-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload 111; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero 112; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 113; AVX-NEXT: callq __truncsfhf2@PLT 114; AVX-NEXT: callq __extendhfsf2@PLT 115; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill 116; AVX-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 117; AVX-NEXT: callq __truncsfhf2@PLT 118; AVX-NEXT: callq __extendhfsf2@PLT 119; AVX-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload 120; AVX-NEXT: callq __truncsfhf2@PLT 121; AVX-NEXT: callq __extendhfsf2@PLT 122; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill 123; AVX-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 124; AVX-NEXT: callq __extendhfsf2@PLT 125; AVX-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload 126; AVX-NEXT: callq __truncsfhf2@PLT 127; AVX-NEXT: callq __extendhfsf2@PLT 128; AVX-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 129; AVX-NEXT: callq __truncsfhf2@PLT 130; AVX-NEXT: popq %rax 131; AVX-NEXT: retq 132; 133; AVX512-LABEL: complex_canonicalize_fmul_half: 134; AVX512: # %bb.0: # %entry 135; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 136; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 137; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 138; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 139; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 140; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 141; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 142; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 143; AVX512-NEXT: vsubss %xmm0, %xmm2, %xmm0 144; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 145; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 146; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 147; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax 148; AVX512-NEXT: vmovd %eax, %xmm2 149; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 150; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 151; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 152; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] 153; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 154; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 155; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 156; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 157; AVX512-NEXT: vmovd %xmm0, %eax 158; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 159; AVX512-NEXT: retq 160entry: 161 162 %mul1 = fsub half %a, %b 163 %add = fadd half %mul1, %b 164 %mul2 = fsub half %add, %mul1 165 %canonicalized = call half @llvm.canonicalize.f16(half %mul2) 166 %result = fsub half %canonicalized, %b 167 ret half %result 168} 169 170define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind { 171; SSE-LABEL: v_test_canonicalize_v2half: 172; SSE: # %bb.0: # %entry 173; SSE-NEXT: pushq %rbx 174; SSE-NEXT: subq $48, %rsp 175; SSE-NEXT: movq %rdi, %rbx 176; SSE-NEXT: pinsrw $0, 2(%rdi), %xmm0 177; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 178; SSE-NEXT: pinsrw $0, (%rdi), %xmm0 179; SSE-NEXT: callq __extendhfsf2@PLT 180; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 181; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 182; SSE-NEXT: callq __extendhfsf2@PLT 183; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 184; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload 185; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero 186; SSE-NEXT: mulss %xmm0, %xmm1 187; SSE-NEXT: movaps %xmm1, %xmm0 188; SSE-NEXT: callq __truncsfhf2@PLT 189; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 190; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 191; SSE-NEXT: callq __extendhfsf2@PLT 192; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 193; SSE-NEXT: callq __truncsfhf2@PLT 194; SSE-NEXT: pextrw $0, %xmm0, %eax 195; SSE-NEXT: movw %ax, 2(%rbx) 196; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 197; SSE-NEXT: pextrw $0, %xmm0, %eax 198; SSE-NEXT: movw %ax, (%rbx) 199; SSE-NEXT: addq $48, %rsp 200; SSE-NEXT: popq %rbx 201; SSE-NEXT: retq 202; 203; AVX-LABEL: v_test_canonicalize_v2half: 204; AVX: # %bb.0: # %entry 205; AVX-NEXT: pushq %rbx 206; AVX-NEXT: subq $48, %rsp 207; AVX-NEXT: movq %rdi, %rbx 208; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 209; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 210; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 211; AVX-NEXT: callq __extendhfsf2@PLT 212; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 213; AVX-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 214; AVX-NEXT: callq __extendhfsf2@PLT 215; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 216; AVX-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 217; AVX-NEXT: callq __truncsfhf2@PLT 218; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 219; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 220; AVX-NEXT: callq __extendhfsf2@PLT 221; AVX-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload 222; AVX-NEXT: callq __truncsfhf2@PLT 223; AVX-NEXT: vpextrw $0, %xmm0, 2(%rbx) 224; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 225; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 226; AVX-NEXT: addq $48, %rsp 227; AVX-NEXT: popq %rbx 228; AVX-NEXT: retq 229; 230; AVX512-LABEL: v_test_canonicalize_v2half: 231; AVX512: # %bb.0: # %entry 232; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 233; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax 234; AVX512-NEXT: vmovd %eax, %xmm1 235; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 236; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] 237; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 238; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm2 239; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3 240; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3] 241; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 242; AVX512-NEXT: vmovd %xmm2, %eax 243; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 244; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 245; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 246; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 247; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] 248; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 249; AVX512-NEXT: vmovd %xmm0, %eax 250; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 251; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 252; AVX512-NEXT: vmovd %xmm0, (%rdi) 253; AVX512-NEXT: retq 254entry: 255 %val = load <2 x half>, <2 x half> addrspace(1)* %out 256 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) 257 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out 258 ret void 259} 260 261;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 262; AVX1: {{.*}} 263; AVX2: {{.*}} 264; AVX512BW: {{.*}} 265; AVX512F: {{.*}} 266