1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX1 3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX2 4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=F16C 5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=F16C 6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=F16C 7; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=F16C 8; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512F 9; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512-FASTLANE 10; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512-FASTLANE 11 12; 13; Half to Float 14; 15 16define float @cvt_i16_to_f32(i16 %a0) nounwind { 17; AVX-LABEL: cvt_i16_to_f32: 18; AVX: # %bb.0: 19; AVX-NEXT: vpinsrw $0, %edi, %xmm0, %xmm0 20; AVX-NEXT: jmp __extendhfsf2@PLT # TAILCALL 21; 22; F16C-LABEL: cvt_i16_to_f32: 23; F16C: # %bb.0: 24; F16C-NEXT: vmovd %edi, %xmm0 25; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 26; F16C-NEXT: retq 27; 28; AVX512-LABEL: cvt_i16_to_f32: 29; AVX512: # %bb.0: 30; AVX512-NEXT: vmovd %edi, %xmm0 31; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 32; AVX512-NEXT: retq 33 %1 = bitcast i16 %a0 to half 34 %2 = fpext half %1 to float 35 ret float %2 36} 37 38define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { 39; AVX-LABEL: cvt_4i16_to_4f32: 40; AVX: # %bb.0: 41; AVX-NEXT: subq $72, %rsp 42; AVX-NEXT: vmovq %xmm0, %rax 43; AVX-NEXT: movq %rax, %rcx 44; AVX-NEXT: movq %rax, %rdx 45; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 46; AVX-NEXT: # kill: def $eax killed $eax killed $rax 47; AVX-NEXT: shrl $16, %eax 48; AVX-NEXT: shrq $32, %rcx 49; AVX-NEXT: shrq $48, %rdx 50; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 51; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 52; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 53; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 54; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 55; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 56; AVX-NEXT: callq __extendhfsf2@PLT 57; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 58; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 59; AVX-NEXT: callq __extendhfsf2@PLT 60; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 61; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 62; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 63; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 64; AVX-NEXT: callq __extendhfsf2@PLT 65; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 66; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 67; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 68; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 69; AVX-NEXT: callq __extendhfsf2@PLT 70; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 71; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 72; AVX-NEXT: addq $72, %rsp 73; AVX-NEXT: retq 74; 75; F16C-LABEL: cvt_4i16_to_4f32: 76; F16C: # %bb.0: 77; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 78; F16C-NEXT: retq 79; 80; AVX512-LABEL: cvt_4i16_to_4f32: 81; AVX512: # %bb.0: 82; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 83; AVX512-NEXT: retq 84 %1 = bitcast <4 x i16> %a0 to <4 x half> 85 %2 = fpext <4 x half> %1 to <4 x float> 86 ret <4 x float> %2 87} 88 89define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { 90; AVX-LABEL: cvt_8i16_to_4f32: 91; AVX: # %bb.0: 92; AVX-NEXT: subq $72, %rsp 93; AVX-NEXT: vmovq %xmm0, %rax 94; AVX-NEXT: movq %rax, %rcx 95; AVX-NEXT: movq %rax, %rdx 96; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 97; AVX-NEXT: # kill: def $eax killed $eax killed $rax 98; AVX-NEXT: shrl $16, %eax 99; AVX-NEXT: shrq $32, %rcx 100; AVX-NEXT: shrq $48, %rdx 101; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 102; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 103; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 104; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 105; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 106; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 107; AVX-NEXT: callq __extendhfsf2@PLT 108; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 109; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 110; AVX-NEXT: callq __extendhfsf2@PLT 111; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 112; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 113; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 114; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 115; AVX-NEXT: callq __extendhfsf2@PLT 116; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 117; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 118; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 119; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 120; AVX-NEXT: callq __extendhfsf2@PLT 121; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 122; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 123; AVX-NEXT: addq $72, %rsp 124; AVX-NEXT: retq 125; 126; F16C-LABEL: cvt_8i16_to_4f32: 127; F16C: # %bb.0: 128; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 129; F16C-NEXT: retq 130; 131; AVX512-LABEL: cvt_8i16_to_4f32: 132; AVX512: # %bb.0: 133; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 134; AVX512-NEXT: retq 135 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 136 %2 = bitcast <4 x i16> %1 to <4 x half> 137 %3 = fpext <4 x half> %2 to <4 x float> 138 ret <4 x float> %3 139} 140 141define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { 142; AVX-LABEL: cvt_8i16_to_8f32: 143; AVX: # %bb.0: 144; AVX-NEXT: subq $56, %rsp 145; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 146; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 147; AVX-NEXT: callq __extendhfsf2@PLT 148; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 149; AVX-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 150; AVX-NEXT: # xmm0 = mem[2,3,0,1] 151; AVX-NEXT: callq __extendhfsf2@PLT 152; AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 153; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 154; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 155; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 156; AVX-NEXT: # xmm0 = mem[3,3,3,3] 157; AVX-NEXT: callq __extendhfsf2@PLT 158; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 159; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 160; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 161; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 162; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 163; AVX-NEXT: callq __extendhfsf2@PLT 164; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 165; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 166; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 167; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 168; AVX-NEXT: callq __extendhfsf2@PLT 169; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 170; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 171; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 172; AVX-NEXT: callq __extendhfsf2@PLT 173; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 174; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 175; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 176; AVX-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 177; AVX-NEXT: # xmm0 = mem[1,1,3,3] 178; AVX-NEXT: callq __extendhfsf2@PLT 179; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 180; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 181; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 182; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 183; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 184; AVX-NEXT: callq __extendhfsf2@PLT 185; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 186; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 187; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 188; AVX-NEXT: addq $56, %rsp 189; AVX-NEXT: retq 190; 191; F16C-LABEL: cvt_8i16_to_8f32: 192; F16C: # %bb.0: 193; F16C-NEXT: vcvtph2ps %xmm0, %ymm0 194; F16C-NEXT: retq 195; 196; AVX512-LABEL: cvt_8i16_to_8f32: 197; AVX512: # %bb.0: 198; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 199; AVX512-NEXT: retq 200 %1 = bitcast <8 x i16> %a0 to <8 x half> 201 %2 = fpext <8 x half> %1 to <8 x float> 202 ret <8 x float> %2 203} 204 205define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { 206; AVX1-LABEL: cvt_16i16_to_16f32: 207; AVX1: # %bb.0: 208; AVX1-NEXT: subq $104, %rsp 209; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 210; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 211; AVX1-NEXT: vzeroupper 212; AVX1-NEXT: callq __extendhfsf2@PLT 213; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 214; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 215; AVX1-NEXT: # xmm0 = mem[2,3,0,1] 216; AVX1-NEXT: callq __extendhfsf2@PLT 217; AVX1-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 218; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 219; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 220; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 221; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 222; AVX1-NEXT: callq __extendhfsf2@PLT 223; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 224; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 225; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 226; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 227; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 228; AVX1-NEXT: vzeroupper 229; AVX1-NEXT: callq __extendhfsf2@PLT 230; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 231; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 232; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 233; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 234; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 235; AVX1-NEXT: vzeroupper 236; AVX1-NEXT: callq __extendhfsf2@PLT 237; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 238; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 239; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 240; AVX1-NEXT: vzeroupper 241; AVX1-NEXT: callq __extendhfsf2@PLT 242; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 243; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 244; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 245; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 246; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 247; AVX1-NEXT: callq __extendhfsf2@PLT 248; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 249; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 250; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 251; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 252; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 253; AVX1-NEXT: vzeroupper 254; AVX1-NEXT: callq __extendhfsf2@PLT 255; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 256; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 257; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 258; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 259; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 260; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 261; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 262; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 263; AVX1-NEXT: vzeroupper 264; AVX1-NEXT: callq __extendhfsf2@PLT 265; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 266; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 267; AVX1-NEXT: # xmm0 = mem[2,3,0,1] 268; AVX1-NEXT: callq __extendhfsf2@PLT 269; AVX1-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 270; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 271; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 272; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 273; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 274; AVX1-NEXT: callq __extendhfsf2@PLT 275; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 276; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 277; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 278; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 279; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 280; AVX1-NEXT: callq __extendhfsf2@PLT 281; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 282; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 283; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 284; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 285; AVX1-NEXT: callq __extendhfsf2@PLT 286; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 287; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 288; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 289; AVX1-NEXT: callq __extendhfsf2@PLT 290; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 291; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 292; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 293; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 294; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 295; AVX1-NEXT: callq __extendhfsf2@PLT 296; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 297; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 298; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 299; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 300; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 301; AVX1-NEXT: callq __extendhfsf2@PLT 302; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 303; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 304; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 305; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 306; AVX1-NEXT: addq $104, %rsp 307; AVX1-NEXT: retq 308; 309; AVX2-LABEL: cvt_16i16_to_16f32: 310; AVX2: # %bb.0: 311; AVX2-NEXT: subq $104, %rsp 312; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 313; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 314; AVX2-NEXT: vzeroupper 315; AVX2-NEXT: callq __extendhfsf2@PLT 316; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 317; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 318; AVX2-NEXT: # xmm0 = mem[2,3,0,1] 319; AVX2-NEXT: callq __extendhfsf2@PLT 320; AVX2-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 321; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 322; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 323; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 324; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 325; AVX2-NEXT: callq __extendhfsf2@PLT 326; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 327; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 328; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 329; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 330; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 331; AVX2-NEXT: vzeroupper 332; AVX2-NEXT: callq __extendhfsf2@PLT 333; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 334; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 335; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 336; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 337; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 338; AVX2-NEXT: vzeroupper 339; AVX2-NEXT: callq __extendhfsf2@PLT 340; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 341; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 342; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 343; AVX2-NEXT: vzeroupper 344; AVX2-NEXT: callq __extendhfsf2@PLT 345; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 346; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 347; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 348; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 349; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 350; AVX2-NEXT: callq __extendhfsf2@PLT 351; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 352; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 353; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 354; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 355; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 356; AVX2-NEXT: vzeroupper 357; AVX2-NEXT: callq __extendhfsf2@PLT 358; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 359; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 360; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 361; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 362; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 363; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 364; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 365; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 366; AVX2-NEXT: vzeroupper 367; AVX2-NEXT: callq __extendhfsf2@PLT 368; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 369; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 370; AVX2-NEXT: # xmm0 = mem[2,3,0,1] 371; AVX2-NEXT: callq __extendhfsf2@PLT 372; AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 373; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 374; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 375; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 376; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 377; AVX2-NEXT: callq __extendhfsf2@PLT 378; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 379; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 380; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 381; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 382; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 383; AVX2-NEXT: callq __extendhfsf2@PLT 384; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 385; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 386; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 387; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 388; AVX2-NEXT: callq __extendhfsf2@PLT 389; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 390; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 391; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 392; AVX2-NEXT: callq __extendhfsf2@PLT 393; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 394; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 395; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 396; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 397; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 398; AVX2-NEXT: callq __extendhfsf2@PLT 399; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 400; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 401; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 402; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 403; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 404; AVX2-NEXT: callq __extendhfsf2@PLT 405; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 406; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 407; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 408; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 409; AVX2-NEXT: addq $104, %rsp 410; AVX2-NEXT: retq 411; 412; F16C-LABEL: cvt_16i16_to_16f32: 413; F16C: # %bb.0: 414; F16C-NEXT: vcvtph2ps %xmm0, %ymm2 415; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 416; F16C-NEXT: vcvtph2ps %xmm0, %ymm1 417; F16C-NEXT: vmovaps %ymm2, %ymm0 418; F16C-NEXT: retq 419; 420; AVX512-LABEL: cvt_16i16_to_16f32: 421; AVX512: # %bb.0: 422; AVX512-NEXT: vcvtph2ps %ymm0, %zmm0 423; AVX512-NEXT: retq 424 %1 = bitcast <16 x i16> %a0 to <16 x half> 425 %2 = fpext <16 x half> %1 to <16 x float> 426 ret <16 x float> %2 427} 428 429define <2 x float> @cvt_2i16_to_2f32_constrained(<2 x i16> %a0) nounwind strictfp { 430; AVX-LABEL: cvt_2i16_to_2f32_constrained: 431; AVX: # %bb.0: 432; AVX-NEXT: subq $40, %rsp 433; AVX-NEXT: vmovd %xmm0, %eax 434; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 435; AVX-NEXT: shrl $16, %eax 436; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 437; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 438; AVX-NEXT: callq __extendhfsf2@PLT 439; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 440; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 441; AVX-NEXT: callq __extendhfsf2@PLT 442; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 443; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 444; AVX-NEXT: addq $40, %rsp 445; AVX-NEXT: retq 446; 447; F16C-LABEL: cvt_2i16_to_2f32_constrained: 448; F16C: # %bb.0: 449; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 450; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 451; F16C-NEXT: retq 452; 453; AVX512-LABEL: cvt_2i16_to_2f32_constrained: 454; AVX512: # %bb.0: 455; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 456; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 457; AVX512-NEXT: retq 458 %1 = bitcast <2 x i16> %a0 to <2 x half> 459 %2 = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp 460 ret <2 x float> %2 461} 462declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) strictfp 463 464define <4 x float> @cvt_4i16_to_4f32_constrained(<4 x i16> %a0) nounwind strictfp { 465; AVX-LABEL: cvt_4i16_to_4f32_constrained: 466; AVX: # %bb.0: 467; AVX-NEXT: subq $72, %rsp 468; AVX-NEXT: vmovq %xmm0, %rax 469; AVX-NEXT: movq %rax, %rcx 470; AVX-NEXT: movq %rax, %rdx 471; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 472; AVX-NEXT: # kill: def $eax killed $eax killed $rax 473; AVX-NEXT: shrl $16, %eax 474; AVX-NEXT: shrq $32, %rcx 475; AVX-NEXT: shrq $48, %rdx 476; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 477; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 478; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 479; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 480; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 481; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 482; AVX-NEXT: callq __extendhfsf2@PLT 483; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 484; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 485; AVX-NEXT: callq __extendhfsf2@PLT 486; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 487; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 488; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 489; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 490; AVX-NEXT: callq __extendhfsf2@PLT 491; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 492; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 493; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 494; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 495; AVX-NEXT: callq __extendhfsf2@PLT 496; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 497; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 498; AVX-NEXT: addq $72, %rsp 499; AVX-NEXT: retq 500; 501; F16C-LABEL: cvt_4i16_to_4f32_constrained: 502; F16C: # %bb.0: 503; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 504; F16C-NEXT: retq 505; 506; AVX512-LABEL: cvt_4i16_to_4f32_constrained: 507; AVX512: # %bb.0: 508; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 509; AVX512-NEXT: retq 510 %1 = bitcast <4 x i16> %a0 to <4 x half> 511 %2 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp 512 ret <4 x float> %2 513} 514declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) strictfp 515 516define <8 x float> @cvt_8i16_to_8f32_constrained(<8 x i16> %a0) nounwind strictfp { 517; AVX-LABEL: cvt_8i16_to_8f32_constrained: 518; AVX: # %bb.0: 519; AVX-NEXT: subq $56, %rsp 520; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 521; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 522; AVX-NEXT: callq __extendhfsf2@PLT 523; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 524; AVX-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 525; AVX-NEXT: # xmm0 = mem[2,3,0,1] 526; AVX-NEXT: callq __extendhfsf2@PLT 527; AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 528; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 529; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 530; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 531; AVX-NEXT: # xmm0 = mem[3,3,3,3] 532; AVX-NEXT: callq __extendhfsf2@PLT 533; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 534; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 535; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 536; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 537; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 538; AVX-NEXT: callq __extendhfsf2@PLT 539; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 540; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 541; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 542; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 543; AVX-NEXT: callq __extendhfsf2@PLT 544; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 545; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 546; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 547; AVX-NEXT: callq __extendhfsf2@PLT 548; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 549; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 550; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 551; AVX-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 552; AVX-NEXT: # xmm0 = mem[1,1,3,3] 553; AVX-NEXT: callq __extendhfsf2@PLT 554; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 555; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 556; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 557; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 558; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 559; AVX-NEXT: callq __extendhfsf2@PLT 560; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 561; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 562; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 563; AVX-NEXT: addq $56, %rsp 564; AVX-NEXT: retq 565; 566; F16C-LABEL: cvt_8i16_to_8f32_constrained: 567; F16C: # %bb.0: 568; F16C-NEXT: vcvtph2ps %xmm0, %ymm0 569; F16C-NEXT: retq 570; 571; AVX512-LABEL: cvt_8i16_to_8f32_constrained: 572; AVX512: # %bb.0: 573; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 574; AVX512-NEXT: retq 575 %1 = bitcast <8 x i16> %a0 to <8 x half> 576 %2 = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp 577 ret <8 x float> %2 578} 579declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) strictfp 580 581define <16 x float> @cvt_16i16_to_16f32_constrained(<16 x i16> %a0) nounwind strictfp { 582; AVX1-LABEL: cvt_16i16_to_16f32_constrained: 583; AVX1: # %bb.0: 584; AVX1-NEXT: subq $104, %rsp 585; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 586; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 587; AVX1-NEXT: vzeroupper 588; AVX1-NEXT: callq __extendhfsf2@PLT 589; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 590; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 591; AVX1-NEXT: # xmm0 = mem[2,3,0,1] 592; AVX1-NEXT: callq __extendhfsf2@PLT 593; AVX1-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 594; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 595; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 596; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 597; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 598; AVX1-NEXT: callq __extendhfsf2@PLT 599; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 600; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 601; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 602; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 603; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 604; AVX1-NEXT: vzeroupper 605; AVX1-NEXT: callq __extendhfsf2@PLT 606; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 607; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 608; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 609; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 610; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 611; AVX1-NEXT: vzeroupper 612; AVX1-NEXT: callq __extendhfsf2@PLT 613; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 614; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 615; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 616; AVX1-NEXT: vzeroupper 617; AVX1-NEXT: callq __extendhfsf2@PLT 618; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 619; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 620; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 621; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 622; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 623; AVX1-NEXT: callq __extendhfsf2@PLT 624; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 625; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 626; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 627; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 628; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 629; AVX1-NEXT: vzeroupper 630; AVX1-NEXT: callq __extendhfsf2@PLT 631; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 632; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 633; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 634; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 635; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 636; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 637; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 638; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 639; AVX1-NEXT: vzeroupper 640; AVX1-NEXT: callq __extendhfsf2@PLT 641; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 642; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 643; AVX1-NEXT: # xmm0 = mem[2,3,0,1] 644; AVX1-NEXT: callq __extendhfsf2@PLT 645; AVX1-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 646; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 647; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 648; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 649; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 650; AVX1-NEXT: callq __extendhfsf2@PLT 651; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 652; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 653; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 654; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 655; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 656; AVX1-NEXT: callq __extendhfsf2@PLT 657; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 658; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 659; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 660; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 661; AVX1-NEXT: callq __extendhfsf2@PLT 662; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 663; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 664; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 665; AVX1-NEXT: callq __extendhfsf2@PLT 666; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 667; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 668; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 669; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 670; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 671; AVX1-NEXT: callq __extendhfsf2@PLT 672; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 673; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 674; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 675; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 676; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 677; AVX1-NEXT: callq __extendhfsf2@PLT 678; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 679; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 680; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 681; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 682; AVX1-NEXT: addq $104, %rsp 683; AVX1-NEXT: retq 684; 685; AVX2-LABEL: cvt_16i16_to_16f32_constrained: 686; AVX2: # %bb.0: 687; AVX2-NEXT: subq $104, %rsp 688; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 689; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 690; AVX2-NEXT: vzeroupper 691; AVX2-NEXT: callq __extendhfsf2@PLT 692; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 693; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 694; AVX2-NEXT: # xmm0 = mem[2,3,0,1] 695; AVX2-NEXT: callq __extendhfsf2@PLT 696; AVX2-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 697; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 698; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 699; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 700; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 701; AVX2-NEXT: callq __extendhfsf2@PLT 702; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 703; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 704; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 705; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 706; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 707; AVX2-NEXT: vzeroupper 708; AVX2-NEXT: callq __extendhfsf2@PLT 709; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 710; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 711; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 712; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 713; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 714; AVX2-NEXT: vzeroupper 715; AVX2-NEXT: callq __extendhfsf2@PLT 716; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 717; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 718; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 719; AVX2-NEXT: vzeroupper 720; AVX2-NEXT: callq __extendhfsf2@PLT 721; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 722; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 723; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 724; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 725; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 726; AVX2-NEXT: callq __extendhfsf2@PLT 727; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 728; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 729; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 730; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 731; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 732; AVX2-NEXT: vzeroupper 733; AVX2-NEXT: callq __extendhfsf2@PLT 734; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 735; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 736; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 737; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 738; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 739; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 740; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 741; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 742; AVX2-NEXT: vzeroupper 743; AVX2-NEXT: callq __extendhfsf2@PLT 744; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 745; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 746; AVX2-NEXT: # xmm0 = mem[2,3,0,1] 747; AVX2-NEXT: callq __extendhfsf2@PLT 748; AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 749; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 750; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 751; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 752; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 753; AVX2-NEXT: callq __extendhfsf2@PLT 754; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 755; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 756; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 757; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 758; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 759; AVX2-NEXT: callq __extendhfsf2@PLT 760; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 761; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 762; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 763; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 764; AVX2-NEXT: callq __extendhfsf2@PLT 765; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 766; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 767; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 768; AVX2-NEXT: callq __extendhfsf2@PLT 769; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 770; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 771; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 772; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 773; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 774; AVX2-NEXT: callq __extendhfsf2@PLT 775; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 776; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 777; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 778; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 779; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 780; AVX2-NEXT: callq __extendhfsf2@PLT 781; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 782; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 783; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 784; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 785; AVX2-NEXT: addq $104, %rsp 786; AVX2-NEXT: retq 787; 788; F16C-LABEL: cvt_16i16_to_16f32_constrained: 789; F16C: # %bb.0: 790; F16C-NEXT: vextractf128 $1, %ymm0, %xmm1 791; F16C-NEXT: vcvtph2ps %xmm1, %ymm1 792; F16C-NEXT: vcvtph2ps %xmm0, %ymm0 793; F16C-NEXT: retq 794; 795; AVX512-LABEL: cvt_16i16_to_16f32_constrained: 796; AVX512: # %bb.0: 797; AVX512-NEXT: vcvtph2ps %ymm0, %zmm0 798; AVX512-NEXT: retq 799 %1 = bitcast <16 x i16> %a0 to <16 x half> 800 %2 = call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %1, metadata !"fpexcept.strict") strictfp 801 ret <16 x float> %2 802} 803declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata) strictfp 804 805; 806; Half to Float (Load) 807; 808 809define float @load_cvt_i16_to_f32(ptr %a0) nounwind { 810; AVX-LABEL: load_cvt_i16_to_f32: 811; AVX: # %bb.0: 812; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 813; AVX-NEXT: jmp __extendhfsf2@PLT # TAILCALL 814; 815; F16C-LABEL: load_cvt_i16_to_f32: 816; F16C: # %bb.0: 817; F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 818; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 819; F16C-NEXT: retq 820; 821; AVX512-LABEL: load_cvt_i16_to_f32: 822; AVX512: # %bb.0: 823; AVX512-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 824; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 825; AVX512-NEXT: retq 826 %1 = load i16, ptr %a0 827 %2 = bitcast i16 %1 to half 828 %3 = fpext half %2 to float 829 ret float %3 830} 831 832define <4 x float> @load_cvt_4i16_to_4f32(ptr %a0) nounwind { 833; AVX-LABEL: load_cvt_4i16_to_4f32: 834; AVX: # %bb.0: 835; AVX-NEXT: subq $72, %rsp 836; AVX-NEXT: vpinsrw $0, 6(%rdi), %xmm0, %xmm0 837; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 838; AVX-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 839; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 840; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 841; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 842; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 843; AVX-NEXT: callq __extendhfsf2@PLT 844; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 845; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 846; AVX-NEXT: callq __extendhfsf2@PLT 847; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 848; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 849; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 850; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 851; AVX-NEXT: callq __extendhfsf2@PLT 852; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 853; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 854; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 855; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 856; AVX-NEXT: callq __extendhfsf2@PLT 857; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 858; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 859; AVX-NEXT: addq $72, %rsp 860; AVX-NEXT: retq 861; 862; F16C-LABEL: load_cvt_4i16_to_4f32: 863; F16C: # %bb.0: 864; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 865; F16C-NEXT: retq 866; 867; AVX512-LABEL: load_cvt_4i16_to_4f32: 868; AVX512: # %bb.0: 869; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 870; AVX512-NEXT: retq 871 %1 = load <4 x i16>, ptr %a0 872 %2 = bitcast <4 x i16> %1 to <4 x half> 873 %3 = fpext <4 x half> %2 to <4 x float> 874 ret <4 x float> %3 875} 876 877define <4 x float> @load_cvt_8i16_to_4f32(ptr %a0) nounwind { 878; AVX-LABEL: load_cvt_8i16_to_4f32: 879; AVX: # %bb.0: 880; AVX-NEXT: subq $72, %rsp 881; AVX-NEXT: movq (%rdi), %rax 882; AVX-NEXT: movq %rax, %rcx 883; AVX-NEXT: movq %rax, %rdx 884; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 885; AVX-NEXT: # kill: def $eax killed $eax killed $rax 886; AVX-NEXT: shrl $16, %eax 887; AVX-NEXT: shrq $32, %rcx 888; AVX-NEXT: shrq $48, %rdx 889; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 890; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 891; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 892; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 893; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 894; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 895; AVX-NEXT: callq __extendhfsf2@PLT 896; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 897; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 898; AVX-NEXT: callq __extendhfsf2@PLT 899; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 900; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 901; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 902; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 903; AVX-NEXT: callq __extendhfsf2@PLT 904; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 905; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 906; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 907; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 908; AVX-NEXT: callq __extendhfsf2@PLT 909; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 910; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 911; AVX-NEXT: addq $72, %rsp 912; AVX-NEXT: retq 913; 914; F16C-LABEL: load_cvt_8i16_to_4f32: 915; F16C: # %bb.0: 916; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 917; F16C-NEXT: retq 918; 919; AVX512-LABEL: load_cvt_8i16_to_4f32: 920; AVX512: # %bb.0: 921; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 922; AVX512-NEXT: retq 923 %1 = load <8 x i16>, ptr %a0 924 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 925 %3 = bitcast <4 x i16> %2 to <4 x half> 926 %4 = fpext <4 x half> %3 to <4 x float> 927 ret <4 x float> %4 928} 929 930define <8 x float> @load_cvt_8i16_to_8f32(ptr %a0) nounwind { 931; AVX1-LABEL: load_cvt_8i16_to_8f32: 932; AVX1: # %bb.0: 933; AVX1-NEXT: pushq %rbx 934; AVX1-NEXT: subq $48, %rsp 935; AVX1-NEXT: movq %rdi, %rbx 936; AVX1-NEXT: vmovaps (%rdi), %xmm0 937; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 938; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm0 939; AVX1-NEXT: callq __extendhfsf2@PLT 940; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 941; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 942; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 943; AVX1-NEXT: callq __extendhfsf2@PLT 944; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 945; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 946; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 947; AVX1-NEXT: vbroadcastss 12(%rbx), %xmm0 948; AVX1-NEXT: callq __extendhfsf2@PLT 949; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 950; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 951; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 952; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 953; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 954; AVX1-NEXT: callq __extendhfsf2@PLT 955; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 956; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 957; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 958; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 959; AVX1-NEXT: callq __extendhfsf2@PLT 960; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 961; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 962; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 963; AVX1-NEXT: callq __extendhfsf2@PLT 964; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 965; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 966; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 967; AVX1-NEXT: vbroadcastss 4(%rbx), %xmm0 968; AVX1-NEXT: callq __extendhfsf2@PLT 969; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 970; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 971; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 972; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 973; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 974; AVX1-NEXT: callq __extendhfsf2@PLT 975; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 976; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 977; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 978; AVX1-NEXT: addq $48, %rsp 979; AVX1-NEXT: popq %rbx 980; AVX1-NEXT: retq 981; 982; AVX2-LABEL: load_cvt_8i16_to_8f32: 983; AVX2: # %bb.0: 984; AVX2-NEXT: pushq %rbx 985; AVX2-NEXT: subq $48, %rsp 986; AVX2-NEXT: movq %rdi, %rbx 987; AVX2-NEXT: vmovdqa (%rdi), %xmm0 988; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 989; AVX2-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 990; AVX2-NEXT: callq __extendhfsf2@PLT 991; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 992; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 993; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 994; AVX2-NEXT: callq __extendhfsf2@PLT 995; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 996; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 997; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 998; AVX2-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 999; AVX2-NEXT: callq __extendhfsf2@PLT 1000; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1001; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1002; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1003; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1004; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1005; AVX2-NEXT: callq __extendhfsf2@PLT 1006; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1007; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1008; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1009; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1010; AVX2-NEXT: callq __extendhfsf2@PLT 1011; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1012; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1013; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 1014; AVX2-NEXT: callq __extendhfsf2@PLT 1015; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1016; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1017; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1018; AVX2-NEXT: vpinsrw $0, 4(%rbx), %xmm0, %xmm0 1019; AVX2-NEXT: callq __extendhfsf2@PLT 1020; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1021; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1022; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1023; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1024; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 1025; AVX2-NEXT: callq __extendhfsf2@PLT 1026; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1027; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1028; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1029; AVX2-NEXT: addq $48, %rsp 1030; AVX2-NEXT: popq %rbx 1031; AVX2-NEXT: retq 1032; 1033; F16C-LABEL: load_cvt_8i16_to_8f32: 1034; F16C: # %bb.0: 1035; F16C-NEXT: vcvtph2ps (%rdi), %ymm0 1036; F16C-NEXT: retq 1037; 1038; AVX512-LABEL: load_cvt_8i16_to_8f32: 1039; AVX512: # %bb.0: 1040; AVX512-NEXT: vcvtph2ps (%rdi), %ymm0 1041; AVX512-NEXT: retq 1042 %1 = load <8 x i16>, ptr %a0 1043 %2 = bitcast <8 x i16> %1 to <8 x half> 1044 %3 = fpext <8 x half> %2 to <8 x float> 1045 ret <8 x float> %3 1046} 1047 1048define <16 x float> @load_cvt_16i16_to_16f32(ptr %a0) nounwind { 1049; AVX1-LABEL: load_cvt_16i16_to_16f32: 1050; AVX1: # %bb.0: 1051; AVX1-NEXT: pushq %rbx 1052; AVX1-NEXT: subq $80, %rsp 1053; AVX1-NEXT: movq %rdi, %rbx 1054; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm0 1055; AVX1-NEXT: callq __extendhfsf2@PLT 1056; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1057; AVX1-NEXT: vmovdqa (%rbx), %xmm1 1058; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1059; AVX1-NEXT: vmovaps 16(%rbx), %xmm0 1060; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1061; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1062; AVX1-NEXT: callq __extendhfsf2@PLT 1063; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1064; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1065; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1066; AVX1-NEXT: vbroadcastss 12(%rbx), %xmm0 1067; AVX1-NEXT: callq __extendhfsf2@PLT 1068; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1069; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1070; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1071; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1072; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1073; AVX1-NEXT: callq __extendhfsf2@PLT 1074; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1075; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1076; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1077; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1078; AVX1-NEXT: callq __extendhfsf2@PLT 1079; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1080; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1081; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1082; AVX1-NEXT: callq __extendhfsf2@PLT 1083; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1084; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1085; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1086; AVX1-NEXT: vbroadcastss 4(%rbx), %xmm0 1087; AVX1-NEXT: callq __extendhfsf2@PLT 1088; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1089; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1090; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1091; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1092; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 1093; AVX1-NEXT: callq __extendhfsf2@PLT 1094; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1095; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1096; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1097; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1098; AVX1-NEXT: vbroadcastss 24(%rbx), %xmm0 1099; AVX1-NEXT: vzeroupper 1100; AVX1-NEXT: callq __extendhfsf2@PLT 1101; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1102; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1103; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1104; AVX1-NEXT: callq __extendhfsf2@PLT 1105; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1106; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1107; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1108; AVX1-NEXT: vbroadcastss 28(%rbx), %xmm0 1109; AVX1-NEXT: callq __extendhfsf2@PLT 1110; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1111; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1112; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1113; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1114; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1115; AVX1-NEXT: callq __extendhfsf2@PLT 1116; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1117; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1118; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1119; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1120; AVX1-NEXT: callq __extendhfsf2@PLT 1121; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1122; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1123; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1124; AVX1-NEXT: callq __extendhfsf2@PLT 1125; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1126; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1127; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1128; AVX1-NEXT: vbroadcastss 20(%rbx), %xmm0 1129; AVX1-NEXT: callq __extendhfsf2@PLT 1130; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1131; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1132; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1133; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1134; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 1135; AVX1-NEXT: callq __extendhfsf2@PLT 1136; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1137; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1138; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 1139; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1140; AVX1-NEXT: addq $80, %rsp 1141; AVX1-NEXT: popq %rbx 1142; AVX1-NEXT: retq 1143; 1144; AVX2-LABEL: load_cvt_16i16_to_16f32: 1145; AVX2: # %bb.0: 1146; AVX2-NEXT: pushq %rbx 1147; AVX2-NEXT: subq $80, %rsp 1148; AVX2-NEXT: movq %rdi, %rbx 1149; AVX2-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 1150; AVX2-NEXT: callq __extendhfsf2@PLT 1151; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1152; AVX2-NEXT: vmovdqa (%rbx), %xmm1 1153; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1154; AVX2-NEXT: vmovaps 16(%rbx), %xmm0 1155; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1156; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1157; AVX2-NEXT: callq __extendhfsf2@PLT 1158; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1159; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1160; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1161; AVX2-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 1162; AVX2-NEXT: callq __extendhfsf2@PLT 1163; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1164; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1165; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1166; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1167; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1168; AVX2-NEXT: callq __extendhfsf2@PLT 1169; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1170; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1171; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1172; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1173; AVX2-NEXT: callq __extendhfsf2@PLT 1174; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1175; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1176; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 1177; AVX2-NEXT: callq __extendhfsf2@PLT 1178; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1179; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1180; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1181; AVX2-NEXT: vpinsrw $0, 4(%rbx), %xmm0, %xmm0 1182; AVX2-NEXT: callq __extendhfsf2@PLT 1183; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1184; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1185; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1186; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1187; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 1188; AVX2-NEXT: callq __extendhfsf2@PLT 1189; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1190; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1191; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1192; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1193; AVX2-NEXT: vpinsrw $0, 24(%rbx), %xmm0, %xmm0 1194; AVX2-NEXT: vzeroupper 1195; AVX2-NEXT: callq __extendhfsf2@PLT 1196; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1197; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1198; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1199; AVX2-NEXT: callq __extendhfsf2@PLT 1200; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1201; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1202; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1203; AVX2-NEXT: vpinsrw $0, 28(%rbx), %xmm0, %xmm0 1204; AVX2-NEXT: callq __extendhfsf2@PLT 1205; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1206; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1207; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1208; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1209; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1210; AVX2-NEXT: callq __extendhfsf2@PLT 1211; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1212; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1213; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1214; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1215; AVX2-NEXT: callq __extendhfsf2@PLT 1216; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1217; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1218; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 1219; AVX2-NEXT: callq __extendhfsf2@PLT 1220; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1221; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1222; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1223; AVX2-NEXT: vpinsrw $0, 20(%rbx), %xmm0, %xmm0 1224; AVX2-NEXT: callq __extendhfsf2@PLT 1225; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1226; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1227; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1228; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1229; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 1230; AVX2-NEXT: callq __extendhfsf2@PLT 1231; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1232; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1233; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 1234; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1235; AVX2-NEXT: addq $80, %rsp 1236; AVX2-NEXT: popq %rbx 1237; AVX2-NEXT: retq 1238; 1239; F16C-LABEL: load_cvt_16i16_to_16f32: 1240; F16C: # %bb.0: 1241; F16C-NEXT: vcvtph2ps (%rdi), %ymm0 1242; F16C-NEXT: vcvtph2ps 16(%rdi), %ymm1 1243; F16C-NEXT: retq 1244; 1245; AVX512-LABEL: load_cvt_16i16_to_16f32: 1246; AVX512: # %bb.0: 1247; AVX512-NEXT: vcvtph2ps (%rdi), %zmm0 1248; AVX512-NEXT: retq 1249 %1 = load <16 x i16>, ptr %a0 1250 %2 = bitcast <16 x i16> %1 to <16 x half> 1251 %3 = fpext <16 x half> %2 to <16 x float> 1252 ret <16 x float> %3 1253} 1254 1255define <4 x float> @load_cvt_4i16_to_4f32_constrained(ptr %a0) nounwind strictfp { 1256; AVX-LABEL: load_cvt_4i16_to_4f32_constrained: 1257; AVX: # %bb.0: 1258; AVX-NEXT: subq $72, %rsp 1259; AVX-NEXT: vpinsrw $0, 6(%rdi), %xmm0, %xmm0 1260; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1261; AVX-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 1262; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1263; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1264; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1265; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 1266; AVX-NEXT: callq __extendhfsf2@PLT 1267; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1268; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1269; AVX-NEXT: callq __extendhfsf2@PLT 1270; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1271; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 1272; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1273; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1274; AVX-NEXT: callq __extendhfsf2@PLT 1275; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1276; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1277; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1278; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1279; AVX-NEXT: callq __extendhfsf2@PLT 1280; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1281; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1282; AVX-NEXT: addq $72, %rsp 1283; AVX-NEXT: retq 1284; 1285; F16C-LABEL: load_cvt_4i16_to_4f32_constrained: 1286; F16C: # %bb.0: 1287; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 1288; F16C-NEXT: retq 1289; 1290; AVX512-LABEL: load_cvt_4i16_to_4f32_constrained: 1291; AVX512: # %bb.0: 1292; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 1293; AVX512-NEXT: retq 1294 %1 = load <4 x i16>, ptr %a0 1295 %2 = bitcast <4 x i16> %1 to <4 x half> 1296 %3 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %2, metadata !"fpexcept.strict") strictfp 1297 ret <4 x float> %3 1298} 1299 1300define <4 x float> @load_cvt_8i16_to_4f32_constrained(ptr %a0) nounwind strictfp { 1301; AVX-LABEL: load_cvt_8i16_to_4f32_constrained: 1302; AVX: # %bb.0: 1303; AVX-NEXT: subq $72, %rsp 1304; AVX-NEXT: movq (%rdi), %rax 1305; AVX-NEXT: movq %rax, %rcx 1306; AVX-NEXT: movq %rax, %rdx 1307; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1308; AVX-NEXT: # kill: def $eax killed $eax killed $rax 1309; AVX-NEXT: shrl $16, %eax 1310; AVX-NEXT: shrq $32, %rcx 1311; AVX-NEXT: shrq $48, %rdx 1312; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 1313; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1314; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 1315; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1316; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 1317; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill 1318; AVX-NEXT: callq __extendhfsf2@PLT 1319; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1320; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1321; AVX-NEXT: callq __extendhfsf2@PLT 1322; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1323; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 1324; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1325; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1326; AVX-NEXT: callq __extendhfsf2@PLT 1327; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1328; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 1329; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1330; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1331; AVX-NEXT: callq __extendhfsf2@PLT 1332; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1333; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1334; AVX-NEXT: addq $72, %rsp 1335; AVX-NEXT: retq 1336; 1337; F16C-LABEL: load_cvt_8i16_to_4f32_constrained: 1338; F16C: # %bb.0: 1339; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 1340; F16C-NEXT: retq 1341; 1342; AVX512-LABEL: load_cvt_8i16_to_4f32_constrained: 1343; AVX512: # %bb.0: 1344; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 1345; AVX512-NEXT: retq 1346 %1 = load <8 x i16>, ptr %a0 1347 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1348 %3 = bitcast <4 x i16> %2 to <4 x half> 1349 %4 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %3, metadata !"fpexcept.strict") strictfp 1350 ret <4 x float> %4 1351} 1352 1353; 1354; Half to Double 1355; 1356 1357define double @cvt_i16_to_f64(i16 %a0) nounwind { 1358; AVX-LABEL: cvt_i16_to_f64: 1359; AVX: # %bb.0: 1360; AVX-NEXT: pushq %rax 1361; AVX-NEXT: vpinsrw $0, %edi, %xmm0, %xmm0 1362; AVX-NEXT: callq __extendhfsf2@PLT 1363; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1364; AVX-NEXT: popq %rax 1365; AVX-NEXT: retq 1366; 1367; F16C-LABEL: cvt_i16_to_f64: 1368; F16C: # %bb.0: 1369; F16C-NEXT: vmovd %edi, %xmm0 1370; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1371; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1372; F16C-NEXT: retq 1373; 1374; AVX512-LABEL: cvt_i16_to_f64: 1375; AVX512: # %bb.0: 1376; AVX512-NEXT: vmovd %edi, %xmm0 1377; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1378; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1379; AVX512-NEXT: retq 1380 %1 = bitcast i16 %a0 to half 1381 %2 = fpext half %1 to double 1382 ret double %2 1383} 1384 1385define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { 1386; AVX-LABEL: cvt_2i16_to_2f64: 1387; AVX: # %bb.0: 1388; AVX-NEXT: subq $40, %rsp 1389; AVX-NEXT: vmovd %xmm0, %eax 1390; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1391; AVX-NEXT: shrl $16, %eax 1392; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 1393; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1394; AVX-NEXT: callq __extendhfsf2@PLT 1395; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1396; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1397; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1398; AVX-NEXT: callq __extendhfsf2@PLT 1399; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1400; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1401; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1402; AVX-NEXT: addq $40, %rsp 1403; AVX-NEXT: retq 1404; 1405; F16C-LABEL: cvt_2i16_to_2f64: 1406; F16C: # %bb.0: 1407; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1408; F16C-NEXT: vcvtps2pd %xmm0, %xmm0 1409; F16C-NEXT: retq 1410; 1411; AVX512-LABEL: cvt_2i16_to_2f64: 1412; AVX512: # %bb.0: 1413; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1414; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0 1415; AVX512-NEXT: retq 1416 %1 = bitcast <2 x i16> %a0 to <2 x half> 1417 %2 = fpext <2 x half> %1 to <2 x double> 1418 ret <2 x double> %2 1419} 1420 1421define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { 1422; AVX-LABEL: cvt_4i16_to_4f64: 1423; AVX: # %bb.0: 1424; AVX-NEXT: subq $72, %rsp 1425; AVX-NEXT: vmovq %xmm0, %rax 1426; AVX-NEXT: movq %rax, %rcx 1427; AVX-NEXT: movl %eax, %edx 1428; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1429; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1430; AVX-NEXT: shrq $48, %rax 1431; AVX-NEXT: shrq $32, %rcx 1432; AVX-NEXT: shrl $16, %edx 1433; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 1434; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1435; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 1436; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1437; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1438; AVX-NEXT: callq __extendhfsf2@PLT 1439; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1440; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1441; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1442; AVX-NEXT: callq __extendhfsf2@PLT 1443; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1444; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1445; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1446; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1447; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1448; AVX-NEXT: callq __extendhfsf2@PLT 1449; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1450; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1451; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1452; AVX-NEXT: callq __extendhfsf2@PLT 1453; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1454; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1455; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1456; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1457; AVX-NEXT: addq $72, %rsp 1458; AVX-NEXT: retq 1459; 1460; F16C-LABEL: cvt_4i16_to_4f64: 1461; F16C: # %bb.0: 1462; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1463; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1464; F16C-NEXT: retq 1465; 1466; AVX512-LABEL: cvt_4i16_to_4f64: 1467; AVX512: # %bb.0: 1468; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1469; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1470; AVX512-NEXT: retq 1471 %1 = bitcast <4 x i16> %a0 to <4 x half> 1472 %2 = fpext <4 x half> %1 to <4 x double> 1473 ret <4 x double> %2 1474} 1475 1476define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { 1477; AVX-LABEL: cvt_8i16_to_2f64: 1478; AVX: # %bb.0: 1479; AVX-NEXT: subq $40, %rsp 1480; AVX-NEXT: vmovd %xmm0, %eax 1481; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1482; AVX-NEXT: shrl $16, %eax 1483; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 1484; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1485; AVX-NEXT: callq __extendhfsf2@PLT 1486; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1487; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1488; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1489; AVX-NEXT: callq __extendhfsf2@PLT 1490; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1491; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1492; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1493; AVX-NEXT: addq $40, %rsp 1494; AVX-NEXT: retq 1495; 1496; F16C-LABEL: cvt_8i16_to_2f64: 1497; F16C: # %bb.0: 1498; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1499; F16C-NEXT: vcvtps2pd %xmm0, %xmm0 1500; F16C-NEXT: retq 1501; 1502; AVX512-LABEL: cvt_8i16_to_2f64: 1503; AVX512: # %bb.0: 1504; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1505; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0 1506; AVX512-NEXT: retq 1507 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 1508 %2 = bitcast <2 x i16> %1 to <2 x half> 1509 %3 = fpext <2 x half> %2 to <2 x double> 1510 ret <2 x double> %3 1511} 1512 1513define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { 1514; AVX-LABEL: cvt_8i16_to_4f64: 1515; AVX: # %bb.0: 1516; AVX-NEXT: subq $72, %rsp 1517; AVX-NEXT: vmovq %xmm0, %rax 1518; AVX-NEXT: movq %rax, %rcx 1519; AVX-NEXT: movl %eax, %edx 1520; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1521; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1522; AVX-NEXT: shrq $48, %rax 1523; AVX-NEXT: shrq $32, %rcx 1524; AVX-NEXT: shrl $16, %edx 1525; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 1526; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1527; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 1528; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1529; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1530; AVX-NEXT: callq __extendhfsf2@PLT 1531; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1532; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1533; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1534; AVX-NEXT: callq __extendhfsf2@PLT 1535; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1536; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1537; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1538; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1539; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1540; AVX-NEXT: callq __extendhfsf2@PLT 1541; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1542; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1543; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1544; AVX-NEXT: callq __extendhfsf2@PLT 1545; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1546; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1547; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1548; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1549; AVX-NEXT: addq $72, %rsp 1550; AVX-NEXT: retq 1551; 1552; F16C-LABEL: cvt_8i16_to_4f64: 1553; F16C: # %bb.0: 1554; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1555; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1556; F16C-NEXT: retq 1557; 1558; AVX512-LABEL: cvt_8i16_to_4f64: 1559; AVX512: # %bb.0: 1560; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1561; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1562; AVX512-NEXT: retq 1563 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1564 %2 = bitcast <4 x i16> %1 to <4 x half> 1565 %3 = fpext <4 x half> %2 to <4 x double> 1566 ret <4 x double> %3 1567} 1568 1569define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { 1570; AVX-LABEL: cvt_8i16_to_8f64: 1571; AVX: # %bb.0: 1572; AVX-NEXT: subq $88, %rsp 1573; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1574; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 1575; AVX-NEXT: callq __extendhfsf2@PLT 1576; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1577; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1578; AVX-NEXT: vpermilps $245, (%rsp), %xmm0 # 16-byte Folded Reload 1579; AVX-NEXT: # xmm0 = mem[1,1,3,3] 1580; AVX-NEXT: callq __extendhfsf2@PLT 1581; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1582; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1583; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1584; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1585; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1586; AVX-NEXT: callq __extendhfsf2@PLT 1587; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1588; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1589; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1590; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 1591; AVX-NEXT: callq __extendhfsf2@PLT 1592; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1593; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1594; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1595; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1596; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1597; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1598; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1599; AVX-NEXT: vzeroupper 1600; AVX-NEXT: callq __extendhfsf2@PLT 1601; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1602; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1603; AVX-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload 1604; AVX-NEXT: # xmm0 = mem[3,3,3,3] 1605; AVX-NEXT: callq __extendhfsf2@PLT 1606; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1607; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1608; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1609; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1610; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1611; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1612; AVX-NEXT: callq __extendhfsf2@PLT 1613; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1614; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1615; AVX-NEXT: vpermilps $78, (%rsp), %xmm0 # 16-byte Folded Reload 1616; AVX-NEXT: # xmm0 = mem[2,3,0,1] 1617; AVX-NEXT: callq __extendhfsf2@PLT 1618; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1619; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1620; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1621; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 1622; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1623; AVX-NEXT: addq $88, %rsp 1624; AVX-NEXT: retq 1625; 1626; F16C-LABEL: cvt_8i16_to_8f64: 1627; F16C: # %bb.0: 1628; F16C-NEXT: vcvtph2ps %xmm0, %ymm1 1629; F16C-NEXT: vcvtps2pd %xmm1, %ymm0 1630; F16C-NEXT: vextractf128 $1, %ymm1, %xmm1 1631; F16C-NEXT: vcvtps2pd %xmm1, %ymm1 1632; F16C-NEXT: retq 1633; 1634; AVX512-LABEL: cvt_8i16_to_8f64: 1635; AVX512: # %bb.0: 1636; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 1637; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 1638; AVX512-NEXT: retq 1639 %1 = bitcast <8 x i16> %a0 to <8 x half> 1640 %2 = fpext <8 x half> %1 to <8 x double> 1641 ret <8 x double> %2 1642} 1643 1644define <2 x double> @cvt_2i16_to_2f64_constrained(<2 x i16> %a0) nounwind strictfp { 1645; AVX-LABEL: cvt_2i16_to_2f64_constrained: 1646; AVX: # %bb.0: 1647; AVX-NEXT: subq $40, %rsp 1648; AVX-NEXT: vmovd %xmm0, %eax 1649; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1650; AVX-NEXT: shrl $16, %eax 1651; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 1652; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1653; AVX-NEXT: callq __extendhfsf2@PLT 1654; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1655; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1656; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1657; AVX-NEXT: callq __extendhfsf2@PLT 1658; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1659; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 1660; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1661; AVX-NEXT: addq $40, %rsp 1662; AVX-NEXT: retq 1663; 1664; F16C-LABEL: cvt_2i16_to_2f64_constrained: 1665; F16C: # %bb.0: 1666; F16C-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1667; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1668; F16C-NEXT: vcvtps2pd %xmm0, %xmm0 1669; F16C-NEXT: retq 1670; 1671; AVX512-LABEL: cvt_2i16_to_2f64_constrained: 1672; AVX512: # %bb.0: 1673; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1674; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1675; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0 1676; AVX512-NEXT: retq 1677 %1 = bitcast <2 x i16> %a0 to <2 x half> 1678 %2 = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp 1679 ret <2 x double> %2 1680} 1681declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half>, metadata) strictfp 1682 1683define <4 x double> @cvt_4i16_to_4f64_constrained(<4 x i16> %a0) nounwind strictfp { 1684; AVX-LABEL: cvt_4i16_to_4f64_constrained: 1685; AVX: # %bb.0: 1686; AVX-NEXT: subq $72, %rsp 1687; AVX-NEXT: vmovq %xmm0, %rax 1688; AVX-NEXT: movq %rax, %rcx 1689; AVX-NEXT: movl %eax, %edx 1690; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1691; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1692; AVX-NEXT: shrq $48, %rax 1693; AVX-NEXT: shrq $32, %rcx 1694; AVX-NEXT: shrl $16, %edx 1695; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 1696; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1697; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 1698; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1699; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1700; AVX-NEXT: callq __extendhfsf2@PLT 1701; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1702; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1703; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1704; AVX-NEXT: callq __extendhfsf2@PLT 1705; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1706; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1707; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1708; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1709; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1710; AVX-NEXT: callq __extendhfsf2@PLT 1711; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1712; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1713; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1714; AVX-NEXT: callq __extendhfsf2@PLT 1715; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1716; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1717; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1718; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1719; AVX-NEXT: addq $72, %rsp 1720; AVX-NEXT: retq 1721; 1722; F16C-LABEL: cvt_4i16_to_4f64_constrained: 1723; F16C: # %bb.0: 1724; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1725; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1726; F16C-NEXT: retq 1727; 1728; AVX512-LABEL: cvt_4i16_to_4f64_constrained: 1729; AVX512: # %bb.0: 1730; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1731; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1732; AVX512-NEXT: retq 1733 %1 = bitcast <4 x i16> %a0 to <4 x half> 1734 %2 = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp 1735 ret <4 x double> %2 1736} 1737declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata) strictfp 1738 1739define <8 x double> @cvt_8i16_to_8f64_constrained(<8 x i16> %a0) nounwind strictfp { 1740; AVX-LABEL: cvt_8i16_to_8f64_constrained: 1741; AVX: # %bb.0: 1742; AVX-NEXT: subq $88, %rsp 1743; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1744; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 1745; AVX-NEXT: callq __extendhfsf2@PLT 1746; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1747; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1748; AVX-NEXT: vpermilps $245, (%rsp), %xmm0 # 16-byte Folded Reload 1749; AVX-NEXT: # xmm0 = mem[1,1,3,3] 1750; AVX-NEXT: callq __extendhfsf2@PLT 1751; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1752; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1753; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1754; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1755; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1756; AVX-NEXT: callq __extendhfsf2@PLT 1757; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1758; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1759; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1760; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 1761; AVX-NEXT: callq __extendhfsf2@PLT 1762; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1763; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1764; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1765; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1766; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1767; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1768; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1769; AVX-NEXT: vzeroupper 1770; AVX-NEXT: callq __extendhfsf2@PLT 1771; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1772; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1773; AVX-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload 1774; AVX-NEXT: # xmm0 = mem[3,3,3,3] 1775; AVX-NEXT: callq __extendhfsf2@PLT 1776; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1777; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1778; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1779; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1780; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1781; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1782; AVX-NEXT: callq __extendhfsf2@PLT 1783; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1784; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1785; AVX-NEXT: vpermilps $78, (%rsp), %xmm0 # 16-byte Folded Reload 1786; AVX-NEXT: # xmm0 = mem[2,3,0,1] 1787; AVX-NEXT: callq __extendhfsf2@PLT 1788; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1789; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1790; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1791; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 1792; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1793; AVX-NEXT: addq $88, %rsp 1794; AVX-NEXT: retq 1795; 1796; F16C-LABEL: cvt_8i16_to_8f64_constrained: 1797; F16C: # %bb.0: 1798; F16C-NEXT: vcvtph2ps %xmm0, %ymm0 1799; F16C-NEXT: vextractf128 $1, %ymm0, %xmm1 1800; F16C-NEXT: vcvtps2pd %xmm1, %ymm1 1801; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1802; F16C-NEXT: retq 1803; 1804; AVX512-LABEL: cvt_8i16_to_8f64_constrained: 1805; AVX512: # %bb.0: 1806; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 1807; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 1808; AVX512-NEXT: retq 1809 %1 = bitcast <8 x i16> %a0 to <8 x half> 1810 %2 = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp 1811 ret <8 x double> %2 1812} 1813declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>, metadata) strictfp 1814 1815; 1816; Half to Double (Load) 1817; 1818 1819define double @load_cvt_i16_to_f64(ptr %a0) nounwind { 1820; AVX-LABEL: load_cvt_i16_to_f64: 1821; AVX: # %bb.0: 1822; AVX-NEXT: pushq %rax 1823; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1824; AVX-NEXT: callq __extendhfsf2@PLT 1825; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1826; AVX-NEXT: popq %rax 1827; AVX-NEXT: retq 1828; 1829; F16C-LABEL: load_cvt_i16_to_f64: 1830; F16C: # %bb.0: 1831; F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1832; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1833; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1834; F16C-NEXT: retq 1835; 1836; AVX512-LABEL: load_cvt_i16_to_f64: 1837; AVX512: # %bb.0: 1838; AVX512-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1839; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1840; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1841; AVX512-NEXT: retq 1842 %1 = load i16, ptr %a0 1843 %2 = bitcast i16 %1 to half 1844 %3 = fpext half %2 to double 1845 ret double %3 1846} 1847 1848define <2 x double> @load_cvt_2i16_to_2f64(ptr %a0) nounwind { 1849; AVX-LABEL: load_cvt_2i16_to_2f64: 1850; AVX: # %bb.0: 1851; AVX-NEXT: subq $40, %rsp 1852; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1853; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1854; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 1855; AVX-NEXT: callq __extendhfsf2@PLT 1856; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1857; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1858; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1859; AVX-NEXT: callq __extendhfsf2@PLT 1860; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1861; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 1862; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1863; AVX-NEXT: addq $40, %rsp 1864; AVX-NEXT: retq 1865; 1866; F16C-LABEL: load_cvt_2i16_to_2f64: 1867; F16C: # %bb.0: 1868; F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1869; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 1870; F16C-NEXT: vcvtps2pd %xmm0, %xmm0 1871; F16C-NEXT: retq 1872; 1873; AVX512-LABEL: load_cvt_2i16_to_2f64: 1874; AVX512: # %bb.0: 1875; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1876; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1877; AVX512-NEXT: vcvtps2pd %xmm0, %xmm0 1878; AVX512-NEXT: retq 1879 %1 = load <2 x i16>, ptr %a0 1880 %2 = bitcast <2 x i16> %1 to <2 x half> 1881 %3 = fpext <2 x half> %2 to <2 x double> 1882 ret <2 x double> %3 1883} 1884 1885define <4 x double> @load_cvt_4i16_to_4f64(ptr %a0) nounwind { 1886; AVX-LABEL: load_cvt_4i16_to_4f64: 1887; AVX: # %bb.0: 1888; AVX-NEXT: subq $72, %rsp 1889; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 1890; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1891; AVX-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 1892; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1893; AVX-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 1894; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1895; AVX-NEXT: vpinsrw $0, 6(%rdi), %xmm0, %xmm0 1896; AVX-NEXT: callq __extendhfsf2@PLT 1897; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1898; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1899; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1900; AVX-NEXT: callq __extendhfsf2@PLT 1901; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1902; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1903; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1904; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1905; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1906; AVX-NEXT: callq __extendhfsf2@PLT 1907; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1908; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1909; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1910; AVX-NEXT: callq __extendhfsf2@PLT 1911; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1912; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1913; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1914; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1915; AVX-NEXT: addq $72, %rsp 1916; AVX-NEXT: retq 1917; 1918; F16C-LABEL: load_cvt_4i16_to_4f64: 1919; F16C: # %bb.0: 1920; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 1921; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1922; F16C-NEXT: retq 1923; 1924; AVX512-LABEL: load_cvt_4i16_to_4f64: 1925; AVX512: # %bb.0: 1926; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 1927; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1928; AVX512-NEXT: retq 1929 %1 = load <4 x i16>, ptr %a0 1930 %2 = bitcast <4 x i16> %1 to <4 x half> 1931 %3 = fpext <4 x half> %2 to <4 x double> 1932 ret <4 x double> %3 1933} 1934 1935define <4 x double> @load_cvt_8i16_to_4f64(ptr %a0) nounwind { 1936; AVX-LABEL: load_cvt_8i16_to_4f64: 1937; AVX: # %bb.0: 1938; AVX-NEXT: subq $72, %rsp 1939; AVX-NEXT: movq (%rdi), %rax 1940; AVX-NEXT: movq %rax, %rcx 1941; AVX-NEXT: movl %eax, %edx 1942; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1943; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1944; AVX-NEXT: shrq $48, %rax 1945; AVX-NEXT: shrq $32, %rcx 1946; AVX-NEXT: shrl $16, %edx 1947; AVX-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 1948; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1949; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 1950; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 1951; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1952; AVX-NEXT: callq __extendhfsf2@PLT 1953; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1954; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1955; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1956; AVX-NEXT: callq __extendhfsf2@PLT 1957; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1958; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1959; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 1960; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1961; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1962; AVX-NEXT: callq __extendhfsf2@PLT 1963; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1964; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1965; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1966; AVX-NEXT: callq __extendhfsf2@PLT 1967; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1968; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1969; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1970; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 1971; AVX-NEXT: addq $72, %rsp 1972; AVX-NEXT: retq 1973; 1974; F16C-LABEL: load_cvt_8i16_to_4f64: 1975; F16C: # %bb.0: 1976; F16C-NEXT: vcvtph2ps (%rdi), %xmm0 1977; F16C-NEXT: vcvtps2pd %xmm0, %ymm0 1978; F16C-NEXT: retq 1979; 1980; AVX512-LABEL: load_cvt_8i16_to_4f64: 1981; AVX512: # %bb.0: 1982; AVX512-NEXT: vcvtph2ps (%rdi), %xmm0 1983; AVX512-NEXT: vcvtps2pd %xmm0, %ymm0 1984; AVX512-NEXT: retq 1985 %1 = load <8 x i16>, ptr %a0 1986 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1987 %3 = bitcast <4 x i16> %2 to <4 x half> 1988 %4 = fpext <4 x half> %3 to <4 x double> 1989 ret <4 x double> %4 1990} 1991 1992define <8 x double> @load_cvt_8i16_to_8f64(ptr %a0) nounwind { 1993; AVX1-LABEL: load_cvt_8i16_to_8f64: 1994; AVX1: # %bb.0: 1995; AVX1-NEXT: pushq %rbx 1996; AVX1-NEXT: subq $80, %rsp 1997; AVX1-NEXT: movq %rdi, %rbx 1998; AVX1-NEXT: vmovaps (%rdi), %xmm0 1999; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2000; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm0 2001; AVX1-NEXT: callq __extendhfsf2@PLT 2002; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2003; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2004; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2005; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 2006; AVX1-NEXT: callq __extendhfsf2@PLT 2007; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2008; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2009; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2010; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2011; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2012; AVX1-NEXT: callq __extendhfsf2@PLT 2013; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2014; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2015; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2016; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 2017; AVX1-NEXT: callq __extendhfsf2@PLT 2018; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2019; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 2020; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2021; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2022; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2023; AVX1-NEXT: vbroadcastss 12(%rbx), %xmm0 2024; AVX1-NEXT: vzeroupper 2025; AVX1-NEXT: callq __extendhfsf2@PLT 2026; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2027; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2028; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2029; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2030; AVX1-NEXT: callq __extendhfsf2@PLT 2031; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2032; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 2033; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2034; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2035; AVX1-NEXT: vbroadcastss 8(%rbx), %xmm0 2036; AVX1-NEXT: callq __extendhfsf2@PLT 2037; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2038; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2039; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2040; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2041; AVX1-NEXT: callq __extendhfsf2@PLT 2042; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2043; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2044; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2045; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 2046; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2047; AVX1-NEXT: addq $80, %rsp 2048; AVX1-NEXT: popq %rbx 2049; AVX1-NEXT: retq 2050; 2051; AVX2-LABEL: load_cvt_8i16_to_8f64: 2052; AVX2: # %bb.0: 2053; AVX2-NEXT: pushq %rbx 2054; AVX2-NEXT: subq $80, %rsp 2055; AVX2-NEXT: movq %rdi, %rbx 2056; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2057; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2058; AVX2-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 2059; AVX2-NEXT: callq __extendhfsf2@PLT 2060; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2061; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2062; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2063; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 2064; AVX2-NEXT: callq __extendhfsf2@PLT 2065; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2066; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2067; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2068; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2069; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2070; AVX2-NEXT: callq __extendhfsf2@PLT 2071; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2072; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2073; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2074; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 2075; AVX2-NEXT: callq __extendhfsf2@PLT 2076; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2077; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 2078; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2079; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2080; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2081; AVX2-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 2082; AVX2-NEXT: vzeroupper 2083; AVX2-NEXT: callq __extendhfsf2@PLT 2084; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2085; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2086; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2087; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2088; AVX2-NEXT: callq __extendhfsf2@PLT 2089; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2090; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 2091; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2092; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2093; AVX2-NEXT: vpinsrw $0, 8(%rbx), %xmm0, %xmm0 2094; AVX2-NEXT: callq __extendhfsf2@PLT 2095; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2096; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2097; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2098; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2099; AVX2-NEXT: callq __extendhfsf2@PLT 2100; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2101; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2102; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2103; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload 2104; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2105; AVX2-NEXT: addq $80, %rsp 2106; AVX2-NEXT: popq %rbx 2107; AVX2-NEXT: retq 2108; 2109; F16C-LABEL: load_cvt_8i16_to_8f64: 2110; F16C: # %bb.0: 2111; F16C-NEXT: vcvtph2ps (%rdi), %ymm1 2112; F16C-NEXT: vcvtps2pd %xmm1, %ymm0 2113; F16C-NEXT: vextractf128 $1, %ymm1, %xmm1 2114; F16C-NEXT: vcvtps2pd %xmm1, %ymm1 2115; F16C-NEXT: retq 2116; 2117; AVX512-LABEL: load_cvt_8i16_to_8f64: 2118; AVX512: # %bb.0: 2119; AVX512-NEXT: vcvtph2ps (%rdi), %ymm0 2120; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 2121; AVX512-NEXT: retq 2122 %1 = load <8 x i16>, ptr %a0 2123 %2 = bitcast <8 x i16> %1 to <8 x half> 2124 %3 = fpext <8 x half> %2 to <8 x double> 2125 ret <8 x double> %3 2126} 2127 2128; 2129; Float to Half 2130; 2131 2132define i16 @cvt_f32_to_i16(float %a0) nounwind { 2133; AVX-LABEL: cvt_f32_to_i16: 2134; AVX: # %bb.0: 2135; AVX-NEXT: pushq %rax 2136; AVX-NEXT: callq __truncsfhf2@PLT 2137; AVX-NEXT: vpextrw $0, %xmm0, %eax 2138; AVX-NEXT: # kill: def $ax killed $ax killed $eax 2139; AVX-NEXT: popq %rcx 2140; AVX-NEXT: retq 2141; 2142; F16C-LABEL: cvt_f32_to_i16: 2143; F16C: # %bb.0: 2144; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2145; F16C-NEXT: vmovd %xmm0, %eax 2146; F16C-NEXT: # kill: def $ax killed $ax killed $eax 2147; F16C-NEXT: retq 2148; 2149; AVX512-LABEL: cvt_f32_to_i16: 2150; AVX512: # %bb.0: 2151; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2152; AVX512-NEXT: vmovd %xmm0, %eax 2153; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 2154; AVX512-NEXT: retq 2155 %1 = fptrunc float %a0 to half 2156 %2 = bitcast half %1 to i16 2157 ret i16 %2 2158} 2159 2160define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind { 2161; AVX-LABEL: cvt_4f32_to_4i16: 2162; AVX: # %bb.0: 2163; AVX-NEXT: subq $72, %rsp 2164; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2165; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2166; AVX-NEXT: callq __truncsfhf2@PLT 2167; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2168; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2169; AVX-NEXT: callq __truncsfhf2@PLT 2170; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2171; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2172; AVX-NEXT: # xmm0 = mem[1,0] 2173; AVX-NEXT: callq __truncsfhf2@PLT 2174; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2175; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2176; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2177; AVX-NEXT: callq __truncsfhf2@PLT 2178; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2179; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2180; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2181; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2182; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2183; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2184; AVX-NEXT: addq $72, %rsp 2185; AVX-NEXT: retq 2186; 2187; F16C-LABEL: cvt_4f32_to_4i16: 2188; F16C: # %bb.0: 2189; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2190; F16C-NEXT: retq 2191; 2192; AVX512-LABEL: cvt_4f32_to_4i16: 2193; AVX512: # %bb.0: 2194; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2195; AVX512-NEXT: retq 2196 %1 = fptrunc <4 x float> %a0 to <4 x half> 2197 %2 = bitcast <4 x half> %1 to <4 x i16> 2198 ret <4 x i16> %2 2199} 2200 2201define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { 2202; AVX-LABEL: cvt_4f32_to_8i16_undef: 2203; AVX: # %bb.0: 2204; AVX-NEXT: subq $72, %rsp 2205; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2206; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2207; AVX-NEXT: callq __truncsfhf2@PLT 2208; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2209; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2210; AVX-NEXT: callq __truncsfhf2@PLT 2211; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2212; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2213; AVX-NEXT: # xmm0 = mem[1,0] 2214; AVX-NEXT: callq __truncsfhf2@PLT 2215; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2216; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2217; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2218; AVX-NEXT: callq __truncsfhf2@PLT 2219; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2220; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2221; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2222; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2223; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2224; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2225; AVX-NEXT: addq $72, %rsp 2226; AVX-NEXT: retq 2227; 2228; F16C-LABEL: cvt_4f32_to_8i16_undef: 2229; F16C: # %bb.0: 2230; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2231; F16C-NEXT: retq 2232; 2233; AVX512-LABEL: cvt_4f32_to_8i16_undef: 2234; AVX512: # %bb.0: 2235; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2236; AVX512-NEXT: retq 2237 %1 = fptrunc <4 x float> %a0 to <4 x half> 2238 %2 = bitcast <4 x half> %1 to <4 x i16> 2239 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2240 ret <8 x i16> %3 2241} 2242 2243define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { 2244; AVX-LABEL: cvt_4f32_to_8i16_zero: 2245; AVX: # %bb.0: 2246; AVX-NEXT: subq $72, %rsp 2247; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2248; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2249; AVX-NEXT: callq __truncsfhf2@PLT 2250; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2251; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2252; AVX-NEXT: callq __truncsfhf2@PLT 2253; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2254; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2255; AVX-NEXT: # xmm0 = mem[1,0] 2256; AVX-NEXT: callq __truncsfhf2@PLT 2257; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2258; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2259; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2260; AVX-NEXT: callq __truncsfhf2@PLT 2261; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2262; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2263; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2264; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2265; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2266; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2267; AVX-NEXT: addq $72, %rsp 2268; AVX-NEXT: retq 2269; 2270; F16C-LABEL: cvt_4f32_to_8i16_zero: 2271; F16C: # %bb.0: 2272; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2273; F16C-NEXT: retq 2274; 2275; AVX512-LABEL: cvt_4f32_to_8i16_zero: 2276; AVX512: # %bb.0: 2277; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2278; AVX512-NEXT: retq 2279 %1 = fptrunc <4 x float> %a0 to <4 x half> 2280 %2 = bitcast <4 x half> %1 to <4 x i16> 2281 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2282 ret <8 x i16> %3 2283} 2284 2285define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { 2286; AVX-LABEL: cvt_8f32_to_8i16: 2287; AVX: # %bb.0: 2288; AVX-NEXT: subq $88, %rsp 2289; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2290; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 2291; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2292; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2293; AVX-NEXT: vzeroupper 2294; AVX-NEXT: callq __truncsfhf2@PLT 2295; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2296; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2297; AVX-NEXT: # xmm0 = mem[1,0] 2298; AVX-NEXT: callq __truncsfhf2@PLT 2299; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2300; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2301; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2302; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2303; AVX-NEXT: callq __truncsfhf2@PLT 2304; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2305; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2306; AVX-NEXT: # xmm0 = mem[1,1,3,3] 2307; AVX-NEXT: callq __truncsfhf2@PLT 2308; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2309; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2310; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2311; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2312; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2313; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2314; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2315; AVX-NEXT: callq __truncsfhf2@PLT 2316; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2317; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2318; AVX-NEXT: # xmm0 = mem[1,0] 2319; AVX-NEXT: callq __truncsfhf2@PLT 2320; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2321; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2322; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2323; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2324; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2325; AVX-NEXT: vzeroupper 2326; AVX-NEXT: callq __truncsfhf2@PLT 2327; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2328; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2329; AVX-NEXT: # xmm0 = mem[1,1,3,3] 2330; AVX-NEXT: callq __truncsfhf2@PLT 2331; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2332; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2333; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2334; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2335; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2336; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 2337; AVX-NEXT: addq $88, %rsp 2338; AVX-NEXT: retq 2339; 2340; F16C-LABEL: cvt_8f32_to_8i16: 2341; F16C: # %bb.0: 2342; F16C-NEXT: vcvtps2ph $4, %ymm0, %xmm0 2343; F16C-NEXT: vzeroupper 2344; F16C-NEXT: retq 2345; 2346; AVX512-LABEL: cvt_8f32_to_8i16: 2347; AVX512: # %bb.0: 2348; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 2349; AVX512-NEXT: vzeroupper 2350; AVX512-NEXT: retq 2351 %1 = fptrunc <8 x float> %a0 to <8 x half> 2352 %2 = bitcast <8 x half> %1 to <8 x i16> 2353 ret <8 x i16> %2 2354} 2355 2356define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { 2357; AVX1-LABEL: cvt_16f32_to_16i16: 2358; AVX1: # %bb.0: 2359; AVX1-NEXT: subq $120, %rsp 2360; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2361; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2362; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 2363; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2364; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2365; AVX1-NEXT: vzeroupper 2366; AVX1-NEXT: callq __truncsfhf2@PLT 2367; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2368; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2369; AVX1-NEXT: # xmm0 = mem[1,0] 2370; AVX1-NEXT: callq __truncsfhf2@PLT 2371; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2372; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2373; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2374; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2375; AVX1-NEXT: callq __truncsfhf2@PLT 2376; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2377; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2378; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2379; AVX1-NEXT: callq __truncsfhf2@PLT 2380; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2381; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2382; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2383; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2384; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2385; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2386; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 2387; AVX1-NEXT: callq __truncsfhf2@PLT 2388; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2389; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2390; AVX1-NEXT: # xmm0 = mem[1,0] 2391; AVX1-NEXT: callq __truncsfhf2@PLT 2392; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2393; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2394; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2395; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2396; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2397; AVX1-NEXT: vzeroupper 2398; AVX1-NEXT: callq __truncsfhf2@PLT 2399; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2400; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2401; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2402; AVX1-NEXT: callq __truncsfhf2@PLT 2403; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2404; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2405; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2406; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2407; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2408; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 2409; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2410; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2411; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2412; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2413; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2414; AVX1-NEXT: vzeroupper 2415; AVX1-NEXT: callq __truncsfhf2@PLT 2416; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2417; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2418; AVX1-NEXT: # xmm0 = mem[1,0] 2419; AVX1-NEXT: callq __truncsfhf2@PLT 2420; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2421; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2422; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2423; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2424; AVX1-NEXT: callq __truncsfhf2@PLT 2425; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2426; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2427; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2428; AVX1-NEXT: callq __truncsfhf2@PLT 2429; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2430; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2431; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2432; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2433; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2434; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2435; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 2436; AVX1-NEXT: callq __truncsfhf2@PLT 2437; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2438; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2439; AVX1-NEXT: # xmm0 = mem[1,0] 2440; AVX1-NEXT: callq __truncsfhf2@PLT 2441; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2442; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2443; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2444; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2445; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2446; AVX1-NEXT: vzeroupper 2447; AVX1-NEXT: callq __truncsfhf2@PLT 2448; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2449; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2450; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2451; AVX1-NEXT: callq __truncsfhf2@PLT 2452; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2453; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2454; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2455; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2456; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2457; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 2458; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2459; AVX1-NEXT: addq $120, %rsp 2460; AVX1-NEXT: retq 2461; 2462; AVX2-LABEL: cvt_16f32_to_16i16: 2463; AVX2: # %bb.0: 2464; AVX2-NEXT: subq $184, %rsp 2465; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2466; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2467; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 2468; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2469; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2470; AVX2-NEXT: vzeroupper 2471; AVX2-NEXT: callq __truncsfhf2@PLT 2472; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2473; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2474; AVX2-NEXT: # xmm0 = mem[1,0] 2475; AVX2-NEXT: callq __truncsfhf2@PLT 2476; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2477; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2478; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2479; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2480; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2481; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2482; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2483; AVX2-NEXT: vzeroupper 2484; AVX2-NEXT: callq __truncsfhf2@PLT 2485; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2486; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2487; AVX2-NEXT: # xmm0 = mem[1,0] 2488; AVX2-NEXT: callq __truncsfhf2@PLT 2489; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2490; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2491; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2492; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2493; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2494; AVX2-NEXT: vzeroupper 2495; AVX2-NEXT: callq __truncsfhf2@PLT 2496; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2497; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2498; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2499; AVX2-NEXT: callq __truncsfhf2@PLT 2500; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 2501; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2502; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2503; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2504; AVX2-NEXT: callq __truncsfhf2@PLT 2505; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2506; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2507; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2508; AVX2-NEXT: callq __truncsfhf2@PLT 2509; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 2510; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2511; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2512; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2513; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 2514; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2515; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2516; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 2517; AVX2-NEXT: vzeroupper 2518; AVX2-NEXT: callq __truncsfhf2@PLT 2519; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2520; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2521; AVX2-NEXT: # xmm0 = mem[1,0] 2522; AVX2-NEXT: callq __truncsfhf2@PLT 2523; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2524; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2525; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2526; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2527; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 2528; AVX2-NEXT: callq __truncsfhf2@PLT 2529; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2530; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2531; AVX2-NEXT: # xmm0 = mem[1,0] 2532; AVX2-NEXT: callq __truncsfhf2@PLT 2533; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2534; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2535; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2536; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2537; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2538; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2539; AVX2-NEXT: vzeroupper 2540; AVX2-NEXT: callq __truncsfhf2@PLT 2541; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2542; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2543; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2544; AVX2-NEXT: callq __truncsfhf2@PLT 2545; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2546; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2547; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2548; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2549; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2550; AVX2-NEXT: vzeroupper 2551; AVX2-NEXT: callq __truncsfhf2@PLT 2552; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2553; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2554; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2555; AVX2-NEXT: callq __truncsfhf2@PLT 2556; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2557; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2558; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2559; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2560; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 2561; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2562; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] 2563; AVX2-NEXT: addq $184, %rsp 2564; AVX2-NEXT: retq 2565; 2566; F16C-LABEL: cvt_16f32_to_16i16: 2567; F16C: # %bb.0: 2568; F16C-NEXT: vcvtps2ph $4, %ymm0, %xmm0 2569; F16C-NEXT: vcvtps2ph $4, %ymm1, %xmm1 2570; F16C-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2571; F16C-NEXT: retq 2572; 2573; AVX512-LABEL: cvt_16f32_to_16i16: 2574; AVX512: # %bb.0: 2575; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 2576; AVX512-NEXT: retq 2577 %1 = fptrunc <16 x float> %a0 to <16 x half> 2578 %2 = bitcast <16 x half> %1 to <16 x i16> 2579 ret <16 x i16> %2 2580} 2581 2582; 2583; Float to Half (Store) 2584; 2585 2586define void @store_cvt_f32_to_i16(float %a0, ptr %a1) nounwind { 2587; AVX-LABEL: store_cvt_f32_to_i16: 2588; AVX: # %bb.0: 2589; AVX-NEXT: pushq %rbx 2590; AVX-NEXT: movq %rdi, %rbx 2591; AVX-NEXT: callq __truncsfhf2@PLT 2592; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 2593; AVX-NEXT: popq %rbx 2594; AVX-NEXT: retq 2595; 2596; F16C-LABEL: store_cvt_f32_to_i16: 2597; F16C: # %bb.0: 2598; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2599; F16C-NEXT: vpextrw $0, %xmm0, (%rdi) 2600; F16C-NEXT: retq 2601; 2602; AVX512-LABEL: store_cvt_f32_to_i16: 2603; AVX512: # %bb.0: 2604; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2605; AVX512-NEXT: vpextrw $0, %xmm0, (%rdi) 2606; AVX512-NEXT: retq 2607 %1 = fptrunc float %a0 to half 2608 %2 = bitcast half %1 to i16 2609 store i16 %2, ptr %a1 2610 ret void 2611} 2612 2613define void @store_cvt_4f32_to_4i16(<4 x float> %a0, ptr %a1) nounwind { 2614; AVX-LABEL: store_cvt_4f32_to_4i16: 2615; AVX: # %bb.0: 2616; AVX-NEXT: pushq %rbx 2617; AVX-NEXT: subq $64, %rsp 2618; AVX-NEXT: movq %rdi, %rbx 2619; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2620; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2621; AVX-NEXT: callq __truncsfhf2@PLT 2622; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2623; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2624; AVX-NEXT: # xmm0 = mem[1,0] 2625; AVX-NEXT: callq __truncsfhf2@PLT 2626; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2627; AVX-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload 2628; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2629; AVX-NEXT: callq __truncsfhf2@PLT 2630; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2631; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 2632; AVX-NEXT: callq __truncsfhf2@PLT 2633; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 2634; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2635; AVX-NEXT: vpextrw $0, %xmm0, 6(%rbx) 2636; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2637; AVX-NEXT: vpextrw $0, %xmm0, 4(%rbx) 2638; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2639; AVX-NEXT: vpextrw $0, %xmm0, 2(%rbx) 2640; AVX-NEXT: addq $64, %rsp 2641; AVX-NEXT: popq %rbx 2642; AVX-NEXT: retq 2643; 2644; F16C-LABEL: store_cvt_4f32_to_4i16: 2645; F16C: # %bb.0: 2646; F16C-NEXT: vcvtps2ph $4, %xmm0, (%rdi) 2647; F16C-NEXT: retq 2648; 2649; AVX512-LABEL: store_cvt_4f32_to_4i16: 2650; AVX512: # %bb.0: 2651; AVX512-NEXT: vcvtps2ph $4, %xmm0, (%rdi) 2652; AVX512-NEXT: retq 2653 %1 = fptrunc <4 x float> %a0 to <4 x half> 2654 %2 = bitcast <4 x half> %1 to <4 x i16> 2655 store <4 x i16> %2, ptr %a1 2656 ret void 2657} 2658 2659define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, ptr %a1) nounwind { 2660; AVX-LABEL: store_cvt_4f32_to_8i16_undef: 2661; AVX: # %bb.0: 2662; AVX-NEXT: pushq %rbx 2663; AVX-NEXT: subq $64, %rsp 2664; AVX-NEXT: movq %rdi, %rbx 2665; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2666; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2667; AVX-NEXT: callq __truncsfhf2@PLT 2668; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2669; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2670; AVX-NEXT: callq __truncsfhf2@PLT 2671; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2672; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2673; AVX-NEXT: # xmm0 = mem[1,0] 2674; AVX-NEXT: callq __truncsfhf2@PLT 2675; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2676; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2677; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2678; AVX-NEXT: callq __truncsfhf2@PLT 2679; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2680; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2681; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2682; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2683; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2684; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2685; AVX-NEXT: vmovaps %xmm0, (%rbx) 2686; AVX-NEXT: addq $64, %rsp 2687; AVX-NEXT: popq %rbx 2688; AVX-NEXT: retq 2689; 2690; F16C-LABEL: store_cvt_4f32_to_8i16_undef: 2691; F16C: # %bb.0: 2692; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2693; F16C-NEXT: vmovaps %xmm0, (%rdi) 2694; F16C-NEXT: retq 2695; 2696; AVX512-LABEL: store_cvt_4f32_to_8i16_undef: 2697; AVX512: # %bb.0: 2698; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2699; AVX512-NEXT: vmovaps %xmm0, (%rdi) 2700; AVX512-NEXT: retq 2701 %1 = fptrunc <4 x float> %a0 to <4 x half> 2702 %2 = bitcast <4 x half> %1 to <4 x i16> 2703 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2704 store <8 x i16> %3, ptr %a1 2705 ret void 2706} 2707 2708define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, ptr %a1) nounwind { 2709; AVX-LABEL: store_cvt_4f32_to_8i16_zero: 2710; AVX: # %bb.0: 2711; AVX-NEXT: pushq %rbx 2712; AVX-NEXT: subq $64, %rsp 2713; AVX-NEXT: movq %rdi, %rbx 2714; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2715; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2716; AVX-NEXT: callq __truncsfhf2@PLT 2717; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2718; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2719; AVX-NEXT: callq __truncsfhf2@PLT 2720; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2721; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 2722; AVX-NEXT: # xmm0 = mem[1,0] 2723; AVX-NEXT: callq __truncsfhf2@PLT 2724; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2725; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload 2726; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2727; AVX-NEXT: callq __truncsfhf2@PLT 2728; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2729; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2730; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2731; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2732; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2733; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2734; AVX-NEXT: vmovaps %xmm0, (%rbx) 2735; AVX-NEXT: addq $64, %rsp 2736; AVX-NEXT: popq %rbx 2737; AVX-NEXT: retq 2738; 2739; F16C-LABEL: store_cvt_4f32_to_8i16_zero: 2740; F16C: # %bb.0: 2741; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2742; F16C-NEXT: vmovaps %xmm0, (%rdi) 2743; F16C-NEXT: retq 2744; 2745; AVX512-LABEL: store_cvt_4f32_to_8i16_zero: 2746; AVX512: # %bb.0: 2747; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2748; AVX512-NEXT: vmovaps %xmm0, (%rdi) 2749; AVX512-NEXT: retq 2750 %1 = fptrunc <4 x float> %a0 to <4 x half> 2751 %2 = bitcast <4 x half> %1 to <4 x i16> 2752 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2753 store <8 x i16> %3, ptr %a1 2754 ret void 2755} 2756 2757define void @store_cvt_8f32_to_8i16(<8 x float> %a0, ptr %a1) nounwind { 2758; AVX-LABEL: store_cvt_8f32_to_8i16: 2759; AVX: # %bb.0: 2760; AVX-NEXT: pushq %rbx 2761; AVX-NEXT: subq $80, %rsp 2762; AVX-NEXT: movq %rdi, %rbx 2763; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2764; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 2765; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2766; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2767; AVX-NEXT: vzeroupper 2768; AVX-NEXT: callq __truncsfhf2@PLT 2769; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2770; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2771; AVX-NEXT: # xmm0 = mem[1,0] 2772; AVX-NEXT: callq __truncsfhf2@PLT 2773; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2774; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2775; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2776; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2777; AVX-NEXT: callq __truncsfhf2@PLT 2778; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2779; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2780; AVX-NEXT: # xmm0 = mem[1,1,3,3] 2781; AVX-NEXT: callq __truncsfhf2@PLT 2782; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2783; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2784; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2785; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2786; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2787; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2788; AVX-NEXT: # xmm0 = mem[3,3,3,3] 2789; AVX-NEXT: callq __truncsfhf2@PLT 2790; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2791; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2792; AVX-NEXT: # xmm0 = mem[1,0] 2793; AVX-NEXT: callq __truncsfhf2@PLT 2794; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2795; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2796; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2797; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2798; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2799; AVX-NEXT: vzeroupper 2800; AVX-NEXT: callq __truncsfhf2@PLT 2801; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2802; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2803; AVX-NEXT: # xmm0 = mem[1,1,3,3] 2804; AVX-NEXT: callq __truncsfhf2@PLT 2805; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2806; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2807; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2808; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2809; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2810; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 2811; AVX-NEXT: vmovdqa %xmm0, (%rbx) 2812; AVX-NEXT: addq $80, %rsp 2813; AVX-NEXT: popq %rbx 2814; AVX-NEXT: retq 2815; 2816; F16C-LABEL: store_cvt_8f32_to_8i16: 2817; F16C: # %bb.0: 2818; F16C-NEXT: vcvtps2ph $4, %ymm0, (%rdi) 2819; F16C-NEXT: vzeroupper 2820; F16C-NEXT: retq 2821; 2822; AVX512-LABEL: store_cvt_8f32_to_8i16: 2823; AVX512: # %bb.0: 2824; AVX512-NEXT: vcvtps2ph $4, %ymm0, (%rdi) 2825; AVX512-NEXT: vzeroupper 2826; AVX512-NEXT: retq 2827 %1 = fptrunc <8 x float> %a0 to <8 x half> 2828 %2 = bitcast <8 x half> %1 to <8 x i16> 2829 store <8 x i16> %2, ptr %a1 2830 ret void 2831} 2832 2833define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind { 2834; AVX1-LABEL: store_cvt_16f32_to_16i16: 2835; AVX1: # %bb.0: 2836; AVX1-NEXT: pushq %rbx 2837; AVX1-NEXT: subq $112, %rsp 2838; AVX1-NEXT: movq %rdi, %rbx 2839; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2840; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2841; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 2842; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2843; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2844; AVX1-NEXT: vzeroupper 2845; AVX1-NEXT: callq __truncsfhf2@PLT 2846; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2847; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2848; AVX1-NEXT: # xmm0 = mem[1,0] 2849; AVX1-NEXT: callq __truncsfhf2@PLT 2850; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2851; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2852; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2853; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2854; AVX1-NEXT: callq __truncsfhf2@PLT 2855; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2856; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2857; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2858; AVX1-NEXT: callq __truncsfhf2@PLT 2859; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2860; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2861; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2862; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2863; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2864; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2865; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 2866; AVX1-NEXT: callq __truncsfhf2@PLT 2867; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2868; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2869; AVX1-NEXT: # xmm0 = mem[1,0] 2870; AVX1-NEXT: callq __truncsfhf2@PLT 2871; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2872; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2873; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2874; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2875; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2876; AVX1-NEXT: vzeroupper 2877; AVX1-NEXT: callq __truncsfhf2@PLT 2878; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2879; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2880; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2881; AVX1-NEXT: callq __truncsfhf2@PLT 2882; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2883; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2884; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2885; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2886; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2887; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 2888; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2889; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2890; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2891; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2892; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2893; AVX1-NEXT: vzeroupper 2894; AVX1-NEXT: callq __truncsfhf2@PLT 2895; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2896; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2897; AVX1-NEXT: # xmm0 = mem[1,0] 2898; AVX1-NEXT: callq __truncsfhf2@PLT 2899; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2900; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2901; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2902; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2903; AVX1-NEXT: callq __truncsfhf2@PLT 2904; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2905; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2906; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2907; AVX1-NEXT: callq __truncsfhf2@PLT 2908; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2909; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2910; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2911; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2912; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2913; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2914; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 2915; AVX1-NEXT: callq __truncsfhf2@PLT 2916; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2917; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2918; AVX1-NEXT: # xmm0 = mem[1,0] 2919; AVX1-NEXT: callq __truncsfhf2@PLT 2920; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2921; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2922; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2923; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2924; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2925; AVX1-NEXT: vzeroupper 2926; AVX1-NEXT: callq __truncsfhf2@PLT 2927; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2928; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2929; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 2930; AVX1-NEXT: callq __truncsfhf2@PLT 2931; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2932; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2933; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2934; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2935; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2936; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 2937; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2938; AVX1-NEXT: vmovaps %ymm0, (%rbx) 2939; AVX1-NEXT: addq $112, %rsp 2940; AVX1-NEXT: popq %rbx 2941; AVX1-NEXT: vzeroupper 2942; AVX1-NEXT: retq 2943; 2944; AVX2-LABEL: store_cvt_16f32_to_16i16: 2945; AVX2: # %bb.0: 2946; AVX2-NEXT: pushq %rbx 2947; AVX2-NEXT: subq $176, %rsp 2948; AVX2-NEXT: movq %rdi, %rbx 2949; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2950; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2951; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 2952; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2953; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2954; AVX2-NEXT: vzeroupper 2955; AVX2-NEXT: callq __truncsfhf2@PLT 2956; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2957; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2958; AVX2-NEXT: # xmm0 = mem[1,0] 2959; AVX2-NEXT: callq __truncsfhf2@PLT 2960; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2961; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2962; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2963; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2964; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2965; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2966; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2967; AVX2-NEXT: vzeroupper 2968; AVX2-NEXT: callq __truncsfhf2@PLT 2969; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2970; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2971; AVX2-NEXT: # xmm0 = mem[1,0] 2972; AVX2-NEXT: callq __truncsfhf2@PLT 2973; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 2974; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2975; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2976; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2977; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2978; AVX2-NEXT: vzeroupper 2979; AVX2-NEXT: callq __truncsfhf2@PLT 2980; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2981; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2982; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2983; AVX2-NEXT: callq __truncsfhf2@PLT 2984; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 2985; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2986; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2987; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2988; AVX2-NEXT: callq __truncsfhf2@PLT 2989; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2990; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2991; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 2992; AVX2-NEXT: callq __truncsfhf2@PLT 2993; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 2994; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2995; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 2996; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2997; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 2998; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2999; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3000; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 3001; AVX2-NEXT: vzeroupper 3002; AVX2-NEXT: callq __truncsfhf2@PLT 3003; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3004; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3005; AVX2-NEXT: # xmm0 = mem[1,0] 3006; AVX2-NEXT: callq __truncsfhf2@PLT 3007; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3008; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3009; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3010; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3011; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 3012; AVX2-NEXT: callq __truncsfhf2@PLT 3013; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3014; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3015; AVX2-NEXT: # xmm0 = mem[1,0] 3016; AVX2-NEXT: callq __truncsfhf2@PLT 3017; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3018; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3019; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3020; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3021; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3022; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3023; AVX2-NEXT: vzeroupper 3024; AVX2-NEXT: callq __truncsfhf2@PLT 3025; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3026; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3027; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 3028; AVX2-NEXT: callq __truncsfhf2@PLT 3029; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3030; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3031; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3032; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3033; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3034; AVX2-NEXT: vzeroupper 3035; AVX2-NEXT: callq __truncsfhf2@PLT 3036; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3037; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3038; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 3039; AVX2-NEXT: callq __truncsfhf2@PLT 3040; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3041; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3042; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3043; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3044; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 3045; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3046; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] 3047; AVX2-NEXT: vmovdqa %ymm0, (%rbx) 3048; AVX2-NEXT: addq $176, %rsp 3049; AVX2-NEXT: popq %rbx 3050; AVX2-NEXT: vzeroupper 3051; AVX2-NEXT: retq 3052; 3053; F16C-LABEL: store_cvt_16f32_to_16i16: 3054; F16C: # %bb.0: 3055; F16C-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) 3056; F16C-NEXT: vcvtps2ph $4, %ymm0, (%rdi) 3057; F16C-NEXT: vzeroupper 3058; F16C-NEXT: retq 3059; 3060; AVX512-LABEL: store_cvt_16f32_to_16i16: 3061; AVX512: # %bb.0: 3062; AVX512-NEXT: vcvtps2ph $4, %zmm0, (%rdi) 3063; AVX512-NEXT: vzeroupper 3064; AVX512-NEXT: retq 3065 %1 = fptrunc <16 x float> %a0 to <16 x half> 3066 %2 = bitcast <16 x half> %1 to <16 x i16> 3067 store <16 x i16> %2, ptr %a1 3068 ret void 3069} 3070 3071; 3072; Double to Half 3073; 3074 3075define i16 @cvt_f64_to_i16(double %a0) nounwind { 3076; ALL-LABEL: cvt_f64_to_i16: 3077; ALL: # %bb.0: 3078; ALL-NEXT: pushq %rax 3079; ALL-NEXT: callq __truncdfhf2@PLT 3080; ALL-NEXT: vpextrw $0, %xmm0, %eax 3081; ALL-NEXT: # kill: def $ax killed $ax killed $eax 3082; ALL-NEXT: popq %rcx 3083; ALL-NEXT: retq 3084; AVX-LABEL: cvt_f64_to_i16: 3085; AVX: # %bb.0: 3086; AVX-NEXT: pushq %rax 3087; AVX-NEXT: callq __truncdfhf2@PLT 3088; AVX-NEXT: vpextrw $0, %xmm0, %eax 3089; AVX-NEXT: # kill: def $ax killed $ax killed $eax 3090; AVX-NEXT: popq %rcx 3091; AVX-NEXT: retq 3092; 3093; F16C-LABEL: cvt_f64_to_i16: 3094; F16C: # %bb.0: 3095; F16C-NEXT: pushq %rax 3096; F16C-NEXT: callq __truncdfhf2@PLT 3097; F16C-NEXT: vpextrw $0, %xmm0, %eax 3098; F16C-NEXT: # kill: def $ax killed $ax killed $eax 3099; F16C-NEXT: popq %rcx 3100; F16C-NEXT: retq 3101; 3102; AVX512-LABEL: cvt_f64_to_i16: 3103; AVX512: # %bb.0: 3104; AVX512-NEXT: pushq %rax 3105; AVX512-NEXT: callq __truncdfhf2@PLT 3106; AVX512-NEXT: vpextrw $0, %xmm0, %eax 3107; AVX512-NEXT: # kill: def $ax killed $ax killed $eax 3108; AVX512-NEXT: popq %rcx 3109; AVX512-NEXT: retq 3110 %1 = fptrunc double %a0 to half 3111 %2 = bitcast half %1 to i16 3112 ret i16 %2 3113} 3114 3115define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { 3116; AVX-LABEL: cvt_2f64_to_2i16: 3117; AVX: # %bb.0: 3118; AVX-NEXT: subq $40, %rsp 3119; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3120; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3121; AVX-NEXT: callq __truncdfhf2@PLT 3122; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3123; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3124; AVX-NEXT: callq __truncdfhf2@PLT 3125; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 3126; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3127; AVX-NEXT: addq $40, %rsp 3128; AVX-NEXT: retq 3129; 3130; F16C-LABEL: cvt_2f64_to_2i16: 3131; F16C: # %bb.0: 3132; F16C-NEXT: subq $40, %rsp 3133; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3134; F16C-NEXT: callq __truncdfhf2@PLT 3135; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3136; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3137; F16C-NEXT: # xmm0 = mem[1,0] 3138; F16C-NEXT: callq __truncdfhf2@PLT 3139; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3140; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3141; F16C-NEXT: addq $40, %rsp 3142; F16C-NEXT: retq 3143; 3144; AVX512F-LABEL: cvt_2f64_to_2i16: 3145; AVX512F: # %bb.0: 3146; AVX512F-NEXT: subq $104, %rsp 3147; AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 3148; AVX512F-NEXT: callq __truncdfhf2@PLT 3149; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 3150; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3151; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3152; AVX512F-NEXT: vzeroupper 3153; AVX512F-NEXT: callq __truncdfhf2@PLT 3154; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3155; AVX512F-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3156; AVX512F-NEXT: # xmm0 = mem[1,0] 3157; AVX512F-NEXT: callq __truncdfhf2@PLT 3158; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3159; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3160; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,0] 3161; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 3162; AVX512F-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 3163; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3164; AVX512F-NEXT: addq $104, %rsp 3165; AVX512F-NEXT: vzeroupper 3166; AVX512F-NEXT: retq 3167; 3168; AVX512-FASTLANE-LABEL: cvt_2f64_to_2i16: 3169; AVX512-FASTLANE: # %bb.0: 3170; AVX512-FASTLANE-NEXT: subq $40, %rsp 3171; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3172; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT 3173; AVX512-FASTLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3174; AVX512-FASTLANE-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3175; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] 3176; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT 3177; AVX512-FASTLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3178; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3179; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 3180; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT 3181; AVX512-FASTLANE-NEXT: vpbroadcastw %xmm0, %xmm1 3182; AVX512-FASTLANE-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,0] 3183; AVX512-FASTLANE-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload 3184; AVX512-FASTLANE-NEXT: addq $40, %rsp 3185; AVX512-FASTLANE-NEXT: retq 3186 %1 = fptrunc <2 x double> %a0 to <2 x half> 3187 %2 = bitcast <2 x half> %1 to <2 x i16> 3188 ret <2 x i16> %2 3189} 3190 3191define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { 3192; AVX1-LABEL: cvt_4f64_to_4i16: 3193; AVX1: # %bb.0: 3194; AVX1-NEXT: subq $88, %rsp 3195; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3196; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3197; AVX1-NEXT: vzeroupper 3198; AVX1-NEXT: callq __truncdfhf2@PLT 3199; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3200; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3201; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3202; AVX1-NEXT: vzeroupper 3203; AVX1-NEXT: callq __truncdfhf2@PLT 3204; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3205; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3206; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3207; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3208; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3209; AVX1-NEXT: vzeroupper 3210; AVX1-NEXT: callq __truncdfhf2@PLT 3211; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3212; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3213; AVX1-NEXT: callq __truncdfhf2@PLT 3214; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3215; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3216; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3217; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3218; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3219; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3220; AVX1-NEXT: addq $88, %rsp 3221; AVX1-NEXT: retq 3222; 3223; AVX2-LABEL: cvt_4f64_to_4i16: 3224; AVX2: # %bb.0: 3225; AVX2-NEXT: subq $88, %rsp 3226; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3227; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3228; AVX2-NEXT: vzeroupper 3229; AVX2-NEXT: callq __truncdfhf2@PLT 3230; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3231; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3232; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3233; AVX2-NEXT: vzeroupper 3234; AVX2-NEXT: callq __truncdfhf2@PLT 3235; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3236; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3237; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3238; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3239; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3240; AVX2-NEXT: vzeroupper 3241; AVX2-NEXT: callq __truncdfhf2@PLT 3242; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3243; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3244; AVX2-NEXT: callq __truncdfhf2@PLT 3245; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3246; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3247; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3248; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3249; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3250; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3251; AVX2-NEXT: addq $88, %rsp 3252; AVX2-NEXT: retq 3253; 3254; F16C-LABEL: cvt_4f64_to_4i16: 3255; F16C: # %bb.0: 3256; F16C-NEXT: subq $72, %rsp 3257; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3258; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3259; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3260; F16C-NEXT: vzeroupper 3261; F16C-NEXT: callq __truncdfhf2@PLT 3262; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3263; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3264; F16C-NEXT: # xmm0 = mem[1,0] 3265; F16C-NEXT: callq __truncdfhf2@PLT 3266; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3267; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3268; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3269; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3270; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3271; F16C-NEXT: vzeroupper 3272; F16C-NEXT: callq __truncdfhf2@PLT 3273; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3274; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3275; F16C-NEXT: # xmm0 = mem[1,0] 3276; F16C-NEXT: callq __truncdfhf2@PLT 3277; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3278; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3279; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3280; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 3281; F16C-NEXT: addq $72, %rsp 3282; F16C-NEXT: retq 3283; 3284; AVX512-LABEL: cvt_4f64_to_4i16: 3285; AVX512: # %bb.0: 3286; AVX512-NEXT: subq $72, %rsp 3287; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3288; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3289; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3290; AVX512-NEXT: vzeroupper 3291; AVX512-NEXT: callq __truncdfhf2@PLT 3292; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3293; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3294; AVX512-NEXT: # xmm0 = mem[1,0] 3295; AVX512-NEXT: callq __truncdfhf2@PLT 3296; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3297; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3298; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3299; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3300; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3301; AVX512-NEXT: vzeroupper 3302; AVX512-NEXT: callq __truncdfhf2@PLT 3303; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3304; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3305; AVX512-NEXT: # xmm0 = mem[1,0] 3306; AVX512-NEXT: callq __truncdfhf2@PLT 3307; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3308; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3309; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3310; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3311; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3312; AVX512-NEXT: callq __truncdfhf2@PLT 3313; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 3314; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3315; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] 3316; AVX512-NEXT: addq $72, %rsp 3317; AVX512-NEXT: retq 3318 %1 = fptrunc <4 x double> %a0 to <4 x half> 3319 %2 = bitcast <4 x half> %1 to <4 x i16> 3320 ret <4 x i16> %2 3321} 3322 3323define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { 3324; AVX1-LABEL: cvt_4f64_to_8i16_undef: 3325; AVX1: # %bb.0: 3326; AVX1-NEXT: subq $88, %rsp 3327; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3328; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3329; AVX1-NEXT: vzeroupper 3330; AVX1-NEXT: callq __truncdfhf2@PLT 3331; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3332; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3333; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3334; AVX1-NEXT: vzeroupper 3335; AVX1-NEXT: callq __truncdfhf2@PLT 3336; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3337; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3338; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3339; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3340; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3341; AVX1-NEXT: vzeroupper 3342; AVX1-NEXT: callq __truncdfhf2@PLT 3343; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3344; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3345; AVX1-NEXT: callq __truncdfhf2@PLT 3346; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3347; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3348; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3349; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3350; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3351; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3352; AVX1-NEXT: addq $88, %rsp 3353; AVX1-NEXT: retq 3354; 3355; AVX2-LABEL: cvt_4f64_to_8i16_undef: 3356; AVX2: # %bb.0: 3357; AVX2-NEXT: subq $88, %rsp 3358; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3359; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3360; AVX2-NEXT: vzeroupper 3361; AVX2-NEXT: callq __truncdfhf2@PLT 3362; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3363; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3364; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3365; AVX2-NEXT: vzeroupper 3366; AVX2-NEXT: callq __truncdfhf2@PLT 3367; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3368; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3369; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3370; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3371; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3372; AVX2-NEXT: vzeroupper 3373; AVX2-NEXT: callq __truncdfhf2@PLT 3374; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3375; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3376; AVX2-NEXT: callq __truncdfhf2@PLT 3377; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3378; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3379; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3380; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3381; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3382; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3383; AVX2-NEXT: addq $88, %rsp 3384; AVX2-NEXT: retq 3385; 3386; F16C-LABEL: cvt_4f64_to_8i16_undef: 3387; F16C: # %bb.0: 3388; F16C-NEXT: subq $72, %rsp 3389; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3390; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3391; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3392; F16C-NEXT: vzeroupper 3393; F16C-NEXT: callq __truncdfhf2@PLT 3394; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3395; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3396; F16C-NEXT: # xmm0 = mem[1,0] 3397; F16C-NEXT: callq __truncdfhf2@PLT 3398; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3399; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3400; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3401; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3402; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3403; F16C-NEXT: vzeroupper 3404; F16C-NEXT: callq __truncdfhf2@PLT 3405; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3406; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3407; F16C-NEXT: # xmm0 = mem[1,0] 3408; F16C-NEXT: callq __truncdfhf2@PLT 3409; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3410; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3411; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3412; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 3413; F16C-NEXT: addq $72, %rsp 3414; F16C-NEXT: retq 3415; 3416; AVX512-LABEL: cvt_4f64_to_8i16_undef: 3417; AVX512: # %bb.0: 3418; AVX512-NEXT: subq $72, %rsp 3419; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3420; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3421; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3422; AVX512-NEXT: vzeroupper 3423; AVX512-NEXT: callq __truncdfhf2@PLT 3424; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3425; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3426; AVX512-NEXT: # xmm0 = mem[1,0] 3427; AVX512-NEXT: callq __truncdfhf2@PLT 3428; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3429; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3430; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3431; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3432; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3433; AVX512-NEXT: vzeroupper 3434; AVX512-NEXT: callq __truncdfhf2@PLT 3435; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3436; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3437; AVX512-NEXT: # xmm0 = mem[1,0] 3438; AVX512-NEXT: callq __truncdfhf2@PLT 3439; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3440; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3441; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3442; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3443; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3444; AVX512-NEXT: callq __truncdfhf2@PLT 3445; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 3446; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3447; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] 3448; AVX512-NEXT: addq $72, %rsp 3449; AVX512-NEXT: retq 3450 %1 = fptrunc <4 x double> %a0 to <4 x half> 3451 %2 = bitcast <4 x half> %1 to <4 x i16> 3452 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3453 ret <8 x i16> %3 3454} 3455 3456define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { 3457; AVX1-LABEL: cvt_4f64_to_8i16_zero: 3458; AVX1: # %bb.0: 3459; AVX1-NEXT: subq $88, %rsp 3460; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3461; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3462; AVX1-NEXT: vzeroupper 3463; AVX1-NEXT: callq __truncdfhf2@PLT 3464; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3465; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3466; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3467; AVX1-NEXT: vzeroupper 3468; AVX1-NEXT: callq __truncdfhf2@PLT 3469; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3470; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3471; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3472; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3473; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3474; AVX1-NEXT: vzeroupper 3475; AVX1-NEXT: callq __truncdfhf2@PLT 3476; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3477; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3478; AVX1-NEXT: callq __truncdfhf2@PLT 3479; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3480; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3481; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3482; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3483; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3484; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3485; AVX1-NEXT: addq $88, %rsp 3486; AVX1-NEXT: retq 3487; 3488; AVX2-LABEL: cvt_4f64_to_8i16_zero: 3489; AVX2: # %bb.0: 3490; AVX2-NEXT: subq $88, %rsp 3491; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3492; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3493; AVX2-NEXT: vzeroupper 3494; AVX2-NEXT: callq __truncdfhf2@PLT 3495; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3496; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3497; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3498; AVX2-NEXT: vzeroupper 3499; AVX2-NEXT: callq __truncdfhf2@PLT 3500; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3501; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3502; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3503; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3504; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3505; AVX2-NEXT: vzeroupper 3506; AVX2-NEXT: callq __truncdfhf2@PLT 3507; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3508; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3509; AVX2-NEXT: callq __truncdfhf2@PLT 3510; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3511; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 3512; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3513; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 3514; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 3515; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 3516; AVX2-NEXT: addq $88, %rsp 3517; AVX2-NEXT: retq 3518; 3519; F16C-LABEL: cvt_4f64_to_8i16_zero: 3520; F16C: # %bb.0: 3521; F16C-NEXT: subq $72, %rsp 3522; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3523; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3524; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3525; F16C-NEXT: vzeroupper 3526; F16C-NEXT: callq __truncdfhf2@PLT 3527; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3528; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3529; F16C-NEXT: # xmm0 = mem[1,0] 3530; F16C-NEXT: callq __truncdfhf2@PLT 3531; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3532; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3533; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3534; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3535; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3536; F16C-NEXT: vzeroupper 3537; F16C-NEXT: callq __truncdfhf2@PLT 3538; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3539; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3540; F16C-NEXT: # xmm0 = mem[1,0] 3541; F16C-NEXT: callq __truncdfhf2@PLT 3542; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3543; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3544; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3545; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 3546; F16C-NEXT: addq $72, %rsp 3547; F16C-NEXT: retq 3548; 3549; AVX512-LABEL: cvt_4f64_to_8i16_zero: 3550; AVX512: # %bb.0: 3551; AVX512-NEXT: subq $72, %rsp 3552; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3553; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3554; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3555; AVX512-NEXT: vzeroupper 3556; AVX512-NEXT: callq __truncdfhf2@PLT 3557; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3558; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3559; AVX512-NEXT: # xmm0 = mem[1,0] 3560; AVX512-NEXT: callq __truncdfhf2@PLT 3561; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3562; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3563; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3564; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3565; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3566; AVX512-NEXT: vzeroupper 3567; AVX512-NEXT: callq __truncdfhf2@PLT 3568; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3569; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3570; AVX512-NEXT: # xmm0 = mem[1,0] 3571; AVX512-NEXT: callq __truncdfhf2@PLT 3572; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3573; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3574; AVX512-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3575; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 3576; AVX512-NEXT: addq $72, %rsp 3577; AVX512-NEXT: retq 3578 %1 = fptrunc <4 x double> %a0 to <4 x half> 3579 %2 = bitcast <4 x half> %1 to <4 x i16> 3580 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3581 ret <8 x i16> %3 3582} 3583 3584define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { 3585; AVX-LABEL: cvt_8f64_to_8i16: 3586; AVX: # %bb.0: 3587; AVX-NEXT: subq $104, %rsp 3588; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3589; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3590; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 3591; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3592; AVX-NEXT: vzeroupper 3593; AVX-NEXT: callq __truncdfhf2@PLT 3594; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3595; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3596; AVX-NEXT: # xmm0 = mem[1,0] 3597; AVX-NEXT: callq __truncdfhf2@PLT 3598; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3599; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3600; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3601; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3602; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3603; AVX-NEXT: vzeroupper 3604; AVX-NEXT: callq __truncdfhf2@PLT 3605; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3606; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3607; AVX-NEXT: # xmm0 = mem[1,0] 3608; AVX-NEXT: callq __truncdfhf2@PLT 3609; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3610; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3611; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3612; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3613; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3614; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3615; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 3616; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3617; AVX-NEXT: vzeroupper 3618; AVX-NEXT: callq __truncdfhf2@PLT 3619; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3620; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3621; AVX-NEXT: # xmm0 = mem[1,0] 3622; AVX-NEXT: callq __truncdfhf2@PLT 3623; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3624; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3625; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3626; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3627; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3628; AVX-NEXT: vzeroupper 3629; AVX-NEXT: callq __truncdfhf2@PLT 3630; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3631; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3632; AVX-NEXT: # xmm0 = mem[1,0] 3633; AVX-NEXT: callq __truncdfhf2@PLT 3634; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3635; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3636; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3637; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3638; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3639; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 3640; AVX-NEXT: addq $104, %rsp 3641; AVX-NEXT: retq 3642; 3643; F16C-LABEL: cvt_8f64_to_8i16: 3644; F16C: # %bb.0: 3645; F16C-NEXT: subq $104, %rsp 3646; F16C-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3647; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3648; F16C-NEXT: vextractf128 $1, %ymm1, %xmm0 3649; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3650; F16C-NEXT: vzeroupper 3651; F16C-NEXT: callq __truncdfhf2@PLT 3652; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3653; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3654; F16C-NEXT: # xmm0 = mem[1,0] 3655; F16C-NEXT: callq __truncdfhf2@PLT 3656; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3657; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3658; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3659; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3660; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3661; F16C-NEXT: vzeroupper 3662; F16C-NEXT: callq __truncdfhf2@PLT 3663; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3664; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3665; F16C-NEXT: # xmm0 = mem[1,0] 3666; F16C-NEXT: callq __truncdfhf2@PLT 3667; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3668; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3669; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3670; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3671; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3672; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3673; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3674; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3675; F16C-NEXT: vzeroupper 3676; F16C-NEXT: callq __truncdfhf2@PLT 3677; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3678; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3679; F16C-NEXT: # xmm0 = mem[1,0] 3680; F16C-NEXT: callq __truncdfhf2@PLT 3681; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3682; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3683; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3684; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3685; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3686; F16C-NEXT: vzeroupper 3687; F16C-NEXT: callq __truncdfhf2@PLT 3688; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3689; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3690; F16C-NEXT: # xmm0 = mem[1,0] 3691; F16C-NEXT: callq __truncdfhf2@PLT 3692; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3693; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3694; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3695; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3696; F16C-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3697; F16C-NEXT: # xmm0 = xmm0[0],mem[0] 3698; F16C-NEXT: addq $104, %rsp 3699; F16C-NEXT: retq 3700; 3701; AVX512-LABEL: cvt_8f64_to_8i16: 3702; AVX512: # %bb.0: 3703; AVX512-NEXT: subq $120, %rsp 3704; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 3705; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 3706; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3707; AVX512-NEXT: vzeroupper 3708; AVX512-NEXT: callq __truncdfhf2@PLT 3709; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3710; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3711; AVX512-NEXT: # xmm0 = mem[1,0] 3712; AVX512-NEXT: callq __truncdfhf2@PLT 3713; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3714; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3715; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3716; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 3717; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 3718; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3719; AVX512-NEXT: vzeroupper 3720; AVX512-NEXT: callq __truncdfhf2@PLT 3721; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3722; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3723; AVX512-NEXT: # xmm0 = mem[1,0] 3724; AVX512-NEXT: callq __truncdfhf2@PLT 3725; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3726; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3727; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3728; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3729; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3730; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 3731; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3732; AVX512-NEXT: vzeroupper 3733; AVX512-NEXT: callq __truncdfhf2@PLT 3734; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3735; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3736; AVX512-NEXT: # xmm0 = mem[1,0] 3737; AVX512-NEXT: callq __truncdfhf2@PLT 3738; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3739; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3740; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 3741; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 3742; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3743; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3744; AVX512-NEXT: vzeroupper 3745; AVX512-NEXT: callq __truncdfhf2@PLT 3746; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3747; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3748; AVX512-NEXT: # xmm0 = mem[1,0] 3749; AVX512-NEXT: callq __truncdfhf2@PLT 3750; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3751; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3752; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3753; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 3754; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3755; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] 3756; AVX512-NEXT: addq $120, %rsp 3757; AVX512-NEXT: retq 3758 %1 = fptrunc <8 x double> %a0 to <8 x half> 3759 %2 = bitcast <8 x half> %1 to <8 x i16> 3760 ret <8 x i16> %2 3761} 3762 3763; 3764; Double to Half (Store) 3765; 3766 3767define void @store_cvt_f64_to_i16(double %a0, ptr %a1) nounwind { 3768; ALL-LABEL: store_cvt_f64_to_i16: 3769; ALL: # %bb.0: 3770; ALL-NEXT: pushq %rbx 3771; ALL-NEXT: movq %rdi, %rbx 3772; ALL-NEXT: callq __truncdfhf2@PLT 3773; ALL-NEXT: vpextrw $0, %xmm0, (%rbx) 3774; ALL-NEXT: popq %rbx 3775; ALL-NEXT: retq 3776; AVX-LABEL: store_cvt_f64_to_i16: 3777; AVX: # %bb.0: 3778; AVX-NEXT: pushq %rbx 3779; AVX-NEXT: movq %rdi, %rbx 3780; AVX-NEXT: callq __truncdfhf2@PLT 3781; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 3782; AVX-NEXT: popq %rbx 3783; AVX-NEXT: retq 3784; 3785; F16C-LABEL: store_cvt_f64_to_i16: 3786; F16C: # %bb.0: 3787; F16C-NEXT: pushq %rbx 3788; F16C-NEXT: movq %rdi, %rbx 3789; F16C-NEXT: callq __truncdfhf2@PLT 3790; F16C-NEXT: vpextrw $0, %xmm0, (%rbx) 3791; F16C-NEXT: popq %rbx 3792; F16C-NEXT: retq 3793; 3794; AVX512-LABEL: store_cvt_f64_to_i16: 3795; AVX512: # %bb.0: 3796; AVX512-NEXT: pushq %rbx 3797; AVX512-NEXT: movq %rdi, %rbx 3798; AVX512-NEXT: callq __truncdfhf2@PLT 3799; AVX512-NEXT: vpextrw $0, %xmm0, (%rbx) 3800; AVX512-NEXT: popq %rbx 3801; AVX512-NEXT: retq 3802 %1 = fptrunc double %a0 to half 3803 %2 = bitcast half %1 to i16 3804 store i16 %2, ptr %a1 3805 ret void 3806} 3807 3808define void @store_cvt_2f64_to_2i16(<2 x double> %a0, ptr %a1) nounwind { 3809; AVX-LABEL: store_cvt_2f64_to_2i16: 3810; AVX: # %bb.0: 3811; AVX-NEXT: pushq %rbx 3812; AVX-NEXT: subq $32, %rsp 3813; AVX-NEXT: movq %rdi, %rbx 3814; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3815; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3816; AVX-NEXT: callq __truncdfhf2@PLT 3817; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3818; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3819; AVX-NEXT: callq __truncdfhf2@PLT 3820; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) 3821; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3822; AVX-NEXT: vpextrw $0, %xmm0, 2(%rbx) 3823; AVX-NEXT: addq $32, %rsp 3824; AVX-NEXT: popq %rbx 3825; AVX-NEXT: retq 3826; 3827; F16C-LABEL: store_cvt_2f64_to_2i16: 3828; F16C: # %bb.0: 3829; F16C-NEXT: pushq %rbx 3830; F16C-NEXT: subq $32, %rsp 3831; F16C-NEXT: movq %rdi, %rbx 3832; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3833; F16C-NEXT: callq __truncdfhf2@PLT 3834; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3835; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3836; F16C-NEXT: # xmm0 = mem[1,0] 3837; F16C-NEXT: callq __truncdfhf2@PLT 3838; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3839; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3840; F16C-NEXT: vmovd %xmm0, (%rbx) 3841; F16C-NEXT: addq $32, %rsp 3842; F16C-NEXT: popq %rbx 3843; F16C-NEXT: retq 3844; 3845; AVX512-LABEL: store_cvt_2f64_to_2i16: 3846; AVX512: # %bb.0: 3847; AVX512-NEXT: pushq %rbx 3848; AVX512-NEXT: subq $32, %rsp 3849; AVX512-NEXT: movq %rdi, %rbx 3850; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3851; AVX512-NEXT: callq __truncdfhf2@PLT 3852; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3853; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3854; AVX512-NEXT: # xmm0 = mem[1,0] 3855; AVX512-NEXT: callq __truncdfhf2@PLT 3856; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3857; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3858; AVX512-NEXT: vmovd %xmm0, (%rbx) 3859; AVX512-NEXT: addq $32, %rsp 3860; AVX512-NEXT: popq %rbx 3861; AVX512-NEXT: retq 3862 %1 = fptrunc <2 x double> %a0 to <2 x half> 3863 %2 = bitcast <2 x half> %1 to <2 x i16> 3864 store <2 x i16> %2, ptr %a1 3865 ret void 3866} 3867 3868define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind { 3869; AVX1-LABEL: store_cvt_4f64_to_4i16: 3870; AVX1: # %bb.0: 3871; AVX1-NEXT: pushq %rbx 3872; AVX1-NEXT: subq $80, %rsp 3873; AVX1-NEXT: movq %rdi, %rbx 3874; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3875; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3876; AVX1-NEXT: vzeroupper 3877; AVX1-NEXT: callq __truncdfhf2@PLT 3878; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3879; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3880; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3881; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3882; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3883; AVX1-NEXT: vzeroupper 3884; AVX1-NEXT: callq __truncdfhf2@PLT 3885; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3886; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3887; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3888; AVX1-NEXT: vzeroupper 3889; AVX1-NEXT: callq __truncdfhf2@PLT 3890; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3891; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3892; AVX1-NEXT: callq __truncdfhf2@PLT 3893; AVX1-NEXT: vpextrw $0, %xmm0, 4(%rbx) 3894; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3895; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) 3896; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3897; AVX1-NEXT: vpextrw $0, %xmm0, 6(%rbx) 3898; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3899; AVX1-NEXT: vpextrw $0, %xmm0, 2(%rbx) 3900; AVX1-NEXT: addq $80, %rsp 3901; AVX1-NEXT: popq %rbx 3902; AVX1-NEXT: retq 3903; 3904; AVX2-LABEL: store_cvt_4f64_to_4i16: 3905; AVX2: # %bb.0: 3906; AVX2-NEXT: pushq %rbx 3907; AVX2-NEXT: subq $80, %rsp 3908; AVX2-NEXT: movq %rdi, %rbx 3909; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3910; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3911; AVX2-NEXT: vzeroupper 3912; AVX2-NEXT: callq __truncdfhf2@PLT 3913; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3914; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3915; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3916; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3917; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 3918; AVX2-NEXT: vzeroupper 3919; AVX2-NEXT: callq __truncdfhf2@PLT 3920; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3921; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3922; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3923; AVX2-NEXT: vzeroupper 3924; AVX2-NEXT: callq __truncdfhf2@PLT 3925; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3926; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3927; AVX2-NEXT: callq __truncdfhf2@PLT 3928; AVX2-NEXT: vpextrw $0, %xmm0, 4(%rbx) 3929; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 3930; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) 3931; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3932; AVX2-NEXT: vpextrw $0, %xmm0, 6(%rbx) 3933; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3934; AVX2-NEXT: vpextrw $0, %xmm0, 2(%rbx) 3935; AVX2-NEXT: addq $80, %rsp 3936; AVX2-NEXT: popq %rbx 3937; AVX2-NEXT: retq 3938; 3939; F16C-LABEL: store_cvt_4f64_to_4i16: 3940; F16C: # %bb.0: 3941; F16C-NEXT: pushq %rbx 3942; F16C-NEXT: subq $64, %rsp 3943; F16C-NEXT: movq %rdi, %rbx 3944; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3945; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 3946; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3947; F16C-NEXT: vzeroupper 3948; F16C-NEXT: callq __truncdfhf2@PLT 3949; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3950; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3951; F16C-NEXT: # xmm0 = mem[1,0] 3952; F16C-NEXT: callq __truncdfhf2@PLT 3953; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3954; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3955; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3956; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3957; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3958; F16C-NEXT: vzeroupper 3959; F16C-NEXT: callq __truncdfhf2@PLT 3960; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3961; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3962; F16C-NEXT: # xmm0 = mem[1,0] 3963; F16C-NEXT: callq __truncdfhf2@PLT 3964; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3965; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3966; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3967; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3968; F16C-NEXT: vmovq %xmm0, (%rbx) 3969; F16C-NEXT: addq $64, %rsp 3970; F16C-NEXT: popq %rbx 3971; F16C-NEXT: retq 3972; 3973; AVX512-LABEL: store_cvt_4f64_to_4i16: 3974; AVX512: # %bb.0: 3975; AVX512-NEXT: pushq %rbx 3976; AVX512-NEXT: subq $64, %rsp 3977; AVX512-NEXT: movq %rdi, %rbx 3978; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3979; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3980; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3981; AVX512-NEXT: vzeroupper 3982; AVX512-NEXT: callq __truncdfhf2@PLT 3983; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3984; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3985; AVX512-NEXT: # xmm0 = mem[1,0] 3986; AVX512-NEXT: callq __truncdfhf2@PLT 3987; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3988; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3989; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3990; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3991; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3992; AVX512-NEXT: vzeroupper 3993; AVX512-NEXT: callq __truncdfhf2@PLT 3994; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 3995; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3996; AVX512-NEXT: # xmm0 = mem[1,0] 3997; AVX512-NEXT: callq __truncdfhf2@PLT 3998; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 3999; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4000; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4001; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4002; AVX512-NEXT: vmovq %xmm0, (%rbx) 4003; AVX512-NEXT: addq $64, %rsp 4004; AVX512-NEXT: popq %rbx 4005; AVX512-NEXT: retq 4006 %1 = fptrunc <4 x double> %a0 to <4 x half> 4007 %2 = bitcast <4 x half> %1 to <4 x i16> 4008 store <4 x i16> %2, ptr %a1 4009 ret void 4010} 4011 4012define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { 4013; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: 4014; AVX1: # %bb.0: 4015; AVX1-NEXT: pushq %rbx 4016; AVX1-NEXT: subq $80, %rsp 4017; AVX1-NEXT: movq %rdi, %rbx 4018; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4019; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 4020; AVX1-NEXT: vzeroupper 4021; AVX1-NEXT: callq __truncdfhf2@PLT 4022; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4023; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4024; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4025; AVX1-NEXT: vzeroupper 4026; AVX1-NEXT: callq __truncdfhf2@PLT 4027; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4028; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4029; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4030; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4031; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 4032; AVX1-NEXT: vzeroupper 4033; AVX1-NEXT: callq __truncdfhf2@PLT 4034; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4035; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 4036; AVX1-NEXT: callq __truncdfhf2@PLT 4037; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4038; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4039; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4040; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4041; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 4042; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 4043; AVX1-NEXT: vmovaps %xmm0, (%rbx) 4044; AVX1-NEXT: addq $80, %rsp 4045; AVX1-NEXT: popq %rbx 4046; AVX1-NEXT: retq 4047; 4048; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: 4049; AVX2: # %bb.0: 4050; AVX2-NEXT: pushq %rbx 4051; AVX2-NEXT: subq $80, %rsp 4052; AVX2-NEXT: movq %rdi, %rbx 4053; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4054; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 4055; AVX2-NEXT: vzeroupper 4056; AVX2-NEXT: callq __truncdfhf2@PLT 4057; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4058; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4059; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4060; AVX2-NEXT: vzeroupper 4061; AVX2-NEXT: callq __truncdfhf2@PLT 4062; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4063; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4064; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4065; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4066; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 4067; AVX2-NEXT: vzeroupper 4068; AVX2-NEXT: callq __truncdfhf2@PLT 4069; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4070; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 4071; AVX2-NEXT: callq __truncdfhf2@PLT 4072; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4073; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4074; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4075; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4076; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 4077; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 4078; AVX2-NEXT: vmovaps %xmm0, (%rbx) 4079; AVX2-NEXT: addq $80, %rsp 4080; AVX2-NEXT: popq %rbx 4081; AVX2-NEXT: retq 4082; 4083; F16C-LABEL: store_cvt_4f64_to_8i16_undef: 4084; F16C: # %bb.0: 4085; F16C-NEXT: pushq %rbx 4086; F16C-NEXT: subq $64, %rsp 4087; F16C-NEXT: movq %rdi, %rbx 4088; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4089; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 4090; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4091; F16C-NEXT: vzeroupper 4092; F16C-NEXT: callq __truncdfhf2@PLT 4093; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4094; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4095; F16C-NEXT: # xmm0 = mem[1,0] 4096; F16C-NEXT: callq __truncdfhf2@PLT 4097; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4098; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4099; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4100; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4101; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4102; F16C-NEXT: vzeroupper 4103; F16C-NEXT: callq __truncdfhf2@PLT 4104; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4105; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4106; F16C-NEXT: # xmm0 = mem[1,0] 4107; F16C-NEXT: callq __truncdfhf2@PLT 4108; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4109; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4110; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4111; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 4112; F16C-NEXT: vmovaps %xmm0, (%rbx) 4113; F16C-NEXT: addq $64, %rsp 4114; F16C-NEXT: popq %rbx 4115; F16C-NEXT: retq 4116; 4117; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: 4118; AVX512: # %bb.0: 4119; AVX512-NEXT: pushq %rbx 4120; AVX512-NEXT: subq $64, %rsp 4121; AVX512-NEXT: movq %rdi, %rbx 4122; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4123; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4124; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4125; AVX512-NEXT: vzeroupper 4126; AVX512-NEXT: callq __truncdfhf2@PLT 4127; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4128; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4129; AVX512-NEXT: # xmm0 = mem[1,0] 4130; AVX512-NEXT: callq __truncdfhf2@PLT 4131; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4132; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4133; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4134; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4135; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4136; AVX512-NEXT: vzeroupper 4137; AVX512-NEXT: callq __truncdfhf2@PLT 4138; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4139; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4140; AVX512-NEXT: # xmm0 = mem[1,0] 4141; AVX512-NEXT: callq __truncdfhf2@PLT 4142; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4143; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4144; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4145; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4146; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4147; AVX512-NEXT: callq __truncdfhf2@PLT 4148; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 4149; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4150; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] 4151; AVX512-NEXT: vmovaps %xmm0, (%rbx) 4152; AVX512-NEXT: addq $64, %rsp 4153; AVX512-NEXT: popq %rbx 4154; AVX512-NEXT: retq 4155 %1 = fptrunc <4 x double> %a0 to <4 x half> 4156 %2 = bitcast <4 x half> %1 to <4 x i16> 4157 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4158 store <8 x i16> %3, ptr %a1 4159 ret void 4160} 4161 4162define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { 4163; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: 4164; AVX1: # %bb.0: 4165; AVX1-NEXT: pushq %rbx 4166; AVX1-NEXT: subq $80, %rsp 4167; AVX1-NEXT: movq %rdi, %rbx 4168; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4169; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 4170; AVX1-NEXT: vzeroupper 4171; AVX1-NEXT: callq __truncdfhf2@PLT 4172; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4173; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4174; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4175; AVX1-NEXT: vzeroupper 4176; AVX1-NEXT: callq __truncdfhf2@PLT 4177; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4178; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4179; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4180; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4181; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 4182; AVX1-NEXT: vzeroupper 4183; AVX1-NEXT: callq __truncdfhf2@PLT 4184; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4185; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 4186; AVX1-NEXT: callq __truncdfhf2@PLT 4187; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4188; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4189; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4190; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4191; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 4192; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 4193; AVX1-NEXT: vmovaps %xmm0, (%rbx) 4194; AVX1-NEXT: addq $80, %rsp 4195; AVX1-NEXT: popq %rbx 4196; AVX1-NEXT: retq 4197; 4198; AVX2-LABEL: store_cvt_4f64_to_8i16_zero: 4199; AVX2: # %bb.0: 4200; AVX2-NEXT: pushq %rbx 4201; AVX2-NEXT: subq $80, %rsp 4202; AVX2-NEXT: movq %rdi, %rbx 4203; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4204; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 4205; AVX2-NEXT: vzeroupper 4206; AVX2-NEXT: callq __truncdfhf2@PLT 4207; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4208; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4209; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4210; AVX2-NEXT: vzeroupper 4211; AVX2-NEXT: callq __truncdfhf2@PLT 4212; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4213; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4214; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4215; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4216; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 4217; AVX2-NEXT: vzeroupper 4218; AVX2-NEXT: callq __truncdfhf2@PLT 4219; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4220; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 4221; AVX2-NEXT: callq __truncdfhf2@PLT 4222; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4223; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4224; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4225; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 4226; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 4227; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 4228; AVX2-NEXT: vmovaps %xmm0, (%rbx) 4229; AVX2-NEXT: addq $80, %rsp 4230; AVX2-NEXT: popq %rbx 4231; AVX2-NEXT: retq 4232; 4233; F16C-LABEL: store_cvt_4f64_to_8i16_zero: 4234; F16C: # %bb.0: 4235; F16C-NEXT: pushq %rbx 4236; F16C-NEXT: subq $64, %rsp 4237; F16C-NEXT: movq %rdi, %rbx 4238; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4239; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 4240; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4241; F16C-NEXT: vzeroupper 4242; F16C-NEXT: callq __truncdfhf2@PLT 4243; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4244; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4245; F16C-NEXT: # xmm0 = mem[1,0] 4246; F16C-NEXT: callq __truncdfhf2@PLT 4247; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4248; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4249; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4250; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4251; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4252; F16C-NEXT: vzeroupper 4253; F16C-NEXT: callq __truncdfhf2@PLT 4254; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4255; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4256; F16C-NEXT: # xmm0 = mem[1,0] 4257; F16C-NEXT: callq __truncdfhf2@PLT 4258; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4259; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4260; F16C-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4261; F16C-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 4262; F16C-NEXT: vmovaps %xmm0, (%rbx) 4263; F16C-NEXT: addq $64, %rsp 4264; F16C-NEXT: popq %rbx 4265; F16C-NEXT: retq 4266; 4267; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: 4268; AVX512: # %bb.0: 4269; AVX512-NEXT: pushq %rbx 4270; AVX512-NEXT: subq $64, %rsp 4271; AVX512-NEXT: movq %rdi, %rbx 4272; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4273; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4274; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4275; AVX512-NEXT: vzeroupper 4276; AVX512-NEXT: callq __truncdfhf2@PLT 4277; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4278; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4279; AVX512-NEXT: # xmm0 = mem[1,0] 4280; AVX512-NEXT: callq __truncdfhf2@PLT 4281; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4282; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4283; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4284; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4285; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4286; AVX512-NEXT: vzeroupper 4287; AVX512-NEXT: callq __truncdfhf2@PLT 4288; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4289; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4290; AVX512-NEXT: # xmm0 = mem[1,0] 4291; AVX512-NEXT: callq __truncdfhf2@PLT 4292; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4293; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4294; AVX512-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4295; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero 4296; AVX512-NEXT: vmovaps %xmm0, (%rbx) 4297; AVX512-NEXT: addq $64, %rsp 4298; AVX512-NEXT: popq %rbx 4299; AVX512-NEXT: retq 4300 %1 = fptrunc <4 x double> %a0 to <4 x half> 4301 %2 = bitcast <4 x half> %1 to <4 x i16> 4302 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4303 store <8 x i16> %3, ptr %a1 4304 ret void 4305} 4306 4307define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind { 4308; AVX-LABEL: store_cvt_8f64_to_8i16: 4309; AVX: # %bb.0: 4310; AVX-NEXT: pushq %rbx 4311; AVX-NEXT: subq $96, %rsp 4312; AVX-NEXT: movq %rdi, %rbx 4313; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4314; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4315; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 4316; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4317; AVX-NEXT: vzeroupper 4318; AVX-NEXT: callq __truncdfhf2@PLT 4319; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4320; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4321; AVX-NEXT: # xmm0 = mem[1,0] 4322; AVX-NEXT: callq __truncdfhf2@PLT 4323; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4324; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4325; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4326; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4327; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4328; AVX-NEXT: vzeroupper 4329; AVX-NEXT: callq __truncdfhf2@PLT 4330; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4331; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4332; AVX-NEXT: # xmm0 = mem[1,0] 4333; AVX-NEXT: callq __truncdfhf2@PLT 4334; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4335; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4336; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4337; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4338; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4339; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4340; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 4341; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4342; AVX-NEXT: vzeroupper 4343; AVX-NEXT: callq __truncdfhf2@PLT 4344; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4345; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4346; AVX-NEXT: # xmm0 = mem[1,0] 4347; AVX-NEXT: callq __truncdfhf2@PLT 4348; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4349; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4350; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4351; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4352; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4353; AVX-NEXT: vzeroupper 4354; AVX-NEXT: callq __truncdfhf2@PLT 4355; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4356; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4357; AVX-NEXT: # xmm0 = mem[1,0] 4358; AVX-NEXT: callq __truncdfhf2@PLT 4359; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4360; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4361; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4362; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4363; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4364; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 4365; AVX-NEXT: vmovdqa %xmm0, (%rbx) 4366; AVX-NEXT: addq $96, %rsp 4367; AVX-NEXT: popq %rbx 4368; AVX-NEXT: retq 4369; 4370; F16C-LABEL: store_cvt_8f64_to_8i16: 4371; F16C: # %bb.0: 4372; F16C-NEXT: pushq %rbx 4373; F16C-NEXT: subq $96, %rsp 4374; F16C-NEXT: movq %rdi, %rbx 4375; F16C-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4376; F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4377; F16C-NEXT: vextractf128 $1, %ymm1, %xmm0 4378; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4379; F16C-NEXT: vzeroupper 4380; F16C-NEXT: callq __truncdfhf2@PLT 4381; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4382; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4383; F16C-NEXT: # xmm0 = mem[1,0] 4384; F16C-NEXT: callq __truncdfhf2@PLT 4385; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4386; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4387; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4388; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4389; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4390; F16C-NEXT: vzeroupper 4391; F16C-NEXT: callq __truncdfhf2@PLT 4392; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4393; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4394; F16C-NEXT: # xmm0 = mem[1,0] 4395; F16C-NEXT: callq __truncdfhf2@PLT 4396; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4397; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4398; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4399; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4400; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4401; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4402; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 4403; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4404; F16C-NEXT: vzeroupper 4405; F16C-NEXT: callq __truncdfhf2@PLT 4406; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4407; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4408; F16C-NEXT: # xmm0 = mem[1,0] 4409; F16C-NEXT: callq __truncdfhf2@PLT 4410; F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4411; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4412; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4413; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4414; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4415; F16C-NEXT: vzeroupper 4416; F16C-NEXT: callq __truncdfhf2@PLT 4417; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4418; F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4419; F16C-NEXT: # xmm0 = mem[1,0] 4420; F16C-NEXT: callq __truncdfhf2@PLT 4421; F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4422; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4423; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4424; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4425; F16C-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4426; F16C-NEXT: # xmm0 = xmm0[0],mem[0] 4427; F16C-NEXT: vmovdqa %xmm0, (%rbx) 4428; F16C-NEXT: addq $96, %rsp 4429; F16C-NEXT: popq %rbx 4430; F16C-NEXT: retq 4431; 4432; AVX512-LABEL: store_cvt_8f64_to_8i16: 4433; AVX512: # %bb.0: 4434; AVX512-NEXT: pushq %rbx 4435; AVX512-NEXT: subq $112, %rsp 4436; AVX512-NEXT: movq %rdi, %rbx 4437; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4438; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 4439; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4440; AVX512-NEXT: vzeroupper 4441; AVX512-NEXT: callq __truncdfhf2@PLT 4442; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4443; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4444; AVX512-NEXT: # xmm0 = mem[1,0] 4445; AVX512-NEXT: callq __truncdfhf2@PLT 4446; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4447; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4448; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4449; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4450; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 4451; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4452; AVX512-NEXT: vzeroupper 4453; AVX512-NEXT: callq __truncdfhf2@PLT 4454; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4455; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4456; AVX512-NEXT: # xmm0 = mem[1,0] 4457; AVX512-NEXT: callq __truncdfhf2@PLT 4458; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4459; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4460; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4461; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4462; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4463; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4464; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4465; AVX512-NEXT: vzeroupper 4466; AVX512-NEXT: callq __truncdfhf2@PLT 4467; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4468; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4469; AVX512-NEXT: # xmm0 = mem[1,0] 4470; AVX512-NEXT: callq __truncdfhf2@PLT 4471; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4472; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4473; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4474; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4475; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4476; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4477; AVX512-NEXT: vzeroupper 4478; AVX512-NEXT: callq __truncdfhf2@PLT 4479; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4480; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4481; AVX512-NEXT: # xmm0 = mem[1,0] 4482; AVX512-NEXT: callq __truncdfhf2@PLT 4483; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4484; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4485; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4486; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4487; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4488; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] 4489; AVX512-NEXT: vmovdqa %xmm0, (%rbx) 4490; AVX512-NEXT: addq $112, %rsp 4491; AVX512-NEXT: popq %rbx 4492; AVX512-NEXT: retq 4493 %1 = fptrunc <8 x double> %a0 to <8 x half> 4494 %2 = bitcast <8 x half> %1 to <8 x i16> 4495 store <8 x i16> %2, ptr %a1 4496 ret void 4497} 4498 4499define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { 4500; AVX1-LABEL: store_cvt_32f32_to_32f16: 4501; AVX1: # %bb.0: 4502; AVX1-NEXT: pushq %rbx 4503; AVX1-NEXT: subq $176, %rsp 4504; AVX1-NEXT: movq %rdi, %rbx 4505; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4506; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4507; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4508; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4509; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 4510; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4511; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4512; AVX1-NEXT: vzeroupper 4513; AVX1-NEXT: callq __truncsfhf2@PLT 4514; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4515; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4516; AVX1-NEXT: # xmm0 = mem[1,0] 4517; AVX1-NEXT: callq __truncsfhf2@PLT 4518; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4519; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4520; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4521; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4522; AVX1-NEXT: callq __truncsfhf2@PLT 4523; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4524; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4525; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4526; AVX1-NEXT: callq __truncsfhf2@PLT 4527; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4528; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4529; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4530; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4531; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4532; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4533; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 4534; AVX1-NEXT: callq __truncsfhf2@PLT 4535; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4536; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4537; AVX1-NEXT: # xmm0 = mem[1,0] 4538; AVX1-NEXT: callq __truncsfhf2@PLT 4539; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4540; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4541; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4542; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4543; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4544; AVX1-NEXT: vzeroupper 4545; AVX1-NEXT: callq __truncsfhf2@PLT 4546; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4547; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4548; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4549; AVX1-NEXT: callq __truncsfhf2@PLT 4550; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4551; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4552; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4553; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4554; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4555; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 4556; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4557; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4558; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4559; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4560; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4561; AVX1-NEXT: vzeroupper 4562; AVX1-NEXT: callq __truncsfhf2@PLT 4563; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4564; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4565; AVX1-NEXT: # xmm0 = mem[1,0] 4566; AVX1-NEXT: callq __truncsfhf2@PLT 4567; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4568; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4569; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4570; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4571; AVX1-NEXT: callq __truncsfhf2@PLT 4572; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4573; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4574; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4575; AVX1-NEXT: callq __truncsfhf2@PLT 4576; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4577; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4578; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4579; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4580; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4581; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4582; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 4583; AVX1-NEXT: callq __truncsfhf2@PLT 4584; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4585; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4586; AVX1-NEXT: # xmm0 = mem[1,0] 4587; AVX1-NEXT: callq __truncsfhf2@PLT 4588; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4589; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4590; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4591; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4592; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4593; AVX1-NEXT: vzeroupper 4594; AVX1-NEXT: callq __truncsfhf2@PLT 4595; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4596; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4597; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4598; AVX1-NEXT: callq __truncsfhf2@PLT 4599; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4600; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4601; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4602; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4603; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4604; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 4605; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4606; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4607; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4608; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4609; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4610; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4611; AVX1-NEXT: vzeroupper 4612; AVX1-NEXT: callq __truncsfhf2@PLT 4613; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4614; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4615; AVX1-NEXT: # xmm0 = mem[1,0] 4616; AVX1-NEXT: callq __truncsfhf2@PLT 4617; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4618; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4619; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4620; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4621; AVX1-NEXT: callq __truncsfhf2@PLT 4622; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4623; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4624; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4625; AVX1-NEXT: callq __truncsfhf2@PLT 4626; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4627; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4628; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4629; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4630; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4631; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4632; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 4633; AVX1-NEXT: callq __truncsfhf2@PLT 4634; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4635; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4636; AVX1-NEXT: # xmm0 = mem[1,0] 4637; AVX1-NEXT: callq __truncsfhf2@PLT 4638; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4639; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4640; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4641; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4642; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4643; AVX1-NEXT: vzeroupper 4644; AVX1-NEXT: callq __truncsfhf2@PLT 4645; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4646; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4647; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4648; AVX1-NEXT: callq __truncsfhf2@PLT 4649; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4650; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4651; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4652; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4653; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4654; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 4655; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4656; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4657; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4658; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4659; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4660; AVX1-NEXT: vzeroupper 4661; AVX1-NEXT: callq __truncsfhf2@PLT 4662; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4663; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4664; AVX1-NEXT: # xmm0 = mem[1,0] 4665; AVX1-NEXT: callq __truncsfhf2@PLT 4666; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4667; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4668; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4669; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4670; AVX1-NEXT: callq __truncsfhf2@PLT 4671; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4672; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4673; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4674; AVX1-NEXT: callq __truncsfhf2@PLT 4675; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4676; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4677; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4678; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4679; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4680; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4681; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 4682; AVX1-NEXT: callq __truncsfhf2@PLT 4683; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4684; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4685; AVX1-NEXT: # xmm0 = mem[1,0] 4686; AVX1-NEXT: callq __truncsfhf2@PLT 4687; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4688; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4689; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4690; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4691; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4692; AVX1-NEXT: vzeroupper 4693; AVX1-NEXT: callq __truncsfhf2@PLT 4694; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4695; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4696; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 4697; AVX1-NEXT: callq __truncsfhf2@PLT 4698; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4699; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4700; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4701; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 4702; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4703; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 4704; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4705; AVX1-NEXT: vmovaps %ymm0, 32(%rbx) 4706; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4707; AVX1-NEXT: vmovaps %ymm0, (%rbx) 4708; AVX1-NEXT: addq $176, %rsp 4709; AVX1-NEXT: popq %rbx 4710; AVX1-NEXT: vzeroupper 4711; AVX1-NEXT: retq 4712; 4713; AVX2-LABEL: store_cvt_32f32_to_32f16: 4714; AVX2: # %bb.0: 4715; AVX2-NEXT: pushq %rbx 4716; AVX2-NEXT: subq $240, %rsp 4717; AVX2-NEXT: movq %rdi, %rbx 4718; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4719; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4720; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4721; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4722; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 4723; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4724; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4725; AVX2-NEXT: vzeroupper 4726; AVX2-NEXT: callq __truncsfhf2@PLT 4727; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4728; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4729; AVX2-NEXT: # xmm0 = mem[1,0] 4730; AVX2-NEXT: callq __truncsfhf2@PLT 4731; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4732; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4733; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4734; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4735; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4736; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4737; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4738; AVX2-NEXT: vzeroupper 4739; AVX2-NEXT: callq __truncsfhf2@PLT 4740; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4741; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 4742; AVX2-NEXT: # xmm0 = mem[1,0] 4743; AVX2-NEXT: callq __truncsfhf2@PLT 4744; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4745; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4746; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4747; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4748; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4749; AVX2-NEXT: vzeroupper 4750; AVX2-NEXT: callq __truncsfhf2@PLT 4751; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4752; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4753; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4754; AVX2-NEXT: callq __truncsfhf2@PLT 4755; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4756; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4757; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4758; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4759; AVX2-NEXT: callq __truncsfhf2@PLT 4760; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4761; AVX2-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload 4762; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4763; AVX2-NEXT: callq __truncsfhf2@PLT 4764; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4765; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4766; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4767; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4768; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 4769; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4770; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4771; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 4772; AVX2-NEXT: vzeroupper 4773; AVX2-NEXT: callq __truncsfhf2@PLT 4774; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4775; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4776; AVX2-NEXT: # xmm0 = mem[1,0] 4777; AVX2-NEXT: callq __truncsfhf2@PLT 4778; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4779; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4780; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4781; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4782; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 4783; AVX2-NEXT: callq __truncsfhf2@PLT 4784; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4785; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4786; AVX2-NEXT: # xmm0 = mem[1,0] 4787; AVX2-NEXT: callq __truncsfhf2@PLT 4788; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4789; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4790; AVX2-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 4791; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 4792; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4793; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4794; AVX2-NEXT: vzeroupper 4795; AVX2-NEXT: callq __truncsfhf2@PLT 4796; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4797; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4798; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4799; AVX2-NEXT: callq __truncsfhf2@PLT 4800; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4801; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4802; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4803; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4804; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4805; AVX2-NEXT: vzeroupper 4806; AVX2-NEXT: callq __truncsfhf2@PLT 4807; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4808; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4809; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4810; AVX2-NEXT: callq __truncsfhf2@PLT 4811; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4812; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4813; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4814; AVX2-NEXT: vpunpckldq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 4815; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 4816; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4817; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] 4818; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4819; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4820; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4821; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4822; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4823; AVX2-NEXT: vzeroupper 4824; AVX2-NEXT: callq __truncsfhf2@PLT 4825; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4826; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4827; AVX2-NEXT: # xmm0 = mem[1,0] 4828; AVX2-NEXT: callq __truncsfhf2@PLT 4829; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4830; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4831; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4832; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4833; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4834; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4835; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 4836; AVX2-NEXT: vzeroupper 4837; AVX2-NEXT: callq __truncsfhf2@PLT 4838; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4839; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4840; AVX2-NEXT: # xmm0 = mem[1,0] 4841; AVX2-NEXT: callq __truncsfhf2@PLT 4842; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4843; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4844; AVX2-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload 4845; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 4846; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4847; AVX2-NEXT: vzeroupper 4848; AVX2-NEXT: callq __truncsfhf2@PLT 4849; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4850; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4851; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4852; AVX2-NEXT: callq __truncsfhf2@PLT 4853; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4854; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4855; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4856; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4857; AVX2-NEXT: callq __truncsfhf2@PLT 4858; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4859; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4860; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4861; AVX2-NEXT: callq __truncsfhf2@PLT 4862; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4863; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4864; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4865; AVX2-NEXT: vpunpckldq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 4866; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 4867; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4868; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4869; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 4870; AVX2-NEXT: vzeroupper 4871; AVX2-NEXT: callq __truncsfhf2@PLT 4872; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4873; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4874; AVX2-NEXT: # xmm0 = mem[1,0] 4875; AVX2-NEXT: callq __truncsfhf2@PLT 4876; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4877; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4878; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4879; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4880; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 4881; AVX2-NEXT: callq __truncsfhf2@PLT 4882; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4883; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4884; AVX2-NEXT: # xmm0 = mem[1,0] 4885; AVX2-NEXT: callq __truncsfhf2@PLT 4886; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 4887; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 4888; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4889; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4890; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4891; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4892; AVX2-NEXT: vzeroupper 4893; AVX2-NEXT: callq __truncsfhf2@PLT 4894; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4895; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4896; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4897; AVX2-NEXT: callq __truncsfhf2@PLT 4898; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4899; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4900; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4901; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4902; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4903; AVX2-NEXT: vzeroupper 4904; AVX2-NEXT: callq __truncsfhf2@PLT 4905; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4906; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4907; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 4908; AVX2-NEXT: callq __truncsfhf2@PLT 4909; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 4910; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4911; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 4912; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4913; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 4914; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4915; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] 4916; AVX2-NEXT: vmovdqa %ymm0, 32(%rbx) 4917; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4918; AVX2-NEXT: vmovaps %ymm0, (%rbx) 4919; AVX2-NEXT: addq $240, %rsp 4920; AVX2-NEXT: popq %rbx 4921; AVX2-NEXT: vzeroupper 4922; AVX2-NEXT: retq 4923; 4924; F16C-LABEL: store_cvt_32f32_to_32f16: 4925; F16C: # %bb.0: 4926; F16C-NEXT: vcvtps2ph $4, %ymm3, 48(%rdi) 4927; F16C-NEXT: vcvtps2ph $4, %ymm2, 32(%rdi) 4928; F16C-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) 4929; F16C-NEXT: vcvtps2ph $4, %ymm0, (%rdi) 4930; F16C-NEXT: vzeroupper 4931; F16C-NEXT: retq 4932; 4933; AVX512-LABEL: store_cvt_32f32_to_32f16: 4934; AVX512: # %bb.0: 4935; AVX512-NEXT: vcvtps2ph $4, %zmm1, 32(%rdi) 4936; AVX512-NEXT: vcvtps2ph $4, %zmm0, (%rdi) 4937; AVX512-NEXT: vzeroupper 4938; AVX512-NEXT: retq 4939 %1 = fptrunc <32 x float> %a0 to <32 x half> 4940 store <32 x half> %1, ptr %a1 4941 ret void 4942} 4943 4944define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { 4945; AVX-LABEL: fptosi_2f16_to_4i32: 4946; AVX: # %bb.0: 4947; AVX-NEXT: subq $40, %rsp 4948; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 4949; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4950; AVX-NEXT: callq __extendhfsf2@PLT 4951; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 4952; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4953; AVX-NEXT: callq __extendhfsf2@PLT 4954; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 4955; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 4956; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 4957; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4958; AVX-NEXT: addq $40, %rsp 4959; AVX-NEXT: retq 4960; 4961; F16C-LABEL: fptosi_2f16_to_4i32: 4962; F16C: # %bb.0: 4963; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 4964; F16C-NEXT: vcvttps2dq %xmm0, %xmm0 4965; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4966; F16C-NEXT: retq 4967; 4968; AVX512-LABEL: fptosi_2f16_to_4i32: 4969; AVX512: # %bb.0: 4970; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 4971; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 4972; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4973; AVX512-NEXT: retq 4974 %cvt = fptosi <2 x half> %a to <2 x i32> 4975 %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4976 ret <4 x i32> %ext 4977} 4978 4979; PR83402 4980define <4 x i32> @fptosi_4f16_to_4i32(<4 x half> %a) nounwind { 4981; AVX-LABEL: fptosi_4f16_to_4i32: 4982; AVX: # %bb.0: 4983; AVX-NEXT: subq $72, %rsp 4984; AVX-NEXT: vmovdqa %xmm0, %xmm1 4985; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4986; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 4987; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4988; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 4989; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4990; AVX-NEXT: vpsrlq $48, %xmm1, %xmm0 4991; AVX-NEXT: callq __extendhfsf2@PLT 4992; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4993; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4994; AVX-NEXT: callq __extendhfsf2@PLT 4995; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4996; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 4997; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 4998; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 4999; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5000; AVX-NEXT: callq __extendhfsf2@PLT 5001; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5002; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5003; AVX-NEXT: callq __extendhfsf2@PLT 5004; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5005; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 5006; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 5007; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 5008; AVX-NEXT: # xmm0 = xmm0[0],mem[0] 5009; AVX-NEXT: addq $72, %rsp 5010; AVX-NEXT: retq 5011; 5012; F16C-LABEL: fptosi_4f16_to_4i32: 5013; F16C: # %bb.0: 5014; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 5015; F16C-NEXT: vcvttps2dq %xmm0, %xmm0 5016; F16C-NEXT: retq 5017; 5018; AVX512-LABEL: fptosi_4f16_to_4i32: 5019; AVX512: # %bb.0: 5020; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 5021; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 5022; AVX512-NEXT: retq 5023 %cvt = fptosi <4 x half> %a to <4 x i32> 5024 ret <4 x i32> %cvt 5025} 5026 5027define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind { 5028; AVX1-LABEL: fptoui_2f16_to_4i32: 5029; AVX1: # %bb.0: 5030; AVX1-NEXT: subq $40, %rsp 5031; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 5032; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5033; AVX1-NEXT: callq __extendhfsf2@PLT 5034; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 5035; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5036; AVX1-NEXT: callq __extendhfsf2@PLT 5037; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 5038; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 5039; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 5040; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 5041; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 5042; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 5043; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 5044; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 5045; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5046; AVX1-NEXT: addq $40, %rsp 5047; AVX1-NEXT: retq 5048; 5049; AVX2-LABEL: fptoui_2f16_to_4i32: 5050; AVX2: # %bb.0: 5051; AVX2-NEXT: subq $40, %rsp 5052; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 5053; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5054; AVX2-NEXT: callq __extendhfsf2@PLT 5055; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 5056; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5057; AVX2-NEXT: callq __extendhfsf2@PLT 5058; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 5059; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 5060; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1 5061; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 5062; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] 5063; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0 5064; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 5065; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 5066; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 5067; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5068; AVX2-NEXT: addq $40, %rsp 5069; AVX2-NEXT: retq 5070; 5071; F16C-LABEL: fptoui_2f16_to_4i32: 5072; F16C: # %bb.0: 5073; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 5074; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 5075; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 5076; F16C-NEXT: vcvttps2dq %xmm0, %xmm1 5077; F16C-NEXT: vpsrad $31, %xmm1, %xmm2 5078; F16C-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 5079; F16C-NEXT: vcvttps2dq %xmm0, %xmm0 5080; F16C-NEXT: vpand %xmm2, %xmm0, %xmm0 5081; F16C-NEXT: vpor %xmm0, %xmm1, %xmm0 5082; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5083; F16C-NEXT: retq 5084; 5085; AVX512F-LABEL: fptoui_2f16_to_4i32: 5086; AVX512F: # %bb.0: 5087; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 5088; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 5089; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 5090; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 5091; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5092; AVX512F-NEXT: vzeroupper 5093; AVX512F-NEXT: retq 5094; 5095; AVX512-FASTLANE-LABEL: fptoui_2f16_to_4i32: 5096; AVX512-FASTLANE: # %bb.0: 5097; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %xmm0 5098; AVX512-FASTLANE-NEXT: vcvttps2udq %xmm0, %xmm0 5099; AVX512-FASTLANE-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 5100; AVX512-FASTLANE-NEXT: retq 5101 %cvt = fptoui <2 x half> %a to <2 x i32> 5102 %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5103 ret <4 x i32> %ext 5104} 5105 5106define <4 x i32> @fptoui_4f16_to_4i32(<4 x half> %a) nounwind { 5107; AVX1-LABEL: fptoui_4f16_to_4i32: 5108; AVX1: # %bb.0: 5109; AVX1-NEXT: subq $72, %rsp 5110; AVX1-NEXT: vmovdqa %xmm0, %xmm1 5111; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5112; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 5113; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5114; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 5115; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 5116; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm0 5117; AVX1-NEXT: callq __extendhfsf2@PLT 5118; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5119; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 5120; AVX1-NEXT: callq __extendhfsf2@PLT 5121; AVX1-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 5122; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 5123; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 5124; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 5125; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 5126; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 5127; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 5128; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 5129; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 5130; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5131; AVX1-NEXT: callq __extendhfsf2@PLT 5132; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5133; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5134; AVX1-NEXT: callq __extendhfsf2@PLT 5135; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5136; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 5137; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 5138; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 5139; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 5140; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 5141; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 5142; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 5143; AVX1-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 5144; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] 5145; AVX1-NEXT: addq $72, %rsp 5146; AVX1-NEXT: retq 5147; 5148; AVX2-LABEL: fptoui_4f16_to_4i32: 5149; AVX2: # %bb.0: 5150; AVX2-NEXT: subq $72, %rsp 5151; AVX2-NEXT: vmovdqa %xmm0, %xmm1 5152; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5153; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 5154; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5155; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 5156; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 5157; AVX2-NEXT: vpsrlq $48, %xmm1, %xmm0 5158; AVX2-NEXT: callq __extendhfsf2@PLT 5159; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5160; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 5161; AVX2-NEXT: callq __extendhfsf2@PLT 5162; AVX2-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 5163; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] 5164; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1 5165; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 5166; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] 5167; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0 5168; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 5169; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 5170; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 5171; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 5172; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5173; AVX2-NEXT: callq __extendhfsf2@PLT 5174; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5175; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5176; AVX2-NEXT: callq __extendhfsf2@PLT 5177; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5178; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 5179; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1 5180; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 5181; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] 5182; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0 5183; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 5184; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 5185; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 5186; AVX2-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 5187; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] 5188; AVX2-NEXT: addq $72, %rsp 5189; AVX2-NEXT: retq 5190; 5191; F16C-LABEL: fptoui_4f16_to_4i32: 5192; F16C: # %bb.0: 5193; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 5194; F16C-NEXT: vcvttps2dq %xmm0, %xmm1 5195; F16C-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 5196; F16C-NEXT: vcvttps2dq %xmm0, %xmm0 5197; F16C-NEXT: vorps %xmm0, %xmm1, %xmm0 5198; F16C-NEXT: vblendvps %xmm1, %xmm0, %xmm1, %xmm0 5199; F16C-NEXT: retq 5200; 5201; AVX512F-LABEL: fptoui_4f16_to_4i32: 5202; AVX512F: # %bb.0: 5203; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 5204; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 5205; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 5206; AVX512F-NEXT: vzeroupper 5207; AVX512F-NEXT: retq 5208; 5209; AVX512-FASTLANE-LABEL: fptoui_4f16_to_4i32: 5210; AVX512-FASTLANE: # %bb.0: 5211; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %xmm0 5212; AVX512-FASTLANE-NEXT: vcvttps2udq %xmm0, %xmm0 5213; AVX512-FASTLANE-NEXT: retq 5214 %cvt = fptoui <4 x half> %a to <4 x i32> 5215 ret <4 x i32> %cvt 5216} 5217