xref: /llvm-project/llvm/test/CodeGen/X86/vector-half-conversions.ll (revision 67c3f2b4303972a6dc8ada54efe1d5d80d119a51)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX1
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX2
4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=F16C
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=F16C
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=F16C
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+f16c,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=F16C
8; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512F
9; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512-FASTLANE
10; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512-FASTLANE
11
12;
13; Half to Float
14;
15
16define float @cvt_i16_to_f32(i16 %a0) nounwind {
17; AVX-LABEL: cvt_i16_to_f32:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vpinsrw $0, %edi, %xmm0, %xmm0
20; AVX-NEXT:    jmp __extendhfsf2@PLT # TAILCALL
21;
22; F16C-LABEL: cvt_i16_to_f32:
23; F16C:       # %bb.0:
24; F16C-NEXT:    vmovd %edi, %xmm0
25; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
26; F16C-NEXT:    retq
27;
28; AVX512-LABEL: cvt_i16_to_f32:
29; AVX512:       # %bb.0:
30; AVX512-NEXT:    vmovd %edi, %xmm0
31; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
32; AVX512-NEXT:    retq
33  %1 = bitcast i16 %a0 to half
34  %2 = fpext half %1 to float
35  ret float %2
36}
37
38define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
39; AVX-LABEL: cvt_4i16_to_4f32:
40; AVX:       # %bb.0:
41; AVX-NEXT:    subq $72, %rsp
42; AVX-NEXT:    vmovq %xmm0, %rax
43; AVX-NEXT:    movq %rax, %rcx
44; AVX-NEXT:    movq %rax, %rdx
45; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
46; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
47; AVX-NEXT:    shrl $16, %eax
48; AVX-NEXT:    shrq $32, %rcx
49; AVX-NEXT:    shrq $48, %rdx
50; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
51; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
52; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
53; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
54; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
55; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
56; AVX-NEXT:    callq __extendhfsf2@PLT
57; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
58; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
59; AVX-NEXT:    callq __extendhfsf2@PLT
60; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
61; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
62; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
63; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
64; AVX-NEXT:    callq __extendhfsf2@PLT
65; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
66; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
67; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
68; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
69; AVX-NEXT:    callq __extendhfsf2@PLT
70; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
71; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
72; AVX-NEXT:    addq $72, %rsp
73; AVX-NEXT:    retq
74;
75; F16C-LABEL: cvt_4i16_to_4f32:
76; F16C:       # %bb.0:
77; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
78; F16C-NEXT:    retq
79;
80; AVX512-LABEL: cvt_4i16_to_4f32:
81; AVX512:       # %bb.0:
82; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
83; AVX512-NEXT:    retq
84  %1 = bitcast <4 x i16> %a0 to <4 x half>
85  %2 = fpext <4 x half> %1 to <4 x float>
86  ret <4 x float> %2
87}
88
89define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
90; AVX-LABEL: cvt_8i16_to_4f32:
91; AVX:       # %bb.0:
92; AVX-NEXT:    subq $72, %rsp
93; AVX-NEXT:    vmovq %xmm0, %rax
94; AVX-NEXT:    movq %rax, %rcx
95; AVX-NEXT:    movq %rax, %rdx
96; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
97; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
98; AVX-NEXT:    shrl $16, %eax
99; AVX-NEXT:    shrq $32, %rcx
100; AVX-NEXT:    shrq $48, %rdx
101; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
102; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
103; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
104; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
105; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
106; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
107; AVX-NEXT:    callq __extendhfsf2@PLT
108; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
109; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
110; AVX-NEXT:    callq __extendhfsf2@PLT
111; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
112; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
113; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
114; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
115; AVX-NEXT:    callq __extendhfsf2@PLT
116; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
117; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
118; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
119; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
120; AVX-NEXT:    callq __extendhfsf2@PLT
121; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
122; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
123; AVX-NEXT:    addq $72, %rsp
124; AVX-NEXT:    retq
125;
126; F16C-LABEL: cvt_8i16_to_4f32:
127; F16C:       # %bb.0:
128; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
129; F16C-NEXT:    retq
130;
131; AVX512-LABEL: cvt_8i16_to_4f32:
132; AVX512:       # %bb.0:
133; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
134; AVX512-NEXT:    retq
135  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
136  %2 = bitcast <4 x i16> %1 to <4 x half>
137  %3 = fpext <4 x half> %2 to <4 x float>
138  ret <4 x float> %3
139}
140
141define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
142; AVX-LABEL: cvt_8i16_to_8f32:
143; AVX:       # %bb.0:
144; AVX-NEXT:    subq $56, %rsp
145; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
146; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
147; AVX-NEXT:    callq __extendhfsf2@PLT
148; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
149; AVX-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
150; AVX-NEXT:    # xmm0 = mem[2,3,0,1]
151; AVX-NEXT:    callq __extendhfsf2@PLT
152; AVX-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
153; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
154; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
155; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
156; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
157; AVX-NEXT:    callq __extendhfsf2@PLT
158; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
159; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
160; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
161; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
162; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
163; AVX-NEXT:    callq __extendhfsf2@PLT
164; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
165; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
166; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
167; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
168; AVX-NEXT:    callq __extendhfsf2@PLT
169; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
170; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
171; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
172; AVX-NEXT:    callq __extendhfsf2@PLT
173; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
174; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
175; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
176; AVX-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
177; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
178; AVX-NEXT:    callq __extendhfsf2@PLT
179; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
180; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
181; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
182; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
183; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
184; AVX-NEXT:    callq __extendhfsf2@PLT
185; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
186; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
187; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
188; AVX-NEXT:    addq $56, %rsp
189; AVX-NEXT:    retq
190;
191; F16C-LABEL: cvt_8i16_to_8f32:
192; F16C:       # %bb.0:
193; F16C-NEXT:    vcvtph2ps %xmm0, %ymm0
194; F16C-NEXT:    retq
195;
196; AVX512-LABEL: cvt_8i16_to_8f32:
197; AVX512:       # %bb.0:
198; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
199; AVX512-NEXT:    retq
200  %1 = bitcast <8 x i16> %a0 to <8 x half>
201  %2 = fpext <8 x half> %1 to <8 x float>
202  ret <8 x float> %2
203}
204
205define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
206; AVX1-LABEL: cvt_16i16_to_16f32:
207; AVX1:       # %bb.0:
208; AVX1-NEXT:    subq $104, %rsp
209; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
210; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
211; AVX1-NEXT:    vzeroupper
212; AVX1-NEXT:    callq __extendhfsf2@PLT
213; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
214; AVX1-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
215; AVX1-NEXT:    # xmm0 = mem[2,3,0,1]
216; AVX1-NEXT:    callq __extendhfsf2@PLT
217; AVX1-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
218; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
219; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
220; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
221; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
222; AVX1-NEXT:    callq __extendhfsf2@PLT
223; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
224; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
225; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
226; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
227; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
228; AVX1-NEXT:    vzeroupper
229; AVX1-NEXT:    callq __extendhfsf2@PLT
230; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
231; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
232; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
233; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
234; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
235; AVX1-NEXT:    vzeroupper
236; AVX1-NEXT:    callq __extendhfsf2@PLT
237; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
238; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
239; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
240; AVX1-NEXT:    vzeroupper
241; AVX1-NEXT:    callq __extendhfsf2@PLT
242; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
243; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
244; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
245; AVX1-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
246; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
247; AVX1-NEXT:    callq __extendhfsf2@PLT
248; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
249; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
250; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
251; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
252; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
253; AVX1-NEXT:    vzeroupper
254; AVX1-NEXT:    callq __extendhfsf2@PLT
255; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
256; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
257; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
258; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
259; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
260; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
261; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
262; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
263; AVX1-NEXT:    vzeroupper
264; AVX1-NEXT:    callq __extendhfsf2@PLT
265; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
266; AVX1-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
267; AVX1-NEXT:    # xmm0 = mem[2,3,0,1]
268; AVX1-NEXT:    callq __extendhfsf2@PLT
269; AVX1-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
270; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
271; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
272; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
273; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
274; AVX1-NEXT:    callq __extendhfsf2@PLT
275; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
276; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
277; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
278; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
279; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
280; AVX1-NEXT:    callq __extendhfsf2@PLT
281; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
282; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
283; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
284; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
285; AVX1-NEXT:    callq __extendhfsf2@PLT
286; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
287; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
288; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
289; AVX1-NEXT:    callq __extendhfsf2@PLT
290; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
291; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
292; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
293; AVX1-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
294; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
295; AVX1-NEXT:    callq __extendhfsf2@PLT
296; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
297; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
298; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
299; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
300; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
301; AVX1-NEXT:    callq __extendhfsf2@PLT
302; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
303; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
304; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
305; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
306; AVX1-NEXT:    addq $104, %rsp
307; AVX1-NEXT:    retq
308;
309; AVX2-LABEL: cvt_16i16_to_16f32:
310; AVX2:       # %bb.0:
311; AVX2-NEXT:    subq $104, %rsp
312; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
313; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
314; AVX2-NEXT:    vzeroupper
315; AVX2-NEXT:    callq __extendhfsf2@PLT
316; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
317; AVX2-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
318; AVX2-NEXT:    # xmm0 = mem[2,3,0,1]
319; AVX2-NEXT:    callq __extendhfsf2@PLT
320; AVX2-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
321; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
322; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
323; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
324; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
325; AVX2-NEXT:    callq __extendhfsf2@PLT
326; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
327; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
328; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
329; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
330; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
331; AVX2-NEXT:    vzeroupper
332; AVX2-NEXT:    callq __extendhfsf2@PLT
333; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
334; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
335; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
336; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
337; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
338; AVX2-NEXT:    vzeroupper
339; AVX2-NEXT:    callq __extendhfsf2@PLT
340; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
341; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
342; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
343; AVX2-NEXT:    vzeroupper
344; AVX2-NEXT:    callq __extendhfsf2@PLT
345; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
346; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
347; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
348; AVX2-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
349; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
350; AVX2-NEXT:    callq __extendhfsf2@PLT
351; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
352; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
353; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
354; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
355; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
356; AVX2-NEXT:    vzeroupper
357; AVX2-NEXT:    callq __extendhfsf2@PLT
358; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
359; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
360; AVX2-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
361; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
362; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
363; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
364; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
365; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
366; AVX2-NEXT:    vzeroupper
367; AVX2-NEXT:    callq __extendhfsf2@PLT
368; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
369; AVX2-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
370; AVX2-NEXT:    # xmm0 = mem[2,3,0,1]
371; AVX2-NEXT:    callq __extendhfsf2@PLT
372; AVX2-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
373; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
374; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
375; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
376; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
377; AVX2-NEXT:    callq __extendhfsf2@PLT
378; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
379; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
380; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
381; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
382; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
383; AVX2-NEXT:    callq __extendhfsf2@PLT
384; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
385; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
386; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
387; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
388; AVX2-NEXT:    callq __extendhfsf2@PLT
389; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
390; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
391; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
392; AVX2-NEXT:    callq __extendhfsf2@PLT
393; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
394; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
395; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
396; AVX2-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
397; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
398; AVX2-NEXT:    callq __extendhfsf2@PLT
399; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
400; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
401; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
402; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
403; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
404; AVX2-NEXT:    callq __extendhfsf2@PLT
405; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
406; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
407; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
408; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
409; AVX2-NEXT:    addq $104, %rsp
410; AVX2-NEXT:    retq
411;
412; F16C-LABEL: cvt_16i16_to_16f32:
413; F16C:       # %bb.0:
414; F16C-NEXT:    vcvtph2ps %xmm0, %ymm2
415; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
416; F16C-NEXT:    vcvtph2ps %xmm0, %ymm1
417; F16C-NEXT:    vmovaps %ymm2, %ymm0
418; F16C-NEXT:    retq
419;
420; AVX512-LABEL: cvt_16i16_to_16f32:
421; AVX512:       # %bb.0:
422; AVX512-NEXT:    vcvtph2ps %ymm0, %zmm0
423; AVX512-NEXT:    retq
424  %1 = bitcast <16 x i16> %a0 to <16 x half>
425  %2 = fpext <16 x half> %1 to <16 x float>
426  ret <16 x float> %2
427}
428
429define <2 x float> @cvt_2i16_to_2f32_constrained(<2 x i16> %a0) nounwind strictfp {
430; AVX-LABEL: cvt_2i16_to_2f32_constrained:
431; AVX:       # %bb.0:
432; AVX-NEXT:    subq $40, %rsp
433; AVX-NEXT:    vmovd %xmm0, %eax
434; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
435; AVX-NEXT:    shrl $16, %eax
436; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
437; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
438; AVX-NEXT:    callq __extendhfsf2@PLT
439; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
440; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
441; AVX-NEXT:    callq __extendhfsf2@PLT
442; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
443; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
444; AVX-NEXT:    addq $40, %rsp
445; AVX-NEXT:    retq
446;
447; F16C-LABEL: cvt_2i16_to_2f32_constrained:
448; F16C:       # %bb.0:
449; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
450; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
451; F16C-NEXT:    retq
452;
453; AVX512-LABEL: cvt_2i16_to_2f32_constrained:
454; AVX512:       # %bb.0:
455; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
456; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
457; AVX512-NEXT:    retq
458  %1 = bitcast <2 x i16> %a0 to <2 x half>
459  %2 = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp
460  ret <2 x float> %2
461}
462declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) strictfp
463
464define <4 x float> @cvt_4i16_to_4f32_constrained(<4 x i16> %a0) nounwind strictfp {
465; AVX-LABEL: cvt_4i16_to_4f32_constrained:
466; AVX:       # %bb.0:
467; AVX-NEXT:    subq $72, %rsp
468; AVX-NEXT:    vmovq %xmm0, %rax
469; AVX-NEXT:    movq %rax, %rcx
470; AVX-NEXT:    movq %rax, %rdx
471; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
472; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
473; AVX-NEXT:    shrl $16, %eax
474; AVX-NEXT:    shrq $32, %rcx
475; AVX-NEXT:    shrq $48, %rdx
476; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
477; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
478; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
479; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
480; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
481; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
482; AVX-NEXT:    callq __extendhfsf2@PLT
483; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
484; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
485; AVX-NEXT:    callq __extendhfsf2@PLT
486; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
487; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
488; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
489; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
490; AVX-NEXT:    callq __extendhfsf2@PLT
491; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
492; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
493; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
494; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
495; AVX-NEXT:    callq __extendhfsf2@PLT
496; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
497; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
498; AVX-NEXT:    addq $72, %rsp
499; AVX-NEXT:    retq
500;
501; F16C-LABEL: cvt_4i16_to_4f32_constrained:
502; F16C:       # %bb.0:
503; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
504; F16C-NEXT:    retq
505;
506; AVX512-LABEL: cvt_4i16_to_4f32_constrained:
507; AVX512:       # %bb.0:
508; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
509; AVX512-NEXT:    retq
510  %1 = bitcast <4 x i16> %a0 to <4 x half>
511  %2 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp
512  ret <4 x float> %2
513}
514declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) strictfp
515
516define <8 x float> @cvt_8i16_to_8f32_constrained(<8 x i16> %a0) nounwind strictfp {
517; AVX-LABEL: cvt_8i16_to_8f32_constrained:
518; AVX:       # %bb.0:
519; AVX-NEXT:    subq $56, %rsp
520; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
521; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
522; AVX-NEXT:    callq __extendhfsf2@PLT
523; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
524; AVX-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
525; AVX-NEXT:    # xmm0 = mem[2,3,0,1]
526; AVX-NEXT:    callq __extendhfsf2@PLT
527; AVX-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
528; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
529; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
530; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
531; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
532; AVX-NEXT:    callq __extendhfsf2@PLT
533; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
534; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
535; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
536; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
537; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
538; AVX-NEXT:    callq __extendhfsf2@PLT
539; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
540; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
541; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
542; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
543; AVX-NEXT:    callq __extendhfsf2@PLT
544; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
545; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
546; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
547; AVX-NEXT:    callq __extendhfsf2@PLT
548; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
549; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
550; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
551; AVX-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
552; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
553; AVX-NEXT:    callq __extendhfsf2@PLT
554; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
555; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
556; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
557; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
558; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
559; AVX-NEXT:    callq __extendhfsf2@PLT
560; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
561; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
562; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
563; AVX-NEXT:    addq $56, %rsp
564; AVX-NEXT:    retq
565;
566; F16C-LABEL: cvt_8i16_to_8f32_constrained:
567; F16C:       # %bb.0:
568; F16C-NEXT:    vcvtph2ps %xmm0, %ymm0
569; F16C-NEXT:    retq
570;
571; AVX512-LABEL: cvt_8i16_to_8f32_constrained:
572; AVX512:       # %bb.0:
573; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
574; AVX512-NEXT:    retq
575  %1 = bitcast <8 x i16> %a0 to <8 x half>
576  %2 = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp
577  ret <8 x float> %2
578}
579declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) strictfp
580
581define <16 x float> @cvt_16i16_to_16f32_constrained(<16 x i16> %a0) nounwind strictfp {
582; AVX1-LABEL: cvt_16i16_to_16f32_constrained:
583; AVX1:       # %bb.0:
584; AVX1-NEXT:    subq $104, %rsp
585; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
586; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
587; AVX1-NEXT:    vzeroupper
588; AVX1-NEXT:    callq __extendhfsf2@PLT
589; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
590; AVX1-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
591; AVX1-NEXT:    # xmm0 = mem[2,3,0,1]
592; AVX1-NEXT:    callq __extendhfsf2@PLT
593; AVX1-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
594; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
595; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
596; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
597; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
598; AVX1-NEXT:    callq __extendhfsf2@PLT
599; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
600; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
601; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
602; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
603; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
604; AVX1-NEXT:    vzeroupper
605; AVX1-NEXT:    callq __extendhfsf2@PLT
606; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
607; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
608; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
609; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
610; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
611; AVX1-NEXT:    vzeroupper
612; AVX1-NEXT:    callq __extendhfsf2@PLT
613; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
614; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
615; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
616; AVX1-NEXT:    vzeroupper
617; AVX1-NEXT:    callq __extendhfsf2@PLT
618; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
619; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
620; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
621; AVX1-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
622; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
623; AVX1-NEXT:    callq __extendhfsf2@PLT
624; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
625; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
626; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
627; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
628; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
629; AVX1-NEXT:    vzeroupper
630; AVX1-NEXT:    callq __extendhfsf2@PLT
631; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
632; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
633; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
634; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
635; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
636; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
637; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
638; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
639; AVX1-NEXT:    vzeroupper
640; AVX1-NEXT:    callq __extendhfsf2@PLT
641; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
642; AVX1-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
643; AVX1-NEXT:    # xmm0 = mem[2,3,0,1]
644; AVX1-NEXT:    callq __extendhfsf2@PLT
645; AVX1-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
646; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
647; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
648; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
649; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
650; AVX1-NEXT:    callq __extendhfsf2@PLT
651; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
652; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
653; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
654; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
655; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
656; AVX1-NEXT:    callq __extendhfsf2@PLT
657; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
658; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
659; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
660; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
661; AVX1-NEXT:    callq __extendhfsf2@PLT
662; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
663; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
664; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
665; AVX1-NEXT:    callq __extendhfsf2@PLT
666; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
667; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
668; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
669; AVX1-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
670; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
671; AVX1-NEXT:    callq __extendhfsf2@PLT
672; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
673; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
674; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
675; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
676; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
677; AVX1-NEXT:    callq __extendhfsf2@PLT
678; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
679; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
680; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
681; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
682; AVX1-NEXT:    addq $104, %rsp
683; AVX1-NEXT:    retq
684;
685; AVX2-LABEL: cvt_16i16_to_16f32_constrained:
686; AVX2:       # %bb.0:
687; AVX2-NEXT:    subq $104, %rsp
688; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
689; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
690; AVX2-NEXT:    vzeroupper
691; AVX2-NEXT:    callq __extendhfsf2@PLT
692; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
693; AVX2-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
694; AVX2-NEXT:    # xmm0 = mem[2,3,0,1]
695; AVX2-NEXT:    callq __extendhfsf2@PLT
696; AVX2-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
697; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
698; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
699; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
700; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
701; AVX2-NEXT:    callq __extendhfsf2@PLT
702; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
703; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
704; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
705; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
706; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
707; AVX2-NEXT:    vzeroupper
708; AVX2-NEXT:    callq __extendhfsf2@PLT
709; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
710; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
711; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
712; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
713; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
714; AVX2-NEXT:    vzeroupper
715; AVX2-NEXT:    callq __extendhfsf2@PLT
716; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
717; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
718; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
719; AVX2-NEXT:    vzeroupper
720; AVX2-NEXT:    callq __extendhfsf2@PLT
721; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
722; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
723; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
724; AVX2-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
725; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
726; AVX2-NEXT:    callq __extendhfsf2@PLT
727; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
728; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
729; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
730; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
731; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
732; AVX2-NEXT:    vzeroupper
733; AVX2-NEXT:    callq __extendhfsf2@PLT
734; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
735; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
736; AVX2-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
737; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
738; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
739; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
740; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
741; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
742; AVX2-NEXT:    vzeroupper
743; AVX2-NEXT:    callq __extendhfsf2@PLT
744; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
745; AVX2-NEXT:    vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
746; AVX2-NEXT:    # xmm0 = mem[2,3,0,1]
747; AVX2-NEXT:    callq __extendhfsf2@PLT
748; AVX2-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
749; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
750; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
751; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
752; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
753; AVX2-NEXT:    callq __extendhfsf2@PLT
754; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
755; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
756; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
757; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
758; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
759; AVX2-NEXT:    callq __extendhfsf2@PLT
760; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
761; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
762; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
763; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
764; AVX2-NEXT:    callq __extendhfsf2@PLT
765; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
766; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
767; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
768; AVX2-NEXT:    callq __extendhfsf2@PLT
769; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
770; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
771; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
772; AVX2-NEXT:    vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
773; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
774; AVX2-NEXT:    callq __extendhfsf2@PLT
775; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
776; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
777; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
778; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
779; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
780; AVX2-NEXT:    callq __extendhfsf2@PLT
781; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
782; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
783; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
784; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
785; AVX2-NEXT:    addq $104, %rsp
786; AVX2-NEXT:    retq
787;
788; F16C-LABEL: cvt_16i16_to_16f32_constrained:
789; F16C:       # %bb.0:
790; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm1
791; F16C-NEXT:    vcvtph2ps %xmm1, %ymm1
792; F16C-NEXT:    vcvtph2ps %xmm0, %ymm0
793; F16C-NEXT:    retq
794;
795; AVX512-LABEL: cvt_16i16_to_16f32_constrained:
796; AVX512:       # %bb.0:
797; AVX512-NEXT:    vcvtph2ps %ymm0, %zmm0
798; AVX512-NEXT:    retq
799  %1 = bitcast <16 x i16> %a0 to <16 x half>
800  %2 = call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %1, metadata !"fpexcept.strict") strictfp
801  ret <16 x float> %2
802}
803declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata) strictfp
804
805;
806; Half to Float (Load)
807;
808
809define float @load_cvt_i16_to_f32(ptr %a0) nounwind {
810; AVX-LABEL: load_cvt_i16_to_f32:
811; AVX:       # %bb.0:
812; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
813; AVX-NEXT:    jmp __extendhfsf2@PLT # TAILCALL
814;
815; F16C-LABEL: load_cvt_i16_to_f32:
816; F16C:       # %bb.0:
817; F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
818; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
819; F16C-NEXT:    retq
820;
821; AVX512-LABEL: load_cvt_i16_to_f32:
822; AVX512:       # %bb.0:
823; AVX512-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
824; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
825; AVX512-NEXT:    retq
826  %1 = load i16, ptr %a0
827  %2 = bitcast i16 %1 to half
828  %3 = fpext half %2 to float
829  ret float %3
830}
831
832define <4 x float> @load_cvt_4i16_to_4f32(ptr %a0) nounwind {
833; AVX-LABEL: load_cvt_4i16_to_4f32:
834; AVX:       # %bb.0:
835; AVX-NEXT:    subq $72, %rsp
836; AVX-NEXT:    vpinsrw $0, 6(%rdi), %xmm0, %xmm0
837; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
838; AVX-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
839; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
840; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
841; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
842; AVX-NEXT:    vpinsrw $0, 2(%rdi), %xmm0, %xmm0
843; AVX-NEXT:    callq __extendhfsf2@PLT
844; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
845; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
846; AVX-NEXT:    callq __extendhfsf2@PLT
847; AVX-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
848; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
849; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
850; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
851; AVX-NEXT:    callq __extendhfsf2@PLT
852; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
853; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
854; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
855; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
856; AVX-NEXT:    callq __extendhfsf2@PLT
857; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
858; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
859; AVX-NEXT:    addq $72, %rsp
860; AVX-NEXT:    retq
861;
862; F16C-LABEL: load_cvt_4i16_to_4f32:
863; F16C:       # %bb.0:
864; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
865; F16C-NEXT:    retq
866;
867; AVX512-LABEL: load_cvt_4i16_to_4f32:
868; AVX512:       # %bb.0:
869; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
870; AVX512-NEXT:    retq
871  %1 = load <4 x i16>, ptr %a0
872  %2 = bitcast <4 x i16> %1 to <4 x half>
873  %3 = fpext <4 x half> %2 to <4 x float>
874  ret <4 x float> %3
875}
876
877define <4 x float> @load_cvt_8i16_to_4f32(ptr %a0) nounwind {
878; AVX-LABEL: load_cvt_8i16_to_4f32:
879; AVX:       # %bb.0:
880; AVX-NEXT:    subq $72, %rsp
881; AVX-NEXT:    movq (%rdi), %rax
882; AVX-NEXT:    movq %rax, %rcx
883; AVX-NEXT:    movq %rax, %rdx
884; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
885; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
886; AVX-NEXT:    shrl $16, %eax
887; AVX-NEXT:    shrq $32, %rcx
888; AVX-NEXT:    shrq $48, %rdx
889; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
890; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
891; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
892; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
893; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
894; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
895; AVX-NEXT:    callq __extendhfsf2@PLT
896; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
897; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
898; AVX-NEXT:    callq __extendhfsf2@PLT
899; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
900; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
901; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
902; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
903; AVX-NEXT:    callq __extendhfsf2@PLT
904; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
905; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
906; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
907; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
908; AVX-NEXT:    callq __extendhfsf2@PLT
909; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
910; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
911; AVX-NEXT:    addq $72, %rsp
912; AVX-NEXT:    retq
913;
914; F16C-LABEL: load_cvt_8i16_to_4f32:
915; F16C:       # %bb.0:
916; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
917; F16C-NEXT:    retq
918;
919; AVX512-LABEL: load_cvt_8i16_to_4f32:
920; AVX512:       # %bb.0:
921; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
922; AVX512-NEXT:    retq
923  %1 = load <8 x i16>, ptr %a0
924  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
925  %3 = bitcast <4 x i16> %2 to <4 x half>
926  %4 = fpext <4 x half> %3 to <4 x float>
927  ret <4 x float> %4
928}
929
930define <8 x float> @load_cvt_8i16_to_8f32(ptr %a0) nounwind {
931; AVX1-LABEL: load_cvt_8i16_to_8f32:
932; AVX1:       # %bb.0:
933; AVX1-NEXT:    pushq %rbx
934; AVX1-NEXT:    subq $48, %rsp
935; AVX1-NEXT:    movq %rdi, %rbx
936; AVX1-NEXT:    vmovaps (%rdi), %xmm0
937; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
938; AVX1-NEXT:    vbroadcastss 8(%rdi), %xmm0
939; AVX1-NEXT:    callq __extendhfsf2@PLT
940; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
941; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
942; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
943; AVX1-NEXT:    callq __extendhfsf2@PLT
944; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
945; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
946; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
947; AVX1-NEXT:    vbroadcastss 12(%rbx), %xmm0
948; AVX1-NEXT:    callq __extendhfsf2@PLT
949; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
950; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
951; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
952; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
953; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
954; AVX1-NEXT:    callq __extendhfsf2@PLT
955; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
956; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
957; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
958; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
959; AVX1-NEXT:    callq __extendhfsf2@PLT
960; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
961; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
962; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
963; AVX1-NEXT:    callq __extendhfsf2@PLT
964; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
965; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
966; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
967; AVX1-NEXT:    vbroadcastss 4(%rbx), %xmm0
968; AVX1-NEXT:    callq __extendhfsf2@PLT
969; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
970; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
971; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
972; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
973; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
974; AVX1-NEXT:    callq __extendhfsf2@PLT
975; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
976; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
977; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
978; AVX1-NEXT:    addq $48, %rsp
979; AVX1-NEXT:    popq %rbx
980; AVX1-NEXT:    retq
981;
982; AVX2-LABEL: load_cvt_8i16_to_8f32:
983; AVX2:       # %bb.0:
984; AVX2-NEXT:    pushq %rbx
985; AVX2-NEXT:    subq $48, %rsp
986; AVX2-NEXT:    movq %rdi, %rbx
987; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
988; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
989; AVX2-NEXT:    vpinsrw $0, 8(%rdi), %xmm0, %xmm0
990; AVX2-NEXT:    callq __extendhfsf2@PLT
991; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
992; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
993; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
994; AVX2-NEXT:    callq __extendhfsf2@PLT
995; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
996; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
997; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
998; AVX2-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
999; AVX2-NEXT:    callq __extendhfsf2@PLT
1000; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1001; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1002; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1003; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1004; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1005; AVX2-NEXT:    callq __extendhfsf2@PLT
1006; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1007; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1008; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1009; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1010; AVX2-NEXT:    callq __extendhfsf2@PLT
1011; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1012; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1013; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
1014; AVX2-NEXT:    callq __extendhfsf2@PLT
1015; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1016; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1017; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1018; AVX2-NEXT:    vpinsrw $0, 4(%rbx), %xmm0, %xmm0
1019; AVX2-NEXT:    callq __extendhfsf2@PLT
1020; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1021; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1022; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1023; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1024; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
1025; AVX2-NEXT:    callq __extendhfsf2@PLT
1026; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1027; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1028; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1029; AVX2-NEXT:    addq $48, %rsp
1030; AVX2-NEXT:    popq %rbx
1031; AVX2-NEXT:    retq
1032;
1033; F16C-LABEL: load_cvt_8i16_to_8f32:
1034; F16C:       # %bb.0:
1035; F16C-NEXT:    vcvtph2ps (%rdi), %ymm0
1036; F16C-NEXT:    retq
1037;
1038; AVX512-LABEL: load_cvt_8i16_to_8f32:
1039; AVX512:       # %bb.0:
1040; AVX512-NEXT:    vcvtph2ps (%rdi), %ymm0
1041; AVX512-NEXT:    retq
1042  %1 = load <8 x i16>, ptr %a0
1043  %2 = bitcast <8 x i16> %1 to <8 x half>
1044  %3 = fpext <8 x half> %2 to <8 x float>
1045  ret <8 x float> %3
1046}
1047
1048define <16 x float> @load_cvt_16i16_to_16f32(ptr %a0) nounwind {
1049; AVX1-LABEL: load_cvt_16i16_to_16f32:
1050; AVX1:       # %bb.0:
1051; AVX1-NEXT:    pushq %rbx
1052; AVX1-NEXT:    subq $80, %rsp
1053; AVX1-NEXT:    movq %rdi, %rbx
1054; AVX1-NEXT:    vbroadcastss 8(%rdi), %xmm0
1055; AVX1-NEXT:    callq __extendhfsf2@PLT
1056; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1057; AVX1-NEXT:    vmovdqa (%rbx), %xmm1
1058; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1059; AVX1-NEXT:    vmovaps 16(%rbx), %xmm0
1060; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1061; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1062; AVX1-NEXT:    callq __extendhfsf2@PLT
1063; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1064; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1065; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1066; AVX1-NEXT:    vbroadcastss 12(%rbx), %xmm0
1067; AVX1-NEXT:    callq __extendhfsf2@PLT
1068; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1069; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1070; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1071; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1072; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1073; AVX1-NEXT:    callq __extendhfsf2@PLT
1074; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1075; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1076; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1077; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1078; AVX1-NEXT:    callq __extendhfsf2@PLT
1079; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1080; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1081; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1082; AVX1-NEXT:    callq __extendhfsf2@PLT
1083; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1084; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1085; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1086; AVX1-NEXT:    vbroadcastss 4(%rbx), %xmm0
1087; AVX1-NEXT:    callq __extendhfsf2@PLT
1088; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1089; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1090; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1091; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1092; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
1093; AVX1-NEXT:    callq __extendhfsf2@PLT
1094; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1095; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1096; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1097; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1098; AVX1-NEXT:    vbroadcastss 24(%rbx), %xmm0
1099; AVX1-NEXT:    vzeroupper
1100; AVX1-NEXT:    callq __extendhfsf2@PLT
1101; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1102; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1103; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1104; AVX1-NEXT:    callq __extendhfsf2@PLT
1105; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1106; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1107; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1108; AVX1-NEXT:    vbroadcastss 28(%rbx), %xmm0
1109; AVX1-NEXT:    callq __extendhfsf2@PLT
1110; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1111; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1112; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1113; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1114; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1115; AVX1-NEXT:    callq __extendhfsf2@PLT
1116; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1117; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1118; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1119; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1120; AVX1-NEXT:    callq __extendhfsf2@PLT
1121; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1122; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1123; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
1124; AVX1-NEXT:    callq __extendhfsf2@PLT
1125; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1126; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1127; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1128; AVX1-NEXT:    vbroadcastss 20(%rbx), %xmm0
1129; AVX1-NEXT:    callq __extendhfsf2@PLT
1130; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1131; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1132; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1133; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1134; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
1135; AVX1-NEXT:    callq __extendhfsf2@PLT
1136; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1137; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1138; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
1139; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1140; AVX1-NEXT:    addq $80, %rsp
1141; AVX1-NEXT:    popq %rbx
1142; AVX1-NEXT:    retq
1143;
1144; AVX2-LABEL: load_cvt_16i16_to_16f32:
1145; AVX2:       # %bb.0:
1146; AVX2-NEXT:    pushq %rbx
1147; AVX2-NEXT:    subq $80, %rsp
1148; AVX2-NEXT:    movq %rdi, %rbx
1149; AVX2-NEXT:    vpinsrw $0, 8(%rdi), %xmm0, %xmm0
1150; AVX2-NEXT:    callq __extendhfsf2@PLT
1151; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1152; AVX2-NEXT:    vmovdqa (%rbx), %xmm1
1153; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1154; AVX2-NEXT:    vmovaps 16(%rbx), %xmm0
1155; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1156; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1157; AVX2-NEXT:    callq __extendhfsf2@PLT
1158; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1159; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1160; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1161; AVX2-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
1162; AVX2-NEXT:    callq __extendhfsf2@PLT
1163; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1164; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1165; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1166; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1167; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1168; AVX2-NEXT:    callq __extendhfsf2@PLT
1169; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1170; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1171; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1172; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1173; AVX2-NEXT:    callq __extendhfsf2@PLT
1174; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1175; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1176; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
1177; AVX2-NEXT:    callq __extendhfsf2@PLT
1178; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1179; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1180; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1181; AVX2-NEXT:    vpinsrw $0, 4(%rbx), %xmm0, %xmm0
1182; AVX2-NEXT:    callq __extendhfsf2@PLT
1183; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1184; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1185; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1186; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1187; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
1188; AVX2-NEXT:    callq __extendhfsf2@PLT
1189; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1190; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1191; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1192; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1193; AVX2-NEXT:    vpinsrw $0, 24(%rbx), %xmm0, %xmm0
1194; AVX2-NEXT:    vzeroupper
1195; AVX2-NEXT:    callq __extendhfsf2@PLT
1196; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1197; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1198; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1199; AVX2-NEXT:    callq __extendhfsf2@PLT
1200; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1201; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1202; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1203; AVX2-NEXT:    vpinsrw $0, 28(%rbx), %xmm0, %xmm0
1204; AVX2-NEXT:    callq __extendhfsf2@PLT
1205; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1206; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1207; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1208; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1209; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1210; AVX2-NEXT:    callq __extendhfsf2@PLT
1211; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1212; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1213; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1214; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1215; AVX2-NEXT:    callq __extendhfsf2@PLT
1216; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1217; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1218; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
1219; AVX2-NEXT:    callq __extendhfsf2@PLT
1220; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1221; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1222; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1223; AVX2-NEXT:    vpinsrw $0, 20(%rbx), %xmm0, %xmm0
1224; AVX2-NEXT:    callq __extendhfsf2@PLT
1225; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1226; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1227; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1228; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1229; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
1230; AVX2-NEXT:    callq __extendhfsf2@PLT
1231; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1232; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1233; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
1234; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1235; AVX2-NEXT:    addq $80, %rsp
1236; AVX2-NEXT:    popq %rbx
1237; AVX2-NEXT:    retq
1238;
1239; F16C-LABEL: load_cvt_16i16_to_16f32:
1240; F16C:       # %bb.0:
1241; F16C-NEXT:    vcvtph2ps (%rdi), %ymm0
1242; F16C-NEXT:    vcvtph2ps 16(%rdi), %ymm1
1243; F16C-NEXT:    retq
1244;
1245; AVX512-LABEL: load_cvt_16i16_to_16f32:
1246; AVX512:       # %bb.0:
1247; AVX512-NEXT:    vcvtph2ps (%rdi), %zmm0
1248; AVX512-NEXT:    retq
1249  %1 = load <16 x i16>, ptr %a0
1250  %2 = bitcast <16 x i16> %1 to <16 x half>
1251  %3 = fpext <16 x half> %2 to <16 x float>
1252  ret <16 x float> %3
1253}
1254
1255define <4 x float> @load_cvt_4i16_to_4f32_constrained(ptr %a0) nounwind strictfp {
1256; AVX-LABEL: load_cvt_4i16_to_4f32_constrained:
1257; AVX:       # %bb.0:
1258; AVX-NEXT:    subq $72, %rsp
1259; AVX-NEXT:    vpinsrw $0, 6(%rdi), %xmm0, %xmm0
1260; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1261; AVX-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
1262; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1263; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1264; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1265; AVX-NEXT:    vpinsrw $0, 2(%rdi), %xmm0, %xmm0
1266; AVX-NEXT:    callq __extendhfsf2@PLT
1267; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1268; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1269; AVX-NEXT:    callq __extendhfsf2@PLT
1270; AVX-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1271; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
1272; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1273; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1274; AVX-NEXT:    callq __extendhfsf2@PLT
1275; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1276; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1277; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1278; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1279; AVX-NEXT:    callq __extendhfsf2@PLT
1280; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1281; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1282; AVX-NEXT:    addq $72, %rsp
1283; AVX-NEXT:    retq
1284;
1285; F16C-LABEL: load_cvt_4i16_to_4f32_constrained:
1286; F16C:       # %bb.0:
1287; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
1288; F16C-NEXT:    retq
1289;
1290; AVX512-LABEL: load_cvt_4i16_to_4f32_constrained:
1291; AVX512:       # %bb.0:
1292; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
1293; AVX512-NEXT:    retq
1294  %1 = load <4 x i16>, ptr %a0
1295  %2 = bitcast <4 x i16> %1 to <4 x half>
1296  %3 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %2, metadata !"fpexcept.strict") strictfp
1297  ret <4 x float> %3
1298}
1299
1300define <4 x float> @load_cvt_8i16_to_4f32_constrained(ptr %a0) nounwind strictfp {
1301; AVX-LABEL: load_cvt_8i16_to_4f32_constrained:
1302; AVX:       # %bb.0:
1303; AVX-NEXT:    subq $72, %rsp
1304; AVX-NEXT:    movq (%rdi), %rax
1305; AVX-NEXT:    movq %rax, %rcx
1306; AVX-NEXT:    movq %rax, %rdx
1307; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1308; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
1309; AVX-NEXT:    shrl $16, %eax
1310; AVX-NEXT:    shrq $32, %rcx
1311; AVX-NEXT:    shrq $48, %rdx
1312; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
1313; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1314; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm1
1315; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1316; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
1317; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
1318; AVX-NEXT:    callq __extendhfsf2@PLT
1319; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1320; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1321; AVX-NEXT:    callq __extendhfsf2@PLT
1322; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1323; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
1324; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1325; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1326; AVX-NEXT:    callq __extendhfsf2@PLT
1327; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1328; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
1329; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1330; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1331; AVX-NEXT:    callq __extendhfsf2@PLT
1332; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1333; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1334; AVX-NEXT:    addq $72, %rsp
1335; AVX-NEXT:    retq
1336;
1337; F16C-LABEL: load_cvt_8i16_to_4f32_constrained:
1338; F16C:       # %bb.0:
1339; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
1340; F16C-NEXT:    retq
1341;
1342; AVX512-LABEL: load_cvt_8i16_to_4f32_constrained:
1343; AVX512:       # %bb.0:
1344; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
1345; AVX512-NEXT:    retq
1346  %1 = load <8 x i16>, ptr %a0
1347  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1348  %3 = bitcast <4 x i16> %2 to <4 x half>
1349  %4 = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %3, metadata !"fpexcept.strict") strictfp
1350  ret <4 x float> %4
1351}
1352
1353;
1354; Half to Double
1355;
1356
1357define double @cvt_i16_to_f64(i16 %a0) nounwind {
1358; AVX-LABEL: cvt_i16_to_f64:
1359; AVX:       # %bb.0:
1360; AVX-NEXT:    pushq %rax
1361; AVX-NEXT:    vpinsrw $0, %edi, %xmm0, %xmm0
1362; AVX-NEXT:    callq __extendhfsf2@PLT
1363; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1364; AVX-NEXT:    popq %rax
1365; AVX-NEXT:    retq
1366;
1367; F16C-LABEL: cvt_i16_to_f64:
1368; F16C:       # %bb.0:
1369; F16C-NEXT:    vmovd %edi, %xmm0
1370; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1371; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1372; F16C-NEXT:    retq
1373;
1374; AVX512-LABEL: cvt_i16_to_f64:
1375; AVX512:       # %bb.0:
1376; AVX512-NEXT:    vmovd %edi, %xmm0
1377; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1378; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1379; AVX512-NEXT:    retq
1380  %1 = bitcast i16 %a0 to half
1381  %2 = fpext half %1 to double
1382  ret double %2
1383}
1384
1385define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
1386; AVX-LABEL: cvt_2i16_to_2f64:
1387; AVX:       # %bb.0:
1388; AVX-NEXT:    subq $40, %rsp
1389; AVX-NEXT:    vmovd %xmm0, %eax
1390; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1391; AVX-NEXT:    shrl $16, %eax
1392; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
1393; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1394; AVX-NEXT:    callq __extendhfsf2@PLT
1395; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1396; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1397; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1398; AVX-NEXT:    callq __extendhfsf2@PLT
1399; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1400; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1401; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1402; AVX-NEXT:    addq $40, %rsp
1403; AVX-NEXT:    retq
1404;
1405; F16C-LABEL: cvt_2i16_to_2f64:
1406; F16C:       # %bb.0:
1407; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1408; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
1409; F16C-NEXT:    retq
1410;
1411; AVX512-LABEL: cvt_2i16_to_2f64:
1412; AVX512:       # %bb.0:
1413; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1414; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
1415; AVX512-NEXT:    retq
1416  %1 = bitcast <2 x i16> %a0 to <2 x half>
1417  %2 = fpext <2 x half> %1 to <2 x double>
1418  ret <2 x double> %2
1419}
1420
1421define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
1422; AVX-LABEL: cvt_4i16_to_4f64:
1423; AVX:       # %bb.0:
1424; AVX-NEXT:    subq $72, %rsp
1425; AVX-NEXT:    vmovq %xmm0, %rax
1426; AVX-NEXT:    movq %rax, %rcx
1427; AVX-NEXT:    movl %eax, %edx
1428; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1429; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1430; AVX-NEXT:    shrq $48, %rax
1431; AVX-NEXT:    shrq $32, %rcx
1432; AVX-NEXT:    shrl $16, %edx
1433; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
1434; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1435; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
1436; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1437; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1438; AVX-NEXT:    callq __extendhfsf2@PLT
1439; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1440; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1441; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1442; AVX-NEXT:    callq __extendhfsf2@PLT
1443; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1444; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1445; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1446; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1447; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1448; AVX-NEXT:    callq __extendhfsf2@PLT
1449; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1450; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1451; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1452; AVX-NEXT:    callq __extendhfsf2@PLT
1453; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1454; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1455; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1456; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1457; AVX-NEXT:    addq $72, %rsp
1458; AVX-NEXT:    retq
1459;
1460; F16C-LABEL: cvt_4i16_to_4f64:
1461; F16C:       # %bb.0:
1462; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1463; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1464; F16C-NEXT:    retq
1465;
1466; AVX512-LABEL: cvt_4i16_to_4f64:
1467; AVX512:       # %bb.0:
1468; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1469; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1470; AVX512-NEXT:    retq
1471  %1 = bitcast <4 x i16> %a0 to <4 x half>
1472  %2 = fpext <4 x half> %1 to <4 x double>
1473  ret <4 x double> %2
1474}
1475
1476define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
1477; AVX-LABEL: cvt_8i16_to_2f64:
1478; AVX:       # %bb.0:
1479; AVX-NEXT:    subq $40, %rsp
1480; AVX-NEXT:    vmovd %xmm0, %eax
1481; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1482; AVX-NEXT:    shrl $16, %eax
1483; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
1484; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1485; AVX-NEXT:    callq __extendhfsf2@PLT
1486; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1487; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1488; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1489; AVX-NEXT:    callq __extendhfsf2@PLT
1490; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1491; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1492; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1493; AVX-NEXT:    addq $40, %rsp
1494; AVX-NEXT:    retq
1495;
1496; F16C-LABEL: cvt_8i16_to_2f64:
1497; F16C:       # %bb.0:
1498; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1499; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
1500; F16C-NEXT:    retq
1501;
1502; AVX512-LABEL: cvt_8i16_to_2f64:
1503; AVX512:       # %bb.0:
1504; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1505; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
1506; AVX512-NEXT:    retq
1507  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1508  %2 = bitcast <2 x i16> %1 to <2 x half>
1509  %3 = fpext <2 x half> %2 to <2 x double>
1510  ret <2 x double> %3
1511}
1512
1513define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
1514; AVX-LABEL: cvt_8i16_to_4f64:
1515; AVX:       # %bb.0:
1516; AVX-NEXT:    subq $72, %rsp
1517; AVX-NEXT:    vmovq %xmm0, %rax
1518; AVX-NEXT:    movq %rax, %rcx
1519; AVX-NEXT:    movl %eax, %edx
1520; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1521; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1522; AVX-NEXT:    shrq $48, %rax
1523; AVX-NEXT:    shrq $32, %rcx
1524; AVX-NEXT:    shrl $16, %edx
1525; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
1526; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1527; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
1528; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1529; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1530; AVX-NEXT:    callq __extendhfsf2@PLT
1531; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1532; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1533; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1534; AVX-NEXT:    callq __extendhfsf2@PLT
1535; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1536; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1537; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1538; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1539; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1540; AVX-NEXT:    callq __extendhfsf2@PLT
1541; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1542; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1543; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1544; AVX-NEXT:    callq __extendhfsf2@PLT
1545; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1546; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1547; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1548; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1549; AVX-NEXT:    addq $72, %rsp
1550; AVX-NEXT:    retq
1551;
1552; F16C-LABEL: cvt_8i16_to_4f64:
1553; F16C:       # %bb.0:
1554; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1555; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1556; F16C-NEXT:    retq
1557;
1558; AVX512-LABEL: cvt_8i16_to_4f64:
1559; AVX512:       # %bb.0:
1560; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1561; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1562; AVX512-NEXT:    retq
1563  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1564  %2 = bitcast <4 x i16> %1 to <4 x half>
1565  %3 = fpext <4 x half> %2 to <4 x double>
1566  ret <4 x double> %3
1567}
1568
1569define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
1570; AVX-LABEL: cvt_8i16_to_8f64:
1571; AVX:       # %bb.0:
1572; AVX-NEXT:    subq $88, %rsp
1573; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1574; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
1575; AVX-NEXT:    callq __extendhfsf2@PLT
1576; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1577; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1578; AVX-NEXT:    vpermilps $245, (%rsp), %xmm0 # 16-byte Folded Reload
1579; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
1580; AVX-NEXT:    callq __extendhfsf2@PLT
1581; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1582; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1583; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1584; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1585; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1586; AVX-NEXT:    callq __extendhfsf2@PLT
1587; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1588; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1589; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1590; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
1591; AVX-NEXT:    callq __extendhfsf2@PLT
1592; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1593; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1594; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1595; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1596; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1597; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1598; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1599; AVX-NEXT:    vzeroupper
1600; AVX-NEXT:    callq __extendhfsf2@PLT
1601; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1602; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1603; AVX-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
1604; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
1605; AVX-NEXT:    callq __extendhfsf2@PLT
1606; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1607; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1608; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1609; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1610; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1611; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1612; AVX-NEXT:    callq __extendhfsf2@PLT
1613; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1614; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1615; AVX-NEXT:    vpermilps $78, (%rsp), %xmm0 # 16-byte Folded Reload
1616; AVX-NEXT:    # xmm0 = mem[2,3,0,1]
1617; AVX-NEXT:    callq __extendhfsf2@PLT
1618; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1619; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1620; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1621; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
1622; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1623; AVX-NEXT:    addq $88, %rsp
1624; AVX-NEXT:    retq
1625;
1626; F16C-LABEL: cvt_8i16_to_8f64:
1627; F16C:       # %bb.0:
1628; F16C-NEXT:    vcvtph2ps %xmm0, %ymm1
1629; F16C-NEXT:    vcvtps2pd %xmm1, %ymm0
1630; F16C-NEXT:    vextractf128 $1, %ymm1, %xmm1
1631; F16C-NEXT:    vcvtps2pd %xmm1, %ymm1
1632; F16C-NEXT:    retq
1633;
1634; AVX512-LABEL: cvt_8i16_to_8f64:
1635; AVX512:       # %bb.0:
1636; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
1637; AVX512-NEXT:    vcvtps2pd %ymm0, %zmm0
1638; AVX512-NEXT:    retq
1639  %1 = bitcast <8 x i16> %a0 to <8 x half>
1640  %2 = fpext <8 x half> %1 to <8 x double>
1641  ret <8 x double> %2
1642}
1643
1644define <2 x double> @cvt_2i16_to_2f64_constrained(<2 x i16> %a0) nounwind strictfp {
1645; AVX-LABEL: cvt_2i16_to_2f64_constrained:
1646; AVX:       # %bb.0:
1647; AVX-NEXT:    subq $40, %rsp
1648; AVX-NEXT:    vmovd %xmm0, %eax
1649; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1650; AVX-NEXT:    shrl $16, %eax
1651; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
1652; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1653; AVX-NEXT:    callq __extendhfsf2@PLT
1654; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1655; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1656; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1657; AVX-NEXT:    callq __extendhfsf2@PLT
1658; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1659; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
1660; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1661; AVX-NEXT:    addq $40, %rsp
1662; AVX-NEXT:    retq
1663;
1664; F16C-LABEL: cvt_2i16_to_2f64_constrained:
1665; F16C:       # %bb.0:
1666; F16C-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1667; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1668; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
1669; F16C-NEXT:    retq
1670;
1671; AVX512-LABEL: cvt_2i16_to_2f64_constrained:
1672; AVX512:       # %bb.0:
1673; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1674; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1675; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
1676; AVX512-NEXT:    retq
1677  %1 = bitcast <2 x i16> %a0 to <2 x half>
1678  %2 = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half> %1, metadata !"fpexcept.strict") strictfp
1679  ret <2 x double> %2
1680}
1681declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half>, metadata) strictfp
1682
1683define <4 x double> @cvt_4i16_to_4f64_constrained(<4 x i16> %a0) nounwind strictfp {
1684; AVX-LABEL: cvt_4i16_to_4f64_constrained:
1685; AVX:       # %bb.0:
1686; AVX-NEXT:    subq $72, %rsp
1687; AVX-NEXT:    vmovq %xmm0, %rax
1688; AVX-NEXT:    movq %rax, %rcx
1689; AVX-NEXT:    movl %eax, %edx
1690; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1691; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1692; AVX-NEXT:    shrq $48, %rax
1693; AVX-NEXT:    shrq $32, %rcx
1694; AVX-NEXT:    shrl $16, %edx
1695; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
1696; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1697; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
1698; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1699; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1700; AVX-NEXT:    callq __extendhfsf2@PLT
1701; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1702; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1703; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1704; AVX-NEXT:    callq __extendhfsf2@PLT
1705; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1706; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1707; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1708; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1709; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1710; AVX-NEXT:    callq __extendhfsf2@PLT
1711; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1712; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1713; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1714; AVX-NEXT:    callq __extendhfsf2@PLT
1715; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1716; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1717; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1718; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1719; AVX-NEXT:    addq $72, %rsp
1720; AVX-NEXT:    retq
1721;
1722; F16C-LABEL: cvt_4i16_to_4f64_constrained:
1723; F16C:       # %bb.0:
1724; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1725; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1726; F16C-NEXT:    retq
1727;
1728; AVX512-LABEL: cvt_4i16_to_4f64_constrained:
1729; AVX512:       # %bb.0:
1730; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1731; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1732; AVX512-NEXT:    retq
1733  %1 = bitcast <4 x i16> %a0 to <4 x half>
1734  %2 = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half> %1, metadata !"fpexcept.strict") strictfp
1735  ret <4 x double> %2
1736}
1737declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata) strictfp
1738
1739define <8 x double> @cvt_8i16_to_8f64_constrained(<8 x i16> %a0) nounwind strictfp {
1740; AVX-LABEL: cvt_8i16_to_8f64_constrained:
1741; AVX:       # %bb.0:
1742; AVX-NEXT:    subq $88, %rsp
1743; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1744; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
1745; AVX-NEXT:    callq __extendhfsf2@PLT
1746; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1747; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1748; AVX-NEXT:    vpermilps $245, (%rsp), %xmm0 # 16-byte Folded Reload
1749; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
1750; AVX-NEXT:    callq __extendhfsf2@PLT
1751; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1752; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1753; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1754; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1755; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1756; AVX-NEXT:    callq __extendhfsf2@PLT
1757; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1758; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1759; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1760; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
1761; AVX-NEXT:    callq __extendhfsf2@PLT
1762; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1763; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1764; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1765; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1766; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1767; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1768; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1769; AVX-NEXT:    vzeroupper
1770; AVX-NEXT:    callq __extendhfsf2@PLT
1771; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1772; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1773; AVX-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
1774; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
1775; AVX-NEXT:    callq __extendhfsf2@PLT
1776; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1777; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1778; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1779; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1780; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1781; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1782; AVX-NEXT:    callq __extendhfsf2@PLT
1783; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1784; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1785; AVX-NEXT:    vpermilps $78, (%rsp), %xmm0 # 16-byte Folded Reload
1786; AVX-NEXT:    # xmm0 = mem[2,3,0,1]
1787; AVX-NEXT:    callq __extendhfsf2@PLT
1788; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1789; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1790; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1791; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
1792; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1793; AVX-NEXT:    addq $88, %rsp
1794; AVX-NEXT:    retq
1795;
1796; F16C-LABEL: cvt_8i16_to_8f64_constrained:
1797; F16C:       # %bb.0:
1798; F16C-NEXT:    vcvtph2ps %xmm0, %ymm0
1799; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm1
1800; F16C-NEXT:    vcvtps2pd %xmm1, %ymm1
1801; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1802; F16C-NEXT:    retq
1803;
1804; AVX512-LABEL: cvt_8i16_to_8f64_constrained:
1805; AVX512:       # %bb.0:
1806; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
1807; AVX512-NEXT:    vcvtps2pd %ymm0, %zmm0
1808; AVX512-NEXT:    retq
1809  %1 = bitcast <8 x i16> %a0 to <8 x half>
1810  %2 = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half> %1, metadata !"fpexcept.strict") strictfp
1811  ret <8 x double> %2
1812}
1813declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>, metadata) strictfp
1814
1815;
1816; Half to Double (Load)
1817;
1818
1819define double @load_cvt_i16_to_f64(ptr %a0) nounwind {
1820; AVX-LABEL: load_cvt_i16_to_f64:
1821; AVX:       # %bb.0:
1822; AVX-NEXT:    pushq %rax
1823; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1824; AVX-NEXT:    callq __extendhfsf2@PLT
1825; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1826; AVX-NEXT:    popq %rax
1827; AVX-NEXT:    retq
1828;
1829; F16C-LABEL: load_cvt_i16_to_f64:
1830; F16C:       # %bb.0:
1831; F16C-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1832; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1833; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1834; F16C-NEXT:    retq
1835;
1836; AVX512-LABEL: load_cvt_i16_to_f64:
1837; AVX512:       # %bb.0:
1838; AVX512-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1839; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1840; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1841; AVX512-NEXT:    retq
1842  %1 = load i16, ptr %a0
1843  %2 = bitcast i16 %1 to half
1844  %3 = fpext half %2 to double
1845  ret double %3
1846}
1847
1848define <2 x double> @load_cvt_2i16_to_2f64(ptr %a0) nounwind {
1849; AVX-LABEL: load_cvt_2i16_to_2f64:
1850; AVX:       # %bb.0:
1851; AVX-NEXT:    subq $40, %rsp
1852; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1853; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1854; AVX-NEXT:    vpinsrw $0, 2(%rdi), %xmm0, %xmm0
1855; AVX-NEXT:    callq __extendhfsf2@PLT
1856; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1857; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1858; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1859; AVX-NEXT:    callq __extendhfsf2@PLT
1860; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1861; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1862; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1863; AVX-NEXT:    addq $40, %rsp
1864; AVX-NEXT:    retq
1865;
1866; F16C-LABEL: load_cvt_2i16_to_2f64:
1867; F16C:       # %bb.0:
1868; F16C-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1869; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
1870; F16C-NEXT:    vcvtps2pd %xmm0, %xmm0
1871; F16C-NEXT:    retq
1872;
1873; AVX512-LABEL: load_cvt_2i16_to_2f64:
1874; AVX512:       # %bb.0:
1875; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1876; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1877; AVX512-NEXT:    vcvtps2pd %xmm0, %xmm0
1878; AVX512-NEXT:    retq
1879  %1 = load <2 x i16>, ptr %a0
1880  %2 = bitcast <2 x i16> %1 to <2 x half>
1881  %3 = fpext <2 x half> %2 to <2 x double>
1882  ret <2 x double> %3
1883}
1884
1885define <4 x double> @load_cvt_4i16_to_4f64(ptr %a0) nounwind {
1886; AVX-LABEL: load_cvt_4i16_to_4f64:
1887; AVX:       # %bb.0:
1888; AVX-NEXT:    subq $72, %rsp
1889; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
1890; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1891; AVX-NEXT:    vpinsrw $0, 2(%rdi), %xmm0, %xmm0
1892; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1893; AVX-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
1894; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1895; AVX-NEXT:    vpinsrw $0, 6(%rdi), %xmm0, %xmm0
1896; AVX-NEXT:    callq __extendhfsf2@PLT
1897; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1898; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1899; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1900; AVX-NEXT:    callq __extendhfsf2@PLT
1901; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1902; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1903; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1904; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1905; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1906; AVX-NEXT:    callq __extendhfsf2@PLT
1907; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1908; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1909; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1910; AVX-NEXT:    callq __extendhfsf2@PLT
1911; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1912; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1913; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1914; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1915; AVX-NEXT:    addq $72, %rsp
1916; AVX-NEXT:    retq
1917;
1918; F16C-LABEL: load_cvt_4i16_to_4f64:
1919; F16C:       # %bb.0:
1920; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
1921; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1922; F16C-NEXT:    retq
1923;
1924; AVX512-LABEL: load_cvt_4i16_to_4f64:
1925; AVX512:       # %bb.0:
1926; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
1927; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1928; AVX512-NEXT:    retq
1929  %1 = load <4 x i16>, ptr %a0
1930  %2 = bitcast <4 x i16> %1 to <4 x half>
1931  %3 = fpext <4 x half> %2 to <4 x double>
1932  ret <4 x double> %3
1933}
1934
1935define <4 x double> @load_cvt_8i16_to_4f64(ptr %a0) nounwind {
1936; AVX-LABEL: load_cvt_8i16_to_4f64:
1937; AVX:       # %bb.0:
1938; AVX-NEXT:    subq $72, %rsp
1939; AVX-NEXT:    movq (%rdi), %rax
1940; AVX-NEXT:    movq %rax, %rcx
1941; AVX-NEXT:    movl %eax, %edx
1942; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1943; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1944; AVX-NEXT:    shrq $48, %rax
1945; AVX-NEXT:    shrq $32, %rcx
1946; AVX-NEXT:    shrl $16, %edx
1947; AVX-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
1948; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1949; AVX-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0
1950; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
1951; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1952; AVX-NEXT:    callq __extendhfsf2@PLT
1953; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1954; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1955; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1956; AVX-NEXT:    callq __extendhfsf2@PLT
1957; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1958; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1959; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
1960; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1961; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1962; AVX-NEXT:    callq __extendhfsf2@PLT
1963; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1964; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1965; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1966; AVX-NEXT:    callq __extendhfsf2@PLT
1967; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1968; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1969; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1970; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
1971; AVX-NEXT:    addq $72, %rsp
1972; AVX-NEXT:    retq
1973;
1974; F16C-LABEL: load_cvt_8i16_to_4f64:
1975; F16C:       # %bb.0:
1976; F16C-NEXT:    vcvtph2ps (%rdi), %xmm0
1977; F16C-NEXT:    vcvtps2pd %xmm0, %ymm0
1978; F16C-NEXT:    retq
1979;
1980; AVX512-LABEL: load_cvt_8i16_to_4f64:
1981; AVX512:       # %bb.0:
1982; AVX512-NEXT:    vcvtph2ps (%rdi), %xmm0
1983; AVX512-NEXT:    vcvtps2pd %xmm0, %ymm0
1984; AVX512-NEXT:    retq
1985  %1 = load <8 x i16>, ptr %a0
1986  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1987  %3 = bitcast <4 x i16> %2 to <4 x half>
1988  %4 = fpext <4 x half> %3 to <4 x double>
1989  ret <4 x double> %4
1990}
1991
1992define <8 x double> @load_cvt_8i16_to_8f64(ptr %a0) nounwind {
1993; AVX1-LABEL: load_cvt_8i16_to_8f64:
1994; AVX1:       # %bb.0:
1995; AVX1-NEXT:    pushq %rbx
1996; AVX1-NEXT:    subq $80, %rsp
1997; AVX1-NEXT:    movq %rdi, %rbx
1998; AVX1-NEXT:    vmovaps (%rdi), %xmm0
1999; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2000; AVX1-NEXT:    vbroadcastss 4(%rdi), %xmm0
2001; AVX1-NEXT:    callq __extendhfsf2@PLT
2002; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2003; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2004; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2005; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
2006; AVX1-NEXT:    callq __extendhfsf2@PLT
2007; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2008; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2009; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2010; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2011; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2012; AVX1-NEXT:    callq __extendhfsf2@PLT
2013; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2014; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2015; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2016; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
2017; AVX1-NEXT:    callq __extendhfsf2@PLT
2018; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2019; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
2020; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2021; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2022; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2023; AVX1-NEXT:    vbroadcastss 12(%rbx), %xmm0
2024; AVX1-NEXT:    vzeroupper
2025; AVX1-NEXT:    callq __extendhfsf2@PLT
2026; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2027; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2028; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2029; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2030; AVX1-NEXT:    callq __extendhfsf2@PLT
2031; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2032; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
2033; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2034; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2035; AVX1-NEXT:    vbroadcastss 8(%rbx), %xmm0
2036; AVX1-NEXT:    callq __extendhfsf2@PLT
2037; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2038; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2039; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2040; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2041; AVX1-NEXT:    callq __extendhfsf2@PLT
2042; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2043; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2044; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2045; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
2046; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2047; AVX1-NEXT:    addq $80, %rsp
2048; AVX1-NEXT:    popq %rbx
2049; AVX1-NEXT:    retq
2050;
2051; AVX2-LABEL: load_cvt_8i16_to_8f64:
2052; AVX2:       # %bb.0:
2053; AVX2-NEXT:    pushq %rbx
2054; AVX2-NEXT:    subq $80, %rsp
2055; AVX2-NEXT:    movq %rdi, %rbx
2056; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
2057; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2058; AVX2-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
2059; AVX2-NEXT:    callq __extendhfsf2@PLT
2060; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2061; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2062; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2063; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
2064; AVX2-NEXT:    callq __extendhfsf2@PLT
2065; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2066; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2067; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2068; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2069; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2070; AVX2-NEXT:    callq __extendhfsf2@PLT
2071; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2072; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2073; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2074; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
2075; AVX2-NEXT:    callq __extendhfsf2@PLT
2076; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2077; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
2078; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2079; AVX2-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2080; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2081; AVX2-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
2082; AVX2-NEXT:    vzeroupper
2083; AVX2-NEXT:    callq __extendhfsf2@PLT
2084; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2085; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2086; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2087; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2088; AVX2-NEXT:    callq __extendhfsf2@PLT
2089; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2090; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
2091; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2092; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2093; AVX2-NEXT:    vpinsrw $0, 8(%rbx), %xmm0, %xmm0
2094; AVX2-NEXT:    callq __extendhfsf2@PLT
2095; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2096; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2097; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2098; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2099; AVX2-NEXT:    callq __extendhfsf2@PLT
2100; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2101; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2102; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2103; AVX2-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload
2104; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2105; AVX2-NEXT:    addq $80, %rsp
2106; AVX2-NEXT:    popq %rbx
2107; AVX2-NEXT:    retq
2108;
2109; F16C-LABEL: load_cvt_8i16_to_8f64:
2110; F16C:       # %bb.0:
2111; F16C-NEXT:    vcvtph2ps (%rdi), %ymm1
2112; F16C-NEXT:    vcvtps2pd %xmm1, %ymm0
2113; F16C-NEXT:    vextractf128 $1, %ymm1, %xmm1
2114; F16C-NEXT:    vcvtps2pd %xmm1, %ymm1
2115; F16C-NEXT:    retq
2116;
2117; AVX512-LABEL: load_cvt_8i16_to_8f64:
2118; AVX512:       # %bb.0:
2119; AVX512-NEXT:    vcvtph2ps (%rdi), %ymm0
2120; AVX512-NEXT:    vcvtps2pd %ymm0, %zmm0
2121; AVX512-NEXT:    retq
2122  %1 = load <8 x i16>, ptr %a0
2123  %2 = bitcast <8 x i16> %1 to <8 x half>
2124  %3 = fpext <8 x half> %2 to <8 x double>
2125  ret <8 x double> %3
2126}
2127
2128;
2129; Float to Half
2130;
2131
2132define i16 @cvt_f32_to_i16(float %a0) nounwind {
2133; AVX-LABEL: cvt_f32_to_i16:
2134; AVX:       # %bb.0:
2135; AVX-NEXT:    pushq %rax
2136; AVX-NEXT:    callq __truncsfhf2@PLT
2137; AVX-NEXT:    vpextrw $0, %xmm0, %eax
2138; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
2139; AVX-NEXT:    popq %rcx
2140; AVX-NEXT:    retq
2141;
2142; F16C-LABEL: cvt_f32_to_i16:
2143; F16C:       # %bb.0:
2144; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2145; F16C-NEXT:    vmovd %xmm0, %eax
2146; F16C-NEXT:    # kill: def $ax killed $ax killed $eax
2147; F16C-NEXT:    retq
2148;
2149; AVX512-LABEL: cvt_f32_to_i16:
2150; AVX512:       # %bb.0:
2151; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2152; AVX512-NEXT:    vmovd %xmm0, %eax
2153; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
2154; AVX512-NEXT:    retq
2155  %1 = fptrunc float %a0 to half
2156  %2 = bitcast half %1 to i16
2157  ret i16 %2
2158}
2159
2160define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
2161; AVX-LABEL: cvt_4f32_to_4i16:
2162; AVX:       # %bb.0:
2163; AVX-NEXT:    subq $72, %rsp
2164; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2165; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2166; AVX-NEXT:    callq __truncsfhf2@PLT
2167; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2168; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2169; AVX-NEXT:    callq __truncsfhf2@PLT
2170; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2171; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2172; AVX-NEXT:    # xmm0 = mem[1,0]
2173; AVX-NEXT:    callq __truncsfhf2@PLT
2174; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2175; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2176; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2177; AVX-NEXT:    callq __truncsfhf2@PLT
2178; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2179; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2180; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2181; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2182; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2183; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2184; AVX-NEXT:    addq $72, %rsp
2185; AVX-NEXT:    retq
2186;
2187; F16C-LABEL: cvt_4f32_to_4i16:
2188; F16C:       # %bb.0:
2189; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2190; F16C-NEXT:    retq
2191;
2192; AVX512-LABEL: cvt_4f32_to_4i16:
2193; AVX512:       # %bb.0:
2194; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2195; AVX512-NEXT:    retq
2196  %1 = fptrunc <4 x float> %a0 to <4 x half>
2197  %2 = bitcast <4 x half> %1 to <4 x i16>
2198  ret <4 x i16> %2
2199}
2200
2201define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
2202; AVX-LABEL: cvt_4f32_to_8i16_undef:
2203; AVX:       # %bb.0:
2204; AVX-NEXT:    subq $72, %rsp
2205; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2206; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2207; AVX-NEXT:    callq __truncsfhf2@PLT
2208; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2209; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2210; AVX-NEXT:    callq __truncsfhf2@PLT
2211; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2212; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2213; AVX-NEXT:    # xmm0 = mem[1,0]
2214; AVX-NEXT:    callq __truncsfhf2@PLT
2215; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2216; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2217; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2218; AVX-NEXT:    callq __truncsfhf2@PLT
2219; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2220; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2221; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2222; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2223; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2224; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2225; AVX-NEXT:    addq $72, %rsp
2226; AVX-NEXT:    retq
2227;
2228; F16C-LABEL: cvt_4f32_to_8i16_undef:
2229; F16C:       # %bb.0:
2230; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2231; F16C-NEXT:    retq
2232;
2233; AVX512-LABEL: cvt_4f32_to_8i16_undef:
2234; AVX512:       # %bb.0:
2235; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2236; AVX512-NEXT:    retq
2237  %1 = fptrunc <4 x float> %a0 to <4 x half>
2238  %2 = bitcast <4 x half> %1 to <4 x i16>
2239  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2240  ret <8 x i16> %3
2241}
2242
2243define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
2244; AVX-LABEL: cvt_4f32_to_8i16_zero:
2245; AVX:       # %bb.0:
2246; AVX-NEXT:    subq $72, %rsp
2247; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2248; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2249; AVX-NEXT:    callq __truncsfhf2@PLT
2250; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2251; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2252; AVX-NEXT:    callq __truncsfhf2@PLT
2253; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2254; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2255; AVX-NEXT:    # xmm0 = mem[1,0]
2256; AVX-NEXT:    callq __truncsfhf2@PLT
2257; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2258; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2259; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2260; AVX-NEXT:    callq __truncsfhf2@PLT
2261; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2262; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2263; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2264; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2265; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2266; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2267; AVX-NEXT:    addq $72, %rsp
2268; AVX-NEXT:    retq
2269;
2270; F16C-LABEL: cvt_4f32_to_8i16_zero:
2271; F16C:       # %bb.0:
2272; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2273; F16C-NEXT:    retq
2274;
2275; AVX512-LABEL: cvt_4f32_to_8i16_zero:
2276; AVX512:       # %bb.0:
2277; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2278; AVX512-NEXT:    retq
2279  %1 = fptrunc <4 x float> %a0 to <4 x half>
2280  %2 = bitcast <4 x half> %1 to <4 x i16>
2281  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2282  ret <8 x i16> %3
2283}
2284
2285define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
2286; AVX-LABEL: cvt_8f32_to_8i16:
2287; AVX:       # %bb.0:
2288; AVX-NEXT:    subq $88, %rsp
2289; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2290; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
2291; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2292; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2293; AVX-NEXT:    vzeroupper
2294; AVX-NEXT:    callq __truncsfhf2@PLT
2295; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2296; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2297; AVX-NEXT:    # xmm0 = mem[1,0]
2298; AVX-NEXT:    callq __truncsfhf2@PLT
2299; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2300; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2301; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2302; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2303; AVX-NEXT:    callq __truncsfhf2@PLT
2304; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2305; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2306; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
2307; AVX-NEXT:    callq __truncsfhf2@PLT
2308; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2309; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2310; AVX-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2311; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2312; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2313; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2314; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2315; AVX-NEXT:    callq __truncsfhf2@PLT
2316; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2317; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2318; AVX-NEXT:    # xmm0 = mem[1,0]
2319; AVX-NEXT:    callq __truncsfhf2@PLT
2320; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2321; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2322; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2323; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2324; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2325; AVX-NEXT:    vzeroupper
2326; AVX-NEXT:    callq __truncsfhf2@PLT
2327; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2328; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2329; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
2330; AVX-NEXT:    callq __truncsfhf2@PLT
2331; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2332; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2333; AVX-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2334; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2335; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2336; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
2337; AVX-NEXT:    addq $88, %rsp
2338; AVX-NEXT:    retq
2339;
2340; F16C-LABEL: cvt_8f32_to_8i16:
2341; F16C:       # %bb.0:
2342; F16C-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
2343; F16C-NEXT:    vzeroupper
2344; F16C-NEXT:    retq
2345;
2346; AVX512-LABEL: cvt_8f32_to_8i16:
2347; AVX512:       # %bb.0:
2348; AVX512-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
2349; AVX512-NEXT:    vzeroupper
2350; AVX512-NEXT:    retq
2351  %1 = fptrunc <8 x float> %a0 to <8 x half>
2352  %2 = bitcast <8 x half> %1 to <8 x i16>
2353  ret <8 x i16> %2
2354}
2355
2356define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
2357; AVX1-LABEL: cvt_16f32_to_16i16:
2358; AVX1:       # %bb.0:
2359; AVX1-NEXT:    subq $120, %rsp
2360; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2361; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2362; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
2363; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2364; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2365; AVX1-NEXT:    vzeroupper
2366; AVX1-NEXT:    callq __truncsfhf2@PLT
2367; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2368; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2369; AVX1-NEXT:    # xmm0 = mem[1,0]
2370; AVX1-NEXT:    callq __truncsfhf2@PLT
2371; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2372; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2373; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2374; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2375; AVX1-NEXT:    callq __truncsfhf2@PLT
2376; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2377; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2378; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2379; AVX1-NEXT:    callq __truncsfhf2@PLT
2380; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2381; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2382; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2383; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2384; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2385; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2386; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
2387; AVX1-NEXT:    callq __truncsfhf2@PLT
2388; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2389; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2390; AVX1-NEXT:    # xmm0 = mem[1,0]
2391; AVX1-NEXT:    callq __truncsfhf2@PLT
2392; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2393; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2394; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2395; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2396; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2397; AVX1-NEXT:    vzeroupper
2398; AVX1-NEXT:    callq __truncsfhf2@PLT
2399; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2400; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2401; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2402; AVX1-NEXT:    callq __truncsfhf2@PLT
2403; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2404; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2405; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2406; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2407; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2408; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
2409; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2410; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2411; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2412; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2413; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2414; AVX1-NEXT:    vzeroupper
2415; AVX1-NEXT:    callq __truncsfhf2@PLT
2416; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2417; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2418; AVX1-NEXT:    # xmm0 = mem[1,0]
2419; AVX1-NEXT:    callq __truncsfhf2@PLT
2420; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2421; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2422; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2423; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2424; AVX1-NEXT:    callq __truncsfhf2@PLT
2425; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2426; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2427; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2428; AVX1-NEXT:    callq __truncsfhf2@PLT
2429; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2430; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2431; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2432; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2433; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2434; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2435; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
2436; AVX1-NEXT:    callq __truncsfhf2@PLT
2437; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2438; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2439; AVX1-NEXT:    # xmm0 = mem[1,0]
2440; AVX1-NEXT:    callq __truncsfhf2@PLT
2441; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2442; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2443; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2444; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2445; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2446; AVX1-NEXT:    vzeroupper
2447; AVX1-NEXT:    callq __truncsfhf2@PLT
2448; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2449; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2450; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2451; AVX1-NEXT:    callq __truncsfhf2@PLT
2452; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2453; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2454; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2455; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2456; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2457; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
2458; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2459; AVX1-NEXT:    addq $120, %rsp
2460; AVX1-NEXT:    retq
2461;
2462; AVX2-LABEL: cvt_16f32_to_16i16:
2463; AVX2:       # %bb.0:
2464; AVX2-NEXT:    subq $184, %rsp
2465; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2466; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2467; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
2468; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2469; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2470; AVX2-NEXT:    vzeroupper
2471; AVX2-NEXT:    callq __truncsfhf2@PLT
2472; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2473; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2474; AVX2-NEXT:    # xmm0 = mem[1,0]
2475; AVX2-NEXT:    callq __truncsfhf2@PLT
2476; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2477; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2478; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2479; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2480; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2481; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2482; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2483; AVX2-NEXT:    vzeroupper
2484; AVX2-NEXT:    callq __truncsfhf2@PLT
2485; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2486; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2487; AVX2-NEXT:    # xmm0 = mem[1,0]
2488; AVX2-NEXT:    callq __truncsfhf2@PLT
2489; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2490; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2491; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2492; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2493; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2494; AVX2-NEXT:    vzeroupper
2495; AVX2-NEXT:    callq __truncsfhf2@PLT
2496; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2497; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2498; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2499; AVX2-NEXT:    callq __truncsfhf2@PLT
2500; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
2501; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2502; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2503; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2504; AVX2-NEXT:    callq __truncsfhf2@PLT
2505; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2506; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2507; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2508; AVX2-NEXT:    callq __truncsfhf2@PLT
2509; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
2510; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2511; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2512; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2513; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2514; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2515; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2516; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
2517; AVX2-NEXT:    vzeroupper
2518; AVX2-NEXT:    callq __truncsfhf2@PLT
2519; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2520; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2521; AVX2-NEXT:    # xmm0 = mem[1,0]
2522; AVX2-NEXT:    callq __truncsfhf2@PLT
2523; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2524; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2525; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2526; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2527; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
2528; AVX2-NEXT:    callq __truncsfhf2@PLT
2529; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2530; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2531; AVX2-NEXT:    # xmm0 = mem[1,0]
2532; AVX2-NEXT:    callq __truncsfhf2@PLT
2533; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2534; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2535; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2536; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2537; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2538; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2539; AVX2-NEXT:    vzeroupper
2540; AVX2-NEXT:    callq __truncsfhf2@PLT
2541; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2542; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2543; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2544; AVX2-NEXT:    callq __truncsfhf2@PLT
2545; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2546; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2547; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2548; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2549; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2550; AVX2-NEXT:    vzeroupper
2551; AVX2-NEXT:    callq __truncsfhf2@PLT
2552; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2553; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2554; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2555; AVX2-NEXT:    callq __truncsfhf2@PLT
2556; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2557; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2558; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2559; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2560; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2561; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2562; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
2563; AVX2-NEXT:    addq $184, %rsp
2564; AVX2-NEXT:    retq
2565;
2566; F16C-LABEL: cvt_16f32_to_16i16:
2567; F16C:       # %bb.0:
2568; F16C-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
2569; F16C-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
2570; F16C-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2571; F16C-NEXT:    retq
2572;
2573; AVX512-LABEL: cvt_16f32_to_16i16:
2574; AVX512:       # %bb.0:
2575; AVX512-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
2576; AVX512-NEXT:    retq
2577  %1 = fptrunc <16 x float> %a0 to <16 x half>
2578  %2 = bitcast <16 x half> %1 to <16 x i16>
2579  ret <16 x i16> %2
2580}
2581
2582;
2583; Float to Half (Store)
2584;
2585
2586define void @store_cvt_f32_to_i16(float %a0, ptr %a1) nounwind {
2587; AVX-LABEL: store_cvt_f32_to_i16:
2588; AVX:       # %bb.0:
2589; AVX-NEXT:    pushq %rbx
2590; AVX-NEXT:    movq %rdi, %rbx
2591; AVX-NEXT:    callq __truncsfhf2@PLT
2592; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
2593; AVX-NEXT:    popq %rbx
2594; AVX-NEXT:    retq
2595;
2596; F16C-LABEL: store_cvt_f32_to_i16:
2597; F16C:       # %bb.0:
2598; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2599; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
2600; F16C-NEXT:    retq
2601;
2602; AVX512-LABEL: store_cvt_f32_to_i16:
2603; AVX512:       # %bb.0:
2604; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2605; AVX512-NEXT:    vpextrw $0, %xmm0, (%rdi)
2606; AVX512-NEXT:    retq
2607  %1 = fptrunc float %a0 to half
2608  %2 = bitcast half %1 to i16
2609  store i16 %2, ptr %a1
2610  ret void
2611}
2612
2613define void @store_cvt_4f32_to_4i16(<4 x float> %a0, ptr %a1) nounwind {
2614; AVX-LABEL: store_cvt_4f32_to_4i16:
2615; AVX:       # %bb.0:
2616; AVX-NEXT:    pushq %rbx
2617; AVX-NEXT:    subq $64, %rsp
2618; AVX-NEXT:    movq %rdi, %rbx
2619; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2620; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2621; AVX-NEXT:    callq __truncsfhf2@PLT
2622; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2623; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2624; AVX-NEXT:    # xmm0 = mem[1,0]
2625; AVX-NEXT:    callq __truncsfhf2@PLT
2626; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2627; AVX-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
2628; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2629; AVX-NEXT:    callq __truncsfhf2@PLT
2630; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2631; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
2632; AVX-NEXT:    callq __truncsfhf2@PLT
2633; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
2634; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2635; AVX-NEXT:    vpextrw $0, %xmm0, 6(%rbx)
2636; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2637; AVX-NEXT:    vpextrw $0, %xmm0, 4(%rbx)
2638; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2639; AVX-NEXT:    vpextrw $0, %xmm0, 2(%rbx)
2640; AVX-NEXT:    addq $64, %rsp
2641; AVX-NEXT:    popq %rbx
2642; AVX-NEXT:    retq
2643;
2644; F16C-LABEL: store_cvt_4f32_to_4i16:
2645; F16C:       # %bb.0:
2646; F16C-NEXT:    vcvtps2ph $4, %xmm0, (%rdi)
2647; F16C-NEXT:    retq
2648;
2649; AVX512-LABEL: store_cvt_4f32_to_4i16:
2650; AVX512:       # %bb.0:
2651; AVX512-NEXT:    vcvtps2ph $4, %xmm0, (%rdi)
2652; AVX512-NEXT:    retq
2653  %1 = fptrunc <4 x float> %a0 to <4 x half>
2654  %2 = bitcast <4 x half> %1 to <4 x i16>
2655  store <4 x i16> %2, ptr %a1
2656  ret void
2657}
2658
2659define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, ptr %a1) nounwind {
2660; AVX-LABEL: store_cvt_4f32_to_8i16_undef:
2661; AVX:       # %bb.0:
2662; AVX-NEXT:    pushq %rbx
2663; AVX-NEXT:    subq $64, %rsp
2664; AVX-NEXT:    movq %rdi, %rbx
2665; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2666; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2667; AVX-NEXT:    callq __truncsfhf2@PLT
2668; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2669; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2670; AVX-NEXT:    callq __truncsfhf2@PLT
2671; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2672; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2673; AVX-NEXT:    # xmm0 = mem[1,0]
2674; AVX-NEXT:    callq __truncsfhf2@PLT
2675; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2676; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2677; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2678; AVX-NEXT:    callq __truncsfhf2@PLT
2679; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2680; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2681; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2682; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2683; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2684; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2685; AVX-NEXT:    vmovaps %xmm0, (%rbx)
2686; AVX-NEXT:    addq $64, %rsp
2687; AVX-NEXT:    popq %rbx
2688; AVX-NEXT:    retq
2689;
2690; F16C-LABEL: store_cvt_4f32_to_8i16_undef:
2691; F16C:       # %bb.0:
2692; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2693; F16C-NEXT:    vmovaps %xmm0, (%rdi)
2694; F16C-NEXT:    retq
2695;
2696; AVX512-LABEL: store_cvt_4f32_to_8i16_undef:
2697; AVX512:       # %bb.0:
2698; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2699; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
2700; AVX512-NEXT:    retq
2701  %1 = fptrunc <4 x float> %a0 to <4 x half>
2702  %2 = bitcast <4 x half> %1 to <4 x i16>
2703  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2704  store <8 x i16> %3, ptr %a1
2705  ret void
2706}
2707
2708define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, ptr %a1) nounwind {
2709; AVX-LABEL: store_cvt_4f32_to_8i16_zero:
2710; AVX:       # %bb.0:
2711; AVX-NEXT:    pushq %rbx
2712; AVX-NEXT:    subq $64, %rsp
2713; AVX-NEXT:    movq %rdi, %rbx
2714; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2715; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2716; AVX-NEXT:    callq __truncsfhf2@PLT
2717; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2718; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
2719; AVX-NEXT:    callq __truncsfhf2@PLT
2720; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2721; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2722; AVX-NEXT:    # xmm0 = mem[1,0]
2723; AVX-NEXT:    callq __truncsfhf2@PLT
2724; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2725; AVX-NEXT:    vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
2726; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2727; AVX-NEXT:    callq __truncsfhf2@PLT
2728; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2729; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2730; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2731; AVX-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2732; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2733; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2734; AVX-NEXT:    vmovaps %xmm0, (%rbx)
2735; AVX-NEXT:    addq $64, %rsp
2736; AVX-NEXT:    popq %rbx
2737; AVX-NEXT:    retq
2738;
2739; F16C-LABEL: store_cvt_4f32_to_8i16_zero:
2740; F16C:       # %bb.0:
2741; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2742; F16C-NEXT:    vmovaps %xmm0, (%rdi)
2743; F16C-NEXT:    retq
2744;
2745; AVX512-LABEL: store_cvt_4f32_to_8i16_zero:
2746; AVX512:       # %bb.0:
2747; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2748; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
2749; AVX512-NEXT:    retq
2750  %1 = fptrunc <4 x float> %a0 to <4 x half>
2751  %2 = bitcast <4 x half> %1 to <4 x i16>
2752  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2753  store <8 x i16> %3, ptr %a1
2754  ret void
2755}
2756
2757define void @store_cvt_8f32_to_8i16(<8 x float> %a0, ptr %a1) nounwind {
2758; AVX-LABEL: store_cvt_8f32_to_8i16:
2759; AVX:       # %bb.0:
2760; AVX-NEXT:    pushq %rbx
2761; AVX-NEXT:    subq $80, %rsp
2762; AVX-NEXT:    movq %rdi, %rbx
2763; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2764; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
2765; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2766; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2767; AVX-NEXT:    vzeroupper
2768; AVX-NEXT:    callq __truncsfhf2@PLT
2769; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2770; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2771; AVX-NEXT:    # xmm0 = mem[1,0]
2772; AVX-NEXT:    callq __truncsfhf2@PLT
2773; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2774; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2775; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2776; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2777; AVX-NEXT:    callq __truncsfhf2@PLT
2778; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2779; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2780; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
2781; AVX-NEXT:    callq __truncsfhf2@PLT
2782; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2783; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2784; AVX-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2785; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2786; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2787; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2788; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
2789; AVX-NEXT:    callq __truncsfhf2@PLT
2790; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2791; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2792; AVX-NEXT:    # xmm0 = mem[1,0]
2793; AVX-NEXT:    callq __truncsfhf2@PLT
2794; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2795; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2796; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2797; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2798; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2799; AVX-NEXT:    vzeroupper
2800; AVX-NEXT:    callq __truncsfhf2@PLT
2801; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2802; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2803; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
2804; AVX-NEXT:    callq __truncsfhf2@PLT
2805; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2806; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2807; AVX-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2808; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2809; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2810; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
2811; AVX-NEXT:    vmovdqa %xmm0, (%rbx)
2812; AVX-NEXT:    addq $80, %rsp
2813; AVX-NEXT:    popq %rbx
2814; AVX-NEXT:    retq
2815;
2816; F16C-LABEL: store_cvt_8f32_to_8i16:
2817; F16C:       # %bb.0:
2818; F16C-NEXT:    vcvtps2ph $4, %ymm0, (%rdi)
2819; F16C-NEXT:    vzeroupper
2820; F16C-NEXT:    retq
2821;
2822; AVX512-LABEL: store_cvt_8f32_to_8i16:
2823; AVX512:       # %bb.0:
2824; AVX512-NEXT:    vcvtps2ph $4, %ymm0, (%rdi)
2825; AVX512-NEXT:    vzeroupper
2826; AVX512-NEXT:    retq
2827  %1 = fptrunc <8 x float> %a0 to <8 x half>
2828  %2 = bitcast <8 x half> %1 to <8 x i16>
2829  store <8 x i16> %2, ptr %a1
2830  ret void
2831}
2832
2833define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
2834; AVX1-LABEL: store_cvt_16f32_to_16i16:
2835; AVX1:       # %bb.0:
2836; AVX1-NEXT:    pushq %rbx
2837; AVX1-NEXT:    subq $112, %rsp
2838; AVX1-NEXT:    movq %rdi, %rbx
2839; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2840; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2841; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
2842; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2843; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2844; AVX1-NEXT:    vzeroupper
2845; AVX1-NEXT:    callq __truncsfhf2@PLT
2846; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2847; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2848; AVX1-NEXT:    # xmm0 = mem[1,0]
2849; AVX1-NEXT:    callq __truncsfhf2@PLT
2850; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2851; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2852; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2853; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2854; AVX1-NEXT:    callq __truncsfhf2@PLT
2855; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2856; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2857; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2858; AVX1-NEXT:    callq __truncsfhf2@PLT
2859; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2860; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2861; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2862; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2863; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2864; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2865; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
2866; AVX1-NEXT:    callq __truncsfhf2@PLT
2867; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2868; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2869; AVX1-NEXT:    # xmm0 = mem[1,0]
2870; AVX1-NEXT:    callq __truncsfhf2@PLT
2871; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2872; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2873; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2874; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2875; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2876; AVX1-NEXT:    vzeroupper
2877; AVX1-NEXT:    callq __truncsfhf2@PLT
2878; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2879; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2880; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2881; AVX1-NEXT:    callq __truncsfhf2@PLT
2882; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2883; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2884; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2885; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2886; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2887; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
2888; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2889; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2890; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2891; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2892; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2893; AVX1-NEXT:    vzeroupper
2894; AVX1-NEXT:    callq __truncsfhf2@PLT
2895; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2896; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2897; AVX1-NEXT:    # xmm0 = mem[1,0]
2898; AVX1-NEXT:    callq __truncsfhf2@PLT
2899; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2900; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2901; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2902; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2903; AVX1-NEXT:    callq __truncsfhf2@PLT
2904; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2905; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2906; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2907; AVX1-NEXT:    callq __truncsfhf2@PLT
2908; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2909; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2910; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2911; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2912; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2913; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2914; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
2915; AVX1-NEXT:    callq __truncsfhf2@PLT
2916; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2917; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2918; AVX1-NEXT:    # xmm0 = mem[1,0]
2919; AVX1-NEXT:    callq __truncsfhf2@PLT
2920; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2921; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2922; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
2923; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2924; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2925; AVX1-NEXT:    vzeroupper
2926; AVX1-NEXT:    callq __truncsfhf2@PLT
2927; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2928; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2929; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
2930; AVX1-NEXT:    callq __truncsfhf2@PLT
2931; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2932; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2933; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2934; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2935; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2936; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
2937; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2938; AVX1-NEXT:    vmovaps %ymm0, (%rbx)
2939; AVX1-NEXT:    addq $112, %rsp
2940; AVX1-NEXT:    popq %rbx
2941; AVX1-NEXT:    vzeroupper
2942; AVX1-NEXT:    retq
2943;
2944; AVX2-LABEL: store_cvt_16f32_to_16i16:
2945; AVX2:       # %bb.0:
2946; AVX2-NEXT:    pushq %rbx
2947; AVX2-NEXT:    subq $176, %rsp
2948; AVX2-NEXT:    movq %rdi, %rbx
2949; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2950; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2951; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
2952; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2953; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2954; AVX2-NEXT:    vzeroupper
2955; AVX2-NEXT:    callq __truncsfhf2@PLT
2956; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2957; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2958; AVX2-NEXT:    # xmm0 = mem[1,0]
2959; AVX2-NEXT:    callq __truncsfhf2@PLT
2960; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2961; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2962; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2963; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2964; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
2965; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2966; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2967; AVX2-NEXT:    vzeroupper
2968; AVX2-NEXT:    callq __truncsfhf2@PLT
2969; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2970; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2971; AVX2-NEXT:    # xmm0 = mem[1,0]
2972; AVX2-NEXT:    callq __truncsfhf2@PLT
2973; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2974; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2975; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2976; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2977; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2978; AVX2-NEXT:    vzeroupper
2979; AVX2-NEXT:    callq __truncsfhf2@PLT
2980; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2981; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2982; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2983; AVX2-NEXT:    callq __truncsfhf2@PLT
2984; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
2985; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2986; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2987; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2988; AVX2-NEXT:    callq __truncsfhf2@PLT
2989; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2990; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2991; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
2992; AVX2-NEXT:    callq __truncsfhf2@PLT
2993; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
2994; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2995; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
2996; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2997; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2998; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2999; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3000; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
3001; AVX2-NEXT:    vzeroupper
3002; AVX2-NEXT:    callq __truncsfhf2@PLT
3003; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3004; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3005; AVX2-NEXT:    # xmm0 = mem[1,0]
3006; AVX2-NEXT:    callq __truncsfhf2@PLT
3007; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3008; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3009; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3010; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3011; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
3012; AVX2-NEXT:    callq __truncsfhf2@PLT
3013; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3014; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3015; AVX2-NEXT:    # xmm0 = mem[1,0]
3016; AVX2-NEXT:    callq __truncsfhf2@PLT
3017; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3018; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3019; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3020; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3021; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3022; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3023; AVX2-NEXT:    vzeroupper
3024; AVX2-NEXT:    callq __truncsfhf2@PLT
3025; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3026; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3027; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
3028; AVX2-NEXT:    callq __truncsfhf2@PLT
3029; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3030; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3031; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3032; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3033; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3034; AVX2-NEXT:    vzeroupper
3035; AVX2-NEXT:    callq __truncsfhf2@PLT
3036; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3037; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3038; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
3039; AVX2-NEXT:    callq __truncsfhf2@PLT
3040; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3041; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3042; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
3043; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3044; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
3045; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3046; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
3047; AVX2-NEXT:    vmovdqa %ymm0, (%rbx)
3048; AVX2-NEXT:    addq $176, %rsp
3049; AVX2-NEXT:    popq %rbx
3050; AVX2-NEXT:    vzeroupper
3051; AVX2-NEXT:    retq
3052;
3053; F16C-LABEL: store_cvt_16f32_to_16i16:
3054; F16C:       # %bb.0:
3055; F16C-NEXT:    vcvtps2ph $4, %ymm1, 16(%rdi)
3056; F16C-NEXT:    vcvtps2ph $4, %ymm0, (%rdi)
3057; F16C-NEXT:    vzeroupper
3058; F16C-NEXT:    retq
3059;
3060; AVX512-LABEL: store_cvt_16f32_to_16i16:
3061; AVX512:       # %bb.0:
3062; AVX512-NEXT:    vcvtps2ph $4, %zmm0, (%rdi)
3063; AVX512-NEXT:    vzeroupper
3064; AVX512-NEXT:    retq
3065  %1 = fptrunc <16 x float> %a0 to <16 x half>
3066  %2 = bitcast <16 x half> %1 to <16 x i16>
3067  store <16 x i16> %2, ptr %a1
3068  ret void
3069}
3070
3071;
3072; Double to Half
3073;
3074
3075define i16 @cvt_f64_to_i16(double %a0) nounwind {
3076; ALL-LABEL: cvt_f64_to_i16:
3077; ALL:       # %bb.0:
3078; ALL-NEXT:    pushq %rax
3079; ALL-NEXT:    callq __truncdfhf2@PLT
3080; ALL-NEXT:    vpextrw $0, %xmm0, %eax
3081; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
3082; ALL-NEXT:    popq %rcx
3083; ALL-NEXT:    retq
3084; AVX-LABEL: cvt_f64_to_i16:
3085; AVX:       # %bb.0:
3086; AVX-NEXT:    pushq %rax
3087; AVX-NEXT:    callq __truncdfhf2@PLT
3088; AVX-NEXT:    vpextrw $0, %xmm0, %eax
3089; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
3090; AVX-NEXT:    popq %rcx
3091; AVX-NEXT:    retq
3092;
3093; F16C-LABEL: cvt_f64_to_i16:
3094; F16C:       # %bb.0:
3095; F16C-NEXT:    pushq %rax
3096; F16C-NEXT:    callq __truncdfhf2@PLT
3097; F16C-NEXT:    vpextrw $0, %xmm0, %eax
3098; F16C-NEXT:    # kill: def $ax killed $ax killed $eax
3099; F16C-NEXT:    popq %rcx
3100; F16C-NEXT:    retq
3101;
3102; AVX512-LABEL: cvt_f64_to_i16:
3103; AVX512:       # %bb.0:
3104; AVX512-NEXT:    pushq %rax
3105; AVX512-NEXT:    callq __truncdfhf2@PLT
3106; AVX512-NEXT:    vpextrw $0, %xmm0, %eax
3107; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
3108; AVX512-NEXT:    popq %rcx
3109; AVX512-NEXT:    retq
3110  %1 = fptrunc double %a0 to half
3111  %2 = bitcast half %1 to i16
3112  ret i16 %2
3113}
3114
3115define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
3116; AVX-LABEL: cvt_2f64_to_2i16:
3117; AVX:       # %bb.0:
3118; AVX-NEXT:    subq $40, %rsp
3119; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3120; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3121; AVX-NEXT:    callq __truncdfhf2@PLT
3122; AVX-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3123; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3124; AVX-NEXT:    callq __truncdfhf2@PLT
3125; AVX-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
3126; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3127; AVX-NEXT:    addq $40, %rsp
3128; AVX-NEXT:    retq
3129;
3130; F16C-LABEL: cvt_2f64_to_2i16:
3131; F16C:       # %bb.0:
3132; F16C-NEXT:    subq $40, %rsp
3133; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3134; F16C-NEXT:    callq __truncdfhf2@PLT
3135; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3136; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3137; F16C-NEXT:    # xmm0 = mem[1,0]
3138; F16C-NEXT:    callq __truncdfhf2@PLT
3139; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3140; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3141; F16C-NEXT:    addq $40, %rsp
3142; F16C-NEXT:    retq
3143;
3144; AVX512F-LABEL: cvt_2f64_to_2i16:
3145; AVX512F:       # %bb.0:
3146; AVX512F-NEXT:    subq $104, %rsp
3147; AVX512F-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
3148; AVX512F-NEXT:    callq __truncdfhf2@PLT
3149; AVX512F-NEXT:    vpbroadcastw %xmm0, %xmm0
3150; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3151; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3152; AVX512F-NEXT:    vzeroupper
3153; AVX512F-NEXT:    callq __truncdfhf2@PLT
3154; AVX512F-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3155; AVX512F-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3156; AVX512F-NEXT:    # xmm0 = mem[1,0]
3157; AVX512F-NEXT:    callq __truncdfhf2@PLT
3158; AVX512F-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3159; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3160; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [16,0]
3161; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
3162; AVX512F-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
3163; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3164; AVX512F-NEXT:    addq $104, %rsp
3165; AVX512F-NEXT:    vzeroupper
3166; AVX512F-NEXT:    retq
3167;
3168; AVX512-FASTLANE-LABEL: cvt_2f64_to_2i16:
3169; AVX512-FASTLANE:       # %bb.0:
3170; AVX512-FASTLANE-NEXT:    subq $40, %rsp
3171; AVX512-FASTLANE-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3172; AVX512-FASTLANE-NEXT:    callq __truncdfhf2@PLT
3173; AVX512-FASTLANE-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3174; AVX512-FASTLANE-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3175; AVX512-FASTLANE-NEXT:    # xmm0 = mem[1,0]
3176; AVX512-FASTLANE-NEXT:    callq __truncdfhf2@PLT
3177; AVX512-FASTLANE-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3178; AVX512-FASTLANE-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3179; AVX512-FASTLANE-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
3180; AVX512-FASTLANE-NEXT:    callq __truncdfhf2@PLT
3181; AVX512-FASTLANE-NEXT:    vpbroadcastw %xmm0, %xmm1
3182; AVX512-FASTLANE-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [4,0]
3183; AVX512-FASTLANE-NEXT:    vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
3184; AVX512-FASTLANE-NEXT:    addq $40, %rsp
3185; AVX512-FASTLANE-NEXT:    retq
3186  %1 = fptrunc <2 x double> %a0 to <2 x half>
3187  %2 = bitcast <2 x half> %1 to <2 x i16>
3188  ret <2 x i16> %2
3189}
3190
3191define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
3192; AVX1-LABEL: cvt_4f64_to_4i16:
3193; AVX1:       # %bb.0:
3194; AVX1-NEXT:    subq $88, %rsp
3195; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3196; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3197; AVX1-NEXT:    vzeroupper
3198; AVX1-NEXT:    callq __truncdfhf2@PLT
3199; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3200; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3201; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3202; AVX1-NEXT:    vzeroupper
3203; AVX1-NEXT:    callq __truncdfhf2@PLT
3204; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3205; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3206; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3207; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3208; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3209; AVX1-NEXT:    vzeroupper
3210; AVX1-NEXT:    callq __truncdfhf2@PLT
3211; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3212; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3213; AVX1-NEXT:    callq __truncdfhf2@PLT
3214; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3215; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3216; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3217; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3218; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3219; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3220; AVX1-NEXT:    addq $88, %rsp
3221; AVX1-NEXT:    retq
3222;
3223; AVX2-LABEL: cvt_4f64_to_4i16:
3224; AVX2:       # %bb.0:
3225; AVX2-NEXT:    subq $88, %rsp
3226; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3227; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3228; AVX2-NEXT:    vzeroupper
3229; AVX2-NEXT:    callq __truncdfhf2@PLT
3230; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3231; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3232; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3233; AVX2-NEXT:    vzeroupper
3234; AVX2-NEXT:    callq __truncdfhf2@PLT
3235; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3236; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3237; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3238; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3239; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3240; AVX2-NEXT:    vzeroupper
3241; AVX2-NEXT:    callq __truncdfhf2@PLT
3242; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3243; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3244; AVX2-NEXT:    callq __truncdfhf2@PLT
3245; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3246; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3247; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3248; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3249; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3250; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3251; AVX2-NEXT:    addq $88, %rsp
3252; AVX2-NEXT:    retq
3253;
3254; F16C-LABEL: cvt_4f64_to_4i16:
3255; F16C:       # %bb.0:
3256; F16C-NEXT:    subq $72, %rsp
3257; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3258; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3259; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3260; F16C-NEXT:    vzeroupper
3261; F16C-NEXT:    callq __truncdfhf2@PLT
3262; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3263; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3264; F16C-NEXT:    # xmm0 = mem[1,0]
3265; F16C-NEXT:    callq __truncdfhf2@PLT
3266; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3267; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3268; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3269; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3270; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3271; F16C-NEXT:    vzeroupper
3272; F16C-NEXT:    callq __truncdfhf2@PLT
3273; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3274; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3275; F16C-NEXT:    # xmm0 = mem[1,0]
3276; F16C-NEXT:    callq __truncdfhf2@PLT
3277; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3278; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3279; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3280; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
3281; F16C-NEXT:    addq $72, %rsp
3282; F16C-NEXT:    retq
3283;
3284; AVX512-LABEL: cvt_4f64_to_4i16:
3285; AVX512:       # %bb.0:
3286; AVX512-NEXT:    subq $72, %rsp
3287; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3288; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3289; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3290; AVX512-NEXT:    vzeroupper
3291; AVX512-NEXT:    callq __truncdfhf2@PLT
3292; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3293; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3294; AVX512-NEXT:    # xmm0 = mem[1,0]
3295; AVX512-NEXT:    callq __truncdfhf2@PLT
3296; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3297; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3298; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3299; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3300; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3301; AVX512-NEXT:    vzeroupper
3302; AVX512-NEXT:    callq __truncdfhf2@PLT
3303; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3304; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3305; AVX512-NEXT:    # xmm0 = mem[1,0]
3306; AVX512-NEXT:    callq __truncdfhf2@PLT
3307; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3308; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3309; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3310; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3311; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3312; AVX512-NEXT:    callq __truncdfhf2@PLT
3313; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
3314; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3315; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
3316; AVX512-NEXT:    addq $72, %rsp
3317; AVX512-NEXT:    retq
3318  %1 = fptrunc <4 x double> %a0 to <4 x half>
3319  %2 = bitcast <4 x half> %1 to <4 x i16>
3320  ret <4 x i16> %2
3321}
3322
3323define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
3324; AVX1-LABEL: cvt_4f64_to_8i16_undef:
3325; AVX1:       # %bb.0:
3326; AVX1-NEXT:    subq $88, %rsp
3327; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3328; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3329; AVX1-NEXT:    vzeroupper
3330; AVX1-NEXT:    callq __truncdfhf2@PLT
3331; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3332; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3333; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3334; AVX1-NEXT:    vzeroupper
3335; AVX1-NEXT:    callq __truncdfhf2@PLT
3336; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3337; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3338; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3339; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3340; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3341; AVX1-NEXT:    vzeroupper
3342; AVX1-NEXT:    callq __truncdfhf2@PLT
3343; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3344; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3345; AVX1-NEXT:    callq __truncdfhf2@PLT
3346; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3347; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3348; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3349; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3350; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3351; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3352; AVX1-NEXT:    addq $88, %rsp
3353; AVX1-NEXT:    retq
3354;
3355; AVX2-LABEL: cvt_4f64_to_8i16_undef:
3356; AVX2:       # %bb.0:
3357; AVX2-NEXT:    subq $88, %rsp
3358; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3359; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3360; AVX2-NEXT:    vzeroupper
3361; AVX2-NEXT:    callq __truncdfhf2@PLT
3362; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3363; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3364; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3365; AVX2-NEXT:    vzeroupper
3366; AVX2-NEXT:    callq __truncdfhf2@PLT
3367; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3368; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3369; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3370; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3371; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3372; AVX2-NEXT:    vzeroupper
3373; AVX2-NEXT:    callq __truncdfhf2@PLT
3374; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3375; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3376; AVX2-NEXT:    callq __truncdfhf2@PLT
3377; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3378; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3379; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3380; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3381; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3382; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3383; AVX2-NEXT:    addq $88, %rsp
3384; AVX2-NEXT:    retq
3385;
3386; F16C-LABEL: cvt_4f64_to_8i16_undef:
3387; F16C:       # %bb.0:
3388; F16C-NEXT:    subq $72, %rsp
3389; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3390; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3391; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3392; F16C-NEXT:    vzeroupper
3393; F16C-NEXT:    callq __truncdfhf2@PLT
3394; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3395; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3396; F16C-NEXT:    # xmm0 = mem[1,0]
3397; F16C-NEXT:    callq __truncdfhf2@PLT
3398; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3399; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3400; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3401; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3402; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3403; F16C-NEXT:    vzeroupper
3404; F16C-NEXT:    callq __truncdfhf2@PLT
3405; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3406; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3407; F16C-NEXT:    # xmm0 = mem[1,0]
3408; F16C-NEXT:    callq __truncdfhf2@PLT
3409; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3410; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3411; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3412; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
3413; F16C-NEXT:    addq $72, %rsp
3414; F16C-NEXT:    retq
3415;
3416; AVX512-LABEL: cvt_4f64_to_8i16_undef:
3417; AVX512:       # %bb.0:
3418; AVX512-NEXT:    subq $72, %rsp
3419; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3420; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3421; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3422; AVX512-NEXT:    vzeroupper
3423; AVX512-NEXT:    callq __truncdfhf2@PLT
3424; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3425; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3426; AVX512-NEXT:    # xmm0 = mem[1,0]
3427; AVX512-NEXT:    callq __truncdfhf2@PLT
3428; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3429; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3430; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3431; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3432; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3433; AVX512-NEXT:    vzeroupper
3434; AVX512-NEXT:    callq __truncdfhf2@PLT
3435; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3436; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3437; AVX512-NEXT:    # xmm0 = mem[1,0]
3438; AVX512-NEXT:    callq __truncdfhf2@PLT
3439; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3440; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3441; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3442; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3443; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3444; AVX512-NEXT:    callq __truncdfhf2@PLT
3445; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
3446; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3447; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
3448; AVX512-NEXT:    addq $72, %rsp
3449; AVX512-NEXT:    retq
3450  %1 = fptrunc <4 x double> %a0 to <4 x half>
3451  %2 = bitcast <4 x half> %1 to <4 x i16>
3452  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3453  ret <8 x i16> %3
3454}
3455
3456define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
3457; AVX1-LABEL: cvt_4f64_to_8i16_zero:
3458; AVX1:       # %bb.0:
3459; AVX1-NEXT:    subq $88, %rsp
3460; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3461; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3462; AVX1-NEXT:    vzeroupper
3463; AVX1-NEXT:    callq __truncdfhf2@PLT
3464; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3465; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3466; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3467; AVX1-NEXT:    vzeroupper
3468; AVX1-NEXT:    callq __truncdfhf2@PLT
3469; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3470; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3471; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3472; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3473; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3474; AVX1-NEXT:    vzeroupper
3475; AVX1-NEXT:    callq __truncdfhf2@PLT
3476; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3477; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3478; AVX1-NEXT:    callq __truncdfhf2@PLT
3479; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3480; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3481; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3482; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3483; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3484; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3485; AVX1-NEXT:    addq $88, %rsp
3486; AVX1-NEXT:    retq
3487;
3488; AVX2-LABEL: cvt_4f64_to_8i16_zero:
3489; AVX2:       # %bb.0:
3490; AVX2-NEXT:    subq $88, %rsp
3491; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3492; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3493; AVX2-NEXT:    vzeroupper
3494; AVX2-NEXT:    callq __truncdfhf2@PLT
3495; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3496; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3497; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3498; AVX2-NEXT:    vzeroupper
3499; AVX2-NEXT:    callq __truncdfhf2@PLT
3500; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3501; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3502; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3503; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3504; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3505; AVX2-NEXT:    vzeroupper
3506; AVX2-NEXT:    callq __truncdfhf2@PLT
3507; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3508; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3509; AVX2-NEXT:    callq __truncdfhf2@PLT
3510; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3511; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3512; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3513; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3514; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3515; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
3516; AVX2-NEXT:    addq $88, %rsp
3517; AVX2-NEXT:    retq
3518;
3519; F16C-LABEL: cvt_4f64_to_8i16_zero:
3520; F16C:       # %bb.0:
3521; F16C-NEXT:    subq $72, %rsp
3522; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3523; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3524; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3525; F16C-NEXT:    vzeroupper
3526; F16C-NEXT:    callq __truncdfhf2@PLT
3527; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3528; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3529; F16C-NEXT:    # xmm0 = mem[1,0]
3530; F16C-NEXT:    callq __truncdfhf2@PLT
3531; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3532; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3533; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3534; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3535; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3536; F16C-NEXT:    vzeroupper
3537; F16C-NEXT:    callq __truncdfhf2@PLT
3538; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3539; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3540; F16C-NEXT:    # xmm0 = mem[1,0]
3541; F16C-NEXT:    callq __truncdfhf2@PLT
3542; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3543; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3544; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3545; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
3546; F16C-NEXT:    addq $72, %rsp
3547; F16C-NEXT:    retq
3548;
3549; AVX512-LABEL: cvt_4f64_to_8i16_zero:
3550; AVX512:       # %bb.0:
3551; AVX512-NEXT:    subq $72, %rsp
3552; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3553; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3554; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3555; AVX512-NEXT:    vzeroupper
3556; AVX512-NEXT:    callq __truncdfhf2@PLT
3557; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3558; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3559; AVX512-NEXT:    # xmm0 = mem[1,0]
3560; AVX512-NEXT:    callq __truncdfhf2@PLT
3561; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3562; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3563; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3564; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3565; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3566; AVX512-NEXT:    vzeroupper
3567; AVX512-NEXT:    callq __truncdfhf2@PLT
3568; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3569; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3570; AVX512-NEXT:    # xmm0 = mem[1,0]
3571; AVX512-NEXT:    callq __truncdfhf2@PLT
3572; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3573; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3574; AVX512-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3575; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
3576; AVX512-NEXT:    addq $72, %rsp
3577; AVX512-NEXT:    retq
3578  %1 = fptrunc <4 x double> %a0 to <4 x half>
3579  %2 = bitcast <4 x half> %1 to <4 x i16>
3580  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3581  ret <8 x i16> %3
3582}
3583
3584define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
3585; AVX-LABEL: cvt_8f64_to_8i16:
3586; AVX:       # %bb.0:
3587; AVX-NEXT:    subq $104, %rsp
3588; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3589; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3590; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
3591; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3592; AVX-NEXT:    vzeroupper
3593; AVX-NEXT:    callq __truncdfhf2@PLT
3594; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3595; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3596; AVX-NEXT:    # xmm0 = mem[1,0]
3597; AVX-NEXT:    callq __truncdfhf2@PLT
3598; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3599; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3600; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3601; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3602; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3603; AVX-NEXT:    vzeroupper
3604; AVX-NEXT:    callq __truncdfhf2@PLT
3605; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3606; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3607; AVX-NEXT:    # xmm0 = mem[1,0]
3608; AVX-NEXT:    callq __truncdfhf2@PLT
3609; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3610; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3611; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3612; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3613; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3614; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3615; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
3616; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3617; AVX-NEXT:    vzeroupper
3618; AVX-NEXT:    callq __truncdfhf2@PLT
3619; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3620; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3621; AVX-NEXT:    # xmm0 = mem[1,0]
3622; AVX-NEXT:    callq __truncdfhf2@PLT
3623; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3624; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3625; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3626; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3627; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3628; AVX-NEXT:    vzeroupper
3629; AVX-NEXT:    callq __truncdfhf2@PLT
3630; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3631; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3632; AVX-NEXT:    # xmm0 = mem[1,0]
3633; AVX-NEXT:    callq __truncdfhf2@PLT
3634; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3635; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3636; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3637; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3638; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3639; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
3640; AVX-NEXT:    addq $104, %rsp
3641; AVX-NEXT:    retq
3642;
3643; F16C-LABEL: cvt_8f64_to_8i16:
3644; F16C:       # %bb.0:
3645; F16C-NEXT:    subq $104, %rsp
3646; F16C-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3647; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3648; F16C-NEXT:    vextractf128 $1, %ymm1, %xmm0
3649; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3650; F16C-NEXT:    vzeroupper
3651; F16C-NEXT:    callq __truncdfhf2@PLT
3652; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3653; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3654; F16C-NEXT:    # xmm0 = mem[1,0]
3655; F16C-NEXT:    callq __truncdfhf2@PLT
3656; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3657; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3658; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3659; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3660; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3661; F16C-NEXT:    vzeroupper
3662; F16C-NEXT:    callq __truncdfhf2@PLT
3663; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3664; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3665; F16C-NEXT:    # xmm0 = mem[1,0]
3666; F16C-NEXT:    callq __truncdfhf2@PLT
3667; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3668; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3669; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3670; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3671; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3672; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3673; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3674; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3675; F16C-NEXT:    vzeroupper
3676; F16C-NEXT:    callq __truncdfhf2@PLT
3677; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3678; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3679; F16C-NEXT:    # xmm0 = mem[1,0]
3680; F16C-NEXT:    callq __truncdfhf2@PLT
3681; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3682; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3683; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3684; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3685; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3686; F16C-NEXT:    vzeroupper
3687; F16C-NEXT:    callq __truncdfhf2@PLT
3688; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3689; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3690; F16C-NEXT:    # xmm0 = mem[1,0]
3691; F16C-NEXT:    callq __truncdfhf2@PLT
3692; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3693; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3694; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3695; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3696; F16C-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3697; F16C-NEXT:    # xmm0 = xmm0[0],mem[0]
3698; F16C-NEXT:    addq $104, %rsp
3699; F16C-NEXT:    retq
3700;
3701; AVX512-LABEL: cvt_8f64_to_8i16:
3702; AVX512:       # %bb.0:
3703; AVX512-NEXT:    subq $120, %rsp
3704; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3705; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
3706; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3707; AVX512-NEXT:    vzeroupper
3708; AVX512-NEXT:    callq __truncdfhf2@PLT
3709; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3710; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3711; AVX512-NEXT:    # xmm0 = mem[1,0]
3712; AVX512-NEXT:    callq __truncdfhf2@PLT
3713; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3714; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3715; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3716; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
3717; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
3718; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3719; AVX512-NEXT:    vzeroupper
3720; AVX512-NEXT:    callq __truncdfhf2@PLT
3721; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3722; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3723; AVX512-NEXT:    # xmm0 = mem[1,0]
3724; AVX512-NEXT:    callq __truncdfhf2@PLT
3725; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3726; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3727; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3728; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3729; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3730; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
3731; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3732; AVX512-NEXT:    vzeroupper
3733; AVX512-NEXT:    callq __truncdfhf2@PLT
3734; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3735; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3736; AVX512-NEXT:    # xmm0 = mem[1,0]
3737; AVX512-NEXT:    callq __truncdfhf2@PLT
3738; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3739; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3740; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
3741; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
3742; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3743; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3744; AVX512-NEXT:    vzeroupper
3745; AVX512-NEXT:    callq __truncdfhf2@PLT
3746; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3747; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3748; AVX512-NEXT:    # xmm0 = mem[1,0]
3749; AVX512-NEXT:    callq __truncdfhf2@PLT
3750; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3751; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3752; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3753; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3754; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3755; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
3756; AVX512-NEXT:    addq $120, %rsp
3757; AVX512-NEXT:    retq
3758  %1 = fptrunc <8 x double> %a0 to <8 x half>
3759  %2 = bitcast <8 x half> %1 to <8 x i16>
3760  ret <8 x i16> %2
3761}
3762
3763;
3764; Double to Half (Store)
3765;
3766
3767define void @store_cvt_f64_to_i16(double %a0, ptr %a1) nounwind {
3768; ALL-LABEL: store_cvt_f64_to_i16:
3769; ALL:       # %bb.0:
3770; ALL-NEXT:    pushq %rbx
3771; ALL-NEXT:    movq %rdi, %rbx
3772; ALL-NEXT:    callq __truncdfhf2@PLT
3773; ALL-NEXT:    vpextrw $0, %xmm0, (%rbx)
3774; ALL-NEXT:    popq %rbx
3775; ALL-NEXT:    retq
3776; AVX-LABEL: store_cvt_f64_to_i16:
3777; AVX:       # %bb.0:
3778; AVX-NEXT:    pushq %rbx
3779; AVX-NEXT:    movq %rdi, %rbx
3780; AVX-NEXT:    callq __truncdfhf2@PLT
3781; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
3782; AVX-NEXT:    popq %rbx
3783; AVX-NEXT:    retq
3784;
3785; F16C-LABEL: store_cvt_f64_to_i16:
3786; F16C:       # %bb.0:
3787; F16C-NEXT:    pushq %rbx
3788; F16C-NEXT:    movq %rdi, %rbx
3789; F16C-NEXT:    callq __truncdfhf2@PLT
3790; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
3791; F16C-NEXT:    popq %rbx
3792; F16C-NEXT:    retq
3793;
3794; AVX512-LABEL: store_cvt_f64_to_i16:
3795; AVX512:       # %bb.0:
3796; AVX512-NEXT:    pushq %rbx
3797; AVX512-NEXT:    movq %rdi, %rbx
3798; AVX512-NEXT:    callq __truncdfhf2@PLT
3799; AVX512-NEXT:    vpextrw $0, %xmm0, (%rbx)
3800; AVX512-NEXT:    popq %rbx
3801; AVX512-NEXT:    retq
3802  %1 = fptrunc double %a0 to half
3803  %2 = bitcast half %1 to i16
3804  store i16 %2, ptr %a1
3805  ret void
3806}
3807
3808define void @store_cvt_2f64_to_2i16(<2 x double> %a0, ptr %a1) nounwind {
3809; AVX-LABEL: store_cvt_2f64_to_2i16:
3810; AVX:       # %bb.0:
3811; AVX-NEXT:    pushq %rbx
3812; AVX-NEXT:    subq $32, %rsp
3813; AVX-NEXT:    movq %rdi, %rbx
3814; AVX-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3815; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3816; AVX-NEXT:    callq __truncdfhf2@PLT
3817; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3818; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3819; AVX-NEXT:    callq __truncdfhf2@PLT
3820; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
3821; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3822; AVX-NEXT:    vpextrw $0, %xmm0, 2(%rbx)
3823; AVX-NEXT:    addq $32, %rsp
3824; AVX-NEXT:    popq %rbx
3825; AVX-NEXT:    retq
3826;
3827; F16C-LABEL: store_cvt_2f64_to_2i16:
3828; F16C:       # %bb.0:
3829; F16C-NEXT:    pushq %rbx
3830; F16C-NEXT:    subq $32, %rsp
3831; F16C-NEXT:    movq %rdi, %rbx
3832; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3833; F16C-NEXT:    callq __truncdfhf2@PLT
3834; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3835; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3836; F16C-NEXT:    # xmm0 = mem[1,0]
3837; F16C-NEXT:    callq __truncdfhf2@PLT
3838; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3839; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3840; F16C-NEXT:    vmovd %xmm0, (%rbx)
3841; F16C-NEXT:    addq $32, %rsp
3842; F16C-NEXT:    popq %rbx
3843; F16C-NEXT:    retq
3844;
3845; AVX512-LABEL: store_cvt_2f64_to_2i16:
3846; AVX512:       # %bb.0:
3847; AVX512-NEXT:    pushq %rbx
3848; AVX512-NEXT:    subq $32, %rsp
3849; AVX512-NEXT:    movq %rdi, %rbx
3850; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3851; AVX512-NEXT:    callq __truncdfhf2@PLT
3852; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3853; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3854; AVX512-NEXT:    # xmm0 = mem[1,0]
3855; AVX512-NEXT:    callq __truncdfhf2@PLT
3856; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3857; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3858; AVX512-NEXT:    vmovd %xmm0, (%rbx)
3859; AVX512-NEXT:    addq $32, %rsp
3860; AVX512-NEXT:    popq %rbx
3861; AVX512-NEXT:    retq
3862  %1 = fptrunc <2 x double> %a0 to <2 x half>
3863  %2 = bitcast <2 x half> %1 to <2 x i16>
3864  store <2 x i16> %2, ptr %a1
3865  ret void
3866}
3867
3868define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind {
3869; AVX1-LABEL: store_cvt_4f64_to_4i16:
3870; AVX1:       # %bb.0:
3871; AVX1-NEXT:    pushq %rbx
3872; AVX1-NEXT:    subq $80, %rsp
3873; AVX1-NEXT:    movq %rdi, %rbx
3874; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3875; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3876; AVX1-NEXT:    vzeroupper
3877; AVX1-NEXT:    callq __truncdfhf2@PLT
3878; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3879; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3880; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3881; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3882; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3883; AVX1-NEXT:    vzeroupper
3884; AVX1-NEXT:    callq __truncdfhf2@PLT
3885; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3886; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3887; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3888; AVX1-NEXT:    vzeroupper
3889; AVX1-NEXT:    callq __truncdfhf2@PLT
3890; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3891; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3892; AVX1-NEXT:    callq __truncdfhf2@PLT
3893; AVX1-NEXT:    vpextrw $0, %xmm0, 4(%rbx)
3894; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3895; AVX1-NEXT:    vpextrw $0, %xmm0, (%rbx)
3896; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3897; AVX1-NEXT:    vpextrw $0, %xmm0, 6(%rbx)
3898; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3899; AVX1-NEXT:    vpextrw $0, %xmm0, 2(%rbx)
3900; AVX1-NEXT:    addq $80, %rsp
3901; AVX1-NEXT:    popq %rbx
3902; AVX1-NEXT:    retq
3903;
3904; AVX2-LABEL: store_cvt_4f64_to_4i16:
3905; AVX2:       # %bb.0:
3906; AVX2-NEXT:    pushq %rbx
3907; AVX2-NEXT:    subq $80, %rsp
3908; AVX2-NEXT:    movq %rdi, %rbx
3909; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3910; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3911; AVX2-NEXT:    vzeroupper
3912; AVX2-NEXT:    callq __truncdfhf2@PLT
3913; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3914; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3915; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3916; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3917; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
3918; AVX2-NEXT:    vzeroupper
3919; AVX2-NEXT:    callq __truncdfhf2@PLT
3920; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3921; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3922; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3923; AVX2-NEXT:    vzeroupper
3924; AVX2-NEXT:    callq __truncdfhf2@PLT
3925; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3926; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3927; AVX2-NEXT:    callq __truncdfhf2@PLT
3928; AVX2-NEXT:    vpextrw $0, %xmm0, 4(%rbx)
3929; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3930; AVX2-NEXT:    vpextrw $0, %xmm0, (%rbx)
3931; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3932; AVX2-NEXT:    vpextrw $0, %xmm0, 6(%rbx)
3933; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3934; AVX2-NEXT:    vpextrw $0, %xmm0, 2(%rbx)
3935; AVX2-NEXT:    addq $80, %rsp
3936; AVX2-NEXT:    popq %rbx
3937; AVX2-NEXT:    retq
3938;
3939; F16C-LABEL: store_cvt_4f64_to_4i16:
3940; F16C:       # %bb.0:
3941; F16C-NEXT:    pushq %rbx
3942; F16C-NEXT:    subq $64, %rsp
3943; F16C-NEXT:    movq %rdi, %rbx
3944; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3945; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
3946; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3947; F16C-NEXT:    vzeroupper
3948; F16C-NEXT:    callq __truncdfhf2@PLT
3949; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3950; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3951; F16C-NEXT:    # xmm0 = mem[1,0]
3952; F16C-NEXT:    callq __truncdfhf2@PLT
3953; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3954; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3955; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3956; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3957; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3958; F16C-NEXT:    vzeroupper
3959; F16C-NEXT:    callq __truncdfhf2@PLT
3960; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3961; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3962; F16C-NEXT:    # xmm0 = mem[1,0]
3963; F16C-NEXT:    callq __truncdfhf2@PLT
3964; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3965; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3966; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3967; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3968; F16C-NEXT:    vmovq %xmm0, (%rbx)
3969; F16C-NEXT:    addq $64, %rsp
3970; F16C-NEXT:    popq %rbx
3971; F16C-NEXT:    retq
3972;
3973; AVX512-LABEL: store_cvt_4f64_to_4i16:
3974; AVX512:       # %bb.0:
3975; AVX512-NEXT:    pushq %rbx
3976; AVX512-NEXT:    subq $64, %rsp
3977; AVX512-NEXT:    movq %rdi, %rbx
3978; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3979; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3980; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3981; AVX512-NEXT:    vzeroupper
3982; AVX512-NEXT:    callq __truncdfhf2@PLT
3983; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3984; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3985; AVX512-NEXT:    # xmm0 = mem[1,0]
3986; AVX512-NEXT:    callq __truncdfhf2@PLT
3987; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3988; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3989; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3990; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3991; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3992; AVX512-NEXT:    vzeroupper
3993; AVX512-NEXT:    callq __truncdfhf2@PLT
3994; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
3995; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3996; AVX512-NEXT:    # xmm0 = mem[1,0]
3997; AVX512-NEXT:    callq __truncdfhf2@PLT
3998; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
3999; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4000; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4001; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4002; AVX512-NEXT:    vmovq %xmm0, (%rbx)
4003; AVX512-NEXT:    addq $64, %rsp
4004; AVX512-NEXT:    popq %rbx
4005; AVX512-NEXT:    retq
4006  %1 = fptrunc <4 x double> %a0 to <4 x half>
4007  %2 = bitcast <4 x half> %1 to <4 x i16>
4008  store <4 x i16> %2, ptr %a1
4009  ret void
4010}
4011
4012define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
4013; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
4014; AVX1:       # %bb.0:
4015; AVX1-NEXT:    pushq %rbx
4016; AVX1-NEXT:    subq $80, %rsp
4017; AVX1-NEXT:    movq %rdi, %rbx
4018; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4019; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
4020; AVX1-NEXT:    vzeroupper
4021; AVX1-NEXT:    callq __truncdfhf2@PLT
4022; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4023; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4024; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4025; AVX1-NEXT:    vzeroupper
4026; AVX1-NEXT:    callq __truncdfhf2@PLT
4027; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4028; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4029; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4030; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4031; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
4032; AVX1-NEXT:    vzeroupper
4033; AVX1-NEXT:    callq __truncdfhf2@PLT
4034; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4035; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
4036; AVX1-NEXT:    callq __truncdfhf2@PLT
4037; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4038; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4039; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4040; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4041; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4042; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
4043; AVX1-NEXT:    vmovaps %xmm0, (%rbx)
4044; AVX1-NEXT:    addq $80, %rsp
4045; AVX1-NEXT:    popq %rbx
4046; AVX1-NEXT:    retq
4047;
4048; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
4049; AVX2:       # %bb.0:
4050; AVX2-NEXT:    pushq %rbx
4051; AVX2-NEXT:    subq $80, %rsp
4052; AVX2-NEXT:    movq %rdi, %rbx
4053; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4054; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
4055; AVX2-NEXT:    vzeroupper
4056; AVX2-NEXT:    callq __truncdfhf2@PLT
4057; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4058; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4059; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4060; AVX2-NEXT:    vzeroupper
4061; AVX2-NEXT:    callq __truncdfhf2@PLT
4062; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4063; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4064; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4065; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4066; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
4067; AVX2-NEXT:    vzeroupper
4068; AVX2-NEXT:    callq __truncdfhf2@PLT
4069; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4070; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
4071; AVX2-NEXT:    callq __truncdfhf2@PLT
4072; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4073; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4074; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4075; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4076; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4077; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
4078; AVX2-NEXT:    vmovaps %xmm0, (%rbx)
4079; AVX2-NEXT:    addq $80, %rsp
4080; AVX2-NEXT:    popq %rbx
4081; AVX2-NEXT:    retq
4082;
4083; F16C-LABEL: store_cvt_4f64_to_8i16_undef:
4084; F16C:       # %bb.0:
4085; F16C-NEXT:    pushq %rbx
4086; F16C-NEXT:    subq $64, %rsp
4087; F16C-NEXT:    movq %rdi, %rbx
4088; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4089; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
4090; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4091; F16C-NEXT:    vzeroupper
4092; F16C-NEXT:    callq __truncdfhf2@PLT
4093; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4094; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4095; F16C-NEXT:    # xmm0 = mem[1,0]
4096; F16C-NEXT:    callq __truncdfhf2@PLT
4097; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4098; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4099; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4100; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4101; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4102; F16C-NEXT:    vzeroupper
4103; F16C-NEXT:    callq __truncdfhf2@PLT
4104; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4105; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4106; F16C-NEXT:    # xmm0 = mem[1,0]
4107; F16C-NEXT:    callq __truncdfhf2@PLT
4108; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4109; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4110; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4111; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
4112; F16C-NEXT:    vmovaps %xmm0, (%rbx)
4113; F16C-NEXT:    addq $64, %rsp
4114; F16C-NEXT:    popq %rbx
4115; F16C-NEXT:    retq
4116;
4117; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
4118; AVX512:       # %bb.0:
4119; AVX512-NEXT:    pushq %rbx
4120; AVX512-NEXT:    subq $64, %rsp
4121; AVX512-NEXT:    movq %rdi, %rbx
4122; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4123; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
4124; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4125; AVX512-NEXT:    vzeroupper
4126; AVX512-NEXT:    callq __truncdfhf2@PLT
4127; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4128; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4129; AVX512-NEXT:    # xmm0 = mem[1,0]
4130; AVX512-NEXT:    callq __truncdfhf2@PLT
4131; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4132; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4133; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4134; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4135; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4136; AVX512-NEXT:    vzeroupper
4137; AVX512-NEXT:    callq __truncdfhf2@PLT
4138; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4139; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4140; AVX512-NEXT:    # xmm0 = mem[1,0]
4141; AVX512-NEXT:    callq __truncdfhf2@PLT
4142; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4143; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4144; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4145; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4146; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4147; AVX512-NEXT:    callq __truncdfhf2@PLT
4148; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
4149; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4150; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
4151; AVX512-NEXT:    vmovaps %xmm0, (%rbx)
4152; AVX512-NEXT:    addq $64, %rsp
4153; AVX512-NEXT:    popq %rbx
4154; AVX512-NEXT:    retq
4155  %1 = fptrunc <4 x double> %a0 to <4 x half>
4156  %2 = bitcast <4 x half> %1 to <4 x i16>
4157  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4158  store <8 x i16> %3, ptr %a1
4159  ret void
4160}
4161
4162define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
4163; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
4164; AVX1:       # %bb.0:
4165; AVX1-NEXT:    pushq %rbx
4166; AVX1-NEXT:    subq $80, %rsp
4167; AVX1-NEXT:    movq %rdi, %rbx
4168; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4169; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
4170; AVX1-NEXT:    vzeroupper
4171; AVX1-NEXT:    callq __truncdfhf2@PLT
4172; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4173; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4174; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4175; AVX1-NEXT:    vzeroupper
4176; AVX1-NEXT:    callq __truncdfhf2@PLT
4177; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4178; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4179; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4180; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4181; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
4182; AVX1-NEXT:    vzeroupper
4183; AVX1-NEXT:    callq __truncdfhf2@PLT
4184; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4185; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
4186; AVX1-NEXT:    callq __truncdfhf2@PLT
4187; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4188; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4189; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4190; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4191; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4192; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
4193; AVX1-NEXT:    vmovaps %xmm0, (%rbx)
4194; AVX1-NEXT:    addq $80, %rsp
4195; AVX1-NEXT:    popq %rbx
4196; AVX1-NEXT:    retq
4197;
4198; AVX2-LABEL: store_cvt_4f64_to_8i16_zero:
4199; AVX2:       # %bb.0:
4200; AVX2-NEXT:    pushq %rbx
4201; AVX2-NEXT:    subq $80, %rsp
4202; AVX2-NEXT:    movq %rdi, %rbx
4203; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4204; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
4205; AVX2-NEXT:    vzeroupper
4206; AVX2-NEXT:    callq __truncdfhf2@PLT
4207; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4208; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4209; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4210; AVX2-NEXT:    vzeroupper
4211; AVX2-NEXT:    callq __truncdfhf2@PLT
4212; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4213; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4214; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4215; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4216; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
4217; AVX2-NEXT:    vzeroupper
4218; AVX2-NEXT:    callq __truncdfhf2@PLT
4219; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4220; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
4221; AVX2-NEXT:    callq __truncdfhf2@PLT
4222; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4223; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4224; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4225; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4226; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4227; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
4228; AVX2-NEXT:    vmovaps %xmm0, (%rbx)
4229; AVX2-NEXT:    addq $80, %rsp
4230; AVX2-NEXT:    popq %rbx
4231; AVX2-NEXT:    retq
4232;
4233; F16C-LABEL: store_cvt_4f64_to_8i16_zero:
4234; F16C:       # %bb.0:
4235; F16C-NEXT:    pushq %rbx
4236; F16C-NEXT:    subq $64, %rsp
4237; F16C-NEXT:    movq %rdi, %rbx
4238; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4239; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
4240; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4241; F16C-NEXT:    vzeroupper
4242; F16C-NEXT:    callq __truncdfhf2@PLT
4243; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4244; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4245; F16C-NEXT:    # xmm0 = mem[1,0]
4246; F16C-NEXT:    callq __truncdfhf2@PLT
4247; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4248; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4249; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4250; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4251; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4252; F16C-NEXT:    vzeroupper
4253; F16C-NEXT:    callq __truncdfhf2@PLT
4254; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4255; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4256; F16C-NEXT:    # xmm0 = mem[1,0]
4257; F16C-NEXT:    callq __truncdfhf2@PLT
4258; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4259; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4260; F16C-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4261; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
4262; F16C-NEXT:    vmovaps %xmm0, (%rbx)
4263; F16C-NEXT:    addq $64, %rsp
4264; F16C-NEXT:    popq %rbx
4265; F16C-NEXT:    retq
4266;
4267; AVX512-LABEL: store_cvt_4f64_to_8i16_zero:
4268; AVX512:       # %bb.0:
4269; AVX512-NEXT:    pushq %rbx
4270; AVX512-NEXT:    subq $64, %rsp
4271; AVX512-NEXT:    movq %rdi, %rbx
4272; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4273; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
4274; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4275; AVX512-NEXT:    vzeroupper
4276; AVX512-NEXT:    callq __truncdfhf2@PLT
4277; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4278; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4279; AVX512-NEXT:    # xmm0 = mem[1,0]
4280; AVX512-NEXT:    callq __truncdfhf2@PLT
4281; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4282; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4283; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4284; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4285; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4286; AVX512-NEXT:    vzeroupper
4287; AVX512-NEXT:    callq __truncdfhf2@PLT
4288; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4289; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4290; AVX512-NEXT:    # xmm0 = mem[1,0]
4291; AVX512-NEXT:    callq __truncdfhf2@PLT
4292; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4293; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4294; AVX512-NEXT:    vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4295; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
4296; AVX512-NEXT:    vmovaps %xmm0, (%rbx)
4297; AVX512-NEXT:    addq $64, %rsp
4298; AVX512-NEXT:    popq %rbx
4299; AVX512-NEXT:    retq
4300  %1 = fptrunc <4 x double> %a0 to <4 x half>
4301  %2 = bitcast <4 x half> %1 to <4 x i16>
4302  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4303  store <8 x i16> %3, ptr %a1
4304  ret void
4305}
4306
4307define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind {
4308; AVX-LABEL: store_cvt_8f64_to_8i16:
4309; AVX:       # %bb.0:
4310; AVX-NEXT:    pushq %rbx
4311; AVX-NEXT:    subq $96, %rsp
4312; AVX-NEXT:    movq %rdi, %rbx
4313; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4314; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4315; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
4316; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4317; AVX-NEXT:    vzeroupper
4318; AVX-NEXT:    callq __truncdfhf2@PLT
4319; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4320; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4321; AVX-NEXT:    # xmm0 = mem[1,0]
4322; AVX-NEXT:    callq __truncdfhf2@PLT
4323; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4324; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4325; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4326; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4327; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4328; AVX-NEXT:    vzeroupper
4329; AVX-NEXT:    callq __truncdfhf2@PLT
4330; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4331; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4332; AVX-NEXT:    # xmm0 = mem[1,0]
4333; AVX-NEXT:    callq __truncdfhf2@PLT
4334; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4335; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4336; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4337; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4338; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4339; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4340; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
4341; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4342; AVX-NEXT:    vzeroupper
4343; AVX-NEXT:    callq __truncdfhf2@PLT
4344; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4345; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4346; AVX-NEXT:    # xmm0 = mem[1,0]
4347; AVX-NEXT:    callq __truncdfhf2@PLT
4348; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4349; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4350; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4351; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4352; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4353; AVX-NEXT:    vzeroupper
4354; AVX-NEXT:    callq __truncdfhf2@PLT
4355; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4356; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4357; AVX-NEXT:    # xmm0 = mem[1,0]
4358; AVX-NEXT:    callq __truncdfhf2@PLT
4359; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4360; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4361; AVX-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4362; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4363; AVX-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4364; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
4365; AVX-NEXT:    vmovdqa %xmm0, (%rbx)
4366; AVX-NEXT:    addq $96, %rsp
4367; AVX-NEXT:    popq %rbx
4368; AVX-NEXT:    retq
4369;
4370; F16C-LABEL: store_cvt_8f64_to_8i16:
4371; F16C:       # %bb.0:
4372; F16C-NEXT:    pushq %rbx
4373; F16C-NEXT:    subq $96, %rsp
4374; F16C-NEXT:    movq %rdi, %rbx
4375; F16C-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4376; F16C-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4377; F16C-NEXT:    vextractf128 $1, %ymm1, %xmm0
4378; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4379; F16C-NEXT:    vzeroupper
4380; F16C-NEXT:    callq __truncdfhf2@PLT
4381; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4382; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4383; F16C-NEXT:    # xmm0 = mem[1,0]
4384; F16C-NEXT:    callq __truncdfhf2@PLT
4385; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4386; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4387; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4388; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4389; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4390; F16C-NEXT:    vzeroupper
4391; F16C-NEXT:    callq __truncdfhf2@PLT
4392; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4393; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4394; F16C-NEXT:    # xmm0 = mem[1,0]
4395; F16C-NEXT:    callq __truncdfhf2@PLT
4396; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4397; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4398; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4399; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4400; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4401; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4402; F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
4403; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4404; F16C-NEXT:    vzeroupper
4405; F16C-NEXT:    callq __truncdfhf2@PLT
4406; F16C-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4407; F16C-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4408; F16C-NEXT:    # xmm0 = mem[1,0]
4409; F16C-NEXT:    callq __truncdfhf2@PLT
4410; F16C-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4411; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4412; F16C-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4413; F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4414; F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4415; F16C-NEXT:    vzeroupper
4416; F16C-NEXT:    callq __truncdfhf2@PLT
4417; F16C-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4418; F16C-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4419; F16C-NEXT:    # xmm0 = mem[1,0]
4420; F16C-NEXT:    callq __truncdfhf2@PLT
4421; F16C-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4422; F16C-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4423; F16C-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4424; F16C-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4425; F16C-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4426; F16C-NEXT:    # xmm0 = xmm0[0],mem[0]
4427; F16C-NEXT:    vmovdqa %xmm0, (%rbx)
4428; F16C-NEXT:    addq $96, %rsp
4429; F16C-NEXT:    popq %rbx
4430; F16C-NEXT:    retq
4431;
4432; AVX512-LABEL: store_cvt_8f64_to_8i16:
4433; AVX512:       # %bb.0:
4434; AVX512-NEXT:    pushq %rbx
4435; AVX512-NEXT:    subq $112, %rsp
4436; AVX512-NEXT:    movq %rdi, %rbx
4437; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4438; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
4439; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4440; AVX512-NEXT:    vzeroupper
4441; AVX512-NEXT:    callq __truncdfhf2@PLT
4442; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4443; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4444; AVX512-NEXT:    # xmm0 = mem[1,0]
4445; AVX512-NEXT:    callq __truncdfhf2@PLT
4446; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4447; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4448; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4449; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4450; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
4451; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4452; AVX512-NEXT:    vzeroupper
4453; AVX512-NEXT:    callq __truncdfhf2@PLT
4454; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4455; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4456; AVX512-NEXT:    # xmm0 = mem[1,0]
4457; AVX512-NEXT:    callq __truncdfhf2@PLT
4458; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4459; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4460; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4461; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4462; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4463; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4464; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4465; AVX512-NEXT:    vzeroupper
4466; AVX512-NEXT:    callq __truncdfhf2@PLT
4467; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4468; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4469; AVX512-NEXT:    # xmm0 = mem[1,0]
4470; AVX512-NEXT:    callq __truncdfhf2@PLT
4471; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4472; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4473; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4474; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4475; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
4476; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4477; AVX512-NEXT:    vzeroupper
4478; AVX512-NEXT:    callq __truncdfhf2@PLT
4479; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4480; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4481; AVX512-NEXT:    # xmm0 = mem[1,0]
4482; AVX512-NEXT:    callq __truncdfhf2@PLT
4483; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4484; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4485; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4486; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4487; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4488; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
4489; AVX512-NEXT:    vmovdqa %xmm0, (%rbx)
4490; AVX512-NEXT:    addq $112, %rsp
4491; AVX512-NEXT:    popq %rbx
4492; AVX512-NEXT:    retq
4493  %1 = fptrunc <8 x double> %a0 to <8 x half>
4494  %2 = bitcast <8 x half> %1 to <8 x i16>
4495  store <8 x i16> %2, ptr %a1
4496  ret void
4497}
4498
4499define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
4500; AVX1-LABEL: store_cvt_32f32_to_32f16:
4501; AVX1:       # %bb.0:
4502; AVX1-NEXT:    pushq %rbx
4503; AVX1-NEXT:    subq $176, %rsp
4504; AVX1-NEXT:    movq %rdi, %rbx
4505; AVX1-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4506; AVX1-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4507; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4508; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4509; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
4510; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4511; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4512; AVX1-NEXT:    vzeroupper
4513; AVX1-NEXT:    callq __truncsfhf2@PLT
4514; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4515; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4516; AVX1-NEXT:    # xmm0 = mem[1,0]
4517; AVX1-NEXT:    callq __truncsfhf2@PLT
4518; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4519; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4520; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4521; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4522; AVX1-NEXT:    callq __truncsfhf2@PLT
4523; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4524; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4525; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4526; AVX1-NEXT:    callq __truncsfhf2@PLT
4527; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4528; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4529; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4530; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4531; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4532; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4533; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
4534; AVX1-NEXT:    callq __truncsfhf2@PLT
4535; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4536; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4537; AVX1-NEXT:    # xmm0 = mem[1,0]
4538; AVX1-NEXT:    callq __truncsfhf2@PLT
4539; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4540; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4541; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4542; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4543; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4544; AVX1-NEXT:    vzeroupper
4545; AVX1-NEXT:    callq __truncsfhf2@PLT
4546; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4547; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4548; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4549; AVX1-NEXT:    callq __truncsfhf2@PLT
4550; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4551; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4552; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4553; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4554; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4555; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
4556; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4557; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4558; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4559; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4560; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4561; AVX1-NEXT:    vzeroupper
4562; AVX1-NEXT:    callq __truncsfhf2@PLT
4563; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4564; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4565; AVX1-NEXT:    # xmm0 = mem[1,0]
4566; AVX1-NEXT:    callq __truncsfhf2@PLT
4567; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4568; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4569; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4570; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4571; AVX1-NEXT:    callq __truncsfhf2@PLT
4572; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4573; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4574; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4575; AVX1-NEXT:    callq __truncsfhf2@PLT
4576; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4577; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4578; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4579; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4580; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4581; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4582; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
4583; AVX1-NEXT:    callq __truncsfhf2@PLT
4584; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4585; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4586; AVX1-NEXT:    # xmm0 = mem[1,0]
4587; AVX1-NEXT:    callq __truncsfhf2@PLT
4588; AVX1-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4589; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4590; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4591; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4592; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4593; AVX1-NEXT:    vzeroupper
4594; AVX1-NEXT:    callq __truncsfhf2@PLT
4595; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4596; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4597; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4598; AVX1-NEXT:    callq __truncsfhf2@PLT
4599; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4600; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4601; AVX1-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4602; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4603; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4604; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
4605; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4606; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4607; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4608; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4609; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4610; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4611; AVX1-NEXT:    vzeroupper
4612; AVX1-NEXT:    callq __truncsfhf2@PLT
4613; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4614; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4615; AVX1-NEXT:    # xmm0 = mem[1,0]
4616; AVX1-NEXT:    callq __truncsfhf2@PLT
4617; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4618; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4619; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4620; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4621; AVX1-NEXT:    callq __truncsfhf2@PLT
4622; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4623; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4624; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4625; AVX1-NEXT:    callq __truncsfhf2@PLT
4626; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4627; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4628; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4629; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4630; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4631; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4632; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
4633; AVX1-NEXT:    callq __truncsfhf2@PLT
4634; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4635; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4636; AVX1-NEXT:    # xmm0 = mem[1,0]
4637; AVX1-NEXT:    callq __truncsfhf2@PLT
4638; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4639; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4640; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4641; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4642; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4643; AVX1-NEXT:    vzeroupper
4644; AVX1-NEXT:    callq __truncsfhf2@PLT
4645; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4646; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4647; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4648; AVX1-NEXT:    callq __truncsfhf2@PLT
4649; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4650; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4651; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4652; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4653; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4654; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
4655; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4656; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4657; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4658; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4659; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4660; AVX1-NEXT:    vzeroupper
4661; AVX1-NEXT:    callq __truncsfhf2@PLT
4662; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4663; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4664; AVX1-NEXT:    # xmm0 = mem[1,0]
4665; AVX1-NEXT:    callq __truncsfhf2@PLT
4666; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4667; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4668; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4669; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4670; AVX1-NEXT:    callq __truncsfhf2@PLT
4671; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4672; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4673; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4674; AVX1-NEXT:    callq __truncsfhf2@PLT
4675; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4676; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4677; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4678; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4679; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4680; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4681; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
4682; AVX1-NEXT:    callq __truncsfhf2@PLT
4683; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4684; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4685; AVX1-NEXT:    # xmm0 = mem[1,0]
4686; AVX1-NEXT:    callq __truncsfhf2@PLT
4687; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4688; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4689; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4690; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4691; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4692; AVX1-NEXT:    vzeroupper
4693; AVX1-NEXT:    callq __truncsfhf2@PLT
4694; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4695; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4696; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
4697; AVX1-NEXT:    callq __truncsfhf2@PLT
4698; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4699; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4700; AVX1-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4701; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
4702; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4703; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
4704; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4705; AVX1-NEXT:    vmovaps %ymm0, 32(%rbx)
4706; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4707; AVX1-NEXT:    vmovaps %ymm0, (%rbx)
4708; AVX1-NEXT:    addq $176, %rsp
4709; AVX1-NEXT:    popq %rbx
4710; AVX1-NEXT:    vzeroupper
4711; AVX1-NEXT:    retq
4712;
4713; AVX2-LABEL: store_cvt_32f32_to_32f16:
4714; AVX2:       # %bb.0:
4715; AVX2-NEXT:    pushq %rbx
4716; AVX2-NEXT:    subq $240, %rsp
4717; AVX2-NEXT:    movq %rdi, %rbx
4718; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4719; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4720; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4721; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4722; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
4723; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4724; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4725; AVX2-NEXT:    vzeroupper
4726; AVX2-NEXT:    callq __truncsfhf2@PLT
4727; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4728; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4729; AVX2-NEXT:    # xmm0 = mem[1,0]
4730; AVX2-NEXT:    callq __truncsfhf2@PLT
4731; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4732; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4733; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4734; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4735; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4736; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4737; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4738; AVX2-NEXT:    vzeroupper
4739; AVX2-NEXT:    callq __truncsfhf2@PLT
4740; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4741; AVX2-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
4742; AVX2-NEXT:    # xmm0 = mem[1,0]
4743; AVX2-NEXT:    callq __truncsfhf2@PLT
4744; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4745; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4746; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4747; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4748; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4749; AVX2-NEXT:    vzeroupper
4750; AVX2-NEXT:    callq __truncsfhf2@PLT
4751; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4752; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4753; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4754; AVX2-NEXT:    callq __truncsfhf2@PLT
4755; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4756; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4757; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4758; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4759; AVX2-NEXT:    callq __truncsfhf2@PLT
4760; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4761; AVX2-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
4762; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4763; AVX2-NEXT:    callq __truncsfhf2@PLT
4764; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4765; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4766; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4767; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4768; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4769; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4770; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4771; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
4772; AVX2-NEXT:    vzeroupper
4773; AVX2-NEXT:    callq __truncsfhf2@PLT
4774; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4775; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4776; AVX2-NEXT:    # xmm0 = mem[1,0]
4777; AVX2-NEXT:    callq __truncsfhf2@PLT
4778; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4779; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4780; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4781; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4782; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
4783; AVX2-NEXT:    callq __truncsfhf2@PLT
4784; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4785; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4786; AVX2-NEXT:    # xmm0 = mem[1,0]
4787; AVX2-NEXT:    callq __truncsfhf2@PLT
4788; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4789; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4790; AVX2-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
4791; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
4792; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4793; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4794; AVX2-NEXT:    vzeroupper
4795; AVX2-NEXT:    callq __truncsfhf2@PLT
4796; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4797; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4798; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4799; AVX2-NEXT:    callq __truncsfhf2@PLT
4800; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4801; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4802; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4803; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4804; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4805; AVX2-NEXT:    vzeroupper
4806; AVX2-NEXT:    callq __truncsfhf2@PLT
4807; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4808; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4809; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4810; AVX2-NEXT:    callq __truncsfhf2@PLT
4811; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4812; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4813; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4814; AVX2-NEXT:    vpunpckldq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
4815; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4816; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4817; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
4818; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4819; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4820; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4821; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4822; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4823; AVX2-NEXT:    vzeroupper
4824; AVX2-NEXT:    callq __truncsfhf2@PLT
4825; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4826; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4827; AVX2-NEXT:    # xmm0 = mem[1,0]
4828; AVX2-NEXT:    callq __truncsfhf2@PLT
4829; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4830; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4831; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4832; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4833; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4834; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4835; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
4836; AVX2-NEXT:    vzeroupper
4837; AVX2-NEXT:    callq __truncsfhf2@PLT
4838; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4839; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4840; AVX2-NEXT:    # xmm0 = mem[1,0]
4841; AVX2-NEXT:    callq __truncsfhf2@PLT
4842; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4843; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4844; AVX2-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
4845; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
4846; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4847; AVX2-NEXT:    vzeroupper
4848; AVX2-NEXT:    callq __truncsfhf2@PLT
4849; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4850; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4851; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4852; AVX2-NEXT:    callq __truncsfhf2@PLT
4853; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4854; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4855; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4856; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4857; AVX2-NEXT:    callq __truncsfhf2@PLT
4858; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4859; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4860; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4861; AVX2-NEXT:    callq __truncsfhf2@PLT
4862; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4863; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4864; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4865; AVX2-NEXT:    vpunpckldq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
4866; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4867; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4868; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4869; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
4870; AVX2-NEXT:    vzeroupper
4871; AVX2-NEXT:    callq __truncsfhf2@PLT
4872; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4873; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4874; AVX2-NEXT:    # xmm0 = mem[1,0]
4875; AVX2-NEXT:    callq __truncsfhf2@PLT
4876; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4877; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4878; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4879; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4880; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
4881; AVX2-NEXT:    callq __truncsfhf2@PLT
4882; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4883; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4884; AVX2-NEXT:    # xmm0 = mem[1,0]
4885; AVX2-NEXT:    callq __truncsfhf2@PLT
4886; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
4887; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
4888; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4889; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4890; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4891; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4892; AVX2-NEXT:    vzeroupper
4893; AVX2-NEXT:    callq __truncsfhf2@PLT
4894; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4895; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4896; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4897; AVX2-NEXT:    callq __truncsfhf2@PLT
4898; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4899; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4900; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4901; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4902; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4903; AVX2-NEXT:    vzeroupper
4904; AVX2-NEXT:    callq __truncsfhf2@PLT
4905; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4906; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4907; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
4908; AVX2-NEXT:    callq __truncsfhf2@PLT
4909; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
4910; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4911; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
4912; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4913; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4914; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4915; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
4916; AVX2-NEXT:    vmovdqa %ymm0, 32(%rbx)
4917; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4918; AVX2-NEXT:    vmovaps %ymm0, (%rbx)
4919; AVX2-NEXT:    addq $240, %rsp
4920; AVX2-NEXT:    popq %rbx
4921; AVX2-NEXT:    vzeroupper
4922; AVX2-NEXT:    retq
4923;
4924; F16C-LABEL: store_cvt_32f32_to_32f16:
4925; F16C:       # %bb.0:
4926; F16C-NEXT:    vcvtps2ph $4, %ymm3, 48(%rdi)
4927; F16C-NEXT:    vcvtps2ph $4, %ymm2, 32(%rdi)
4928; F16C-NEXT:    vcvtps2ph $4, %ymm1, 16(%rdi)
4929; F16C-NEXT:    vcvtps2ph $4, %ymm0, (%rdi)
4930; F16C-NEXT:    vzeroupper
4931; F16C-NEXT:    retq
4932;
4933; AVX512-LABEL: store_cvt_32f32_to_32f16:
4934; AVX512:       # %bb.0:
4935; AVX512-NEXT:    vcvtps2ph $4, %zmm1, 32(%rdi)
4936; AVX512-NEXT:    vcvtps2ph $4, %zmm0, (%rdi)
4937; AVX512-NEXT:    vzeroupper
4938; AVX512-NEXT:    retq
4939  %1 = fptrunc <32 x float> %a0 to <32 x half>
4940  store <32 x half> %1, ptr %a1
4941  ret void
4942}
4943
4944define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
4945; AVX-LABEL: fptosi_2f16_to_4i32:
4946; AVX:       # %bb.0:
4947; AVX-NEXT:    subq $40, %rsp
4948; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
4949; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4950; AVX-NEXT:    callq __extendhfsf2@PLT
4951; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4952; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4953; AVX-NEXT:    callq __extendhfsf2@PLT
4954; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
4955; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
4956; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
4957; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
4958; AVX-NEXT:    addq $40, %rsp
4959; AVX-NEXT:    retq
4960;
4961; F16C-LABEL: fptosi_2f16_to_4i32:
4962; F16C:       # %bb.0:
4963; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
4964; F16C-NEXT:    vcvttps2dq %xmm0, %xmm0
4965; F16C-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
4966; F16C-NEXT:    retq
4967;
4968; AVX512-LABEL: fptosi_2f16_to_4i32:
4969; AVX512:       # %bb.0:
4970; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
4971; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
4972; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
4973; AVX512-NEXT:    retq
4974  %cvt = fptosi <2 x half> %a to <2 x i32>
4975  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4976  ret <4 x i32> %ext
4977}
4978
4979; PR83402
4980define <4 x i32> @fptosi_4f16_to_4i32(<4 x half> %a) nounwind {
4981; AVX-LABEL: fptosi_4f16_to_4i32:
4982; AVX:       # %bb.0:
4983; AVX-NEXT:    subq $72, %rsp
4984; AVX-NEXT:    vmovdqa %xmm0, %xmm1
4985; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4986; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
4987; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4988; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
4989; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4990; AVX-NEXT:    vpsrlq $48, %xmm1, %xmm0
4991; AVX-NEXT:    callq __extendhfsf2@PLT
4992; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4993; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4994; AVX-NEXT:    callq __extendhfsf2@PLT
4995; AVX-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4996; AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
4997; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
4998; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
4999; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5000; AVX-NEXT:    callq __extendhfsf2@PLT
5001; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5002; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5003; AVX-NEXT:    callq __extendhfsf2@PLT
5004; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5005; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5006; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
5007; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
5008; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
5009; AVX-NEXT:    addq $72, %rsp
5010; AVX-NEXT:    retq
5011;
5012; F16C-LABEL: fptosi_4f16_to_4i32:
5013; F16C:       # %bb.0:
5014; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
5015; F16C-NEXT:    vcvttps2dq %xmm0, %xmm0
5016; F16C-NEXT:    retq
5017;
5018; AVX512-LABEL: fptosi_4f16_to_4i32:
5019; AVX512:       # %bb.0:
5020; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
5021; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
5022; AVX512-NEXT:    retq
5023  %cvt = fptosi <4 x half> %a to <4 x i32>
5024  ret <4 x i32> %cvt
5025}
5026
5027define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind {
5028; AVX1-LABEL: fptoui_2f16_to_4i32:
5029; AVX1:       # %bb.0:
5030; AVX1-NEXT:    subq $40, %rsp
5031; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
5032; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5033; AVX1-NEXT:    callq __extendhfsf2@PLT
5034; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
5035; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5036; AVX1-NEXT:    callq __extendhfsf2@PLT
5037; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
5038; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5039; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm1
5040; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
5041; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5042; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
5043; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
5044; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
5045; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5046; AVX1-NEXT:    addq $40, %rsp
5047; AVX1-NEXT:    retq
5048;
5049; AVX2-LABEL: fptoui_2f16_to_4i32:
5050; AVX2:       # %bb.0:
5051; AVX2-NEXT:    subq $40, %rsp
5052; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
5053; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5054; AVX2-NEXT:    callq __extendhfsf2@PLT
5055; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
5056; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5057; AVX2-NEXT:    callq __extendhfsf2@PLT
5058; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
5059; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5060; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm1
5061; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
5062; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
5063; AVX2-NEXT:    vsubps %xmm3, %xmm0, %xmm0
5064; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
5065; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
5066; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
5067; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5068; AVX2-NEXT:    addq $40, %rsp
5069; AVX2-NEXT:    retq
5070;
5071; F16C-LABEL: fptoui_2f16_to_4i32:
5072; F16C:       # %bb.0:
5073; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
5074; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
5075; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
5076; F16C-NEXT:    vcvttps2dq %xmm0, %xmm1
5077; F16C-NEXT:    vpsrad $31, %xmm1, %xmm2
5078; F16C-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5079; F16C-NEXT:    vcvttps2dq %xmm0, %xmm0
5080; F16C-NEXT:    vpand %xmm2, %xmm0, %xmm0
5081; F16C-NEXT:    vpor %xmm0, %xmm1, %xmm0
5082; F16C-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5083; F16C-NEXT:    retq
5084;
5085; AVX512F-LABEL: fptoui_2f16_to_4i32:
5086; AVX512F:       # %bb.0:
5087; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
5088; AVX512F-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
5089; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
5090; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
5091; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5092; AVX512F-NEXT:    vzeroupper
5093; AVX512F-NEXT:    retq
5094;
5095; AVX512-FASTLANE-LABEL: fptoui_2f16_to_4i32:
5096; AVX512-FASTLANE:       # %bb.0:
5097; AVX512-FASTLANE-NEXT:    vcvtph2ps %xmm0, %xmm0
5098; AVX512-FASTLANE-NEXT:    vcvttps2udq %xmm0, %xmm0
5099; AVX512-FASTLANE-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
5100; AVX512-FASTLANE-NEXT:    retq
5101  %cvt = fptoui <2 x half> %a to <2 x i32>
5102  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5103  ret <4 x i32> %ext
5104}
5105
5106define <4 x i32> @fptoui_4f16_to_4i32(<4 x half> %a) nounwind {
5107; AVX1-LABEL: fptoui_4f16_to_4i32:
5108; AVX1:       # %bb.0:
5109; AVX1-NEXT:    subq $72, %rsp
5110; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
5111; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5112; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
5113; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5114; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
5115; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
5116; AVX1-NEXT:    vpsrlq $48, %xmm1, %xmm0
5117; AVX1-NEXT:    callq __extendhfsf2@PLT
5118; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5119; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5120; AVX1-NEXT:    callq __extendhfsf2@PLT
5121; AVX1-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5122; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
5123; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm1
5124; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
5125; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5126; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
5127; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
5128; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
5129; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
5130; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5131; AVX1-NEXT:    callq __extendhfsf2@PLT
5132; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5133; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5134; AVX1-NEXT:    callq __extendhfsf2@PLT
5135; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5136; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5137; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm1
5138; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
5139; AVX1-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5140; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
5141; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
5142; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
5143; AVX1-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
5144; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
5145; AVX1-NEXT:    addq $72, %rsp
5146; AVX1-NEXT:    retq
5147;
5148; AVX2-LABEL: fptoui_4f16_to_4i32:
5149; AVX2:       # %bb.0:
5150; AVX2-NEXT:    subq $72, %rsp
5151; AVX2-NEXT:    vmovdqa %xmm0, %xmm1
5152; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5153; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
5154; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5155; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
5156; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
5157; AVX2-NEXT:    vpsrlq $48, %xmm1, %xmm0
5158; AVX2-NEXT:    callq __extendhfsf2@PLT
5159; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5160; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
5161; AVX2-NEXT:    callq __extendhfsf2@PLT
5162; AVX2-NEXT:    vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5163; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
5164; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm1
5165; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
5166; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
5167; AVX2-NEXT:    vsubps %xmm3, %xmm0, %xmm0
5168; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
5169; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
5170; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
5171; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
5172; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5173; AVX2-NEXT:    callq __extendhfsf2@PLT
5174; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5175; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5176; AVX2-NEXT:    callq __extendhfsf2@PLT
5177; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5178; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5179; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm1
5180; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
5181; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
5182; AVX2-NEXT:    vsubps %xmm3, %xmm0, %xmm0
5183; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
5184; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
5185; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
5186; AVX2-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
5187; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
5188; AVX2-NEXT:    addq $72, %rsp
5189; AVX2-NEXT:    retq
5190;
5191; F16C-LABEL: fptoui_4f16_to_4i32:
5192; F16C:       # %bb.0:
5193; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
5194; F16C-NEXT:    vcvttps2dq %xmm0, %xmm1
5195; F16C-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5196; F16C-NEXT:    vcvttps2dq %xmm0, %xmm0
5197; F16C-NEXT:    vorps %xmm0, %xmm1, %xmm0
5198; F16C-NEXT:    vblendvps %xmm1, %xmm0, %xmm1, %xmm0
5199; F16C-NEXT:    retq
5200;
5201; AVX512F-LABEL: fptoui_4f16_to_4i32:
5202; AVX512F:       # %bb.0:
5203; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
5204; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
5205; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
5206; AVX512F-NEXT:    vzeroupper
5207; AVX512F-NEXT:    retq
5208;
5209; AVX512-FASTLANE-LABEL: fptoui_4f16_to_4i32:
5210; AVX512-FASTLANE:       # %bb.0:
5211; AVX512-FASTLANE-NEXT:    vcvtph2ps %xmm0, %xmm0
5212; AVX512-FASTLANE-NEXT:    vcvttps2udq %xmm0, %xmm0
5213; AVX512-FASTLANE-NEXT:    retq
5214  %cvt = fptoui <4 x half> %a to <4 x i32>
5215  ret <4 x i32> %cvt
5216}
5217