xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll (revision 4ab3041acbdc274050d6c53f72619c7455cbc97a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -verify-machineinstrs -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
13; CHECK-LABEL: stack_fold_addpd:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16; CHECK-NEXT:    #APP
17; CHECK-NEXT:    nop
18; CHECK-NEXT:    #NO_APP
19; CHECK-NEXT:    addpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
20; CHECK-NEXT:    retq
21  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
22  %2 = fadd <2 x double> %a0, %a1
23  ret <2 x double> %2
24}
25
26define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
27; CHECK-LABEL: stack_fold_addps:
28; CHECK:       # %bb.0:
29; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
30; CHECK-NEXT:    #APP
31; CHECK-NEXT:    nop
32; CHECK-NEXT:    #NO_APP
33; CHECK-NEXT:    addps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
34; CHECK-NEXT:    retq
35  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
36  %2 = fadd <4 x float> %a0, %a1
37  ret <4 x float> %2
38}
39
40define double @stack_fold_addsd(double %a0, double %a1) {
41; CHECK-LABEL: stack_fold_addsd:
42; CHECK:       # %bb.0:
43; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
44; CHECK-NEXT:    #APP
45; CHECK-NEXT:    nop
46; CHECK-NEXT:    #NO_APP
47; CHECK-NEXT:    addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
48; CHECK-NEXT:    retq
49  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
50  %2 = fadd double %a0, %a1
51  ret double %2
52}
53
54define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
55; CHECK-LABEL: stack_fold_addsd_int:
56; CHECK:       # %bb.0:
57; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
58; CHECK-NEXT:    #APP
59; CHECK-NEXT:    nop
60; CHECK-NEXT:    #NO_APP
61; CHECK-NEXT:    addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
62; CHECK-NEXT:    retq
63  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
64  %2 = extractelement <2 x double> %a0, i32 0
65  %3 = extractelement <2 x double> %a1, i32 0
66  %4 = fadd double %2, %3
67  %5 = insertelement <2 x double> %a0, double %4, i32 0
68  ret <2 x double> %5
69}
70
71define float @stack_fold_addss(float %a0, float %a1) {
72; CHECK-LABEL: stack_fold_addss:
73; CHECK:       # %bb.0:
74; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
75; CHECK-NEXT:    #APP
76; CHECK-NEXT:    nop
77; CHECK-NEXT:    #NO_APP
78; CHECK-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
79; CHECK-NEXT:    retq
80  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
81  %2 = fadd float %a0, %a1
82  ret float %2
83}
84
85define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
86; CHECK-LABEL: stack_fold_addss_int:
87; CHECK:       # %bb.0:
88; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
89; CHECK-NEXT:    #APP
90; CHECK-NEXT:    nop
91; CHECK-NEXT:    #NO_APP
92; CHECK-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
93; CHECK-NEXT:    retq
94  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
95  %2 = extractelement <4 x float> %a0, i32 0
96  %3 = extractelement <4 x float> %a1, i32 0
97  %4 = fadd float %2, %3
98  %5 = insertelement <4 x float> %a0, float %4, i32 0
99  ret <4 x float> %5
100}
101
102define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
103; CHECK-LABEL: stack_fold_addsubpd:
104; CHECK:       # %bb.0:
105; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
106; CHECK-NEXT:    #APP
107; CHECK-NEXT:    nop
108; CHECK-NEXT:    #NO_APP
109; CHECK-NEXT:    addsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
110; CHECK-NEXT:    retq
111  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
112  %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
113  ret <2 x double> %2
114}
115declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
116
117define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
118; CHECK-LABEL: stack_fold_addsubps:
119; CHECK:       # %bb.0:
120; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
121; CHECK-NEXT:    #APP
122; CHECK-NEXT:    nop
123; CHECK-NEXT:    #NO_APP
124; CHECK-NEXT:    addsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
125; CHECK-NEXT:    retq
126  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
127  %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
128  ret <4 x float> %2
129}
130declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
131
132define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
133; CHECK-LABEL: stack_fold_andnpd:
134; CHECK:       # %bb.0:
135; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
136; CHECK-NEXT:    #APP
137; CHECK-NEXT:    nop
138; CHECK-NEXT:    #NO_APP
139; CHECK-NEXT:    andnpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
140; CHECK-NEXT:    xorpd %xmm1, %xmm1
141; CHECK-NEXT:    addpd %xmm1, %xmm0
142; CHECK-NEXT:    retq
143  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
144  %2 = bitcast <2 x double> %a0 to <2 x i64>
145  %3 = bitcast <2 x double> %a1 to <2 x i64>
146  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
147  %5 = and <2 x i64> %4, %3
148  %6 = bitcast <2 x i64> %5 to <2 x double>
149  ; fadd forces execution domain
150  %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
151  ret <2 x double> %7
152}
153
154define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
155; CHECK-LABEL: stack_fold_andnps:
156; CHECK:       # %bb.0:
157; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
158; CHECK-NEXT:    #APP
159; CHECK-NEXT:    nop
160; CHECK-NEXT:    #NO_APP
161; CHECK-NEXT:    andnps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
162; CHECK-NEXT:    xorps %xmm1, %xmm1
163; CHECK-NEXT:    addps %xmm1, %xmm0
164; CHECK-NEXT:    retq
165  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
166  %2 = bitcast <4 x float> %a0 to <2 x i64>
167  %3 = bitcast <4 x float> %a1 to <2 x i64>
168  %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
169  %5 = and <2 x i64> %4, %3
170  %6 = bitcast <2 x i64> %5 to <4 x float>
171  ; fadd forces execution domain
172  %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
173  ret <4 x float> %7
174}
175
176define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
177; CHECK-LABEL: stack_fold_andpd:
178; CHECK:       # %bb.0:
179; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
180; CHECK-NEXT:    #APP
181; CHECK-NEXT:    nop
182; CHECK-NEXT:    #NO_APP
183; CHECK-NEXT:    andpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
184; CHECK-NEXT:    xorpd %xmm1, %xmm1
185; CHECK-NEXT:    addpd %xmm1, %xmm0
186; CHECK-NEXT:    retq
187  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
188  %2 = bitcast <2 x double> %a0 to <2 x i64>
189  %3 = bitcast <2 x double> %a1 to <2 x i64>
190  %4 = and <2 x i64> %2, %3
191  %5 = bitcast <2 x i64> %4 to <2 x double>
192  ; fadd forces execution domain
193  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
194  ret <2 x double> %6
195}
196
197define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
198; CHECK-LABEL: stack_fold_andps:
199; CHECK:       # %bb.0:
200; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
201; CHECK-NEXT:    #APP
202; CHECK-NEXT:    nop
203; CHECK-NEXT:    #NO_APP
204; CHECK-NEXT:    andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
205; CHECK-NEXT:    xorps %xmm1, %xmm1
206; CHECK-NEXT:    addps %xmm1, %xmm0
207; CHECK-NEXT:    retq
208  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
209  %2 = bitcast <4 x float> %a0 to <2 x i64>
210  %3 = bitcast <4 x float> %a1 to <2 x i64>
211  %4 = and <2 x i64> %2, %3
212  %5 = bitcast <2 x i64> %4 to <4 x float>
213  ; fadd forces execution domain
214  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
215  ret <4 x float> %6
216}
217
218define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
219; CHECK-LABEL: stack_fold_blendpd:
220; CHECK:       # %bb.0:
221; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
222; CHECK-NEXT:    #APP
223; CHECK-NEXT:    nop
224; CHECK-NEXT:    #NO_APP
225; CHECK-NEXT:    blendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
226; CHECK-NEXT:    # xmm0 = xmm0[0],mem[1]
227; CHECK-NEXT:    xorpd %xmm1, %xmm1
228; CHECK-NEXT:    addpd %xmm1, %xmm0
229; CHECK-NEXT:    retq
230  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
231  %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
232  ; fadd forces execution domain
233  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
234  ret <2 x double> %3
235}
236
237define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
238; CHECK-LABEL: stack_fold_blendps:
239; CHECK:       # %bb.0:
240; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
241; CHECK-NEXT:    #APP
242; CHECK-NEXT:    nop
243; CHECK-NEXT:    #NO_APP
244; CHECK-NEXT:    blendps $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
245; CHECK-NEXT:    # xmm0 = xmm0[0],mem[1,2],xmm0[3]
246; CHECK-NEXT:    xorps %xmm1, %xmm1
247; CHECK-NEXT:    addps %xmm1, %xmm0
248; CHECK-NEXT:    retq
249  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
250  %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
251  ; fadd forces execution domain
252  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
253  ret <4 x float> %3
254}
255
256define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
257; CHECK-LABEL: stack_fold_blendvpd:
258; CHECK:       # %bb.0:
259; CHECK-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
260; CHECK-NEXT:    movapd %xmm1, %xmm2
261; CHECK-NEXT:    #APP
262; CHECK-NEXT:    nop
263; CHECK-NEXT:    #NO_APP
264; CHECK-NEXT:    blendvpd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
265; CHECK-NEXT:    movapd %xmm2, %xmm0
266; CHECK-NEXT:    retq
267  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
268  %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
269  ret <2 x double> %2
270}
271declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
272
273define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
274; CHECK-LABEL: stack_fold_blendvps:
275; CHECK:       # %bb.0:
276; CHECK-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
277; CHECK-NEXT:    movaps %xmm1, %xmm2
278; CHECK-NEXT:    #APP
279; CHECK-NEXT:    nop
280; CHECK-NEXT:    #NO_APP
281; CHECK-NEXT:    blendvps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
282; CHECK-NEXT:    movaps %xmm2, %xmm0
283; CHECK-NEXT:    retq
284  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
285  %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
286  ret <4 x float> %2
287}
288declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
289
290define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
291; CHECK-LABEL: stack_fold_cmppd:
292; CHECK:       # %bb.0:
293; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
294; CHECK-NEXT:    #APP
295; CHECK-NEXT:    nop
296; CHECK-NEXT:    #NO_APP
297; CHECK-NEXT:    cmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
298; CHECK-NEXT:    retq
299  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
300  %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
301  ret <2 x double> %2
302}
303declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
304
305define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
306; CHECK-LABEL: stack_fold_cmpps:
307; CHECK:       # %bb.0:
308; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
309; CHECK-NEXT:    #APP
310; CHECK-NEXT:    nop
311; CHECK-NEXT:    #NO_APP
312; CHECK-NEXT:    cmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
313; CHECK-NEXT:    retq
314  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
315  %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
316  ret <4 x float> %2
317}
318declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
319
320define i32 @stack_fold_cmpsd(double %a0, double %a1) {
321; CHECK-LABEL: stack_fold_cmpsd:
322; CHECK:       # %bb.0:
323; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
324; CHECK-NEXT:    #APP
325; CHECK-NEXT:    nop
326; CHECK-NEXT:    #NO_APP
327; CHECK-NEXT:    cmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
328; CHECK-NEXT:    movq %xmm0, %rax
329; CHECK-NEXT:    andl $1, %eax
330; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
331; CHECK-NEXT:    retq
332  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
333  %2 = fcmp oeq double %a0, %a1
334  %3 = zext i1 %2 to i32
335  ret i32 %3
336}
337
338define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
339; CHECK-LABEL: stack_fold_cmpsd_int:
340; CHECK:       # %bb.0:
341; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
342; CHECK-NEXT:    #APP
343; CHECK-NEXT:    nop
344; CHECK-NEXT:    #NO_APP
345; CHECK-NEXT:    cmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
346; CHECK-NEXT:    retq
347  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
348  %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
349  ret <2 x double> %2
350}
351declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
352
353define i32 @stack_fold_cmpss(float %a0, float %a1) {
354; CHECK-LABEL: stack_fold_cmpss:
355; CHECK:       # %bb.0:
356; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
357; CHECK-NEXT:    #APP
358; CHECK-NEXT:    nop
359; CHECK-NEXT:    #NO_APP
360; CHECK-NEXT:    cmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
361; CHECK-NEXT:    movd %xmm0, %eax
362; CHECK-NEXT:    andl $1, %eax
363; CHECK-NEXT:    retq
364  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
365  %2 = fcmp oeq float %a0, %a1
366  %3 = zext i1 %2 to i32
367  ret i32 %3
368}
369
370define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
371; CHECK-LABEL: stack_fold_cmpss_int:
372; CHECK:       # %bb.0:
373; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
374; CHECK-NEXT:    #APP
375; CHECK-NEXT:    nop
376; CHECK-NEXT:    #NO_APP
377; CHECK-NEXT:    cmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
378; CHECK-NEXT:    retq
379  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
380  %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
381  ret <4 x float> %2
382}
383declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
384
385; TODO stack_fold_comisd
386
387define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
388; CHECK-LABEL: stack_fold_comisd_int:
389; CHECK:       # %bb.0:
390; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
391; CHECK-NEXT:    #APP
392; CHECK-NEXT:    nop
393; CHECK-NEXT:    #NO_APP
394; CHECK-NEXT:    comisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
395; CHECK-NEXT:    setnp %al
396; CHECK-NEXT:    sete %cl
397; CHECK-NEXT:    andb %al, %cl
398; CHECK-NEXT:    movzbl %cl, %eax
399; CHECK-NEXT:    retq
400  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
401  %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
402  ret i32 %2
403}
404declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
405
406; TODO stack_fold_comiss
407
408define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
409; CHECK-LABEL: stack_fold_comiss_int:
410; CHECK:       # %bb.0:
411; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
412; CHECK-NEXT:    #APP
413; CHECK-NEXT:    nop
414; CHECK-NEXT:    #NO_APP
415; CHECK-NEXT:    comiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
416; CHECK-NEXT:    setnp %al
417; CHECK-NEXT:    sete %cl
418; CHECK-NEXT:    andb %al, %cl
419; CHECK-NEXT:    movzbl %cl, %eax
420; CHECK-NEXT:    retq
421  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
422  %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
423  ret i32 %2
424}
425declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
426
427define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
428; CHECK-LABEL: stack_fold_cvtdq2pd:
429; CHECK:       # %bb.0:
430; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
431; CHECK-NEXT:    #APP
432; CHECK-NEXT:    nop
433; CHECK-NEXT:    #NO_APP
434; CHECK-NEXT:    cvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
435; CHECK-NEXT:    retq
436  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
437  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
438  %3 = sitofp <2 x i32> %2 to <2 x double>
439  ret <2 x double> %3
440}
441
442define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
443; CHECK-LABEL: stack_fold_cvtdq2pd_int:
444; CHECK:       # %bb.0:
445; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
446; CHECK-NEXT:    #APP
447; CHECK-NEXT:    nop
448; CHECK-NEXT:    #NO_APP
449; CHECK-NEXT:    cvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
450; CHECK-NEXT:    retq
451  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
452  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a0, <2 x i32> <i32 0, i32 1>
453  %cvt = sitofp <2 x i32> %2 to <2 x double>
454  ret <2 x double> %cvt
455}
456
457define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
458; CHECK-LABEL: stack_fold_cvtdq2ps:
459; CHECK:       # %bb.0:
460; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
461; CHECK-NEXT:    #APP
462; CHECK-NEXT:    nop
463; CHECK-NEXT:    #NO_APP
464; CHECK-NEXT:    cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
465; CHECK-NEXT:    retq
466  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
467  %2 = sitofp <4 x i32> %a0 to <4 x float>
468  ret <4 x float> %2
469}
470
471define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
472; CHECK-LABEL: stack_fold_cvtpd2dq:
473; CHECK:       # %bb.0:
474; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
475; CHECK-NEXT:    #APP
476; CHECK-NEXT:    nop
477; CHECK-NEXT:    #NO_APP
478; CHECK-NEXT:    cvtpd2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
479; CHECK-NEXT:    retq
480  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
481  %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
482  ret <4 x i32> %2
483}
484declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
485
486define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
487; CHECK-LABEL: stack_fold_cvtpd2ps:
488; CHECK:       # %bb.0:
489; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
490; CHECK-NEXT:    #APP
491; CHECK-NEXT:    nop
492; CHECK-NEXT:    #NO_APP
493; CHECK-NEXT:    cvtpd2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
494; CHECK-NEXT:    retq
495  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
496  %2 = fptrunc <2 x double> %a0 to <2 x float>
497  ret <2 x float> %2
498}
499
500define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
501; CHECK-LABEL: stack_fold_cvtps2dq:
502; CHECK:       # %bb.0:
503; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
504; CHECK-NEXT:    #APP
505; CHECK-NEXT:    nop
506; CHECK-NEXT:    #NO_APP
507; CHECK-NEXT:    cvtps2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
508; CHECK-NEXT:    retq
509  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
510  %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
511  ret <4 x i32> %2
512}
513declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
514
515define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
516; CHECK-LABEL: stack_fold_cvtps2pd:
517; CHECK:       # %bb.0:
518; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
519; CHECK-NEXT:    #APP
520; CHECK-NEXT:    nop
521; CHECK-NEXT:    #NO_APP
522; CHECK-NEXT:    cvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
523; CHECK-NEXT:    retq
524  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
525  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
526  %3 = fpext <2 x float> %2 to <2 x double>
527  ret <2 x double> %3
528}
529
530define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
531; CHECK-LABEL: stack_fold_cvtps2pd_int:
532; CHECK:       # %bb.0:
533; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
534; CHECK-NEXT:    #APP
535; CHECK-NEXT:    nop
536; CHECK-NEXT:    #NO_APP
537; CHECK-NEXT:    cvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
538; CHECK-NEXT:    retq
539  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
540  %2 = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
541  %cvtps2pd = fpext <2 x float> %2 to <2 x double>
542  ret <2 x double> %cvtps2pd
543}
544
545; TODO stack_fold_cvtsd2si
546
547define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
548; CHECK-LABEL: stack_fold_cvtsd2si_int:
549; CHECK:       # %bb.0:
550; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
551; CHECK-NEXT:    #APP
552; CHECK-NEXT:    nop
553; CHECK-NEXT:    #NO_APP
554; CHECK-NEXT:    cvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
555; CHECK-NEXT:    retq
556  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
557  %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
558  ret i32 %2
559}
560declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
561
562; TODO stack_fold_cvtsd2si64
563
564define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
565; CHECK-LABEL: stack_fold_cvtsd2si64_int:
566; CHECK:       # %bb.0:
567; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
568; CHECK-NEXT:    #APP
569; CHECK-NEXT:    nop
570; CHECK-NEXT:    #NO_APP
571; CHECK-NEXT:    cvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
572; CHECK-NEXT:    retq
573  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
574  %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
575  ret i64 %2
576}
577declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
578
579define float @stack_fold_cvtsd2ss(double %a0) minsize {
580; CHECK-LABEL: stack_fold_cvtsd2ss:
581; CHECK:       # %bb.0:
582; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
583; CHECK-NEXT:    #APP
584; CHECK-NEXT:    nop
585; CHECK-NEXT:    #NO_APP
586; CHECK-NEXT:    cvtsd2ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
587; CHECK-NEXT:    retq
588  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
589  %2 = fptrunc double %a0 to float
590  ret float %2
591}
592
593define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) optsize {
594; CHECK-LABEL: stack_fold_cvtsd2ss_int:
595; CHECK:       # %bb.0:
596; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
597; CHECK-NEXT:    #APP
598; CHECK-NEXT:    nop
599; CHECK-NEXT:    #NO_APP
600; CHECK-NEXT:    xorps %xmm1, %xmm1
601; CHECK-NEXT:    cvtsd2ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
602; CHECK-NEXT:    movaps %xmm1, %xmm0
603; CHECK-NEXT:    retq
604  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
605  %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
606  ret <4 x float> %2
607}
608declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
609
610define double @stack_fold_cvtsi2sd(i32 %a0) {
611; CHECK-LABEL: stack_fold_cvtsi2sd:
612; CHECK:       # %bb.0:
613; CHECK-NEXT:    pushq %rbp
614; CHECK-NEXT:    .cfi_def_cfa_offset 16
615; CHECK-NEXT:    pushq %r15
616; CHECK-NEXT:    .cfi_def_cfa_offset 24
617; CHECK-NEXT:    pushq %r14
618; CHECK-NEXT:    .cfi_def_cfa_offset 32
619; CHECK-NEXT:    pushq %r13
620; CHECK-NEXT:    .cfi_def_cfa_offset 40
621; CHECK-NEXT:    pushq %r12
622; CHECK-NEXT:    .cfi_def_cfa_offset 48
623; CHECK-NEXT:    pushq %rbx
624; CHECK-NEXT:    .cfi_def_cfa_offset 56
625; CHECK-NEXT:    .cfi_offset %rbx, -56
626; CHECK-NEXT:    .cfi_offset %r12, -48
627; CHECK-NEXT:    .cfi_offset %r13, -40
628; CHECK-NEXT:    .cfi_offset %r14, -32
629; CHECK-NEXT:    .cfi_offset %r15, -24
630; CHECK-NEXT:    .cfi_offset %rbp, -16
631; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
632; CHECK-NEXT:    #APP
633; CHECK-NEXT:    nop
634; CHECK-NEXT:    #NO_APP
635; CHECK-NEXT:    xorps %xmm0, %xmm0
636; CHECK-NEXT:    cvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
637; CHECK-NEXT:    popq %rbx
638; CHECK-NEXT:    .cfi_def_cfa_offset 48
639; CHECK-NEXT:    popq %r12
640; CHECK-NEXT:    .cfi_def_cfa_offset 40
641; CHECK-NEXT:    popq %r13
642; CHECK-NEXT:    .cfi_def_cfa_offset 32
643; CHECK-NEXT:    popq %r14
644; CHECK-NEXT:    .cfi_def_cfa_offset 24
645; CHECK-NEXT:    popq %r15
646; CHECK-NEXT:    .cfi_def_cfa_offset 16
647; CHECK-NEXT:    popq %rbp
648; CHECK-NEXT:    .cfi_def_cfa_offset 8
649; CHECK-NEXT:    retq
650  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
651  %2 = sitofp i32 %a0 to double
652  ret double %2
653}
654
655define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0, <2 x double> %b0) {
656; CHECK-LABEL: stack_fold_cvtsi2sd_int:
657; CHECK:       # %bb.0:
658; CHECK-NEXT:    pushq %rbp
659; CHECK-NEXT:    .cfi_def_cfa_offset 16
660; CHECK-NEXT:    pushq %r15
661; CHECK-NEXT:    .cfi_def_cfa_offset 24
662; CHECK-NEXT:    pushq %r14
663; CHECK-NEXT:    .cfi_def_cfa_offset 32
664; CHECK-NEXT:    pushq %r13
665; CHECK-NEXT:    .cfi_def_cfa_offset 40
666; CHECK-NEXT:    pushq %r12
667; CHECK-NEXT:    .cfi_def_cfa_offset 48
668; CHECK-NEXT:    pushq %rbx
669; CHECK-NEXT:    .cfi_def_cfa_offset 56
670; CHECK-NEXT:    .cfi_offset %rbx, -56
671; CHECK-NEXT:    .cfi_offset %r12, -48
672; CHECK-NEXT:    .cfi_offset %r13, -40
673; CHECK-NEXT:    .cfi_offset %r14, -32
674; CHECK-NEXT:    .cfi_offset %r15, -24
675; CHECK-NEXT:    .cfi_offset %rbp, -16
676; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
677; CHECK-NEXT:    #APP
678; CHECK-NEXT:    nop
679; CHECK-NEXT:    #NO_APP
680; CHECK-NEXT:    cvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
681; CHECK-NEXT:    popq %rbx
682; CHECK-NEXT:    .cfi_def_cfa_offset 48
683; CHECK-NEXT:    popq %r12
684; CHECK-NEXT:    .cfi_def_cfa_offset 40
685; CHECK-NEXT:    popq %r13
686; CHECK-NEXT:    .cfi_def_cfa_offset 32
687; CHECK-NEXT:    popq %r14
688; CHECK-NEXT:    .cfi_def_cfa_offset 24
689; CHECK-NEXT:    popq %r15
690; CHECK-NEXT:    .cfi_def_cfa_offset 16
691; CHECK-NEXT:    popq %rbp
692; CHECK-NEXT:    .cfi_def_cfa_offset 8
693; CHECK-NEXT:    retq
694  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
695  %2 = sitofp i32 %a0 to double
696  %3 = insertelement <2 x double> %b0, double %2, i64 0
697  ret <2 x double> %3
698}
699
700define double @stack_fold_cvtsi642sd(i64 %a0) {
701; CHECK-LABEL: stack_fold_cvtsi642sd:
702; CHECK:       # %bb.0:
703; CHECK-NEXT:    pushq %rbp
704; CHECK-NEXT:    .cfi_def_cfa_offset 16
705; CHECK-NEXT:    pushq %r15
706; CHECK-NEXT:    .cfi_def_cfa_offset 24
707; CHECK-NEXT:    pushq %r14
708; CHECK-NEXT:    .cfi_def_cfa_offset 32
709; CHECK-NEXT:    pushq %r13
710; CHECK-NEXT:    .cfi_def_cfa_offset 40
711; CHECK-NEXT:    pushq %r12
712; CHECK-NEXT:    .cfi_def_cfa_offset 48
713; CHECK-NEXT:    pushq %rbx
714; CHECK-NEXT:    .cfi_def_cfa_offset 56
715; CHECK-NEXT:    .cfi_offset %rbx, -56
716; CHECK-NEXT:    .cfi_offset %r12, -48
717; CHECK-NEXT:    .cfi_offset %r13, -40
718; CHECK-NEXT:    .cfi_offset %r14, -32
719; CHECK-NEXT:    .cfi_offset %r15, -24
720; CHECK-NEXT:    .cfi_offset %rbp, -16
721; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
722; CHECK-NEXT:    #APP
723; CHECK-NEXT:    nop
724; CHECK-NEXT:    #NO_APP
725; CHECK-NEXT:    xorps %xmm0, %xmm0
726; CHECK-NEXT:    cvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
727; CHECK-NEXT:    popq %rbx
728; CHECK-NEXT:    .cfi_def_cfa_offset 48
729; CHECK-NEXT:    popq %r12
730; CHECK-NEXT:    .cfi_def_cfa_offset 40
731; CHECK-NEXT:    popq %r13
732; CHECK-NEXT:    .cfi_def_cfa_offset 32
733; CHECK-NEXT:    popq %r14
734; CHECK-NEXT:    .cfi_def_cfa_offset 24
735; CHECK-NEXT:    popq %r15
736; CHECK-NEXT:    .cfi_def_cfa_offset 16
737; CHECK-NEXT:    popq %rbp
738; CHECK-NEXT:    .cfi_def_cfa_offset 8
739; CHECK-NEXT:    retq
740  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
741  %2 = sitofp i64 %a0 to double
742  ret double %2
743}
744
745define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0, <2 x double> %b0) {
746; CHECK-LABEL: stack_fold_cvtsi642sd_int:
747; CHECK:       # %bb.0:
748; CHECK-NEXT:    pushq %rbp
749; CHECK-NEXT:    .cfi_def_cfa_offset 16
750; CHECK-NEXT:    pushq %r15
751; CHECK-NEXT:    .cfi_def_cfa_offset 24
752; CHECK-NEXT:    pushq %r14
753; CHECK-NEXT:    .cfi_def_cfa_offset 32
754; CHECK-NEXT:    pushq %r13
755; CHECK-NEXT:    .cfi_def_cfa_offset 40
756; CHECK-NEXT:    pushq %r12
757; CHECK-NEXT:    .cfi_def_cfa_offset 48
758; CHECK-NEXT:    pushq %rbx
759; CHECK-NEXT:    .cfi_def_cfa_offset 56
760; CHECK-NEXT:    .cfi_offset %rbx, -56
761; CHECK-NEXT:    .cfi_offset %r12, -48
762; CHECK-NEXT:    .cfi_offset %r13, -40
763; CHECK-NEXT:    .cfi_offset %r14, -32
764; CHECK-NEXT:    .cfi_offset %r15, -24
765; CHECK-NEXT:    .cfi_offset %rbp, -16
766; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
767; CHECK-NEXT:    #APP
768; CHECK-NEXT:    nop
769; CHECK-NEXT:    #NO_APP
770; CHECK-NEXT:    cvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
771; CHECK-NEXT:    popq %rbx
772; CHECK-NEXT:    .cfi_def_cfa_offset 48
773; CHECK-NEXT:    popq %r12
774; CHECK-NEXT:    .cfi_def_cfa_offset 40
775; CHECK-NEXT:    popq %r13
776; CHECK-NEXT:    .cfi_def_cfa_offset 32
777; CHECK-NEXT:    popq %r14
778; CHECK-NEXT:    .cfi_def_cfa_offset 24
779; CHECK-NEXT:    popq %r15
780; CHECK-NEXT:    .cfi_def_cfa_offset 16
781; CHECK-NEXT:    popq %rbp
782; CHECK-NEXT:    .cfi_def_cfa_offset 8
783; CHECK-NEXT:    retq
784  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
785  %2 = sitofp i64 %a0 to double
786  %3 = insertelement <2 x double> %b0, double %2, i64 0
787  ret <2 x double> %3
788}
789
790define float @stack_fold_cvtsi2ss(i32 %a0) {
791; CHECK-LABEL: stack_fold_cvtsi2ss:
792; CHECK:       # %bb.0:
793; CHECK-NEXT:    pushq %rbp
794; CHECK-NEXT:    .cfi_def_cfa_offset 16
795; CHECK-NEXT:    pushq %r15
796; CHECK-NEXT:    .cfi_def_cfa_offset 24
797; CHECK-NEXT:    pushq %r14
798; CHECK-NEXT:    .cfi_def_cfa_offset 32
799; CHECK-NEXT:    pushq %r13
800; CHECK-NEXT:    .cfi_def_cfa_offset 40
801; CHECK-NEXT:    pushq %r12
802; CHECK-NEXT:    .cfi_def_cfa_offset 48
803; CHECK-NEXT:    pushq %rbx
804; CHECK-NEXT:    .cfi_def_cfa_offset 56
805; CHECK-NEXT:    .cfi_offset %rbx, -56
806; CHECK-NEXT:    .cfi_offset %r12, -48
807; CHECK-NEXT:    .cfi_offset %r13, -40
808; CHECK-NEXT:    .cfi_offset %r14, -32
809; CHECK-NEXT:    .cfi_offset %r15, -24
810; CHECK-NEXT:    .cfi_offset %rbp, -16
811; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
812; CHECK-NEXT:    #APP
813; CHECK-NEXT:    nop
814; CHECK-NEXT:    #NO_APP
815; CHECK-NEXT:    xorps %xmm0, %xmm0
816; CHECK-NEXT:    cvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
817; CHECK-NEXT:    popq %rbx
818; CHECK-NEXT:    .cfi_def_cfa_offset 48
819; CHECK-NEXT:    popq %r12
820; CHECK-NEXT:    .cfi_def_cfa_offset 40
821; CHECK-NEXT:    popq %r13
822; CHECK-NEXT:    .cfi_def_cfa_offset 32
823; CHECK-NEXT:    popq %r14
824; CHECK-NEXT:    .cfi_def_cfa_offset 24
825; CHECK-NEXT:    popq %r15
826; CHECK-NEXT:    .cfi_def_cfa_offset 16
827; CHECK-NEXT:    popq %rbp
828; CHECK-NEXT:    .cfi_def_cfa_offset 8
829; CHECK-NEXT:    retq
830  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
831  %2 = sitofp i32 %a0 to float
832  ret float %2
833}
834
835define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0, <4 x float> %b0) {
836; CHECK-LABEL: stack_fold_cvtsi2ss_int:
837; CHECK:       # %bb.0:
838; CHECK-NEXT:    pushq %rbp
839; CHECK-NEXT:    .cfi_def_cfa_offset 16
840; CHECK-NEXT:    pushq %r15
841; CHECK-NEXT:    .cfi_def_cfa_offset 24
842; CHECK-NEXT:    pushq %r14
843; CHECK-NEXT:    .cfi_def_cfa_offset 32
844; CHECK-NEXT:    pushq %r13
845; CHECK-NEXT:    .cfi_def_cfa_offset 40
846; CHECK-NEXT:    pushq %r12
847; CHECK-NEXT:    .cfi_def_cfa_offset 48
848; CHECK-NEXT:    pushq %rbx
849; CHECK-NEXT:    .cfi_def_cfa_offset 56
850; CHECK-NEXT:    .cfi_offset %rbx, -56
851; CHECK-NEXT:    .cfi_offset %r12, -48
852; CHECK-NEXT:    .cfi_offset %r13, -40
853; CHECK-NEXT:    .cfi_offset %r14, -32
854; CHECK-NEXT:    .cfi_offset %r15, -24
855; CHECK-NEXT:    .cfi_offset %rbp, -16
856; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
857; CHECK-NEXT:    #APP
858; CHECK-NEXT:    nop
859; CHECK-NEXT:    #NO_APP
860; CHECK-NEXT:    cvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
861; CHECK-NEXT:    popq %rbx
862; CHECK-NEXT:    .cfi_def_cfa_offset 48
863; CHECK-NEXT:    popq %r12
864; CHECK-NEXT:    .cfi_def_cfa_offset 40
865; CHECK-NEXT:    popq %r13
866; CHECK-NEXT:    .cfi_def_cfa_offset 32
867; CHECK-NEXT:    popq %r14
868; CHECK-NEXT:    .cfi_def_cfa_offset 24
869; CHECK-NEXT:    popq %r15
870; CHECK-NEXT:    .cfi_def_cfa_offset 16
871; CHECK-NEXT:    popq %rbp
872; CHECK-NEXT:    .cfi_def_cfa_offset 8
873; CHECK-NEXT:    retq
874  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
875  %2 = sitofp i32 %a0 to float
876  %3 = insertelement <4 x float> %b0, float %2, i64 0
877  ret <4 x float> %3
878}
879
880define float @stack_fold_cvtsi642ss(i64 %a0) {
881; CHECK-LABEL: stack_fold_cvtsi642ss:
882; CHECK:       # %bb.0:
883; CHECK-NEXT:    pushq %rbp
884; CHECK-NEXT:    .cfi_def_cfa_offset 16
885; CHECK-NEXT:    pushq %r15
886; CHECK-NEXT:    .cfi_def_cfa_offset 24
887; CHECK-NEXT:    pushq %r14
888; CHECK-NEXT:    .cfi_def_cfa_offset 32
889; CHECK-NEXT:    pushq %r13
890; CHECK-NEXT:    .cfi_def_cfa_offset 40
891; CHECK-NEXT:    pushq %r12
892; CHECK-NEXT:    .cfi_def_cfa_offset 48
893; CHECK-NEXT:    pushq %rbx
894; CHECK-NEXT:    .cfi_def_cfa_offset 56
895; CHECK-NEXT:    .cfi_offset %rbx, -56
896; CHECK-NEXT:    .cfi_offset %r12, -48
897; CHECK-NEXT:    .cfi_offset %r13, -40
898; CHECK-NEXT:    .cfi_offset %r14, -32
899; CHECK-NEXT:    .cfi_offset %r15, -24
900; CHECK-NEXT:    .cfi_offset %rbp, -16
901; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
902; CHECK-NEXT:    #APP
903; CHECK-NEXT:    nop
904; CHECK-NEXT:    #NO_APP
905; CHECK-NEXT:    xorps %xmm0, %xmm0
906; CHECK-NEXT:    cvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
907; CHECK-NEXT:    popq %rbx
908; CHECK-NEXT:    .cfi_def_cfa_offset 48
909; CHECK-NEXT:    popq %r12
910; CHECK-NEXT:    .cfi_def_cfa_offset 40
911; CHECK-NEXT:    popq %r13
912; CHECK-NEXT:    .cfi_def_cfa_offset 32
913; CHECK-NEXT:    popq %r14
914; CHECK-NEXT:    .cfi_def_cfa_offset 24
915; CHECK-NEXT:    popq %r15
916; CHECK-NEXT:    .cfi_def_cfa_offset 16
917; CHECK-NEXT:    popq %rbp
918; CHECK-NEXT:    .cfi_def_cfa_offset 8
919; CHECK-NEXT:    retq
920  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
921  %2 = sitofp i64 %a0 to float
922  ret float %2
923}
924
925define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0, <4 x float> %b0) {
926; CHECK-LABEL: stack_fold_cvtsi642ss_int:
927; CHECK:       # %bb.0:
928; CHECK-NEXT:    pushq %rbp
929; CHECK-NEXT:    .cfi_def_cfa_offset 16
930; CHECK-NEXT:    pushq %r15
931; CHECK-NEXT:    .cfi_def_cfa_offset 24
932; CHECK-NEXT:    pushq %r14
933; CHECK-NEXT:    .cfi_def_cfa_offset 32
934; CHECK-NEXT:    pushq %r13
935; CHECK-NEXT:    .cfi_def_cfa_offset 40
936; CHECK-NEXT:    pushq %r12
937; CHECK-NEXT:    .cfi_def_cfa_offset 48
938; CHECK-NEXT:    pushq %rbx
939; CHECK-NEXT:    .cfi_def_cfa_offset 56
940; CHECK-NEXT:    .cfi_offset %rbx, -56
941; CHECK-NEXT:    .cfi_offset %r12, -48
942; CHECK-NEXT:    .cfi_offset %r13, -40
943; CHECK-NEXT:    .cfi_offset %r14, -32
944; CHECK-NEXT:    .cfi_offset %r15, -24
945; CHECK-NEXT:    .cfi_offset %rbp, -16
946; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
947; CHECK-NEXT:    #APP
948; CHECK-NEXT:    nop
949; CHECK-NEXT:    #NO_APP
950; CHECK-NEXT:    cvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
951; CHECK-NEXT:    popq %rbx
952; CHECK-NEXT:    .cfi_def_cfa_offset 48
953; CHECK-NEXT:    popq %r12
954; CHECK-NEXT:    .cfi_def_cfa_offset 40
955; CHECK-NEXT:    popq %r13
956; CHECK-NEXT:    .cfi_def_cfa_offset 32
957; CHECK-NEXT:    popq %r14
958; CHECK-NEXT:    .cfi_def_cfa_offset 24
959; CHECK-NEXT:    popq %r15
960; CHECK-NEXT:    .cfi_def_cfa_offset 16
961; CHECK-NEXT:    popq %rbp
962; CHECK-NEXT:    .cfi_def_cfa_offset 8
963; CHECK-NEXT:    retq
964  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
965  %2 = sitofp i64 %a0 to float
966  %3 = insertelement <4 x float> %b0, float %2, i64 0
967  ret <4 x float> %3
968}
969
970define double @stack_fold_cvtss2sd(float %a0) minsize {
971; CHECK-LABEL: stack_fold_cvtss2sd:
972; CHECK:       # %bb.0:
973; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
974; CHECK-NEXT:    #APP
975; CHECK-NEXT:    nop
976; CHECK-NEXT:    #NO_APP
977; CHECK-NEXT:    cvtss2sd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
978; CHECK-NEXT:    retq
979  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
980  %2 = fpext float %a0 to double
981  ret double %2
982}
983
984define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) optsize {
985; CHECK-LABEL: stack_fold_cvtss2sd_int:
986; CHECK:       # %bb.0:
987; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
988; CHECK-NEXT:    #APP
989; CHECK-NEXT:    nop
990; CHECK-NEXT:    #NO_APP
991; CHECK-NEXT:    xorps %xmm0, %xmm0
992; CHECK-NEXT:    cvtss2sd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
993; CHECK-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
994; CHECK-NEXT:    retq
995  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
996  %2 = extractelement <4 x float> %a0, i64 0
997  %3 = fpext float %2 to double
998  %4 = insertelement <2 x double> zeroinitializer, double %3, i64 0
999  ret <2 x double> %4
1000}
1001
1002; TODO stack_fold_cvtss2si
1003
1004define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
1005; CHECK-LABEL: stack_fold_cvtss2si_int:
1006; CHECK:       # %bb.0:
1007; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1008; CHECK-NEXT:    #APP
1009; CHECK-NEXT:    nop
1010; CHECK-NEXT:    #NO_APP
1011; CHECK-NEXT:    cvtss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
1012; CHECK-NEXT:    retq
1013  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1014  %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
1015  ret i32 %2
1016}
1017declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
1018
1019; TODO stack_fold_cvtss2si64
1020
1021define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
1022; CHECK-LABEL: stack_fold_cvtss2si64_int:
1023; CHECK:       # %bb.0:
1024; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1025; CHECK-NEXT:    #APP
1026; CHECK-NEXT:    nop
1027; CHECK-NEXT:    #NO_APP
1028; CHECK-NEXT:    cvtss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
1029; CHECK-NEXT:    retq
1030  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1031  %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
1032  ret i64 %2
1033}
1034declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
1035
1036define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
1037; CHECK-LABEL: stack_fold_cvttpd2dq:
1038; CHECK:       # %bb.0:
1039; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1040; CHECK-NEXT:    #APP
1041; CHECK-NEXT:    nop
1042; CHECK-NEXT:    #NO_APP
1043; CHECK-NEXT:    cvttpd2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1044; CHECK-NEXT:    retq
1045  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1046  %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
1047  ret <4 x i32> %2
1048}
1049declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
1050
1051define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
1052; CHECK-LABEL: stack_fold_cvttps2dq:
1053; CHECK:       # %bb.0:
1054; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1055; CHECK-NEXT:    #APP
1056; CHECK-NEXT:    nop
1057; CHECK-NEXT:    #NO_APP
1058; CHECK-NEXT:    cvttps2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1059; CHECK-NEXT:    retq
1060  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1061  %2 = fptosi <4 x float> %a0 to <4 x i32>
1062  ret <4 x i32> %2
1063}
1064
1065define i32 @stack_fold_cvttsd2si(double %a0) {
1066; CHECK-LABEL: stack_fold_cvttsd2si:
1067; CHECK:       # %bb.0:
1068; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1069; CHECK-NEXT:    #APP
1070; CHECK-NEXT:    nop
1071; CHECK-NEXT:    #NO_APP
1072; CHECK-NEXT:    cvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 8-byte Folded Reload
1073; CHECK-NEXT:    retq
1074  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1075  %2 = fptosi double %a0 to i32
1076  ret i32 %2
1077}
1078
1079define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
1080; CHECK-LABEL: stack_fold_cvttsd2si_int:
1081; CHECK:       # %bb.0:
1082; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1083; CHECK-NEXT:    #APP
1084; CHECK-NEXT:    nop
1085; CHECK-NEXT:    #NO_APP
1086; CHECK-NEXT:    cvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
1087; CHECK-NEXT:    retq
1088  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1089  %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
1090  ret i32 %2
1091}
1092declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
1093
1094define i64 @stack_fold_cvttsd2si64(double %a0) {
1095; CHECK-LABEL: stack_fold_cvttsd2si64:
1096; CHECK:       # %bb.0:
1097; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1098; CHECK-NEXT:    #APP
1099; CHECK-NEXT:    nop
1100; CHECK-NEXT:    #NO_APP
1101; CHECK-NEXT:    cvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
1102; CHECK-NEXT:    retq
1103  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1104  %2 = fptosi double %a0 to i64
1105  ret i64 %2
1106}
1107
1108define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
1109; CHECK-LABEL: stack_fold_cvttsd2si64_int:
1110; CHECK:       # %bb.0:
1111; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1112; CHECK-NEXT:    #APP
1113; CHECK-NEXT:    nop
1114; CHECK-NEXT:    #NO_APP
1115; CHECK-NEXT:    cvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
1116; CHECK-NEXT:    retq
1117  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1118  %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
1119  ret i64 %2
1120}
1121declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
1122
1123define i32 @stack_fold_cvttss2si(float %a0) {
1124; CHECK-LABEL: stack_fold_cvttss2si:
1125; CHECK:       # %bb.0:
1126; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1127; CHECK-NEXT:    #APP
1128; CHECK-NEXT:    nop
1129; CHECK-NEXT:    #NO_APP
1130; CHECK-NEXT:    cvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
1131; CHECK-NEXT:    retq
1132  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1133  %2 = fptosi float %a0 to i32
1134  ret i32 %2
1135}
1136
1137define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
1138; CHECK-LABEL: stack_fold_cvttss2si_int:
1139; CHECK:       # %bb.0:
1140; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1141; CHECK-NEXT:    #APP
1142; CHECK-NEXT:    nop
1143; CHECK-NEXT:    #NO_APP
1144; CHECK-NEXT:    cvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
1145; CHECK-NEXT:    retq
1146  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1147  %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
1148  ret i32 %2
1149}
1150declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
1151
1152define i64 @stack_fold_cvttss2si64(float %a0) {
1153; CHECK-LABEL: stack_fold_cvttss2si64:
1154; CHECK:       # %bb.0:
1155; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1156; CHECK-NEXT:    #APP
1157; CHECK-NEXT:    nop
1158; CHECK-NEXT:    #NO_APP
1159; CHECK-NEXT:    cvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 4-byte Folded Reload
1160; CHECK-NEXT:    retq
1161  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1162  %2 = fptosi float %a0 to i64
1163  ret i64 %2
1164}
1165
1166define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
1167; CHECK-LABEL: stack_fold_cvttss2si64_int:
1168; CHECK:       # %bb.0:
1169; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1170; CHECK-NEXT:    #APP
1171; CHECK-NEXT:    nop
1172; CHECK-NEXT:    #NO_APP
1173; CHECK-NEXT:    cvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
1174; CHECK-NEXT:    retq
1175  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1176  %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
1177  ret i64 %2
1178}
1179declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
1180
1181define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
1182; CHECK-LABEL: stack_fold_divpd:
1183; CHECK:       # %bb.0:
1184; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1185; CHECK-NEXT:    #APP
1186; CHECK-NEXT:    nop
1187; CHECK-NEXT:    #NO_APP
1188; CHECK-NEXT:    divpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1189; CHECK-NEXT:    retq
1190  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1191  %2 = fdiv <2 x double> %a0, %a1
1192  ret <2 x double> %2
1193}
1194
1195define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
1196; CHECK-LABEL: stack_fold_divps:
1197; CHECK:       # %bb.0:
1198; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1199; CHECK-NEXT:    #APP
1200; CHECK-NEXT:    nop
1201; CHECK-NEXT:    #NO_APP
1202; CHECK-NEXT:    divps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1203; CHECK-NEXT:    retq
1204  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1205  %2 = fdiv <4 x float> %a0, %a1
1206  ret <4 x float> %2
1207}
1208
1209define double @stack_fold_divsd(double %a0, double %a1) {
1210; CHECK-LABEL: stack_fold_divsd:
1211; CHECK:       # %bb.0:
1212; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1213; CHECK-NEXT:    #APP
1214; CHECK-NEXT:    nop
1215; CHECK-NEXT:    #NO_APP
1216; CHECK-NEXT:    divsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
1217; CHECK-NEXT:    retq
1218  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1219  %2 = fdiv double %a0, %a1
1220  ret double %2
1221}
1222
1223define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
1224; CHECK-LABEL: stack_fold_divsd_int:
1225; CHECK:       # %bb.0:
1226; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1227; CHECK-NEXT:    #APP
1228; CHECK-NEXT:    nop
1229; CHECK-NEXT:    #NO_APP
1230; CHECK-NEXT:    divsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1231; CHECK-NEXT:    retq
1232  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1233  %2 = extractelement <2 x double> %a0, i32 0
1234  %3 = extractelement <2 x double> %a1, i32 0
1235  %4 = fdiv double %2, %3
1236  %5 = insertelement <2 x double> %a0, double %4, i32 0
1237  ret <2 x double> %5
1238}
1239declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
1240
1241define float @stack_fold_divss(float %a0, float %a1) {
1242; CHECK-LABEL: stack_fold_divss:
1243; CHECK:       # %bb.0:
1244; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1245; CHECK-NEXT:    #APP
1246; CHECK-NEXT:    nop
1247; CHECK-NEXT:    #NO_APP
1248; CHECK-NEXT:    divss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1249; CHECK-NEXT:    retq
1250  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1251  %2 = fdiv float %a0, %a1
1252  ret float %2
1253}
1254
1255define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
1256; CHECK-LABEL: stack_fold_divss_int:
1257; CHECK:       # %bb.0:
1258; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1259; CHECK-NEXT:    #APP
1260; CHECK-NEXT:    nop
1261; CHECK-NEXT:    #NO_APP
1262; CHECK-NEXT:    divss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1263; CHECK-NEXT:    retq
1264  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1265  %2 = extractelement <4 x float> %a0, i32 0
1266  %3 = extractelement <4 x float> %a1, i32 0
1267  %4 = fdiv float %2, %3
1268  %5 = insertelement <4 x float> %a0, float %4, i32 0
1269  ret <4 x float> %5
1270}
1271declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
1272
1273define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
1274; CHECK-LABEL: stack_fold_dppd:
1275; CHECK:       # %bb.0:
1276; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1277; CHECK-NEXT:    #APP
1278; CHECK-NEXT:    nop
1279; CHECK-NEXT:    #NO_APP
1280; CHECK-NEXT:    dppd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1281; CHECK-NEXT:    retq
1282  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1283  %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
1284  ret <2 x double> %2
1285}
1286declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
1287
1288define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
1289; CHECK-LABEL: stack_fold_dpps:
1290; CHECK:       # %bb.0:
1291; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1292; CHECK-NEXT:    #APP
1293; CHECK-NEXT:    nop
1294; CHECK-NEXT:    #NO_APP
1295; CHECK-NEXT:    dpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1296; CHECK-NEXT:    retq
1297  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1298  %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
1299  ret <4 x float> %2
1300}
1301declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
1302
1303define i32 @stack_fold_extractps(<4 x float> %a0, <4 x float> %a1) {
1304; CHECK-LABEL: stack_fold_extractps:
1305; CHECK:       # %bb.0:
1306; CHECK-NEXT:    pushq %rbp
1307; CHECK-NEXT:    .cfi_def_cfa_offset 16
1308; CHECK-NEXT:    pushq %r15
1309; CHECK-NEXT:    .cfi_def_cfa_offset 24
1310; CHECK-NEXT:    pushq %r14
1311; CHECK-NEXT:    .cfi_def_cfa_offset 32
1312; CHECK-NEXT:    pushq %r13
1313; CHECK-NEXT:    .cfi_def_cfa_offset 40
1314; CHECK-NEXT:    pushq %r12
1315; CHECK-NEXT:    .cfi_def_cfa_offset 48
1316; CHECK-NEXT:    pushq %rbx
1317; CHECK-NEXT:    .cfi_def_cfa_offset 56
1318; CHECK-NEXT:    .cfi_offset %rbx, -56
1319; CHECK-NEXT:    .cfi_offset %r12, -48
1320; CHECK-NEXT:    .cfi_offset %r13, -40
1321; CHECK-NEXT:    .cfi_offset %r14, -32
1322; CHECK-NEXT:    .cfi_offset %r15, -24
1323; CHECK-NEXT:    .cfi_offset %rbp, -16
1324; CHECK-NEXT:    addps %xmm1, %xmm0
1325; CHECK-NEXT:    extractps $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1326; CHECK-NEXT:    #APP
1327; CHECK-NEXT:    nop
1328; CHECK-NEXT:    #NO_APP
1329; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1330; CHECK-NEXT:    popq %rbx
1331; CHECK-NEXT:    .cfi_def_cfa_offset 48
1332; CHECK-NEXT:    popq %r12
1333; CHECK-NEXT:    .cfi_def_cfa_offset 40
1334; CHECK-NEXT:    popq %r13
1335; CHECK-NEXT:    .cfi_def_cfa_offset 32
1336; CHECK-NEXT:    popq %r14
1337; CHECK-NEXT:    .cfi_def_cfa_offset 24
1338; CHECK-NEXT:    popq %r15
1339; CHECK-NEXT:    .cfi_def_cfa_offset 16
1340; CHECK-NEXT:    popq %rbp
1341; CHECK-NEXT:    .cfi_def_cfa_offset 8
1342; CHECK-NEXT:    retq
1343  ; fadd forces execution domain
1344  %1 = fadd <4 x float> %a0, %a1
1345  %2 = extractelement <4 x float> %1, i32 1
1346  %3 = bitcast float %2 to i32
1347  %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1348  ret i32 %3
1349}
1350
1351define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
1352; CHECK-LABEL: stack_fold_haddpd:
1353; CHECK:       # %bb.0:
1354; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1355; CHECK-NEXT:    #APP
1356; CHECK-NEXT:    nop
1357; CHECK-NEXT:    #NO_APP
1358; CHECK-NEXT:    haddpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1359; CHECK-NEXT:    retq
1360  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1361  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
1362  ret <2 x double> %2
1363}
1364declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
1365
1366define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
1367; CHECK-LABEL: stack_fold_haddps:
1368; CHECK:       # %bb.0:
1369; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1370; CHECK-NEXT:    #APP
1371; CHECK-NEXT:    nop
1372; CHECK-NEXT:    #NO_APP
1373; CHECK-NEXT:    haddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1374; CHECK-NEXT:    retq
1375  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1376  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
1377  ret <4 x float> %2
1378}
1379declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
1380
1381define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
1382; CHECK-LABEL: stack_fold_hsubpd:
1383; CHECK:       # %bb.0:
1384; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1385; CHECK-NEXT:    #APP
1386; CHECK-NEXT:    nop
1387; CHECK-NEXT:    #NO_APP
1388; CHECK-NEXT:    hsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1389; CHECK-NEXT:    retq
1390  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1391  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
1392  ret <2 x double> %2
1393}
1394declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
1395
1396define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
1397; CHECK-LABEL: stack_fold_hsubps:
1398; CHECK:       # %bb.0:
1399; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1400; CHECK-NEXT:    #APP
1401; CHECK-NEXT:    nop
1402; CHECK-NEXT:    #NO_APP
1403; CHECK-NEXT:    hsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1404; CHECK-NEXT:    retq
1405  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1406  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
1407  ret <4 x float> %2
1408}
1409declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
1410
1411define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
1412; CHECK-LABEL: stack_fold_insertps:
1413; CHECK:       # %bb.0:
1414; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1415; CHECK-NEXT:    #APP
1416; CHECK-NEXT:    nop
1417; CHECK-NEXT:    #NO_APP
1418; CHECK-NEXT:    insertps $17, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1419; CHECK-NEXT:    # xmm0 = zero,mem[0],xmm0[2,3]
1420; CHECK-NEXT:    retq
1421  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1422  %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
1423  ret <4 x float> %2
1424}
1425declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
1426
1427define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
1428; CHECK-LABEL: stack_fold_maxpd:
1429; CHECK:       # %bb.0:
1430; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1431; CHECK-NEXT:    #APP
1432; CHECK-NEXT:    nop
1433; CHECK-NEXT:    #NO_APP
1434; CHECK-NEXT:    maxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1435; CHECK-NEXT:    retq
1436  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1437  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
1438  ret <2 x double> %2
1439}
1440declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
1441
1442define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
1443; CHECK-LABEL: stack_fold_maxpd_commutable:
1444; CHECK:       # %bb.0:
1445; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1446; CHECK-NEXT:    #APP
1447; CHECK-NEXT:    nop
1448; CHECK-NEXT:    #NO_APP
1449; CHECK-NEXT:    maxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1450; CHECK-NEXT:    retq
1451  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1452  %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
1453  ret <2 x double> %2
1454}
1455
1456define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
1457; CHECK-LABEL: stack_fold_maxps:
1458; CHECK:       # %bb.0:
1459; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1460; CHECK-NEXT:    #APP
1461; CHECK-NEXT:    nop
1462; CHECK-NEXT:    #NO_APP
1463; CHECK-NEXT:    maxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1464; CHECK-NEXT:    retq
1465  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1466  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1467  ret <4 x float> %2
1468}
1469declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
1470
1471define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
1472; CHECK-LABEL: stack_fold_maxps_commutable:
1473; CHECK:       # %bb.0:
1474; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1475; CHECK-NEXT:    #APP
1476; CHECK-NEXT:    nop
1477; CHECK-NEXT:    #NO_APP
1478; CHECK-NEXT:    maxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1479; CHECK-NEXT:    retq
1480  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1481  %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1482  ret <4 x float> %2
1483}
1484
1485define double @stack_fold_maxsd(double %a0, double %a1) #0 {
1486; CHECK-LABEL: stack_fold_maxsd:
1487; CHECK:       # %bb.0:
1488; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1489; CHECK-NEXT:    #APP
1490; CHECK-NEXT:    nop
1491; CHECK-NEXT:    #NO_APP
1492; CHECK-NEXT:    maxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
1493; CHECK-NEXT:    retq
1494  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1495  %2 = fcmp ogt double %a0, %a1
1496  %3 = select i1 %2, double %a0, double %a1
1497  ret double %3
1498}
1499
1500define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
1501; CHECK-LABEL: stack_fold_maxsd_commutable:
1502; CHECK:       # %bb.0:
1503; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1504; CHECK-NEXT:    #APP
1505; CHECK-NEXT:    nop
1506; CHECK-NEXT:    #NO_APP
1507; CHECK-NEXT:    maxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
1508; CHECK-NEXT:    retq
1509  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1510  %2 = fcmp ogt double %a0, %a1
1511  %3 = select i1 %2, double %a0, double %a1
1512  ret double %3
1513}
1514
1515define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
1516; CHECK-LABEL: stack_fold_maxsd_int:
1517; CHECK:       # %bb.0:
1518; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1519; CHECK-NEXT:    #APP
1520; CHECK-NEXT:    nop
1521; CHECK-NEXT:    #NO_APP
1522; CHECK-NEXT:    maxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1523; CHECK-NEXT:    retq
1524  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1525  %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
1526  ret <2 x double> %2
1527}
1528declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
1529
1530define float @stack_fold_maxss(float %a0, float %a1) #0 {
1531; CHECK-LABEL: stack_fold_maxss:
1532; CHECK:       # %bb.0:
1533; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1534; CHECK-NEXT:    #APP
1535; CHECK-NEXT:    nop
1536; CHECK-NEXT:    #NO_APP
1537; CHECK-NEXT:    maxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1538; CHECK-NEXT:    retq
1539  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1540  %2 = fcmp ogt float %a0, %a1
1541  %3 = select i1 %2, float %a0, float %a1
1542  ret float %3
1543}
1544
1545define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
1546; CHECK-LABEL: stack_fold_maxss_commutable:
1547; CHECK:       # %bb.0:
1548; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1549; CHECK-NEXT:    #APP
1550; CHECK-NEXT:    nop
1551; CHECK-NEXT:    #NO_APP
1552; CHECK-NEXT:    maxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1553; CHECK-NEXT:    retq
1554  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1555  %2 = fcmp ogt float %a0, %a1
1556  %3 = select i1 %2, float %a0, float %a1
1557  ret float %3
1558}
1559
1560define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
1561; CHECK-LABEL: stack_fold_maxss_int:
1562; CHECK:       # %bb.0:
1563; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1564; CHECK-NEXT:    #APP
1565; CHECK-NEXT:    nop
1566; CHECK-NEXT:    #NO_APP
1567; CHECK-NEXT:    maxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1568; CHECK-NEXT:    retq
1569  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1570  %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
1571  ret <4 x float> %2
1572}
1573declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
1574
1575define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
1576; CHECK-LABEL: stack_fold_minpd:
1577; CHECK:       # %bb.0:
1578; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1579; CHECK-NEXT:    #APP
1580; CHECK-NEXT:    nop
1581; CHECK-NEXT:    #NO_APP
1582; CHECK-NEXT:    minpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1583; CHECK-NEXT:    retq
1584  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1585  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
1586  ret <2 x double> %2
1587}
1588declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
1589
1590define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
1591; CHECK-LABEL: stack_fold_minpd_commutable:
1592; CHECK:       # %bb.0:
1593; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1594; CHECK-NEXT:    #APP
1595; CHECK-NEXT:    nop
1596; CHECK-NEXT:    #NO_APP
1597; CHECK-NEXT:    minpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1598; CHECK-NEXT:    retq
1599  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1600  %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
1601  ret <2 x double> %2
1602}
1603
1604define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
1605; CHECK-LABEL: stack_fold_minps:
1606; CHECK:       # %bb.0:
1607; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1608; CHECK-NEXT:    #APP
1609; CHECK-NEXT:    nop
1610; CHECK-NEXT:    #NO_APP
1611; CHECK-NEXT:    minps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1612; CHECK-NEXT:    retq
1613  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1614  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
1615  ret <4 x float> %2
1616}
1617declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
1618
1619define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
1620; CHECK-LABEL: stack_fold_minps_commutable:
1621; CHECK:       # %bb.0:
1622; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1623; CHECK-NEXT:    #APP
1624; CHECK-NEXT:    nop
1625; CHECK-NEXT:    #NO_APP
1626; CHECK-NEXT:    minps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1627; CHECK-NEXT:    retq
1628  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1629  %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
1630  ret <4 x float> %2
1631}
1632
1633define double @stack_fold_minsd(double %a0, double %a1) #0 {
1634; CHECK-LABEL: stack_fold_minsd:
1635; CHECK:       # %bb.0:
1636; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1637; CHECK-NEXT:    #APP
1638; CHECK-NEXT:    nop
1639; CHECK-NEXT:    #NO_APP
1640; CHECK-NEXT:    minsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
1641; CHECK-NEXT:    retq
1642  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1643  %2 = fcmp olt double %a0, %a1
1644  %3 = select i1 %2, double %a0, double %a1
1645  ret double %3
1646}
1647
1648define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
1649; CHECK-LABEL: stack_fold_minsd_commutable:
1650; CHECK:       # %bb.0:
1651; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1652; CHECK-NEXT:    #APP
1653; CHECK-NEXT:    nop
1654; CHECK-NEXT:    #NO_APP
1655; CHECK-NEXT:    minsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
1656; CHECK-NEXT:    retq
1657  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1658  %2 = fcmp olt double %a0, %a1
1659  %3 = select i1 %2, double %a0, double %a1
1660  ret double %3
1661}
1662
1663define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
1664; CHECK-LABEL: stack_fold_minsd_int:
1665; CHECK:       # %bb.0:
1666; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1667; CHECK-NEXT:    #APP
1668; CHECK-NEXT:    nop
1669; CHECK-NEXT:    #NO_APP
1670; CHECK-NEXT:    minsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1671; CHECK-NEXT:    retq
1672  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1673  %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
1674  ret <2 x double> %2
1675}
1676declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
1677
1678define float @stack_fold_minss(float %a0, float %a1) #0 {
1679; CHECK-LABEL: stack_fold_minss:
1680; CHECK:       # %bb.0:
1681; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1682; CHECK-NEXT:    #APP
1683; CHECK-NEXT:    nop
1684; CHECK-NEXT:    #NO_APP
1685; CHECK-NEXT:    minss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1686; CHECK-NEXT:    retq
1687  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1688  %2 = fcmp olt float %a0, %a1
1689  %3 = select i1 %2, float %a0, float %a1
1690  ret float %3
1691}
1692
1693define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
1694; CHECK-LABEL: stack_fold_minss_commutable:
1695; CHECK:       # %bb.0:
1696; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1697; CHECK-NEXT:    #APP
1698; CHECK-NEXT:    nop
1699; CHECK-NEXT:    #NO_APP
1700; CHECK-NEXT:    minss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1701; CHECK-NEXT:    retq
1702  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1703  %2 = fcmp olt float %a0, %a1
1704  %3 = select i1 %2, float %a0, float %a1
1705  ret float %3
1706}
1707
1708define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
1709; CHECK-LABEL: stack_fold_minss_int:
1710; CHECK:       # %bb.0:
1711; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1712; CHECK-NEXT:    #APP
1713; CHECK-NEXT:    nop
1714; CHECK-NEXT:    #NO_APP
1715; CHECK-NEXT:    minss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1716; CHECK-NEXT:    retq
1717  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1718  %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
1719  ret <4 x float> %2
1720}
1721declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
1722
1723define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
1724; CHECK-LABEL: stack_fold_movddup:
1725; CHECK:       # %bb.0:
1726; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1727; CHECK-NEXT:    #APP
1728; CHECK-NEXT:    nop
1729; CHECK-NEXT:    #NO_APP
1730; CHECK-NEXT:    movddup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1731; CHECK-NEXT:    # xmm0 = mem[0,0]
1732; CHECK-NEXT:    retq
1733  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1734  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
1735  ret <2 x double> %2
1736}
1737; TODO stack_fold_movhpd (load / store)
1738; TODO stack_fold_movhps (load / store)
1739
1740; TODO stack_fold_movlpd (load / store)
1741; TODO stack_fold_movlps (load / store)
1742
1743define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
1744; CHECK-LABEL: stack_fold_movshdup:
1745; CHECK:       # %bb.0:
1746; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1747; CHECK-NEXT:    #APP
1748; CHECK-NEXT:    nop
1749; CHECK-NEXT:    #NO_APP
1750; CHECK-NEXT:    movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1751; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
1752; CHECK-NEXT:    retq
1753  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1754  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
1755  ret <4 x float> %2
1756}
1757
1758define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
1759; CHECK-LABEL: stack_fold_movsldup:
1760; CHECK:       # %bb.0:
1761; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1762; CHECK-NEXT:    #APP
1763; CHECK-NEXT:    nop
1764; CHECK-NEXT:    #NO_APP
1765; CHECK-NEXT:    movsldup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1766; CHECK-NEXT:    # xmm0 = mem[0,0,2,2]
1767; CHECK-NEXT:    retq
1768  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1769  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1770  ret <4 x float> %2
1771}
1772
1773define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
1774; CHECK-LABEL: stack_fold_mulpd:
1775; CHECK:       # %bb.0:
1776; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1777; CHECK-NEXT:    #APP
1778; CHECK-NEXT:    nop
1779; CHECK-NEXT:    #NO_APP
1780; CHECK-NEXT:    mulpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1781; CHECK-NEXT:    retq
1782  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1783  %2 = fmul <2 x double> %a0, %a1
1784  ret <2 x double> %2
1785}
1786
1787define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
1788; CHECK-LABEL: stack_fold_mulps:
1789; CHECK:       # %bb.0:
1790; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1791; CHECK-NEXT:    #APP
1792; CHECK-NEXT:    nop
1793; CHECK-NEXT:    #NO_APP
1794; CHECK-NEXT:    mulps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1795; CHECK-NEXT:    retq
1796  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1797  %2 = fmul <4 x float> %a0, %a1
1798  ret <4 x float> %2
1799}
1800
1801define double @stack_fold_mulsd(double %a0, double %a1) {
1802; CHECK-LABEL: stack_fold_mulsd:
1803; CHECK:       # %bb.0:
1804; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1805; CHECK-NEXT:    #APP
1806; CHECK-NEXT:    nop
1807; CHECK-NEXT:    #NO_APP
1808; CHECK-NEXT:    mulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
1809; CHECK-NEXT:    retq
1810  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1811  %2 = fmul double %a0, %a1
1812  ret double %2
1813}
1814
1815define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
1816; CHECK-LABEL: stack_fold_mulsd_int:
1817; CHECK:       # %bb.0:
1818; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1819; CHECK-NEXT:    #APP
1820; CHECK-NEXT:    nop
1821; CHECK-NEXT:    #NO_APP
1822; CHECK-NEXT:    mulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1823; CHECK-NEXT:    retq
1824  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1825  %2 = extractelement <2 x double> %a0, i32 0
1826  %3 = extractelement <2 x double> %a1, i32 0
1827  %4 = fmul double %2, %3
1828  %5 = insertelement <2 x double> %a0, double %4, i32 0
1829  ret <2 x double> %5
1830}
1831
1832define float @stack_fold_mulss(float %a0, float %a1) {
1833; CHECK-LABEL: stack_fold_mulss:
1834; CHECK:       # %bb.0:
1835; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1836; CHECK-NEXT:    #APP
1837; CHECK-NEXT:    nop
1838; CHECK-NEXT:    #NO_APP
1839; CHECK-NEXT:    mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1840; CHECK-NEXT:    retq
1841  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1842  %2 = fmul float %a0, %a1
1843  ret float %2
1844}
1845
1846define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
1847; CHECK-LABEL: stack_fold_mulss_int:
1848; CHECK:       # %bb.0:
1849; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1850; CHECK-NEXT:    #APP
1851; CHECK-NEXT:    nop
1852; CHECK-NEXT:    #NO_APP
1853; CHECK-NEXT:    mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1854; CHECK-NEXT:    retq
1855  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1856  %2 = extractelement <4 x float> %a0, i32 0
1857  %3 = extractelement <4 x float> %a1, i32 0
1858  %4 = fmul float %2, %3
1859  %5 = insertelement <4 x float> %a0, float %4, i32 0
1860  ret <4 x float> %5
1861}
1862
1863define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
1864; CHECK-LABEL: stack_fold_orpd:
1865; CHECK:       # %bb.0:
1866; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1867; CHECK-NEXT:    #APP
1868; CHECK-NEXT:    nop
1869; CHECK-NEXT:    #NO_APP
1870; CHECK-NEXT:    orpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1871; CHECK-NEXT:    xorpd %xmm1, %xmm1
1872; CHECK-NEXT:    addpd %xmm1, %xmm0
1873; CHECK-NEXT:    retq
1874  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1875  %2 = bitcast <2 x double> %a0 to <2 x i64>
1876  %3 = bitcast <2 x double> %a1 to <2 x i64>
1877  %4 = or <2 x i64> %2, %3
1878  %5 = bitcast <2 x i64> %4 to <2 x double>
1879  ; fadd forces execution domain
1880  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
1881  ret <2 x double> %6
1882}
1883
1884define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
1885; CHECK-LABEL: stack_fold_orps:
1886; CHECK:       # %bb.0:
1887; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1888; CHECK-NEXT:    #APP
1889; CHECK-NEXT:    nop
1890; CHECK-NEXT:    #NO_APP
1891; CHECK-NEXT:    orps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1892; CHECK-NEXT:    xorps %xmm1, %xmm1
1893; CHECK-NEXT:    addps %xmm1, %xmm0
1894; CHECK-NEXT:    retq
1895  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1896  %2 = bitcast <4 x float> %a0 to <2 x i64>
1897  %3 = bitcast <4 x float> %a1 to <2 x i64>
1898  %4 = or <2 x i64> %2, %3
1899  %5 = bitcast <2 x i64> %4 to <4 x float>
1900  ; fadd forces execution domain
1901  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
1902  ret <4 x float> %6
1903}
1904
1905; TODO stack_fold_rcpps
1906
1907define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
1908; CHECK-LABEL: stack_fold_rcpps_int:
1909; CHECK:       # %bb.0:
1910; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1911; CHECK-NEXT:    #APP
1912; CHECK-NEXT:    nop
1913; CHECK-NEXT:    #NO_APP
1914; CHECK-NEXT:    rcpps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1915; CHECK-NEXT:    retq
1916  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1917  %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
1918  ret <4 x float> %2
1919}
1920declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
1921
1922; TODO stack_fold_rcpss
1923
1924define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0, <4 x float> %a1) optsize {
1925; CHECK-LABEL: stack_fold_rcpss_int:
1926; CHECK:       # %bb.0:
1927; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1928; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1929; CHECK-NEXT:    #APP
1930; CHECK-NEXT:    nop
1931; CHECK-NEXT:    #NO_APP
1932; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1933; CHECK-NEXT:    rcpss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1934; CHECK-NEXT:    retq
1935  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1936  %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a1)
1937  %3 = extractelement <4 x float> %2, i32 0
1938  %4 = insertelement <4 x float> %a0, float %3, i32 0
1939  ret <4 x float> %4
1940}
1941declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>)
1942
1943define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
1944; CHECK-LABEL: stack_fold_roundpd:
1945; CHECK:       # %bb.0:
1946; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1947; CHECK-NEXT:    #APP
1948; CHECK-NEXT:    nop
1949; CHECK-NEXT:    #NO_APP
1950; CHECK-NEXT:    roundpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1951; CHECK-NEXT:    retq
1952  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1953  %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
1954  ret <2 x double> %2
1955}
1956declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
1957
1958define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
1959; CHECK-LABEL: stack_fold_roundps:
1960; CHECK:       # %bb.0:
1961; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1962; CHECK-NEXT:    #APP
1963; CHECK-NEXT:    nop
1964; CHECK-NEXT:    #NO_APP
1965; CHECK-NEXT:    roundps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1966; CHECK-NEXT:    retq
1967  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1968  %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
1969  ret <4 x float> %2
1970}
1971declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
1972
1973define double @stack_fold_roundsd(double %a0) optsize {
1974; CHECK-LABEL: stack_fold_roundsd:
1975; CHECK:       # %bb.0:
1976; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1977; CHECK-NEXT:    #APP
1978; CHECK-NEXT:    nop
1979; CHECK-NEXT:    #NO_APP
1980; CHECK-NEXT:    xorps %xmm0, %xmm0
1981; CHECK-NEXT:    roundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
1982; CHECK-NEXT:    retq
1983  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1984  %2 = call double @llvm.floor.f64(double %a0)
1985  ret double %2
1986}
1987declare double @llvm.floor.f64(double) nounwind readnone
1988
1989define <2 x double> @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize {
1990; CHECK-LABEL: stack_fold_roundsd_int:
1991; CHECK:       # %bb.0:
1992; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1993; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1994; CHECK-NEXT:    #APP
1995; CHECK-NEXT:    nop
1996; CHECK-NEXT:    #NO_APP
1997; CHECK-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1998; CHECK-NEXT:    roundsd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1999; CHECK-NEXT:    retq
2000  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2001  %2 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
2002  ret <2 x double> %2
2003}
2004declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
2005
2006define float @stack_fold_roundss(float %a0) minsize {
2007; CHECK-LABEL: stack_fold_roundss:
2008; CHECK:       # %bb.0:
2009; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2010; CHECK-NEXT:    #APP
2011; CHECK-NEXT:    nop
2012; CHECK-NEXT:    #NO_APP
2013; CHECK-NEXT:    roundss $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
2014; CHECK-NEXT:    retq
2015  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2016  %2 = call float @llvm.floor.f32(float %a0)
2017  ret float %2
2018}
2019declare float @llvm.floor.f32(float) nounwind readnone
2020
2021define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize {
2022; CHECK-LABEL: stack_fold_roundss_int:
2023; CHECK:       # %bb.0:
2024; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2025; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2026; CHECK-NEXT:    #APP
2027; CHECK-NEXT:    nop
2028; CHECK-NEXT:    #NO_APP
2029; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2030; CHECK-NEXT:    roundss $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2031; CHECK-NEXT:    retq
2032  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2033  %2 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
2034  ret <4 x float> %2
2035}
2036declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
2037
2038; TODO stack_fold_rsqrtps
2039
2040define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
2041; CHECK-LABEL: stack_fold_rsqrtps_int:
2042; CHECK:       # %bb.0:
2043; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2044; CHECK-NEXT:    #APP
2045; CHECK-NEXT:    nop
2046; CHECK-NEXT:    #NO_APP
2047; CHECK-NEXT:    rsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2048; CHECK-NEXT:    retq
2049  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2050  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
2051  ret <4 x float> %2
2052}
2053declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
2054
2055; TODO stack_fold_rsqrtss
2056
2057define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0, <4 x float> %a1) optsize {
2058; CHECK-LABEL: stack_fold_rsqrtss_int:
2059; CHECK:       # %bb.0:
2060; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2061; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2062; CHECK-NEXT:    #APP
2063; CHECK-NEXT:    nop
2064; CHECK-NEXT:    #NO_APP
2065; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2066; CHECK-NEXT:    rsqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2067; CHECK-NEXT:    retq
2068  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2069  %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a1)
2070  %3 = extractelement <4 x float> %2, i32 0
2071  %4 = insertelement <4 x float> %a0, float %3, i32 0
2072  ret <4 x float> %4
2073}
2074declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>)
2075
2076define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
2077; CHECK-LABEL: stack_fold_shufpd:
2078; CHECK:       # %bb.0:
2079; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2080; CHECK-NEXT:    #APP
2081; CHECK-NEXT:    nop
2082; CHECK-NEXT:    #NO_APP
2083; CHECK-NEXT:    shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2084; CHECK-NEXT:    # xmm0 = xmm0[1],mem[0]
2085; CHECK-NEXT:    xorpd %xmm1, %xmm1
2086; CHECK-NEXT:    addpd %xmm1, %xmm0
2087; CHECK-NEXT:    retq
2088  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2089  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
2090  ; fadd forces execution domain
2091  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
2092  ret <2 x double> %3
2093}
2094
2095define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
2096; CHECK-LABEL: stack_fold_shufps:
2097; CHECK:       # %bb.0:
2098; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2099; CHECK-NEXT:    #APP
2100; CHECK-NEXT:    nop
2101; CHECK-NEXT:    #NO_APP
2102; CHECK-NEXT:    shufps $200, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2103; CHECK-NEXT:    # xmm0 = xmm0[0,2],mem[0,3]
2104; CHECK-NEXT:    retq
2105  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2106  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
2107  ret <4 x float> %2
2108}
2109
2110define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
2111; CHECK-LABEL: stack_fold_sqrtpd:
2112; CHECK:       # %bb.0:
2113; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2114; CHECK-NEXT:    #APP
2115; CHECK-NEXT:    nop
2116; CHECK-NEXT:    #NO_APP
2117; CHECK-NEXT:    sqrtpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2118; CHECK-NEXT:    retq
2119  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2120  %2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0)
2121  ret <2 x double> %2
2122}
2123
2124define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
2125; CHECK-LABEL: stack_fold_sqrtps:
2126; CHECK:       # %bb.0:
2127; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2128; CHECK-NEXT:    #APP
2129; CHECK-NEXT:    nop
2130; CHECK-NEXT:    #NO_APP
2131; CHECK-NEXT:    sqrtps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2132; CHECK-NEXT:    retq
2133  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2134  %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
2135  ret <4 x float> %2
2136}
2137
2138define double @stack_fold_sqrtsd(double %a0) optsize {
2139; CHECK-LABEL: stack_fold_sqrtsd:
2140; CHECK:       # %bb.0:
2141; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2142; CHECK-NEXT:    #APP
2143; CHECK-NEXT:    nop
2144; CHECK-NEXT:    #NO_APP
2145; CHECK-NEXT:    xorps %xmm0, %xmm0
2146; CHECK-NEXT:    sqrtsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
2147; CHECK-NEXT:    retq
2148  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2149  %2 = call double @llvm.sqrt.f64(double %a0)
2150  ret double %2
2151}
2152declare double @llvm.sqrt.f64(double) nounwind readnone
2153
2154define <2 x double> @stack_fold_sqrtsd_int(<2 x double> %a0, <2 x double> %a1) optsize {
2155; CHECK-LABEL: stack_fold_sqrtsd_int:
2156; CHECK:       # %bb.0:
2157; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2158; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2159; CHECK-NEXT:    #APP
2160; CHECK-NEXT:    nop
2161; CHECK-NEXT:    #NO_APP
2162; CHECK-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2163; CHECK-NEXT:    sqrtsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2164; CHECK-NEXT:    retq
2165  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2166  %2 = extractelement <2 x double> %a1, i64 0
2167  %3 = call double @llvm.sqrt.f64(double %2)
2168  %4 = insertelement <2 x double> %a1, double %3, i64 0
2169  %5 = extractelement <2 x double> %4, i32 0
2170  %6 = insertelement <2 x double> %a0, double %5, i32 0
2171  ret <2 x double> %6
2172}
2173
2174define float @stack_fold_sqrtss(float %a0) minsize {
2175; CHECK-LABEL: stack_fold_sqrtss:
2176; CHECK:       # %bb.0:
2177; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2178; CHECK-NEXT:    #APP
2179; CHECK-NEXT:    nop
2180; CHECK-NEXT:    #NO_APP
2181; CHECK-NEXT:    sqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
2182; CHECK-NEXT:    retq
2183  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2184  %2 = call float @llvm.sqrt.f32(float %a0)
2185  ret float %2
2186}
2187declare float @llvm.sqrt.f32(float) nounwind readnone
2188
2189define <4 x float> @stack_fold_sqrtss_int(<4 x float> %a0, <4 x float> %a1) optsize {
2190; CHECK-LABEL: stack_fold_sqrtss_int:
2191; CHECK:       # %bb.0:
2192; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2193; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2194; CHECK-NEXT:    #APP
2195; CHECK-NEXT:    nop
2196; CHECK-NEXT:    #NO_APP
2197; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2198; CHECK-NEXT:    sqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2199; CHECK-NEXT:    retq
2200  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2201  %2 = extractelement <4 x float> %a1, i64 0
2202  %3 = call float @llvm.sqrt.f32(float %2)
2203  %4 = insertelement <4 x float> %a1, float %3, i64 0
2204  %5 = extractelement <4 x float> %4, i32 0
2205  %6 = insertelement <4 x float> %a0, float %5, i32 0
2206  ret <4 x float> %6
2207}
2208
2209define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
2210; CHECK-LABEL: stack_fold_subpd:
2211; CHECK:       # %bb.0:
2212; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2213; CHECK-NEXT:    #APP
2214; CHECK-NEXT:    nop
2215; CHECK-NEXT:    #NO_APP
2216; CHECK-NEXT:    subpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2217; CHECK-NEXT:    retq
2218  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2219  %2 = fsub <2 x double> %a0, %a1
2220  ret <2 x double> %2
2221}
2222
2223define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
2224; CHECK-LABEL: stack_fold_subps:
2225; CHECK:       # %bb.0:
2226; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2227; CHECK-NEXT:    #APP
2228; CHECK-NEXT:    nop
2229; CHECK-NEXT:    #NO_APP
2230; CHECK-NEXT:    subps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2231; CHECK-NEXT:    retq
2232  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2233  %2 = fsub <4 x float> %a0, %a1
2234  ret <4 x float> %2
2235}
2236
2237define double @stack_fold_subsd(double %a0, double %a1) {
2238; CHECK-LABEL: stack_fold_subsd:
2239; CHECK:       # %bb.0:
2240; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2241; CHECK-NEXT:    #APP
2242; CHECK-NEXT:    nop
2243; CHECK-NEXT:    #NO_APP
2244; CHECK-NEXT:    subsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
2245; CHECK-NEXT:    retq
2246  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2247  %2 = fsub double %a0, %a1
2248  ret double %2
2249}
2250
2251define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
2252; CHECK-LABEL: stack_fold_subsd_int:
2253; CHECK:       # %bb.0:
2254; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2255; CHECK-NEXT:    #APP
2256; CHECK-NEXT:    nop
2257; CHECK-NEXT:    #NO_APP
2258; CHECK-NEXT:    subsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2259; CHECK-NEXT:    retq
2260  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2261  %2 = extractelement <2 x double> %a0, i32 0
2262  %3 = extractelement <2 x double> %a1, i32 0
2263  %4 = fsub double %2, %3
2264  %5 = insertelement <2 x double> %a0, double %4, i32 0
2265  ret <2 x double> %5
2266}
2267
2268define float @stack_fold_subss(float %a0, float %a1) {
2269; CHECK-LABEL: stack_fold_subss:
2270; CHECK:       # %bb.0:
2271; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2272; CHECK-NEXT:    #APP
2273; CHECK-NEXT:    nop
2274; CHECK-NEXT:    #NO_APP
2275; CHECK-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
2276; CHECK-NEXT:    retq
2277  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2278  %2 = fsub float %a0, %a1
2279  ret float %2
2280}
2281
2282define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
2283; CHECK-LABEL: stack_fold_subss_int:
2284; CHECK:       # %bb.0:
2285; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2286; CHECK-NEXT:    #APP
2287; CHECK-NEXT:    nop
2288; CHECK-NEXT:    #NO_APP
2289; CHECK-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2290; CHECK-NEXT:    retq
2291  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2292  %2 = extractelement <4 x float> %a0, i32 0
2293  %3 = extractelement <4 x float> %a1, i32 0
2294  %4 = fsub float %2, %3
2295  %5 = insertelement <4 x float> %a0, float %4, i32 0
2296  ret <4 x float> %5
2297}
2298
2299define i32 @stack_fold_ucomisd(double %a0, double %a1) {
2300; CHECK-LABEL: stack_fold_ucomisd:
2301; CHECK:       # %bb.0:
2302; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2303; CHECK-NEXT:    #APP
2304; CHECK-NEXT:    nop
2305; CHECK-NEXT:    #NO_APP
2306; CHECK-NEXT:    xorl %eax, %eax
2307; CHECK-NEXT:    ucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
2308; CHECK-NEXT:    sete %al
2309; CHECK-NEXT:    leal -1(%rax,%rax), %eax
2310; CHECK-NEXT:    retq
2311  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2312  %2 = fcmp ueq double %a0, %a1
2313  %3 = select i1 %2, i32 1, i32 -1
2314  ret i32 %3
2315}
2316
2317define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
2318; CHECK-LABEL: stack_fold_ucomisd_int:
2319; CHECK:       # %bb.0:
2320; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2321; CHECK-NEXT:    #APP
2322; CHECK-NEXT:    nop
2323; CHECK-NEXT:    #NO_APP
2324; CHECK-NEXT:    ucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2325; CHECK-NEXT:    setnp %al
2326; CHECK-NEXT:    sete %cl
2327; CHECK-NEXT:    andb %al, %cl
2328; CHECK-NEXT:    movzbl %cl, %eax
2329; CHECK-NEXT:    retq
2330  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2331  %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
2332  ret i32 %2
2333}
2334declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
2335
2336define i32 @stack_fold_ucomiss(float %a0, float %a1) {
2337; CHECK-LABEL: stack_fold_ucomiss:
2338; CHECK:       # %bb.0:
2339; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2340; CHECK-NEXT:    #APP
2341; CHECK-NEXT:    nop
2342; CHECK-NEXT:    #NO_APP
2343; CHECK-NEXT:    xorl %eax, %eax
2344; CHECK-NEXT:    ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
2345; CHECK-NEXT:    sete %al
2346; CHECK-NEXT:    leal -1(%rax,%rax), %eax
2347; CHECK-NEXT:    retq
2348  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2349  %2 = fcmp ueq float %a0, %a1
2350  %3 = select i1 %2, i32 1, i32 -1
2351  ret i32 %3
2352}
2353
2354define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
2355; CHECK-LABEL: stack_fold_ucomiss_int:
2356; CHECK:       # %bb.0:
2357; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2358; CHECK-NEXT:    #APP
2359; CHECK-NEXT:    nop
2360; CHECK-NEXT:    #NO_APP
2361; CHECK-NEXT:    ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2362; CHECK-NEXT:    setnp %al
2363; CHECK-NEXT:    sete %cl
2364; CHECK-NEXT:    andb %al, %cl
2365; CHECK-NEXT:    movzbl %cl, %eax
2366; CHECK-NEXT:    retq
2367  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2368  %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
2369  ret i32 %2
2370}
2371declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
2372
2373define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
2374; CHECK-LABEL: stack_fold_unpckhpd:
2375; CHECK:       # %bb.0:
2376; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2377; CHECK-NEXT:    #APP
2378; CHECK-NEXT:    nop
2379; CHECK-NEXT:    #NO_APP
2380; CHECK-NEXT:    unpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2381; CHECK-NEXT:    # xmm0 = xmm0[1],mem[1]
2382; CHECK-NEXT:    xorpd %xmm1, %xmm1
2383; CHECK-NEXT:    addpd %xmm1, %xmm0
2384; CHECK-NEXT:    retq
2385  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2386  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
2387  ; fadd forces execution domain
2388  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
2389  ret <2 x double> %3
2390}
2391
2392define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
2393; CHECK-LABEL: stack_fold_unpckhps:
2394; CHECK:       # %bb.0:
2395; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2396; CHECK-NEXT:    #APP
2397; CHECK-NEXT:    nop
2398; CHECK-NEXT:    #NO_APP
2399; CHECK-NEXT:    unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2400; CHECK-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
2401; CHECK-NEXT:    xorps %xmm1, %xmm1
2402; CHECK-NEXT:    addps %xmm1, %xmm0
2403; CHECK-NEXT:    retq
2404  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2405  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2406  ; fadd forces execution domain
2407  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
2408  ret <4 x float> %3
2409}
2410
2411define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
2412; CHECK-LABEL: stack_fold_unpcklpd:
2413; CHECK:       # %bb.0:
2414; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2415; CHECK-NEXT:    #APP
2416; CHECK-NEXT:    nop
2417; CHECK-NEXT:    #NO_APP
2418; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2419; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
2420; CHECK-NEXT:    xorpd %xmm1, %xmm1
2421; CHECK-NEXT:    addpd %xmm1, %xmm0
2422; CHECK-NEXT:    retq
2423  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2424  %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
2425  ; fadd forces execution domain
2426  %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
2427  ret <2 x double> %3
2428}
2429
2430define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
2431; CHECK-LABEL: stack_fold_unpcklps:
2432; CHECK:       # %bb.0:
2433; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2434; CHECK-NEXT:    #APP
2435; CHECK-NEXT:    nop
2436; CHECK-NEXT:    #NO_APP
2437; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2438; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2439; CHECK-NEXT:    xorps %xmm1, %xmm1
2440; CHECK-NEXT:    addps %xmm1, %xmm0
2441; CHECK-NEXT:    retq
2442  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2443  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2444  ; fadd forces execution domain
2445  %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
2446  ret <4 x float> %3
2447}
2448
2449define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
2450; CHECK-LABEL: stack_fold_xorpd:
2451; CHECK:       # %bb.0:
2452; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2453; CHECK-NEXT:    #APP
2454; CHECK-NEXT:    nop
2455; CHECK-NEXT:    #NO_APP
2456; CHECK-NEXT:    xorpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2457; CHECK-NEXT:    xorpd %xmm1, %xmm1
2458; CHECK-NEXT:    addpd %xmm1, %xmm0
2459; CHECK-NEXT:    retq
2460  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2461  %2 = bitcast <2 x double> %a0 to <2 x i64>
2462  %3 = bitcast <2 x double> %a1 to <2 x i64>
2463  %4 = xor <2 x i64> %2, %3
2464  %5 = bitcast <2 x i64> %4 to <2 x double>
2465  ; fadd forces execution domain
2466  %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
2467  ret <2 x double> %6
2468}
2469
2470define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
2471; CHECK-LABEL: stack_fold_xorps:
2472; CHECK:       # %bb.0:
2473; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2474; CHECK-NEXT:    #APP
2475; CHECK-NEXT:    nop
2476; CHECK-NEXT:    #NO_APP
2477; CHECK-NEXT:    xorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2478; CHECK-NEXT:    xorps %xmm1, %xmm1
2479; CHECK-NEXT:    addps %xmm1, %xmm0
2480; CHECK-NEXT:    retq
2481  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2482  %2 = bitcast <4 x float> %a0 to <2 x i64>
2483  %3 = bitcast <4 x float> %a1 to <2 x i64>
2484  %4 = xor <2 x i64> %2, %3
2485  %5 = bitcast <2 x i64> %4 to <4 x float>
2486  ; fadd forces execution domain
2487  %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
2488  ret <4 x float> %6
2489}
2490
2491declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
2492declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
2493
2494attributes #0 = { "unsafe-fp-math"="false" }
2495attributes #1 = { "unsafe-fp-math"="true" }
2496