xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-mmx.ll (revision b7e4fba6e5dcae5ff51f8eced21470a1b3ccd895)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s
3
4define <1 x i64> @stack_fold_cvtpd2pi(<2 x double> %a0) {
5; CHECK-LABEL: stack_fold_cvtpd2pi:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8; CHECK-NEXT:    #APP
9; CHECK-NEXT:    nop
10; CHECK-NEXT:    #NO_APP
11; CHECK-NEXT:    cvtpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
12; CHECK-NEXT:    movq %mm0, %rax
13; CHECK-NEXT:    retq
14  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
15  %2 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
16  ret <1 x i64> %2
17}
18declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
19
20define <2 x double> @stack_fold_cvtpi2pd(<1 x i64> %a0) {
21; CHECK-LABEL: stack_fold_cvtpi2pd:
22; CHECK:       # %bb.0:
23; CHECK-NEXT:    movq %rdi, %mm0
24; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
25; CHECK-NEXT:    #APP
26; CHECK-NEXT:    nop
27; CHECK-NEXT:    #NO_APP
28; CHECK-NEXT:    cvtpi2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
29; CHECK-NEXT:    retq
30  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
31  %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %a0) nounwind readnone
32  ret <2 x double> %2
33}
34declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
35
36define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) {
37; CHECK-LABEL: stack_fold_cvtpi2ps:
38; CHECK:       # %bb.0:
39; CHECK-NEXT:    movq %rdi, %mm0
40; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
41; CHECK-NEXT:    #APP
42; CHECK-NEXT:    nop
43; CHECK-NEXT:    #NO_APP
44; CHECK-NEXT:    cvtpi2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
45; CHECK-NEXT:    retq
46  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
47  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) nounwind readnone
48  ret <4 x float> %2
49}
50declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
51
52define <1 x i64> @stack_fold_cvtps2pi(<4 x float> %a0) {
53; CHECK-LABEL: stack_fold_cvtps2pi:
54; CHECK:       # %bb.0:
55; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
56; CHECK-NEXT:    #APP
57; CHECK-NEXT:    nop
58; CHECK-NEXT:    #NO_APP
59; CHECK-NEXT:    cvtps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
60; CHECK-NEXT:    movq %mm0, %rax
61; CHECK-NEXT:    retq
62  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
63  %2 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
64  ret <1 x i64> %2
65}
66declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
67
68define <1 x i64> @stack_fold_cvttpd2pi(<2 x double> %a0) {
69; CHECK-LABEL: stack_fold_cvttpd2pi:
70; CHECK:       # %bb.0:
71; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
72; CHECK-NEXT:    #APP
73; CHECK-NEXT:    nop
74; CHECK-NEXT:    #NO_APP
75; CHECK-NEXT:    cvttpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
76; CHECK-NEXT:    movq %mm0, %rax
77; CHECK-NEXT:    retq
78  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
79  %2 = call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
80  ret <1 x i64> %2
81}
82declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
83
84define <1 x i64> @stack_fold_cvttps2pi(<4 x float> %a0) {
85; CHECK-LABEL: stack_fold_cvttps2pi:
86; CHECK:       # %bb.0:
87; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
88; CHECK-NEXT:    #APP
89; CHECK-NEXT:    nop
90; CHECK-NEXT:    #NO_APP
91; CHECK-NEXT:    cvttps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
92; CHECK-NEXT:    movq %mm0, %rax
93; CHECK-NEXT:    retq
94  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
95  %2 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
96  ret <1 x i64> %2
97}
98declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
99
100; TODO stack_fold_movd_load
101
102; padd forces execution on mmx
103define i32 @stack_fold_movd_store(<1 x i64> %a0) nounwind {
104; CHECK-LABEL: stack_fold_movd_store:
105; CHECK:       # %bb.0:
106; CHECK-NEXT:    pushq %rbp
107; CHECK-NEXT:    pushq %r15
108; CHECK-NEXT:    pushq %r14
109; CHECK-NEXT:    pushq %r13
110; CHECK-NEXT:    pushq %r12
111; CHECK-NEXT:    pushq %rbx
112; CHECK-NEXT:    movq %rdi, %mm0
113; CHECK-NEXT:    paddb %mm0, %mm0
114; CHECK-NEXT:    movd %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
115; CHECK-NEXT:    #APP
116; CHECK-NEXT:    nop
117; CHECK-NEXT:    #NO_APP
118; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
119; CHECK-NEXT:    popq %rbx
120; CHECK-NEXT:    popq %r12
121; CHECK-NEXT:    popq %r13
122; CHECK-NEXT:    popq %r14
123; CHECK-NEXT:    popq %r15
124; CHECK-NEXT:    popq %rbp
125; CHECK-NEXT:    retq
126  %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0)
127  %2 = bitcast <1 x i64> %1 to <2 x i32>
128  %3 = extractelement <2 x i32> %2, i32 0
129  %4 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
130  ret i32 %3
131}
132
133; TODO stack_fold_movq_load
134
135; padd forces execution on mmx
136define i64 @stack_fold_movq_store(<1 x i64> %a0) nounwind {
137; CHECK-LABEL: stack_fold_movq_store:
138; CHECK:       # %bb.0:
139; CHECK-NEXT:    pushq %rbp
140; CHECK-NEXT:    pushq %r15
141; CHECK-NEXT:    pushq %r14
142; CHECK-NEXT:    pushq %r13
143; CHECK-NEXT:    pushq %r12
144; CHECK-NEXT:    pushq %rbx
145; CHECK-NEXT:    movq %rdi, %mm0
146; CHECK-NEXT:    paddb %mm0, %mm0
147; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
148; CHECK-NEXT:    #APP
149; CHECK-NEXT:    nop
150; CHECK-NEXT:    #NO_APP
151; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
152; CHECK-NEXT:    popq %rbx
153; CHECK-NEXT:    popq %r12
154; CHECK-NEXT:    popq %r13
155; CHECK-NEXT:    popq %r14
156; CHECK-NEXT:    popq %r15
157; CHECK-NEXT:    popq %rbp
158; CHECK-NEXT:    retq
159  %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0)
160  %2 = bitcast <1 x i64> %1 to i64
161  %3 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
162  ret i64 %2
163}
164
165define <1 x i64> @stack_fold_pabsb(<1 x i64> %a0) {
166; CHECK-LABEL: stack_fold_pabsb:
167; CHECK:       # %bb.0:
168; CHECK-NEXT:    movq %rdi, %mm0
169; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
170; CHECK-NEXT:    #APP
171; CHECK-NEXT:    nop
172; CHECK-NEXT:    #NO_APP
173; CHECK-NEXT:    pabsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
174; CHECK-NEXT:    movq %mm0, %rax
175; CHECK-NEXT:    retq
176  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
177  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %a0) nounwind readnone
178  ret <1 x i64> %2
179}
180declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
181
182define <1 x i64> @stack_fold_pabsd(<1 x i64> %a0) {
183; CHECK-LABEL: stack_fold_pabsd:
184; CHECK:       # %bb.0:
185; CHECK-NEXT:    movq %rdi, %mm0
186; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
187; CHECK-NEXT:    #APP
188; CHECK-NEXT:    nop
189; CHECK-NEXT:    #NO_APP
190; CHECK-NEXT:    pabsd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
191; CHECK-NEXT:    movq %mm0, %rax
192; CHECK-NEXT:    retq
193  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
194  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %a0) nounwind readnone
195  ret <1 x i64> %2
196}
197declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
198
199define <1 x i64> @stack_fold_pabsw(<1 x i64> %a0) {
200; CHECK-LABEL: stack_fold_pabsw:
201; CHECK:       # %bb.0:
202; CHECK-NEXT:    movq %rdi, %mm0
203; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
204; CHECK-NEXT:    #APP
205; CHECK-NEXT:    nop
206; CHECK-NEXT:    #NO_APP
207; CHECK-NEXT:    pabsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
208; CHECK-NEXT:    movq %mm0, %rax
209; CHECK-NEXT:    retq
210  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
211  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %a0) nounwind readnone
212  ret <1 x i64> %2
213}
214declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
215
216define <1 x i64> @stack_fold_packssdw(<1 x i64> %a, <1 x i64> %b) {
217; CHECK-LABEL: stack_fold_packssdw:
218; CHECK:       # %bb.0:
219; CHECK-NEXT:    movq %rsi, %mm0
220; CHECK-NEXT:    movq %rdi, %mm1
221; CHECK-NEXT:    packssdw %mm0, %mm1
222; CHECK-NEXT:    movq %mm1, %rax
223; CHECK-NEXT:    #APP
224; CHECK-NEXT:    nop
225; CHECK-NEXT:    #NO_APP
226; CHECK-NEXT:    retq
227  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
228  %2 = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
229  ret <1 x i64> %2
230}
231declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
232
233define <1 x i64> @stack_fold_packsswb(<1 x i64> %a, <1 x i64> %b) {
234; CHECK-LABEL: stack_fold_packsswb:
235; CHECK:       # %bb.0:
236; CHECK-NEXT:    movq %rsi, %mm0
237; CHECK-NEXT:    movq %rdi, %mm1
238; CHECK-NEXT:    packsswb %mm0, %mm1
239; CHECK-NEXT:    movq %mm1, %rax
240; CHECK-NEXT:    #APP
241; CHECK-NEXT:    nop
242; CHECK-NEXT:    #NO_APP
243; CHECK-NEXT:    retq
244  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
245  %2 = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone
246  ret <1 x i64> %2
247}
248declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
249
250define <1 x i64> @stack_fold_packuswb(<1 x i64> %a, <1 x i64> %b) {
251; CHECK-LABEL: stack_fold_packuswb:
252; CHECK:       # %bb.0:
253; CHECK-NEXT:    movq %rsi, %mm0
254; CHECK-NEXT:    movq %rdi, %mm1
255; CHECK-NEXT:    packuswb %mm0, %mm1
256; CHECK-NEXT:    movq %mm1, %rax
257; CHECK-NEXT:    #APP
258; CHECK-NEXT:    nop
259; CHECK-NEXT:    #NO_APP
260; CHECK-NEXT:    retq
261  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
262  %2 = call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone
263  ret <1 x i64> %2
264}
265declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
266
267define <1 x i64> @stack_fold_paddb(<1 x i64> %a, <1 x i64> %b) {
268; CHECK-LABEL: stack_fold_paddb:
269; CHECK:       # %bb.0:
270; CHECK-NEXT:    movq %rsi, %mm0
271; CHECK-NEXT:    movq %rdi, %mm1
272; CHECK-NEXT:    paddb %mm0, %mm1
273; CHECK-NEXT:    movq %mm1, %rax
274; CHECK-NEXT:    #APP
275; CHECK-NEXT:    nop
276; CHECK-NEXT:    #NO_APP
277; CHECK-NEXT:    retq
278  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
279  %2 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
280  ret <1 x i64> %2
281}
282declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
283
284define <1 x i64> @stack_fold_paddd(<1 x i64> %a, <1 x i64> %b) {
285; CHECK-LABEL: stack_fold_paddd:
286; CHECK:       # %bb.0:
287; CHECK-NEXT:    movq %rsi, %mm0
288; CHECK-NEXT:    movq %rdi, %mm1
289; CHECK-NEXT:    paddd %mm0, %mm1
290; CHECK-NEXT:    movq %mm1, %rax
291; CHECK-NEXT:    #APP
292; CHECK-NEXT:    nop
293; CHECK-NEXT:    #NO_APP
294; CHECK-NEXT:    retq
295  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
296  %2 = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
297  ret <1 x i64> %2
298}
299declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
300
301define <1 x i64> @stack_fold_paddq(<1 x i64> %a, <1 x i64> %b) {
302; CHECK-LABEL: stack_fold_paddq:
303; CHECK:       # %bb.0:
304; CHECK-NEXT:    movq %rsi, %mm0
305; CHECK-NEXT:    movq %rdi, %mm1
306; CHECK-NEXT:    paddq %mm0, %mm1
307; CHECK-NEXT:    movq %mm1, %rax
308; CHECK-NEXT:    #APP
309; CHECK-NEXT:    nop
310; CHECK-NEXT:    #NO_APP
311; CHECK-NEXT:    retq
312  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
313  %2 = call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
314  ret <1 x i64> %2
315}
316declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
317
318define <1 x i64> @stack_fold_paddsb(<1 x i64> %a, <1 x i64> %b) {
319; CHECK-LABEL: stack_fold_paddsb:
320; CHECK:       # %bb.0:
321; CHECK-NEXT:    movq %rsi, %mm0
322; CHECK-NEXT:    movq %rdi, %mm1
323; CHECK-NEXT:    paddsb %mm0, %mm1
324; CHECK-NEXT:    movq %mm1, %rax
325; CHECK-NEXT:    #APP
326; CHECK-NEXT:    nop
327; CHECK-NEXT:    #NO_APP
328; CHECK-NEXT:    retq
329  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
330  %2 = call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
331  ret <1 x i64> %2
332}
333declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
334
335define <1 x i64> @stack_fold_paddsw(<1 x i64> %a, <1 x i64> %b) {
336; CHECK-LABEL: stack_fold_paddsw:
337; CHECK:       # %bb.0:
338; CHECK-NEXT:    movq %rsi, %mm0
339; CHECK-NEXT:    movq %rdi, %mm1
340; CHECK-NEXT:    paddsw %mm0, %mm1
341; CHECK-NEXT:    movq %mm1, %rax
342; CHECK-NEXT:    #APP
343; CHECK-NEXT:    nop
344; CHECK-NEXT:    #NO_APP
345; CHECK-NEXT:    retq
346  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
347  %2 = call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
348  ret <1 x i64> %2
349}
350declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
351
352define <1 x i64> @stack_fold_paddusb(<1 x i64> %a, <1 x i64> %b) {
353; CHECK-LABEL: stack_fold_paddusb:
354; CHECK:       # %bb.0:
355; CHECK-NEXT:    movq %rsi, %mm0
356; CHECK-NEXT:    movq %rdi, %mm1
357; CHECK-NEXT:    paddusb %mm0, %mm1
358; CHECK-NEXT:    movq %mm1, %rax
359; CHECK-NEXT:    #APP
360; CHECK-NEXT:    nop
361; CHECK-NEXT:    #NO_APP
362; CHECK-NEXT:    retq
363  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
364  %2 = call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
365  ret <1 x i64> %2
366}
367declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
368
369define <1 x i64> @stack_fold_paddusw(<1 x i64> %a, <1 x i64> %b) {
370; CHECK-LABEL: stack_fold_paddusw:
371; CHECK:       # %bb.0:
372; CHECK-NEXT:    movq %rsi, %mm0
373; CHECK-NEXT:    movq %rdi, %mm1
374; CHECK-NEXT:    paddusw %mm0, %mm1
375; CHECK-NEXT:    movq %mm1, %rax
376; CHECK-NEXT:    #APP
377; CHECK-NEXT:    nop
378; CHECK-NEXT:    #NO_APP
379; CHECK-NEXT:    retq
380  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
381  %2 = call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
382  ret <1 x i64> %2
383}
384declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
385
386define <1 x i64> @stack_fold_paddw(<1 x i64> %a, <1 x i64> %b) {
387; CHECK-LABEL: stack_fold_paddw:
388; CHECK:       # %bb.0:
389; CHECK-NEXT:    movq %rsi, %mm0
390; CHECK-NEXT:    movq %rdi, %mm1
391; CHECK-NEXT:    paddw %mm0, %mm1
392; CHECK-NEXT:    movq %mm1, %rax
393; CHECK-NEXT:    #APP
394; CHECK-NEXT:    nop
395; CHECK-NEXT:    #NO_APP
396; CHECK-NEXT:    retq
397  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
398  %2 = call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
399  ret <1 x i64> %2
400}
401declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
402
403define <1 x i64> @stack_fold_palignr(<1 x i64> %a, <1 x i64> %b) {
404; CHECK-LABEL: stack_fold_palignr:
405; CHECK:       # %bb.0:
406; CHECK-NEXT:    movq %rsi, %mm0
407; CHECK-NEXT:    movq %rdi, %mm1
408; CHECK-NEXT:    palignr $1, %mm0, %mm1
409; CHECK-NEXT:    movq %mm1, %rax
410; CHECK-NEXT:    #APP
411; CHECK-NEXT:    nop
412; CHECK-NEXT:    #NO_APP
413; CHECK-NEXT:    retq
414  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
415  %2 = call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %a, <1 x i64> %b, i8 1) nounwind readnone
416  ret <1 x i64> %2
417}
418declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
419
420define <1 x i64> @stack_fold_pand(<1 x i64> %a, <1 x i64> %b) {
421; CHECK-LABEL: stack_fold_pand:
422; CHECK:       # %bb.0:
423; CHECK-NEXT:    movq %rsi, %mm0
424; CHECK-NEXT:    movq %rdi, %mm1
425; CHECK-NEXT:    pand %mm0, %mm1
426; CHECK-NEXT:    movq %mm1, %rax
427; CHECK-NEXT:    #APP
428; CHECK-NEXT:    nop
429; CHECK-NEXT:    #NO_APP
430; CHECK-NEXT:    retq
431  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
432  %2 = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %a, <1 x i64> %b) nounwind readnone
433  ret <1 x i64> %2
434}
435declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
436
437define <1 x i64> @stack_fold_pandn(<1 x i64> %a, <1 x i64> %b) {
438; CHECK-LABEL: stack_fold_pandn:
439; CHECK:       # %bb.0:
440; CHECK-NEXT:    movq %rsi, %mm0
441; CHECK-NEXT:    movq %rdi, %mm1
442; CHECK-NEXT:    pandn %mm0, %mm1
443; CHECK-NEXT:    movq %mm1, %rax
444; CHECK-NEXT:    #APP
445; CHECK-NEXT:    nop
446; CHECK-NEXT:    #NO_APP
447; CHECK-NEXT:    retq
448  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
449  %2 = call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %a, <1 x i64> %b) nounwind readnone
450  ret <1 x i64> %2
451}
452declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
453
454define <1 x i64> @stack_fold_pavgb(<1 x i64> %a, <1 x i64> %b) {
455; CHECK-LABEL: stack_fold_pavgb:
456; CHECK:       # %bb.0:
457; CHECK-NEXT:    movq %rsi, %mm0
458; CHECK-NEXT:    movq %rdi, %mm1
459; CHECK-NEXT:    pavgb %mm0, %mm1
460; CHECK-NEXT:    movq %mm1, %rax
461; CHECK-NEXT:    #APP
462; CHECK-NEXT:    nop
463; CHECK-NEXT:    #NO_APP
464; CHECK-NEXT:    retq
465  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
466  %2 = call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
467  ret <1 x i64> %2
468}
469declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
470
471define <1 x i64> @stack_fold_pavgw(<1 x i64> %a, <1 x i64> %b) {
472; CHECK-LABEL: stack_fold_pavgw:
473; CHECK:       # %bb.0:
474; CHECK-NEXT:    movq %rsi, %mm0
475; CHECK-NEXT:    movq %rdi, %mm1
476; CHECK-NEXT:    pavgw %mm0, %mm1
477; CHECK-NEXT:    movq %mm1, %rax
478; CHECK-NEXT:    #APP
479; CHECK-NEXT:    nop
480; CHECK-NEXT:    #NO_APP
481; CHECK-NEXT:    retq
482  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
483  %2 = call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
484  ret <1 x i64> %2
485}
486declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
487
488define <1 x i64> @stack_fold_pcmpeqb(<1 x i64> %a, <1 x i64> %b) {
489; CHECK-LABEL: stack_fold_pcmpeqb:
490; CHECK:       # %bb.0:
491; CHECK-NEXT:    movq %rsi, %mm0
492; CHECK-NEXT:    movq %rdi, %mm1
493; CHECK-NEXT:    pcmpeqb %mm0, %mm1
494; CHECK-NEXT:    movq %mm1, %rax
495; CHECK-NEXT:    #APP
496; CHECK-NEXT:    nop
497; CHECK-NEXT:    #NO_APP
498; CHECK-NEXT:    retq
499  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
500  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
501  ret <1 x i64> %2
502}
503declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
504
505define <1 x i64> @stack_fold_pcmpeqd(<1 x i64> %a, <1 x i64> %b) {
506; CHECK-LABEL: stack_fold_pcmpeqd:
507; CHECK:       # %bb.0:
508; CHECK-NEXT:    movq %rsi, %mm0
509; CHECK-NEXT:    movq %rdi, %mm1
510; CHECK-NEXT:    pcmpeqd %mm0, %mm1
511; CHECK-NEXT:    movq %mm1, %rax
512; CHECK-NEXT:    #APP
513; CHECK-NEXT:    nop
514; CHECK-NEXT:    #NO_APP
515; CHECK-NEXT:    retq
516  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
517  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
518  ret <1 x i64> %2
519}
520declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
521
522define <1 x i64> @stack_fold_pcmpeqw(<1 x i64> %a, <1 x i64> %b) {
523; CHECK-LABEL: stack_fold_pcmpeqw:
524; CHECK:       # %bb.0:
525; CHECK-NEXT:    movq %rsi, %mm0
526; CHECK-NEXT:    movq %rdi, %mm1
527; CHECK-NEXT:    pcmpeqw %mm0, %mm1
528; CHECK-NEXT:    movq %mm1, %rax
529; CHECK-NEXT:    #APP
530; CHECK-NEXT:    nop
531; CHECK-NEXT:    #NO_APP
532; CHECK-NEXT:    retq
533  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
534  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
535  ret <1 x i64> %2
536}
537declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
538
539define <1 x i64> @stack_fold_pcmpgtb(<1 x i64> %a, <1 x i64> %b) {
540; CHECK-LABEL: stack_fold_pcmpgtb:
541; CHECK:       # %bb.0:
542; CHECK-NEXT:    movq %rsi, %mm0
543; CHECK-NEXT:    movq %rdi, %mm1
544; CHECK-NEXT:    pcmpgtb %mm0, %mm1
545; CHECK-NEXT:    movq %mm1, %rax
546; CHECK-NEXT:    #APP
547; CHECK-NEXT:    nop
548; CHECK-NEXT:    #NO_APP
549; CHECK-NEXT:    retq
550  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
551  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
552  ret <1 x i64> %2
553}
554declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
555
556define <1 x i64> @stack_fold_pcmpgtd(<1 x i64> %a, <1 x i64> %b) {
557; CHECK-LABEL: stack_fold_pcmpgtd:
558; CHECK:       # %bb.0:
559; CHECK-NEXT:    movq %rsi, %mm0
560; CHECK-NEXT:    movq %rdi, %mm1
561; CHECK-NEXT:    pcmpgtd %mm0, %mm1
562; CHECK-NEXT:    movq %mm1, %rax
563; CHECK-NEXT:    #APP
564; CHECK-NEXT:    nop
565; CHECK-NEXT:    #NO_APP
566; CHECK-NEXT:    retq
567  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
568  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
569  ret <1 x i64> %2
570}
571declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
572
573define <1 x i64> @stack_fold_pcmpgtw(<1 x i64> %a, <1 x i64> %b) {
574; CHECK-LABEL: stack_fold_pcmpgtw:
575; CHECK:       # %bb.0:
576; CHECK-NEXT:    movq %rsi, %mm0
577; CHECK-NEXT:    movq %rdi, %mm1
578; CHECK-NEXT:    pcmpgtw %mm0, %mm1
579; CHECK-NEXT:    movq %mm1, %rax
580; CHECK-NEXT:    #APP
581; CHECK-NEXT:    nop
582; CHECK-NEXT:    #NO_APP
583; CHECK-NEXT:    retq
584  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
585  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
586  ret <1 x i64> %2
587}
588declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
589
590define <1 x i64> @stack_fold_phaddd(<1 x i64> %a, <1 x i64> %b) {
591; CHECK-LABEL: stack_fold_phaddd:
592; CHECK:       # %bb.0:
593; CHECK-NEXT:    movq %rsi, %mm0
594; CHECK-NEXT:    movq %rdi, %mm1
595; CHECK-NEXT:    phaddd %mm0, %mm1
596; CHECK-NEXT:    movq %mm1, %rax
597; CHECK-NEXT:    #APP
598; CHECK-NEXT:    nop
599; CHECK-NEXT:    #NO_APP
600; CHECK-NEXT:    retq
601  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
602  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
603  ret <1 x i64> %2
604}
605declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
606
607define <1 x i64> @stack_fold_phaddsw(<1 x i64> %a, <1 x i64> %b) {
608; CHECK-LABEL: stack_fold_phaddsw:
609; CHECK:       # %bb.0:
610; CHECK-NEXT:    movq %rsi, %mm0
611; CHECK-NEXT:    movq %rdi, %mm1
612; CHECK-NEXT:    phaddsw %mm0, %mm1
613; CHECK-NEXT:    movq %mm1, %rax
614; CHECK-NEXT:    #APP
615; CHECK-NEXT:    nop
616; CHECK-NEXT:    #NO_APP
617; CHECK-NEXT:    retq
618  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
619  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
620  ret <1 x i64> %2
621}
622declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
623
624define <1 x i64> @stack_fold_phaddw(<1 x i64> %a, <1 x i64> %b) {
625; CHECK-LABEL: stack_fold_phaddw:
626; CHECK:       # %bb.0:
627; CHECK-NEXT:    movq %rsi, %mm0
628; CHECK-NEXT:    movq %rdi, %mm1
629; CHECK-NEXT:    phaddw %mm0, %mm1
630; CHECK-NEXT:    movq %mm1, %rax
631; CHECK-NEXT:    #APP
632; CHECK-NEXT:    nop
633; CHECK-NEXT:    #NO_APP
634; CHECK-NEXT:    retq
635  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
636  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
637  ret <1 x i64> %2
638}
639declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
640
641define <1 x i64> @stack_fold_phsubd(<1 x i64> %a, <1 x i64> %b) {
642; CHECK-LABEL: stack_fold_phsubd:
643; CHECK:       # %bb.0:
644; CHECK-NEXT:    movq %rsi, %mm0
645; CHECK-NEXT:    movq %rdi, %mm1
646; CHECK-NEXT:    phsubd %mm0, %mm1
647; CHECK-NEXT:    movq %mm1, %rax
648; CHECK-NEXT:    #APP
649; CHECK-NEXT:    nop
650; CHECK-NEXT:    #NO_APP
651; CHECK-NEXT:    retq
652  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
653  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
654  ret <1 x i64> %2
655}
656declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
657
658define <1 x i64> @stack_fold_phsubsw(<1 x i64> %a, <1 x i64> %b) {
659; CHECK-LABEL: stack_fold_phsubsw:
660; CHECK:       # %bb.0:
661; CHECK-NEXT:    movq %rsi, %mm0
662; CHECK-NEXT:    movq %rdi, %mm1
663; CHECK-NEXT:    phsubsw %mm0, %mm1
664; CHECK-NEXT:    movq %mm1, %rax
665; CHECK-NEXT:    #APP
666; CHECK-NEXT:    nop
667; CHECK-NEXT:    #NO_APP
668; CHECK-NEXT:    retq
669  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
670  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
671  ret <1 x i64> %2
672}
673declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
674
675define <1 x i64> @stack_fold_phsubw(<1 x i64> %a, <1 x i64> %b) {
676; CHECK-LABEL: stack_fold_phsubw:
677; CHECK:       # %bb.0:
678; CHECK-NEXT:    movq %rsi, %mm0
679; CHECK-NEXT:    movq %rdi, %mm1
680; CHECK-NEXT:    phsubw %mm0, %mm1
681; CHECK-NEXT:    movq %mm1, %rax
682; CHECK-NEXT:    #APP
683; CHECK-NEXT:    nop
684; CHECK-NEXT:    #NO_APP
685; CHECK-NEXT:    retq
686  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
687  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
688  ret <1 x i64> %2
689}
690declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
691
692; TODO stack_fold_pinsrw
693
694define <1 x i64> @stack_fold_pmaddubsw(<1 x i64> %a, <1 x i64> %b) {
695; CHECK-LABEL: stack_fold_pmaddubsw:
696; CHECK:       # %bb.0:
697; CHECK-NEXT:    movq %rsi, %mm0
698; CHECK-NEXT:    movq %rdi, %mm1
699; CHECK-NEXT:    pmaddubsw %mm0, %mm1
700; CHECK-NEXT:    movq %mm1, %rax
701; CHECK-NEXT:    #APP
702; CHECK-NEXT:    nop
703; CHECK-NEXT:    #NO_APP
704; CHECK-NEXT:    retq
705  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
706  %2 = call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
707  ret <1 x i64> %2
708}
709declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
710
711define <1 x i64> @stack_fold_pmaddwd(<1 x i64> %a, <1 x i64> %b) {
712; CHECK-LABEL: stack_fold_pmaddwd:
713; CHECK:       # %bb.0:
714; CHECK-NEXT:    movq %rsi, %mm0
715; CHECK-NEXT:    movq %rdi, %mm1
716; CHECK-NEXT:    pmaddwd %mm0, %mm1
717; CHECK-NEXT:    movq %mm1, %rax
718; CHECK-NEXT:    #APP
719; CHECK-NEXT:    nop
720; CHECK-NEXT:    #NO_APP
721; CHECK-NEXT:    retq
722  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
723  %2 = call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
724  ret <1 x i64> %2
725}
726declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
727
728define <1 x i64> @stack_fold_pmaxsw(<1 x i64> %a, <1 x i64> %b) {
729; CHECK-LABEL: stack_fold_pmaxsw:
730; CHECK:       # %bb.0:
731; CHECK-NEXT:    movq %rsi, %mm0
732; CHECK-NEXT:    movq %rdi, %mm1
733; CHECK-NEXT:    pmaxsw %mm0, %mm1
734; CHECK-NEXT:    movq %mm1, %rax
735; CHECK-NEXT:    #APP
736; CHECK-NEXT:    nop
737; CHECK-NEXT:    #NO_APP
738; CHECK-NEXT:    retq
739  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
740  %2 = call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
741  ret <1 x i64> %2
742}
743declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
744
745define <1 x i64> @stack_fold_pmaxub(<1 x i64> %a, <1 x i64> %b) {
746; CHECK-LABEL: stack_fold_pmaxub:
747; CHECK:       # %bb.0:
748; CHECK-NEXT:    movq %rsi, %mm0
749; CHECK-NEXT:    movq %rdi, %mm1
750; CHECK-NEXT:    pmaxub %mm0, %mm1
751; CHECK-NEXT:    movq %mm1, %rax
752; CHECK-NEXT:    #APP
753; CHECK-NEXT:    nop
754; CHECK-NEXT:    #NO_APP
755; CHECK-NEXT:    retq
756  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
757  %2 = call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
758  ret <1 x i64> %2
759}
760declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
761
762define <1 x i64> @stack_fold_pminsw(<1 x i64> %a, <1 x i64> %b) {
763; CHECK-LABEL: stack_fold_pminsw:
764; CHECK:       # %bb.0:
765; CHECK-NEXT:    movq %rsi, %mm0
766; CHECK-NEXT:    movq %rdi, %mm1
767; CHECK-NEXT:    pminsw %mm0, %mm1
768; CHECK-NEXT:    movq %mm1, %rax
769; CHECK-NEXT:    #APP
770; CHECK-NEXT:    nop
771; CHECK-NEXT:    #NO_APP
772; CHECK-NEXT:    retq
773  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
774  %2 = call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
775  ret <1 x i64> %2
776}
777declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
778
779define <1 x i64> @stack_fold_pminub(<1 x i64> %a, <1 x i64> %b) {
780; CHECK-LABEL: stack_fold_pminub:
781; CHECK:       # %bb.0:
782; CHECK-NEXT:    movq %rsi, %mm0
783; CHECK-NEXT:    movq %rdi, %mm1
784; CHECK-NEXT:    pminub %mm0, %mm1
785; CHECK-NEXT:    movq %mm1, %rax
786; CHECK-NEXT:    #APP
787; CHECK-NEXT:    nop
788; CHECK-NEXT:    #NO_APP
789; CHECK-NEXT:    retq
790  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
791  %2 = call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
792  ret <1 x i64> %2
793}
794declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
795
796define <1 x i64> @stack_fold_pmulhrsw(<1 x i64> %a, <1 x i64> %b) {
797; CHECK-LABEL: stack_fold_pmulhrsw:
798; CHECK:       # %bb.0:
799; CHECK-NEXT:    movq %rsi, %mm0
800; CHECK-NEXT:    movq %rdi, %mm1
801; CHECK-NEXT:    pmulhrsw %mm0, %mm1
802; CHECK-NEXT:    movq %mm1, %rax
803; CHECK-NEXT:    #APP
804; CHECK-NEXT:    nop
805; CHECK-NEXT:    #NO_APP
806; CHECK-NEXT:    retq
807  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
808  %2 = call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
809  ret <1 x i64> %2
810}
811declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
812
813define <1 x i64> @stack_fold_pmulhuw(<1 x i64> %a, <1 x i64> %b) {
814; CHECK-LABEL: stack_fold_pmulhuw:
815; CHECK:       # %bb.0:
816; CHECK-NEXT:    movq %rsi, %mm0
817; CHECK-NEXT:    movq %rdi, %mm1
818; CHECK-NEXT:    pmulhuw %mm0, %mm1
819; CHECK-NEXT:    movq %mm1, %rax
820; CHECK-NEXT:    #APP
821; CHECK-NEXT:    nop
822; CHECK-NEXT:    #NO_APP
823; CHECK-NEXT:    retq
824  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
825  %2 = call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
826  ret <1 x i64> %2
827}
828declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
829
830define <1 x i64> @stack_fold_pmulhw(<1 x i64> %a, <1 x i64> %b) {
831; CHECK-LABEL: stack_fold_pmulhw:
832; CHECK:       # %bb.0:
833; CHECK-NEXT:    movq %rsi, %mm0
834; CHECK-NEXT:    movq %rdi, %mm1
835; CHECK-NEXT:    pmulhw %mm0, %mm1
836; CHECK-NEXT:    movq %mm1, %rax
837; CHECK-NEXT:    #APP
838; CHECK-NEXT:    nop
839; CHECK-NEXT:    #NO_APP
840; CHECK-NEXT:    retq
841  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
842  %2 = call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
843  ret <1 x i64> %2
844}
845declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
846
847define <1 x i64> @stack_fold_pmullw(<1 x i64> %a, <1 x i64> %b) {
848; CHECK-LABEL: stack_fold_pmullw:
849; CHECK:       # %bb.0:
850; CHECK-NEXT:    movq %rsi, %mm0
851; CHECK-NEXT:    movq %rdi, %mm1
852; CHECK-NEXT:    pmullw %mm0, %mm1
853; CHECK-NEXT:    movq %mm1, %rax
854; CHECK-NEXT:    #APP
855; CHECK-NEXT:    nop
856; CHECK-NEXT:    #NO_APP
857; CHECK-NEXT:    retq
858  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
859  %2 = call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
860  ret <1 x i64> %2
861}
862declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
863
864define <1 x i64> @stack_fold_pmuludq(<1 x i64> %a, <1 x i64> %b) {
865; CHECK-LABEL: stack_fold_pmuludq:
866; CHECK:       # %bb.0:
867; CHECK-NEXT:    movq %rsi, %mm0
868; CHECK-NEXT:    movq %rdi, %mm1
869; CHECK-NEXT:    pmuludq %mm0, %mm1
870; CHECK-NEXT:    movq %mm1, %rax
871; CHECK-NEXT:    #APP
872; CHECK-NEXT:    nop
873; CHECK-NEXT:    #NO_APP
874; CHECK-NEXT:    retq
875  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
876  %2 = call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
877  ret <1 x i64> %2
878}
879declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
880
881define <1 x i64> @stack_fold_por(<1 x i64> %a, <1 x i64> %b) {
882; CHECK-LABEL: stack_fold_por:
883; CHECK:       # %bb.0:
884; CHECK-NEXT:    movq %rsi, %mm0
885; CHECK-NEXT:    movq %rdi, %mm1
886; CHECK-NEXT:    por %mm0, %mm1
887; CHECK-NEXT:    movq %mm1, %rax
888; CHECK-NEXT:    #APP
889; CHECK-NEXT:    nop
890; CHECK-NEXT:    #NO_APP
891; CHECK-NEXT:    retq
892  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
893  %2 = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %a, <1 x i64> %b) nounwind readnone
894  ret <1 x i64> %2
895}
896declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
897
898define <1 x i64> @stack_fold_psadbw(<1 x i64> %a, <1 x i64> %b) {
899; CHECK-LABEL: stack_fold_psadbw:
900; CHECK:       # %bb.0:
901; CHECK-NEXT:    movq %rsi, %mm0
902; CHECK-NEXT:    movq %rdi, %mm1
903; CHECK-NEXT:    psadbw %mm0, %mm1
904; CHECK-NEXT:    movq %mm1, %rax
905; CHECK-NEXT:    #APP
906; CHECK-NEXT:    nop
907; CHECK-NEXT:    #NO_APP
908; CHECK-NEXT:    retq
909  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
910  %2 = call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
911  ret <1 x i64> %2
912}
913declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
914
915define <1 x i64> @stack_fold_pshufb(<1 x i64> %a, <1 x i64> %b) {
916; CHECK-LABEL: stack_fold_pshufb:
917; CHECK:       # %bb.0:
918; CHECK-NEXT:    movq %rsi, %mm0
919; CHECK-NEXT:    movq %rdi, %mm1
920; CHECK-NEXT:    pshufb %mm0, %mm1
921; CHECK-NEXT:    movq %mm1, %rax
922; CHECK-NEXT:    #APP
923; CHECK-NEXT:    nop
924; CHECK-NEXT:    #NO_APP
925; CHECK-NEXT:    retq
926  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
927  %2 = call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
928  ret <1 x i64> %2
929}
930declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
931
932define <1 x i64> @stack_fold_pshufw(<1 x i64> %a) {
933; CHECK-LABEL: stack_fold_pshufw:
934; CHECK:       # %bb.0:
935; CHECK-NEXT:    movq %rdi, %mm0
936; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
937; CHECK-NEXT:    #APP
938; CHECK-NEXT:    nop
939; CHECK-NEXT:    #NO_APP
940; CHECK-NEXT:    pshufw $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
941; CHECK-NEXT:    # mm0 = mem[1,0,0,0]
942; CHECK-NEXT:    movq %mm0, %rax
943; CHECK-NEXT:    retq
944  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
945  %2 = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %a, i8 1) nounwind readnone
946  ret <1 x i64> %2
947}
948declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
949
950define <1 x i64> @stack_fold_psignb(<1 x i64> %a0, <1 x i64> %a1) {
951; CHECK-LABEL: stack_fold_psignb:
952; CHECK:       # %bb.0:
953; CHECK-NEXT:    movq %rsi, %mm0
954; CHECK-NEXT:    movq %rdi, %mm1
955; CHECK-NEXT:    psignb %mm0, %mm1
956; CHECK-NEXT:    movq %mm1, %rax
957; CHECK-NEXT:    #APP
958; CHECK-NEXT:    nop
959; CHECK-NEXT:    #NO_APP
960; CHECK-NEXT:    retq
961  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
962  %2 = call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
963  ret <1 x i64> %2
964}
965declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
966
967define <1 x i64> @stack_fold_psignd(<1 x i64> %a0, <1 x i64> %a1) {
968; CHECK-LABEL: stack_fold_psignd:
969; CHECK:       # %bb.0:
970; CHECK-NEXT:    movq %rsi, %mm0
971; CHECK-NEXT:    movq %rdi, %mm1
972; CHECK-NEXT:    psignd %mm0, %mm1
973; CHECK-NEXT:    movq %mm1, %rax
974; CHECK-NEXT:    #APP
975; CHECK-NEXT:    nop
976; CHECK-NEXT:    #NO_APP
977; CHECK-NEXT:    retq
978  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
979  %2 = call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
980  ret <1 x i64> %2
981}
982declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
983
984define <1 x i64> @stack_fold_psignw(<1 x i64> %a0, <1 x i64> %a1) {
985; CHECK-LABEL: stack_fold_psignw:
986; CHECK:       # %bb.0:
987; CHECK-NEXT:    movq %rsi, %mm0
988; CHECK-NEXT:    movq %rdi, %mm1
989; CHECK-NEXT:    psignw %mm0, %mm1
990; CHECK-NEXT:    movq %mm1, %rax
991; CHECK-NEXT:    #APP
992; CHECK-NEXT:    nop
993; CHECK-NEXT:    #NO_APP
994; CHECK-NEXT:    retq
995  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
996  %2 = call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
997  ret <1 x i64> %2
998}
999declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
1000
1001define <1 x i64> @stack_fold_pslld(<1 x i64> %a, <1 x i64> %b) {
1002; CHECK-LABEL: stack_fold_pslld:
1003; CHECK:       # %bb.0:
1004; CHECK-NEXT:    movq %rsi, %mm0
1005; CHECK-NEXT:    movq %rdi, %mm1
1006; CHECK-NEXT:    pslld %mm0, %mm1
1007; CHECK-NEXT:    movq %mm1, %rax
1008; CHECK-NEXT:    #APP
1009; CHECK-NEXT:    nop
1010; CHECK-NEXT:    #NO_APP
1011; CHECK-NEXT:    retq
1012  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1013  %2 = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1014  ret <1 x i64> %2
1015}
1016declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
1017
1018define <1 x i64> @stack_fold_psllq(<1 x i64> %a, <1 x i64> %b) {
1019; CHECK-LABEL: stack_fold_psllq:
1020; CHECK:       # %bb.0:
1021; CHECK-NEXT:    movq %rsi, %mm0
1022; CHECK-NEXT:    movq %rdi, %mm1
1023; CHECK-NEXT:    psllq %mm0, %mm1
1024; CHECK-NEXT:    movq %mm1, %rax
1025; CHECK-NEXT:    #APP
1026; CHECK-NEXT:    nop
1027; CHECK-NEXT:    #NO_APP
1028; CHECK-NEXT:    retq
1029  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1030  %2 = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1031  ret <1 x i64> %2
1032}
1033declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
1034
1035define <1 x i64> @stack_fold_psllw(<1 x i64> %a, <1 x i64> %b) {
1036; CHECK-LABEL: stack_fold_psllw:
1037; CHECK:       # %bb.0:
1038; CHECK-NEXT:    movq %rsi, %mm0
1039; CHECK-NEXT:    movq %rdi, %mm1
1040; CHECK-NEXT:    psllw %mm0, %mm1
1041; CHECK-NEXT:    movq %mm1, %rax
1042; CHECK-NEXT:    #APP
1043; CHECK-NEXT:    nop
1044; CHECK-NEXT:    #NO_APP
1045; CHECK-NEXT:    retq
1046  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1047  %2 = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1048  ret <1 x i64> %2
1049}
1050declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
1051
1052define <1 x i64> @stack_fold_psrad(<1 x i64> %a, <1 x i64> %b) {
1053; CHECK-LABEL: stack_fold_psrad:
1054; CHECK:       # %bb.0:
1055; CHECK-NEXT:    movq %rsi, %mm0
1056; CHECK-NEXT:    movq %rdi, %mm1
1057; CHECK-NEXT:    psrad %mm0, %mm1
1058; CHECK-NEXT:    movq %mm1, %rax
1059; CHECK-NEXT:    #APP
1060; CHECK-NEXT:    nop
1061; CHECK-NEXT:    #NO_APP
1062; CHECK-NEXT:    retq
1063  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1064  %2 = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1065  ret <1 x i64> %2
1066}
1067declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
1068
1069define <1 x i64> @stack_fold_psraw(<1 x i64> %a, <1 x i64> %b) {
1070; CHECK-LABEL: stack_fold_psraw:
1071; CHECK:       # %bb.0:
1072; CHECK-NEXT:    movq %rsi, %mm0
1073; CHECK-NEXT:    movq %rdi, %mm1
1074; CHECK-NEXT:    psraw %mm0, %mm1
1075; CHECK-NEXT:    movq %mm1, %rax
1076; CHECK-NEXT:    #APP
1077; CHECK-NEXT:    nop
1078; CHECK-NEXT:    #NO_APP
1079; CHECK-NEXT:    retq
1080  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1081  %2 = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1082  ret <1 x i64> %2
1083}
1084declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
1085
1086define <1 x i64> @stack_fold_psrld(<1 x i64> %a, <1 x i64> %b) {
1087; CHECK-LABEL: stack_fold_psrld:
1088; CHECK:       # %bb.0:
1089; CHECK-NEXT:    movq %rsi, %mm0
1090; CHECK-NEXT:    movq %rdi, %mm1
1091; CHECK-NEXT:    psrld %mm0, %mm1
1092; CHECK-NEXT:    movq %mm1, %rax
1093; CHECK-NEXT:    #APP
1094; CHECK-NEXT:    nop
1095; CHECK-NEXT:    #NO_APP
1096; CHECK-NEXT:    retq
1097  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1098  %2 = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1099  ret <1 x i64> %2
1100}
1101declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
1102
1103define <1 x i64> @stack_fold_psrlq(<1 x i64> %a, <1 x i64> %b) {
1104; CHECK-LABEL: stack_fold_psrlq:
1105; CHECK:       # %bb.0:
1106; CHECK-NEXT:    movq %rsi, %mm0
1107; CHECK-NEXT:    movq %rdi, %mm1
1108; CHECK-NEXT:    psrlq %mm0, %mm1
1109; CHECK-NEXT:    movq %mm1, %rax
1110; CHECK-NEXT:    #APP
1111; CHECK-NEXT:    nop
1112; CHECK-NEXT:    #NO_APP
1113; CHECK-NEXT:    retq
1114  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1115  %2 = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1116  ret <1 x i64> %2
1117}
1118declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
1119
1120define <1 x i64> @stack_fold_psrlw(<1 x i64> %a, <1 x i64> %b) {
1121; CHECK-LABEL: stack_fold_psrlw:
1122; CHECK:       # %bb.0:
1123; CHECK-NEXT:    movq %rsi, %mm0
1124; CHECK-NEXT:    movq %rdi, %mm1
1125; CHECK-NEXT:    psrlw %mm0, %mm1
1126; CHECK-NEXT:    movq %mm1, %rax
1127; CHECK-NEXT:    #APP
1128; CHECK-NEXT:    nop
1129; CHECK-NEXT:    #NO_APP
1130; CHECK-NEXT:    retq
1131  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1132  %2 = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1133  ret <1 x i64> %2
1134}
1135declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
1136
1137define <1 x i64> @stack_fold_psubb(<1 x i64> %a, <1 x i64> %b) {
1138; CHECK-LABEL: stack_fold_psubb:
1139; CHECK:       # %bb.0:
1140; CHECK-NEXT:    movq %rsi, %mm0
1141; CHECK-NEXT:    movq %rdi, %mm1
1142; CHECK-NEXT:    psubb %mm0, %mm1
1143; CHECK-NEXT:    movq %mm1, %rax
1144; CHECK-NEXT:    #APP
1145; CHECK-NEXT:    nop
1146; CHECK-NEXT:    #NO_APP
1147; CHECK-NEXT:    retq
1148  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1149  %2 = call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1150  ret <1 x i64> %2
1151}
1152declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
1153
1154define <1 x i64> @stack_fold_psubd(<1 x i64> %a, <1 x i64> %b) {
1155; CHECK-LABEL: stack_fold_psubd:
1156; CHECK:       # %bb.0:
1157; CHECK-NEXT:    movq %rsi, %mm0
1158; CHECK-NEXT:    movq %rdi, %mm1
1159; CHECK-NEXT:    psubd %mm0, %mm1
1160; CHECK-NEXT:    movq %mm1, %rax
1161; CHECK-NEXT:    #APP
1162; CHECK-NEXT:    nop
1163; CHECK-NEXT:    #NO_APP
1164; CHECK-NEXT:    retq
1165  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1166  %2 = call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1167  ret <1 x i64> %2
1168}
1169declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
1170
1171define <1 x i64> @stack_fold_psubq(<1 x i64> %a, <1 x i64> %b) {
1172; CHECK-LABEL: stack_fold_psubq:
1173; CHECK:       # %bb.0:
1174; CHECK-NEXT:    movq %rsi, %mm0
1175; CHECK-NEXT:    movq %rdi, %mm1
1176; CHECK-NEXT:    psubq %mm0, %mm1
1177; CHECK-NEXT:    movq %mm1, %rax
1178; CHECK-NEXT:    #APP
1179; CHECK-NEXT:    nop
1180; CHECK-NEXT:    #NO_APP
1181; CHECK-NEXT:    retq
1182  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1183  %2 = call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1184  ret <1 x i64> %2
1185}
1186declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
1187
1188define <1 x i64> @stack_fold_psubsb(<1 x i64> %a, <1 x i64> %b) {
1189; CHECK-LABEL: stack_fold_psubsb:
1190; CHECK:       # %bb.0:
1191; CHECK-NEXT:    movq %rsi, %mm0
1192; CHECK-NEXT:    movq %rdi, %mm1
1193; CHECK-NEXT:    psubsb %mm0, %mm1
1194; CHECK-NEXT:    movq %mm1, %rax
1195; CHECK-NEXT:    #APP
1196; CHECK-NEXT:    nop
1197; CHECK-NEXT:    #NO_APP
1198; CHECK-NEXT:    retq
1199  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1200  %2 = call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1201  ret <1 x i64> %2
1202}
1203declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
1204
1205define <1 x i64> @stack_fold_psubsw(<1 x i64> %a, <1 x i64> %b) {
1206; CHECK-LABEL: stack_fold_psubsw:
1207; CHECK:       # %bb.0:
1208; CHECK-NEXT:    movq %rsi, %mm0
1209; CHECK-NEXT:    movq %rdi, %mm1
1210; CHECK-NEXT:    psubsw %mm0, %mm1
1211; CHECK-NEXT:    movq %mm1, %rax
1212; CHECK-NEXT:    #APP
1213; CHECK-NEXT:    nop
1214; CHECK-NEXT:    #NO_APP
1215; CHECK-NEXT:    retq
1216  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1217  %2 = call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1218  ret <1 x i64> %2
1219}
1220declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
1221
1222define <1 x i64> @stack_fold_psubusb(<1 x i64> %a, <1 x i64> %b) {
1223; CHECK-LABEL: stack_fold_psubusb:
1224; CHECK:       # %bb.0:
1225; CHECK-NEXT:    movq %rsi, %mm0
1226; CHECK-NEXT:    movq %rdi, %mm1
1227; CHECK-NEXT:    psubusb %mm0, %mm1
1228; CHECK-NEXT:    movq %mm1, %rax
1229; CHECK-NEXT:    #APP
1230; CHECK-NEXT:    nop
1231; CHECK-NEXT:    #NO_APP
1232; CHECK-NEXT:    retq
1233  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1234  %2 = call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1235  ret <1 x i64> %2
1236}
1237declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
1238
1239define <1 x i64> @stack_fold_psubusw(<1 x i64> %a, <1 x i64> %b) {
1240; CHECK-LABEL: stack_fold_psubusw:
1241; CHECK:       # %bb.0:
1242; CHECK-NEXT:    movq %rsi, %mm0
1243; CHECK-NEXT:    movq %rdi, %mm1
1244; CHECK-NEXT:    psubusw %mm0, %mm1
1245; CHECK-NEXT:    movq %mm1, %rax
1246; CHECK-NEXT:    #APP
1247; CHECK-NEXT:    nop
1248; CHECK-NEXT:    #NO_APP
1249; CHECK-NEXT:    retq
1250  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1251  %2 = call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1252  ret <1 x i64> %2
1253}
1254declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
1255
1256define <1 x i64> @stack_fold_psubw(<1 x i64> %a, <1 x i64> %b) {
1257; CHECK-LABEL: stack_fold_psubw:
1258; CHECK:       # %bb.0:
1259; CHECK-NEXT:    movq %rsi, %mm0
1260; CHECK-NEXT:    movq %rdi, %mm1
1261; CHECK-NEXT:    psubw %mm0, %mm1
1262; CHECK-NEXT:    movq %mm1, %rax
1263; CHECK-NEXT:    #APP
1264; CHECK-NEXT:    nop
1265; CHECK-NEXT:    #NO_APP
1266; CHECK-NEXT:    retq
1267  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1268  %2 = call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1269  ret <1 x i64> %2
1270}
1271declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
1272
1273define <1 x i64> @stack_fold_punpckhbw(<1 x i64> %a, <1 x i64> %b) {
1274; CHECK-LABEL: stack_fold_punpckhbw:
1275; CHECK:       # %bb.0:
1276; CHECK-NEXT:    movq %rsi, %mm0
1277; CHECK-NEXT:    movq %rdi, %mm1
1278; CHECK-NEXT:    punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7]
1279; CHECK-NEXT:    movq %mm1, %rax
1280; CHECK-NEXT:    #APP
1281; CHECK-NEXT:    nop
1282; CHECK-NEXT:    #NO_APP
1283; CHECK-NEXT:    retq
1284  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1285  %2 = call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1286  ret <1 x i64> %2
1287}
1288declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
1289
1290define <1 x i64> @stack_fold_punpckhdq(<1 x i64> %a, <1 x i64> %b) {
1291; CHECK-LABEL: stack_fold_punpckhdq:
1292; CHECK:       # %bb.0:
1293; CHECK-NEXT:    movq %rsi, %mm0
1294; CHECK-NEXT:    movq %rdi, %mm1
1295; CHECK-NEXT:    punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1]
1296; CHECK-NEXT:    movq %mm1, %rax
1297; CHECK-NEXT:    #APP
1298; CHECK-NEXT:    nop
1299; CHECK-NEXT:    #NO_APP
1300; CHECK-NEXT:    retq
1301  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1302  %2 = call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1303  ret <1 x i64> %2
1304}
1305declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
1306
1307define <1 x i64> @stack_fold_punpckhwd(<1 x i64> %a, <1 x i64> %b) {
1308; CHECK-LABEL: stack_fold_punpckhwd:
1309; CHECK:       # %bb.0:
1310; CHECK-NEXT:    movq %rsi, %mm0
1311; CHECK-NEXT:    movq %rdi, %mm1
1312; CHECK-NEXT:    punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3]
1313; CHECK-NEXT:    movq %mm1, %rax
1314; CHECK-NEXT:    #APP
1315; CHECK-NEXT:    nop
1316; CHECK-NEXT:    #NO_APP
1317; CHECK-NEXT:    retq
1318  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1319  %2 = call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1320  ret <1 x i64> %2
1321}
1322declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
1323
1324define <1 x i64> @stack_fold_punpcklbw(<1 x i64> %a, <1 x i64> %b) {
1325; CHECK-LABEL: stack_fold_punpcklbw:
1326; CHECK:       # %bb.0:
1327; CHECK-NEXT:    movq %rsi, %mm0
1328; CHECK-NEXT:    movq %rdi, %mm1
1329; CHECK-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
1330; CHECK-NEXT:    movq %mm1, %rax
1331; CHECK-NEXT:    #APP
1332; CHECK-NEXT:    nop
1333; CHECK-NEXT:    #NO_APP
1334; CHECK-NEXT:    retq
1335  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1336  %2 = call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1337  ret <1 x i64> %2
1338}
1339declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
1340
1341define <1 x i64> @stack_fold_punpckldq(<1 x i64> %a, <1 x i64> %b) {
1342; CHECK-LABEL: stack_fold_punpckldq:
1343; CHECK:       # %bb.0:
1344; CHECK-NEXT:    movq %rsi, %mm0
1345; CHECK-NEXT:    movq %rdi, %mm1
1346; CHECK-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
1347; CHECK-NEXT:    movq %mm1, %rax
1348; CHECK-NEXT:    #APP
1349; CHECK-NEXT:    nop
1350; CHECK-NEXT:    #NO_APP
1351; CHECK-NEXT:    retq
1352  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1353  %2 = call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1354  ret <1 x i64> %2
1355}
1356declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
1357
1358define <1 x i64> @stack_fold_punpcklwd(<1 x i64> %a, <1 x i64> %b) {
1359; CHECK-LABEL: stack_fold_punpcklwd:
1360; CHECK:       # %bb.0:
1361; CHECK-NEXT:    movq %rsi, %mm0
1362; CHECK-NEXT:    movq %rdi, %mm1
1363; CHECK-NEXT:    punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
1364; CHECK-NEXT:    movq %mm1, %rax
1365; CHECK-NEXT:    #APP
1366; CHECK-NEXT:    nop
1367; CHECK-NEXT:    #NO_APP
1368; CHECK-NEXT:    retq
1369  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1370  %2 = call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1371  ret <1 x i64> %2
1372}
1373declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
1374
1375define <1 x i64> @stack_fold_pxor(<1 x i64> %a, <1 x i64> %b) {
1376; CHECK-LABEL: stack_fold_pxor:
1377; CHECK:       # %bb.0:
1378; CHECK-NEXT:    movq %rsi, %mm0
1379; CHECK-NEXT:    movq %rdi, %mm1
1380; CHECK-NEXT:    pxor %mm0, %mm1
1381; CHECK-NEXT:    movq %mm1, %rax
1382; CHECK-NEXT:    #APP
1383; CHECK-NEXT:    nop
1384; CHECK-NEXT:    #NO_APP
1385; CHECK-NEXT:    retq
1386  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1387  %2 = call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %a, <1 x i64> %b) nounwind readnone
1388  ret <1 x i64> %2
1389}
1390declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
1391