xref: /llvm-project/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll (revision a2a0089ac3a5781ba74d4d319c87c9e8b46d4eda)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) {
13; CHECK-LABEL: stack_fold_broadcastsd_ymm:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16; CHECK-NEXT:    #APP
17; CHECK-NEXT:    nop
18; CHECK-NEXT:    #NO_APP
19; CHECK-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
20; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = [4.9406564584124654E-324,0.0E+0]
21; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
22; CHECK-NEXT:    retq
23  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
24  %2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
25  ; fadd forces execution domain
26  %3 = fadd <4 x double> %2, <double 0x1, double 0x0, double 0x0, double 0x0>
27  ret <4 x double> %3
28}
29
30define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) {
31; CHECK-LABEL: stack_fold_broadcastss:
32; CHECK:       # %bb.0:
33; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
34; CHECK-NEXT:    #APP
35; CHECK-NEXT:    nop
36; CHECK-NEXT:    #NO_APP
37; CHECK-NEXT:    vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
38; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
39; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
40; CHECK-NEXT:    retq
41  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
42  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
43  ; fadd forces execution domain
44  %3 = fadd <4 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0>
45  ret <4 x float> %3
46}
47
48define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
49; CHECK-LABEL: stack_fold_broadcastss_ymm:
50; CHECK:       # %bb.0:
51; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
52; CHECK-NEXT:    #APP
53; CHECK-NEXT:    nop
54; CHECK-NEXT:    #NO_APP
55; CHECK-NEXT:    vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
56; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
57; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
58; CHECK-NEXT:    retq
59  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
60  %2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
61  ; fadd forces execution domain
62  %3 = fadd <8 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
63  ret <8 x float> %3
64}
65
66define <4 x i32> @stack_fold_extracti128(<8 x i16> %a0, <8 x i32> %a1) {
67; CHECK-LABEL: stack_fold_extracti128:
68; CHECK:       # %bb.0:
69; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
70; CHECK-NEXT:    vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
71; CHECK-NEXT:    #APP
72; CHECK-NEXT:    nop
73; CHECK-NEXT:    #NO_APP
74; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
75; CHECK-NEXT:    vzeroupper
76; CHECK-NEXT:    retq
77  ; zext forces execution domain
78  %t1 = zext <8 x i16> %a0 to <8 x i32>
79  %t2 = shufflevector <8 x i32> %t1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
80  %t3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
81  ret <4 x i32> %t2
82}
83
84define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) {
85; CHECK-LABEL: stack_fold_inserti128:
86; CHECK:       # %bb.0:
87; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
88; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
89; CHECK-NEXT:    #APP
90; CHECK-NEXT:    nop
91; CHECK-NEXT:    #NO_APP
92; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
93; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
94; CHECK-NEXT:    retq
95  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
96  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
97  ; add forces execution domain
98  %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
99  ret <8 x i32> %3
100}
101
102define <16 x i16> @stack_fold_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
103; CHECK-LABEL: stack_fold_mpsadbw:
104; CHECK:       # %bb.0:
105; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
106; CHECK-NEXT:    #APP
107; CHECK-NEXT:    nop
108; CHECK-NEXT:    #NO_APP
109; CHECK-NEXT:    vmpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
110; CHECK-NEXT:    retq
111  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
112  %2 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7)
113  ret <16 x i16> %2
114}
115declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
116
117define <32 x i8> @stack_fold_pabsb(<32 x i8> %a0) {
118; CHECK-LABEL: stack_fold_pabsb:
119; CHECK:       # %bb.0:
120; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
121; CHECK-NEXT:    #APP
122; CHECK-NEXT:    nop
123; CHECK-NEXT:    #NO_APP
124; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
125; CHECK-NEXT:    retq
126  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
127  %2 = icmp sgt <32 x i8> %a0, zeroinitializer
128  %3 = sub <32 x i8> zeroinitializer, %a0
129  %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3
130  ret <32 x i8> %4
131}
132
133define <8 x i32> @stack_fold_pabsd(<8 x i32> %a0) {
134; CHECK-LABEL: stack_fold_pabsd:
135; CHECK:       # %bb.0:
136; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
137; CHECK-NEXT:    #APP
138; CHECK-NEXT:    nop
139; CHECK-NEXT:    #NO_APP
140; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
141; CHECK-NEXT:    retq
142  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
143  %2 = icmp sgt <8 x i32> %a0, zeroinitializer
144  %3 = sub <8 x i32> zeroinitializer, %a0
145  %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3
146  ret <8 x i32> %4
147}
148
149define <16 x i16> @stack_fold_pabsw(<16 x i16> %a0) {
150; CHECK-LABEL: stack_fold_pabsw:
151; CHECK:       # %bb.0:
152; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
153; CHECK-NEXT:    #APP
154; CHECK-NEXT:    nop
155; CHECK-NEXT:    #NO_APP
156; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
157; CHECK-NEXT:    retq
158  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
159  %2 = icmp sgt <16 x i16> %a0, zeroinitializer
160  %3 = sub <16 x i16> zeroinitializer, %a0
161  %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3
162  ret <16 x i16> %4
163}
164
165define <16 x i16> @stack_fold_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
166; CHECK-LABEL: stack_fold_packssdw:
167; CHECK:       # %bb.0:
168; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
169; CHECK-NEXT:    #APP
170; CHECK-NEXT:    nop
171; CHECK-NEXT:    #NO_APP
172; CHECK-NEXT:    vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
173; CHECK-NEXT:    retq
174  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
175  %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
176  ret <16 x i16> %2
177}
178declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
179
180define <32 x i8> @stack_fold_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
181; CHECK-LABEL: stack_fold_packsswb:
182; CHECK:       # %bb.0:
183; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
184; CHECK-NEXT:    #APP
185; CHECK-NEXT:    nop
186; CHECK-NEXT:    #NO_APP
187; CHECK-NEXT:    vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
188; CHECK-NEXT:    retq
189  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
190  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
191  ret <32 x i8> %2
192}
193declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
194
195define <16 x i16> @stack_fold_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
196; CHECK-LABEL: stack_fold_packusdw:
197; CHECK:       # %bb.0:
198; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
199; CHECK-NEXT:    #APP
200; CHECK-NEXT:    nop
201; CHECK-NEXT:    #NO_APP
202; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
203; CHECK-NEXT:    retq
204  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
205  %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
206  ret <16 x i16> %2
207}
208declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
209
210define <32 x i8> @stack_fold_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
211; CHECK-LABEL: stack_fold_packuswb:
212; CHECK:       # %bb.0:
213; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
214; CHECK-NEXT:    #APP
215; CHECK-NEXT:    nop
216; CHECK-NEXT:    #NO_APP
217; CHECK-NEXT:    vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
218; CHECK-NEXT:    retq
219  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
220  %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
221  ret <32 x i8> %2
222}
223declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
224
225define <32 x i8> @stack_fold_paddb(<32 x i8> %a0, <32 x i8> %a1) {
226; CHECK-LABEL: stack_fold_paddb:
227; CHECK:       # %bb.0:
228; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
229; CHECK-NEXT:    #APP
230; CHECK-NEXT:    nop
231; CHECK-NEXT:    #NO_APP
232; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
233; CHECK-NEXT:    retq
234  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
235  %2 = add <32 x i8> %a0, %a1
236  ret <32 x i8> %2
237}
238
239define <8 x i32> @stack_fold_paddd(<8 x i32> %a0, <8 x i32> %a1) {
240; CHECK-LABEL: stack_fold_paddd:
241; CHECK:       # %bb.0:
242; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
243; CHECK-NEXT:    #APP
244; CHECK-NEXT:    nop
245; CHECK-NEXT:    #NO_APP
246; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
247; CHECK-NEXT:    retq
248  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
249  %2 = add <8 x i32> %a0, %a1
250  ret <8 x i32> %2
251}
252
253define <4 x i64> @stack_fold_paddq(<4 x i64> %a0, <4 x i64> %a1) {
254; CHECK-LABEL: stack_fold_paddq:
255; CHECK:       # %bb.0:
256; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
257; CHECK-NEXT:    #APP
258; CHECK-NEXT:    nop
259; CHECK-NEXT:    #NO_APP
260; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
261; CHECK-NEXT:    retq
262  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
263  %2 = add <4 x i64> %a0, %a1
264  ret <4 x i64> %2
265}
266
267define <32 x i8> @stack_fold_paddsb(<32 x i8> %a0, <32 x i8> %a1) {
268; CHECK-LABEL: stack_fold_paddsb:
269; CHECK:       # %bb.0:
270; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
271; CHECK-NEXT:    #APP
272; CHECK-NEXT:    nop
273; CHECK-NEXT:    #NO_APP
274; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
275; CHECK-NEXT:    retq
276  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
277  %2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
278  ret <32 x i8> %2
279}
280declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
281
282define <16 x i16> @stack_fold_paddsw(<16 x i16> %a0, <16 x i16> %a1) {
283; CHECK-LABEL: stack_fold_paddsw:
284; CHECK:       # %bb.0:
285; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
286; CHECK-NEXT:    #APP
287; CHECK-NEXT:    nop
288; CHECK-NEXT:    #NO_APP
289; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
290; CHECK-NEXT:    retq
291  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
292  %2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
293  ret <16 x i16> %2
294}
295declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
296
297define <32 x i8> @stack_fold_paddusb(<32 x i8> %a0, <32 x i8> %a1) {
298; CHECK-LABEL: stack_fold_paddusb:
299; CHECK:       # %bb.0:
300; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
301; CHECK-NEXT:    #APP
302; CHECK-NEXT:    nop
303; CHECK-NEXT:    #NO_APP
304; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
305; CHECK-NEXT:    retq
306  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
307  %2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
308  ret <32 x i8> %2
309}
310declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
311
312define <16 x i16> @stack_fold_paddusw(<16 x i16> %a0, <16 x i16> %a1) {
313; CHECK-LABEL: stack_fold_paddusw:
314; CHECK:       # %bb.0:
315; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
316; CHECK-NEXT:    #APP
317; CHECK-NEXT:    nop
318; CHECK-NEXT:    #NO_APP
319; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
320; CHECK-NEXT:    retq
321  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
322  %2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
323  ret <16 x i16> %2
324}
325declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
326
327define <16 x i16> @stack_fold_paddw(<16 x i16> %a0, <16 x i16> %a1) {
328; CHECK-LABEL: stack_fold_paddw:
329; CHECK:       # %bb.0:
330; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
331; CHECK-NEXT:    #APP
332; CHECK-NEXT:    nop
333; CHECK-NEXT:    #NO_APP
334; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
335; CHECK-NEXT:    retq
336  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
337  %2 = add <16 x i16> %a0, %a1
338  ret <16 x i16> %2
339}
340
341define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) {
342; CHECK-LABEL: stack_fold_palignr:
343; CHECK:       # %bb.0:
344; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
345; CHECK-NEXT:    #APP
346; CHECK-NEXT:    nop
347; CHECK-NEXT:    #NO_APP
348; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
349; CHECK-NEXT:    # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
350; CHECK-NEXT:    retq
351  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
352  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
353  ret <32 x i8> %2
354}
355
356define <32 x i8> @stack_fold_pand(<32 x i8> %a0, <32 x i8> %a1) {
357; CHECK-LABEL: stack_fold_pand:
358; CHECK:       # %bb.0:
359; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
360; CHECK-NEXT:    #APP
361; CHECK-NEXT:    nop
362; CHECK-NEXT:    #NO_APP
363; CHECK-NEXT:    vpand {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
364; CHECK-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
365; CHECK-NEXT:    retq
366  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
367  %2 = and <32 x i8> %a0, %a1
368  ; add forces execution domain
369  %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
370  ret <32 x i8> %3
371}
372
373define <32 x i8> @stack_fold_pandn(<32 x i8> %a0, <32 x i8> %a1) {
374; CHECK-LABEL: stack_fold_pandn:
375; CHECK:       # %bb.0:
376; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
377; CHECK-NEXT:    #APP
378; CHECK-NEXT:    nop
379; CHECK-NEXT:    #NO_APP
380; CHECK-NEXT:    vpandn {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
381; CHECK-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
382; CHECK-NEXT:    retq
383  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
384  %2 = xor <32 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
385  %3 = and <32 x i8> %2, %a1
386  ; add forces execution domain
387  %4 = add <32 x i8> %3, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
388  ret <32 x i8> %4
389}
390
391define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) {
392; CHECK-LABEL: stack_fold_pavgb:
393; CHECK:       # %bb.0:
394; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
395; CHECK-NEXT:    #APP
396; CHECK-NEXT:    nop
397; CHECK-NEXT:    #NO_APP
398; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
399; CHECK-NEXT:    retq
400  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
401  %2 = zext <32 x i8> %a0 to <32 x i16>
402  %3 = zext <32 x i8> %a1 to <32 x i16>
403  %4 = add <32 x i16> %2, %3
404  %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
405  %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
406  %7 = trunc <32 x i16> %6 to <32 x i8>
407  ret <32 x i8> %7
408}
409
410define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) {
411; CHECK-LABEL: stack_fold_pavgw:
412; CHECK:       # %bb.0:
413; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
414; CHECK-NEXT:    #APP
415; CHECK-NEXT:    nop
416; CHECK-NEXT:    #NO_APP
417; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
418; CHECK-NEXT:    retq
419  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
420  %2 = zext <16 x i16> %a0 to <16 x i32>
421  %3 = zext <16 x i16> %a1 to <16 x i32>
422  %4 = add <16 x i32> %2, %3
423  %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
424  %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
425  %7 = trunc <16 x i32> %6 to <16 x i16>
426  ret <16 x i16> %7
427}
428
429define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) {
430; CHECK-LABEL: stack_fold_pblendd:
431; CHECK:       # %bb.0:
432; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
433; CHECK-NEXT:    #APP
434; CHECK-NEXT:    nop
435; CHECK-NEXT:    #NO_APP
436; CHECK-NEXT:    vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
437; CHECK-NEXT:    # xmm0 = mem[0,1,2],xmm0[3]
438; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
439; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
440; CHECK-NEXT:    retq
441  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
442  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
443  ; add forces execution domain
444  %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
445  ret <4 x i32> %3
446}
447
448define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
449; CHECK-LABEL: stack_fold_pblendd_ymm:
450; CHECK:       # %bb.0:
451; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
452; CHECK-NEXT:    #APP
453; CHECK-NEXT:    nop
454; CHECK-NEXT:    #NO_APP
455; CHECK-NEXT:    vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
456; CHECK-NEXT:    # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7]
457; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
458; CHECK-NEXT:    retq
459  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
460  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
461  ; add forces execution domain
462  %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
463  ret <8 x i32> %3
464}
465
466define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) {
467; CHECK-LABEL: stack_fold_pblendvb:
468; CHECK:       # %bb.0:
469; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
470; CHECK-NEXT:    #APP
471; CHECK-NEXT:    nop
472; CHECK-NEXT:    #NO_APP
473; CHECK-NEXT:    vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
474; CHECK-NEXT:    retq
475  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
476  %2 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a1, <32 x i8> %c, <32 x i8> %a0)
477  ret <32 x i8> %2
478}
479declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
480
481define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
482; CHECK-LABEL: stack_fold_pblendw:
483; CHECK:       # %bb.0:
484; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
485; CHECK-NEXT:    #APP
486; CHECK-NEXT:    nop
487; CHECK-NEXT:    #NO_APP
488; CHECK-NEXT:    vpblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
489; CHECK-NEXT:    # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7],mem[8,9,10],ymm0[11,12,13,14,15]
490; CHECK-NEXT:    retq
491  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
492  %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
493  ret <16 x i16> %2
494}
495
496define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) {
497; CHECK-LABEL: stack_fold_pbroadcastb:
498; CHECK:       # %bb.0:
499; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
500; CHECK-NEXT:    #APP
501; CHECK-NEXT:    nop
502; CHECK-NEXT:    #NO_APP
503; CHECK-NEXT:    vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
504; CHECK-NEXT:    retq
505  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
506  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
507  ret <16 x i8> %2
508}
509
510define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) {
511; CHECK-LABEL: stack_fold_pbroadcastb_ymm:
512; CHECK:       # %bb.0:
513; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
514; CHECK-NEXT:    #APP
515; CHECK-NEXT:    nop
516; CHECK-NEXT:    #NO_APP
517; CHECK-NEXT:    vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
518; CHECK-NEXT:    retq
519  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
520  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> zeroinitializer
521  ret <32 x i8> %2
522}
523
524define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) {
525; CHECK-LABEL: stack_fold_pbroadcastd:
526; CHECK:       # %bb.0:
527; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
528; CHECK-NEXT:    #APP
529; CHECK-NEXT:    nop
530; CHECK-NEXT:    #NO_APP
531; CHECK-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
532; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
533; CHECK-NEXT:    retq
534  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
535  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer
536  ; add forces execution domain
537  %3 = add <4 x i32> %2, <i32 2, i32 1, i32 1, i32 1>
538  ret <4 x i32> %3
539}
540
541define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) {
542; CHECK-LABEL: stack_fold_pbroadcastd_ymm:
543; CHECK:       # %bb.0:
544; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
545; CHECK-NEXT:    #APP
546; CHECK-NEXT:    nop
547; CHECK-NEXT:    #NO_APP
548; CHECK-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
549; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
550; CHECK-NEXT:    retq
551  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
552  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> zeroinitializer
553  ; add forces execution domain
554  %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
555  ret <8 x i32> %3
556}
557
558define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) {
559; CHECK-LABEL: stack_fold_pbroadcastq:
560; CHECK:       # %bb.0:
561; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
562; CHECK-NEXT:    #APP
563; CHECK-NEXT:    nop
564; CHECK-NEXT:    #NO_APP
565; CHECK-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
566; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
567; CHECK-NEXT:    retq
568  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
569  %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
570  ; add forces execution domain
571  %3 = add <2 x i64> %2, <i64 2, i64 1>
572  ret <2 x i64> %3
573}
574
575define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) {
576; CHECK-LABEL: stack_fold_pbroadcastq_ymm:
577; CHECK:       # %bb.0:
578; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
579; CHECK-NEXT:    #APP
580; CHECK-NEXT:    nop
581; CHECK-NEXT:    #NO_APP
582; CHECK-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
583; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
584; CHECK-NEXT:    retq
585  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
586  %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
587  ; add forces execution domain
588  %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1>
589  ret <4 x i64> %3
590}
591
592define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) {
593; CHECK-LABEL: stack_fold_pbroadcastw:
594; CHECK:       # %bb.0:
595; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
596; CHECK-NEXT:    #APP
597; CHECK-NEXT:    nop
598; CHECK-NEXT:    #NO_APP
599; CHECK-NEXT:    vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
600; CHECK-NEXT:    retq
601  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
602  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
603  ret <8 x i16> %2
604}
605
606define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) {
607; CHECK-LABEL: stack_fold_pbroadcastw_ymm:
608; CHECK:       # %bb.0:
609; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
610; CHECK-NEXT:    #APP
611; CHECK-NEXT:    nop
612; CHECK-NEXT:    #NO_APP
613; CHECK-NEXT:    vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
614; CHECK-NEXT:    retq
615  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
616  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> zeroinitializer
617  ret <16 x i16> %2
618}
619
620define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) {
621; CHECK-LABEL: stack_fold_pcmpeqb:
622; CHECK:       # %bb.0:
623; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
624; CHECK-NEXT:    #APP
625; CHECK-NEXT:    nop
626; CHECK-NEXT:    #NO_APP
627; CHECK-NEXT:    vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
628; CHECK-NEXT:    retq
629  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
630  %2 = icmp eq <32 x i8> %a0, %a1
631  %3 = sext <32 x i1> %2 to <32 x i8>
632  ret <32 x i8> %3
633}
634
635define <8 x i32> @stack_fold_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1) {
636; CHECK-LABEL: stack_fold_pcmpeqd:
637; CHECK:       # %bb.0:
638; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
639; CHECK-NEXT:    #APP
640; CHECK-NEXT:    nop
641; CHECK-NEXT:    #NO_APP
642; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
643; CHECK-NEXT:    retq
644  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
645  %2 = icmp eq <8 x i32> %a0, %a1
646  %3 = sext <8 x i1> %2 to <8 x i32>
647  ret <8 x i32> %3
648}
649
650define <4 x i64> @stack_fold_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) {
651; CHECK-LABEL: stack_fold_pcmpeqq:
652; CHECK:       # %bb.0:
653; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
654; CHECK-NEXT:    #APP
655; CHECK-NEXT:    nop
656; CHECK-NEXT:    #NO_APP
657; CHECK-NEXT:    vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
658; CHECK-NEXT:    retq
659  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
660  %2 = icmp eq <4 x i64> %a0, %a1
661  %3 = sext <4 x i1> %2 to <4 x i64>
662  ret <4 x i64> %3
663}
664
665define <16 x i16> @stack_fold_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1) {
666; CHECK-LABEL: stack_fold_pcmpeqw:
667; CHECK:       # %bb.0:
668; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
669; CHECK-NEXT:    #APP
670; CHECK-NEXT:    nop
671; CHECK-NEXT:    #NO_APP
672; CHECK-NEXT:    vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
673; CHECK-NEXT:    retq
674  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
675  %2 = icmp eq <16 x i16> %a0, %a1
676  %3 = sext <16 x i1> %2 to <16 x i16>
677  ret <16 x i16> %3
678}
679
680define <32 x i8> @stack_fold_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1) {
681; CHECK-LABEL: stack_fold_pcmpgtb:
682; CHECK:       # %bb.0:
683; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
684; CHECK-NEXT:    #APP
685; CHECK-NEXT:    nop
686; CHECK-NEXT:    #NO_APP
687; CHECK-NEXT:    vpcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
688; CHECK-NEXT:    retq
689  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
690  %2 = icmp sgt <32 x i8> %a0, %a1
691  %3 = sext <32 x i1> %2 to <32 x i8>
692  ret <32 x i8> %3
693}
694
695define <8 x i32> @stack_fold_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1) {
696; CHECK-LABEL: stack_fold_pcmpgtd:
697; CHECK:       # %bb.0:
698; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
699; CHECK-NEXT:    #APP
700; CHECK-NEXT:    nop
701; CHECK-NEXT:    #NO_APP
702; CHECK-NEXT:    vpcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
703; CHECK-NEXT:    retq
704  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
705  %2 = icmp sgt <8 x i32> %a0, %a1
706  %3 = sext <8 x i1> %2 to <8 x i32>
707  ret <8 x i32> %3
708}
709
710define <4 x i64> @stack_fold_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) {
711; CHECK-LABEL: stack_fold_pcmpgtq:
712; CHECK:       # %bb.0:
713; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
714; CHECK-NEXT:    #APP
715; CHECK-NEXT:    nop
716; CHECK-NEXT:    #NO_APP
717; CHECK-NEXT:    vpcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
718; CHECK-NEXT:    retq
719  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
720  %2 = icmp sgt <4 x i64> %a0, %a1
721  %3 = sext <4 x i1> %2 to <4 x i64>
722  ret <4 x i64> %3
723}
724
725define <16 x i16> @stack_fold_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1) {
726; CHECK-LABEL: stack_fold_pcmpgtw:
727; CHECK:       # %bb.0:
728; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
729; CHECK-NEXT:    #APP
730; CHECK-NEXT:    nop
731; CHECK-NEXT:    #NO_APP
732; CHECK-NEXT:    vpcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
733; CHECK-NEXT:    retq
734  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
735  %2 = icmp sgt <16 x i16> %a0, %a1
736  %3 = sext <16 x i1> %2 to <16 x i16>
737  ret <16 x i16> %3
738}
739
740define <8 x i32> @stack_fold_perm2i128(<8 x i32> %a0, <8 x i32> %a1) {
741; CHECK-LABEL: stack_fold_perm2i128:
742; CHECK:       # %bb.0:
743; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
744; CHECK-NEXT:    #APP
745; CHECK-NEXT:    nop
746; CHECK-NEXT:    #NO_APP
747; CHECK-NEXT:    vperm2i128 $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
748; CHECK-NEXT:    # ymm0 = ymm0[2,3],mem[0,1]
749; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
750; CHECK-NEXT:    retq
751  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
752  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
753  ; add forces execution domain
754  %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
755  ret <8 x i32> %3
756}
757
758define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
759; CHECK-LABEL: stack_fold_permd:
760; CHECK:       # %bb.0:
761; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
762; CHECK-NEXT:    #APP
763; CHECK-NEXT:    nop
764; CHECK-NEXT:    #NO_APP
765; CHECK-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
766; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
767; CHECK-NEXT:    retq
768  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
769  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
770  ; add forces execution domain
771  %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
772  ret <8 x i32> %3
773}
774declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
775
776define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
777; CHECK-LABEL: stack_fold_permpd:
778; CHECK:       # %bb.0:
779; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
780; CHECK-NEXT:    #APP
781; CHECK-NEXT:    nop
782; CHECK-NEXT:    #NO_APP
783; CHECK-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
784; CHECK-NEXT:    # ymm0 = mem[3,2,2,3]
785; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
786; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
787; CHECK-NEXT:    retq
788  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
789  %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
790  ; fadd forces execution domain
791  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
792  ret <4 x double> %3
793}
794
795define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) {
796; CHECK-LABEL: stack_fold_permps:
797; CHECK:       # %bb.0:
798; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
799; CHECK-NEXT:    #APP
800; CHECK-NEXT:    nop
801; CHECK-NEXT:    #NO_APP
802; CHECK-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
803; CHECK-NEXT:    retq
804  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
805  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
806  ret <8 x float> %2
807}
808declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
809
810define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
811; CHECK-LABEL: stack_fold_permq:
812; CHECK:       # %bb.0:
813; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
814; CHECK-NEXT:    #APP
815; CHECK-NEXT:    nop
816; CHECK-NEXT:    #NO_APP
817; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
818; CHECK-NEXT:    # ymm0 = mem[3,2,2,3]
819; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
820; CHECK-NEXT:    retq
821  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
822  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
823  ; add forces execution domain
824  %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1>
825  ret <4 x i64> %3
826}
827
828define <8 x i32> @stack_fold_phaddd(<8 x i32> %a0, <8 x i32> %a1) {
829; CHECK-LABEL: stack_fold_phaddd:
830; CHECK:       # %bb.0:
831; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
832; CHECK-NEXT:    #APP
833; CHECK-NEXT:    nop
834; CHECK-NEXT:    #NO_APP
835; CHECK-NEXT:    vphaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
836; CHECK-NEXT:    retq
837  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
838  %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
839  ret <8 x i32> %2
840}
841declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
842
843define <16 x i16> @stack_fold_phaddsw(<16 x i16> %a0, <16 x i16> %a1) {
844; CHECK-LABEL: stack_fold_phaddsw:
845; CHECK:       # %bb.0:
846; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
847; CHECK-NEXT:    #APP
848; CHECK-NEXT:    nop
849; CHECK-NEXT:    #NO_APP
850; CHECK-NEXT:    vphaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
851; CHECK-NEXT:    retq
852  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
853  %2 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1)
854  ret <16 x i16> %2
855}
856declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
857
858define <16 x i16> @stack_fold_phaddw(<16 x i16> %a0, <16 x i16> %a1) {
859; CHECK-LABEL: stack_fold_phaddw:
860; CHECK:       # %bb.0:
861; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
862; CHECK-NEXT:    #APP
863; CHECK-NEXT:    nop
864; CHECK-NEXT:    #NO_APP
865; CHECK-NEXT:    vphaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
866; CHECK-NEXT:    retq
867  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
868  %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
869  ret <16 x i16> %2
870}
871declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
872
873define <8 x i32> @stack_fold_phsubd(<8 x i32> %a0, <8 x i32> %a1) {
874; CHECK-LABEL: stack_fold_phsubd:
875; CHECK:       # %bb.0:
876; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
877; CHECK-NEXT:    #APP
878; CHECK-NEXT:    nop
879; CHECK-NEXT:    #NO_APP
880; CHECK-NEXT:    vphsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
881; CHECK-NEXT:    retq
882  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
883  %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
884  ret <8 x i32> %2
885}
886declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
887
888define <16 x i16> @stack_fold_phsubsw(<16 x i16> %a0, <16 x i16> %a1) {
889; CHECK-LABEL: stack_fold_phsubsw:
890; CHECK:       # %bb.0:
891; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
892; CHECK-NEXT:    #APP
893; CHECK-NEXT:    nop
894; CHECK-NEXT:    #NO_APP
895; CHECK-NEXT:    vphsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
896; CHECK-NEXT:    retq
897  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
898  %2 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1)
899  ret <16 x i16> %2
900}
901declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
902
903define <16 x i16> @stack_fold_phsubw(<16 x i16> %a0, <16 x i16> %a1) {
904; CHECK-LABEL: stack_fold_phsubw:
905; CHECK:       # %bb.0:
906; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
907; CHECK-NEXT:    #APP
908; CHECK-NEXT:    nop
909; CHECK-NEXT:    #NO_APP
910; CHECK-NEXT:    vphsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
911; CHECK-NEXT:    retq
912  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
913  %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
914  ret <16 x i16> %2
915}
916declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
917
918define <16 x i16> @stack_fold_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1) {
919; CHECK-LABEL: stack_fold_pmaddubsw:
920; CHECK:       # %bb.0:
921; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
922; CHECK-NEXT:    #APP
923; CHECK-NEXT:    nop
924; CHECK-NEXT:    #NO_APP
925; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
926; CHECK-NEXT:    retq
927  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
928  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
929  ret <16 x i16> %2
930}
931declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
932
933define <8 x i32> @stack_fold_pmaddwd(<16 x i16> %a0, <16 x i16> %a1) {
934; CHECK-LABEL: stack_fold_pmaddwd:
935; CHECK:       # %bb.0:
936; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
937; CHECK-NEXT:    #APP
938; CHECK-NEXT:    nop
939; CHECK-NEXT:    #NO_APP
940; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
941; CHECK-NEXT:    retq
942  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
943  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
944  ret <8 x i32> %2
945}
946declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
947
948define <32 x i8> @stack_fold_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
949; CHECK-LABEL: stack_fold_pmaxsb:
950; CHECK:       # %bb.0:
951; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
952; CHECK-NEXT:    #APP
953; CHECK-NEXT:    nop
954; CHECK-NEXT:    #NO_APP
955; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
956; CHECK-NEXT:    retq
957  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
958  %2 = icmp sgt <32 x i8> %a0, %a1
959  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
960  ret <32 x i8> %3
961}
962
963define <8 x i32> @stack_fold_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
964; CHECK-LABEL: stack_fold_pmaxsd:
965; CHECK:       # %bb.0:
966; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
967; CHECK-NEXT:    #APP
968; CHECK-NEXT:    nop
969; CHECK-NEXT:    #NO_APP
970; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
971; CHECK-NEXT:    retq
972  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
973  %2 = icmp sgt <8 x i32> %a0, %a1
974  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
975  ret <8 x i32> %3
976}
977
978define <16 x i16> @stack_fold_pmaxsw(<16 x i16> %a0, <16 x i16> %a1) {
979; CHECK-LABEL: stack_fold_pmaxsw:
980; CHECK:       # %bb.0:
981; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
982; CHECK-NEXT:    #APP
983; CHECK-NEXT:    nop
984; CHECK-NEXT:    #NO_APP
985; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
986; CHECK-NEXT:    retq
987  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
988  %2 = icmp sgt <16 x i16> %a0, %a1
989  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
990  ret <16 x i16> %3
991}
992
993define <32 x i8> @stack_fold_pmaxub(<32 x i8> %a0, <32 x i8> %a1) {
994; CHECK-LABEL: stack_fold_pmaxub:
995; CHECK:       # %bb.0:
996; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
997; CHECK-NEXT:    #APP
998; CHECK-NEXT:    nop
999; CHECK-NEXT:    #NO_APP
1000; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1001; CHECK-NEXT:    retq
1002  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1003  %2 = icmp ugt <32 x i8> %a0, %a1
1004  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1005  ret <32 x i8> %3
1006}
1007
1008define <8 x i32> @stack_fold_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
1009; CHECK-LABEL: stack_fold_pmaxud:
1010; CHECK:       # %bb.0:
1011; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1012; CHECK-NEXT:    #APP
1013; CHECK-NEXT:    nop
1014; CHECK-NEXT:    #NO_APP
1015; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1016; CHECK-NEXT:    retq
1017  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1018  %2 = icmp ugt <8 x i32> %a0, %a1
1019  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1020  ret <8 x i32> %3
1021}
1022
1023define <16 x i16> @stack_fold_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
1024; CHECK-LABEL: stack_fold_pmaxuw:
1025; CHECK:       # %bb.0:
1026; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1027; CHECK-NEXT:    #APP
1028; CHECK-NEXT:    nop
1029; CHECK-NEXT:    #NO_APP
1030; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1031; CHECK-NEXT:    retq
1032  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1033  %2 = icmp ugt <16 x i16> %a0, %a1
1034  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1035  ret <16 x i16> %3
1036}
1037
1038define <32 x i8> @stack_fold_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
1039; CHECK-LABEL: stack_fold_pminsb:
1040; CHECK:       # %bb.0:
1041; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1042; CHECK-NEXT:    #APP
1043; CHECK-NEXT:    nop
1044; CHECK-NEXT:    #NO_APP
1045; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1046; CHECK-NEXT:    retq
1047  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1048  %2 = icmp slt <32 x i8> %a0, %a1
1049  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1050  ret <32 x i8> %3
1051}
1052
1053define <8 x i32> @stack_fold_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
1054; CHECK-LABEL: stack_fold_pminsd:
1055; CHECK:       # %bb.0:
1056; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1057; CHECK-NEXT:    #APP
1058; CHECK-NEXT:    nop
1059; CHECK-NEXT:    #NO_APP
1060; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1061; CHECK-NEXT:    retq
1062  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1063  %2 = icmp slt <8 x i32> %a0, %a1
1064  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1065  ret <8 x i32> %3
1066}
1067
1068define <16 x i16> @stack_fold_pminsw(<16 x i16> %a0, <16 x i16> %a1) {
1069; CHECK-LABEL: stack_fold_pminsw:
1070; CHECK:       # %bb.0:
1071; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1072; CHECK-NEXT:    #APP
1073; CHECK-NEXT:    nop
1074; CHECK-NEXT:    #NO_APP
1075; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1076; CHECK-NEXT:    retq
1077  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1078  %2 = icmp slt <16 x i16> %a0, %a1
1079  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1080  ret <16 x i16> %3
1081}
1082
1083define <32 x i8> @stack_fold_pminub(<32 x i8> %a0, <32 x i8> %a1) {
1084; CHECK-LABEL: stack_fold_pminub:
1085; CHECK:       # %bb.0:
1086; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1087; CHECK-NEXT:    #APP
1088; CHECK-NEXT:    nop
1089; CHECK-NEXT:    #NO_APP
1090; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1091; CHECK-NEXT:    retq
1092  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1093  %2 = icmp ult <32 x i8> %a0, %a1
1094  %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1095  ret <32 x i8> %3
1096}
1097
1098define <8 x i32> @stack_fold_pminud(<8 x i32> %a0, <8 x i32> %a1) {
1099; CHECK-LABEL: stack_fold_pminud:
1100; CHECK:       # %bb.0:
1101; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1102; CHECK-NEXT:    #APP
1103; CHECK-NEXT:    nop
1104; CHECK-NEXT:    #NO_APP
1105; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1106; CHECK-NEXT:    retq
1107  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1108  %2 = icmp ult <8 x i32> %a0, %a1
1109  %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1110  ret <8 x i32> %3
1111}
1112
1113define <16 x i16> @stack_fold_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
1114; CHECK-LABEL: stack_fold_pminuw:
1115; CHECK:       # %bb.0:
1116; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1117; CHECK-NEXT:    #APP
1118; CHECK-NEXT:    nop
1119; CHECK-NEXT:    #NO_APP
1120; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1121; CHECK-NEXT:    retq
1122  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1123  %2 = icmp ult <16 x i16> %a0, %a1
1124  %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1125  ret <16 x i16> %3
1126}
1127
1128define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
1129; CHECK-LABEL: stack_fold_pmovsxbd:
1130; CHECK:       # %bb.0:
1131; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1132; CHECK-NEXT:    #APP
1133; CHECK-NEXT:    nop
1134; CHECK-NEXT:    #NO_APP
1135; CHECK-NEXT:    vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1136; CHECK-NEXT:    retq
1137  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1138  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1139  %3 = sext <8 x i8> %2 to <8 x i32>
1140  ret <8 x i32> %3
1141}
1142
1143define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
1144; CHECK-LABEL: stack_fold_pmovsxbq:
1145; CHECK:       # %bb.0:
1146; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1147; CHECK-NEXT:    #APP
1148; CHECK-NEXT:    nop
1149; CHECK-NEXT:    #NO_APP
1150; CHECK-NEXT:    vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1151; CHECK-NEXT:    retq
1152  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1153  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1154  %3 = sext <4 x i8> %2 to <4 x i64>
1155  ret <4 x i64> %3
1156}
1157
1158define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
1159; CHECK-LABEL: stack_fold_pmovsxbw:
1160; CHECK:       # %bb.0:
1161; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1162; CHECK-NEXT:    #APP
1163; CHECK-NEXT:    nop
1164; CHECK-NEXT:    #NO_APP
1165; CHECK-NEXT:    vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1166; CHECK-NEXT:    retq
1167  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1168  %2 = sext <16 x i8> %a0 to <16 x i16>
1169  ret <16 x i16> %2
1170}
1171
1172define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
1173; CHECK-LABEL: stack_fold_pmovsxdq:
1174; CHECK:       # %bb.0:
1175; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1176; CHECK-NEXT:    #APP
1177; CHECK-NEXT:    nop
1178; CHECK-NEXT:    #NO_APP
1179; CHECK-NEXT:    vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1180; CHECK-NEXT:    retq
1181  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1182  %2 = sext <4 x i32> %a0 to <4 x i64>
1183  ret <4 x i64> %2
1184}
1185
1186define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
1187; CHECK-LABEL: stack_fold_pmovsxwd:
1188; CHECK:       # %bb.0:
1189; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1190; CHECK-NEXT:    #APP
1191; CHECK-NEXT:    nop
1192; CHECK-NEXT:    #NO_APP
1193; CHECK-NEXT:    vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1194; CHECK-NEXT:    retq
1195  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1196  %2 = sext <8 x i16> %a0 to <8 x i32>
1197  ret <8 x i32> %2
1198}
1199
1200define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
1201; CHECK-LABEL: stack_fold_pmovsxwq:
1202; CHECK:       # %bb.0:
1203; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1204; CHECK-NEXT:    #APP
1205; CHECK-NEXT:    nop
1206; CHECK-NEXT:    #NO_APP
1207; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1208; CHECK-NEXT:    retq
1209  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1210  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1211  %3 = sext <4 x i16> %2 to <4 x i64>
1212  ret <4 x i64> %3
1213}
1214
1215define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
1216; CHECK-LABEL: stack_fold_pmovzxbd:
1217; CHECK:       # %bb.0:
1218; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1219; CHECK-NEXT:    #APP
1220; CHECK-NEXT:    nop
1221; CHECK-NEXT:    #NO_APP
1222; CHECK-NEXT:    vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1223; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1224; CHECK-NEXT:    retq
1225  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1226  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1227  %3 = zext <8 x i8> %2 to <8 x i32>
1228  ret <8 x i32> %3
1229}
1230
1231define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
1232; CHECK-LABEL: stack_fold_pmovzxbq:
1233; CHECK:       # %bb.0:
1234; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1235; CHECK-NEXT:    #APP
1236; CHECK-NEXT:    nop
1237; CHECK-NEXT:    #NO_APP
1238; CHECK-NEXT:    vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1239; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
1240; CHECK-NEXT:    retq
1241  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1242  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1243  %3 = zext <4 x i8> %2 to <4 x i64>
1244  ret <4 x i64> %3
1245}
1246
1247define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
1248; CHECK-LABEL: stack_fold_pmovzxbw:
1249; CHECK:       # %bb.0:
1250; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1251; CHECK-NEXT:    #APP
1252; CHECK-NEXT:    nop
1253; CHECK-NEXT:    #NO_APP
1254; CHECK-NEXT:    vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1255; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1256; CHECK-NEXT:    retq
1257  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1258  %2 = zext <16 x i8> %a0 to <16 x i16>
1259  ret <16 x i16> %2
1260}
1261
1262define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
1263; CHECK-LABEL: stack_fold_pmovzxdq:
1264; CHECK:       # %bb.0:
1265; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1266; CHECK-NEXT:    #APP
1267; CHECK-NEXT:    nop
1268; CHECK-NEXT:    #NO_APP
1269; CHECK-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1270; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1271; CHECK-NEXT:    retq
1272  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1273  %2 = zext <4 x i32> %a0 to <4 x i64>
1274  ret <4 x i64> %2
1275}
1276
1277define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
1278; CHECK-LABEL: stack_fold_pmovzxwd:
1279; CHECK:       # %bb.0:
1280; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1281; CHECK-NEXT:    #APP
1282; CHECK-NEXT:    nop
1283; CHECK-NEXT:    #NO_APP
1284; CHECK-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1285; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1286; CHECK-NEXT:    retq
1287  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1288  %2 = zext <8 x i16> %a0 to <8 x i32>
1289  ret <8 x i32> %2
1290}
1291
1292define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
1293; CHECK-LABEL: stack_fold_pmovzxwq:
1294; CHECK:       # %bb.0:
1295; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1296; CHECK-NEXT:    #APP
1297; CHECK-NEXT:    nop
1298; CHECK-NEXT:    #NO_APP
1299; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1300; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1301; CHECK-NEXT:    retq
1302  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1303  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1304  %3 = zext <4 x i16> %2 to <4 x i64>
1305  ret <4 x i64> %3
1306}
1307
1308define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) {
1309; CHECK-LABEL: stack_fold_pmuldq:
1310; CHECK:       # %bb.0:
1311; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1312; CHECK-NEXT:    #APP
1313; CHECK-NEXT:    nop
1314; CHECK-NEXT:    #NO_APP
1315; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1316; CHECK-NEXT:    retq
1317  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1318  %2 = bitcast <8 x i32> %a0 to <4 x i64>
1319  %3 = bitcast <8 x i32> %a1 to <4 x i64>
1320  %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32>
1321  %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
1322  %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32>
1323  %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
1324  %8 = mul <4 x i64> %5, %7
1325  ret <4 x i64> %8
1326}
1327
1328define <16 x i16> @stack_fold_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1) {
1329; CHECK-LABEL: stack_fold_pmulhrsw:
1330; CHECK:       # %bb.0:
1331; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1332; CHECK-NEXT:    #APP
1333; CHECK-NEXT:    nop
1334; CHECK-NEXT:    #NO_APP
1335; CHECK-NEXT:    vpmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1336; CHECK-NEXT:    retq
1337  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1338  %2 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1)
1339  ret <16 x i16> %2
1340}
1341declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1342
1343define <16 x i16> @stack_fold_pmulhuw(<16 x i16> %a0, <16 x i16> %a1) {
1344; CHECK-LABEL: stack_fold_pmulhuw:
1345; CHECK:       # %bb.0:
1346; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1347; CHECK-NEXT:    #APP
1348; CHECK-NEXT:    nop
1349; CHECK-NEXT:    #NO_APP
1350; CHECK-NEXT:    vpmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1351; CHECK-NEXT:    retq
1352  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1353  %2 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1)
1354  ret <16 x i16> %2
1355}
1356declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1357
1358define <16 x i16> @stack_fold_pmulhw(<16 x i16> %a0, <16 x i16> %a1) {
1359; CHECK-LABEL: stack_fold_pmulhw:
1360; CHECK:       # %bb.0:
1361; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1362; CHECK-NEXT:    #APP
1363; CHECK-NEXT:    nop
1364; CHECK-NEXT:    #NO_APP
1365; CHECK-NEXT:    vpmulhw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1366; CHECK-NEXT:    retq
1367  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1368  %2 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1)
1369  ret <16 x i16> %2
1370}
1371declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1372
1373define <8 x i32> @stack_fold_pmulld(<8 x i32> %a0, <8 x i32> %a1) {
1374; CHECK-LABEL: stack_fold_pmulld:
1375; CHECK:       # %bb.0:
1376; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1377; CHECK-NEXT:    #APP
1378; CHECK-NEXT:    nop
1379; CHECK-NEXT:    #NO_APP
1380; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1381; CHECK-NEXT:    retq
1382  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1383  %2 = mul <8 x i32> %a0, %a1
1384  ret <8 x i32> %2
1385}
1386
1387define <16 x i16> @stack_fold_pmullw(<16 x i16> %a0, <16 x i16> %a1) {
1388; CHECK-LABEL: stack_fold_pmullw:
1389; CHECK:       # %bb.0:
1390; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1391; CHECK-NEXT:    #APP
1392; CHECK-NEXT:    nop
1393; CHECK-NEXT:    #NO_APP
1394; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1395; CHECK-NEXT:    retq
1396  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1397  %2 = mul <16 x i16> %a0, %a1
1398  ret <16 x i16> %2
1399}
1400
1401define <4 x i64> @stack_fold_pmuludq(<8 x i32> %a0, <8 x i32> %a1) {
1402; CHECK-LABEL: stack_fold_pmuludq:
1403; CHECK:       # %bb.0:
1404; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1405; CHECK-NEXT:    #APP
1406; CHECK-NEXT:    nop
1407; CHECK-NEXT:    #NO_APP
1408; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1409; CHECK-NEXT:    retq
1410  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1411  %2 = bitcast <8 x i32> %a0 to <4 x i64>
1412  %3 = bitcast <8 x i32> %a1 to <4 x i64>
1413  %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1414  %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1415  %6 = mul <4 x i64> %4, %5
1416  ret <4 x i64> %6
1417}
1418
1419define <32 x i8> @stack_fold_por(<32 x i8> %a0, <32 x i8> %a1) {
1420; CHECK-LABEL: stack_fold_por:
1421; CHECK:       # %bb.0:
1422; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1423; CHECK-NEXT:    #APP
1424; CHECK-NEXT:    nop
1425; CHECK-NEXT:    #NO_APP
1426; CHECK-NEXT:    vpor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1427; CHECK-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1428; CHECK-NEXT:    retq
1429  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1430  %2 = or <32 x i8> %a0, %a1
1431  ; add forces execution domain
1432  %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1433  ret <32 x i8> %3
1434}
1435
1436define <4 x i64> @stack_fold_psadbw(<32 x i8> %a0, <32 x i8> %a1) {
1437; CHECK-LABEL: stack_fold_psadbw:
1438; CHECK:       # %bb.0:
1439; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1440; CHECK-NEXT:    #APP
1441; CHECK-NEXT:    nop
1442; CHECK-NEXT:    #NO_APP
1443; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1444; CHECK-NEXT:    retq
1445  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1446  %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
1447  ret <4 x i64> %2
1448}
1449declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
1450
1451define <32 x i8> @stack_fold_pshufb(<32 x i8> %a0, <32 x i8> %a1) {
1452; CHECK-LABEL: stack_fold_pshufb:
1453; CHECK:       # %bb.0:
1454; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1455; CHECK-NEXT:    #APP
1456; CHECK-NEXT:    nop
1457; CHECK-NEXT:    #NO_APP
1458; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1459; CHECK-NEXT:    retq
1460  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1461  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
1462  ret <32 x i8> %2
1463}
1464declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
1465
1466define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) {
1467; CHECK-LABEL: stack_fold_pshufd:
1468; CHECK:       # %bb.0:
1469; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1470; CHECK-NEXT:    #APP
1471; CHECK-NEXT:    nop
1472; CHECK-NEXT:    #NO_APP
1473; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1474; CHECK-NEXT:    # ymm0 = mem[3,2,1,0,7,6,5,4]
1475; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1476; CHECK-NEXT:    retq
1477  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1478  %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1479  ; add forces execution domain
1480  %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1481  ret <8 x i32> %3
1482}
1483
1484define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) {
1485; CHECK-LABEL: stack_fold_vpshufhw:
1486; CHECK:       # %bb.0:
1487; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1488; CHECK-NEXT:    #APP
1489; CHECK-NEXT:    nop
1490; CHECK-NEXT:    #NO_APP
1491; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1492; CHECK-NEXT:    # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
1493; CHECK-NEXT:    retq
1494  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1495  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
1496  ret <16 x i16> %2
1497}
1498
1499define <16 x i16> @stack_fold_vpshuflw(<16 x i16> %a0) {
1500; CHECK-LABEL: stack_fold_vpshuflw:
1501; CHECK:       # %bb.0:
1502; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1503; CHECK-NEXT:    #APP
1504; CHECK-NEXT:    nop
1505; CHECK-NEXT:    #NO_APP
1506; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1507; CHECK-NEXT:    # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
1508; CHECK-NEXT:    retq
1509  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1510  %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
1511  ret <16 x i16> %2
1512}
1513
1514define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) {
1515; CHECK-LABEL: stack_fold_psignb:
1516; CHECK:       # %bb.0:
1517; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1518; CHECK-NEXT:    #APP
1519; CHECK-NEXT:    nop
1520; CHECK-NEXT:    #NO_APP
1521; CHECK-NEXT:    vpsignb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1522; CHECK-NEXT:    retq
1523  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1524  %2 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1)
1525  ret <32 x i8> %2
1526}
1527declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
1528
1529define <8 x i32> @stack_fold_psignd(<8 x i32> %a0, <8 x i32> %a1) {
1530; CHECK-LABEL: stack_fold_psignd:
1531; CHECK:       # %bb.0:
1532; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1533; CHECK-NEXT:    #APP
1534; CHECK-NEXT:    nop
1535; CHECK-NEXT:    #NO_APP
1536; CHECK-NEXT:    vpsignd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1537; CHECK-NEXT:    retq
1538  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1539  %2 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1)
1540  ret <8 x i32> %2
1541}
1542declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
1543
1544define <16 x i16> @stack_fold_psignw(<16 x i16> %a0, <16 x i16> %a1) {
1545; CHECK-LABEL: stack_fold_psignw:
1546; CHECK:       # %bb.0:
1547; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1548; CHECK-NEXT:    #APP
1549; CHECK-NEXT:    nop
1550; CHECK-NEXT:    #NO_APP
1551; CHECK-NEXT:    vpsignw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1552; CHECK-NEXT:    retq
1553  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1554  %2 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1)
1555  ret <16 x i16> %2
1556}
1557declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
1558
1559define <8 x i32> @stack_fold_pslld(<8 x i32> %a0, <4 x i32> %a1) {
1560; CHECK-LABEL: stack_fold_pslld:
1561; CHECK:       # %bb.0:
1562; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1563; CHECK-NEXT:    #APP
1564; CHECK-NEXT:    nop
1565; CHECK-NEXT:    #NO_APP
1566; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1567; CHECK-NEXT:    retq
1568  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1569  %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
1570  ret <8 x i32> %2
1571}
1572declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
1573
1574define <4 x i64> @stack_fold_psllq(<4 x i64> %a0, <2 x i64> %a1) {
1575; CHECK-LABEL: stack_fold_psllq:
1576; CHECK:       # %bb.0:
1577; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1578; CHECK-NEXT:    #APP
1579; CHECK-NEXT:    nop
1580; CHECK-NEXT:    #NO_APP
1581; CHECK-NEXT:    vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1582; CHECK-NEXT:    retq
1583  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1584  %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
1585  ret <4 x i64> %2
1586}
1587declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
1588
1589define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) {
1590; CHECK-LABEL: stack_fold_psllvd:
1591; CHECK:       # %bb.0:
1592; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1593; CHECK-NEXT:    #APP
1594; CHECK-NEXT:    nop
1595; CHECK-NEXT:    #NO_APP
1596; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1597; CHECK-NEXT:    retq
1598  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1599  %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
1600  ret <4 x i32> %2
1601}
1602declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
1603
1604define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1605; CHECK-LABEL: stack_fold_psllvd_ymm:
1606; CHECK:       # %bb.0:
1607; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1608; CHECK-NEXT:    #APP
1609; CHECK-NEXT:    nop
1610; CHECK-NEXT:    #NO_APP
1611; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1612; CHECK-NEXT:    retq
1613  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1614  %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
1615  ret <8 x i32> %2
1616}
1617declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
1618
1619define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) {
1620; CHECK-LABEL: stack_fold_psllvq:
1621; CHECK:       # %bb.0:
1622; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1623; CHECK-NEXT:    #APP
1624; CHECK-NEXT:    nop
1625; CHECK-NEXT:    #NO_APP
1626; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1627; CHECK-NEXT:    retq
1628  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1629  %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
1630  ret <2 x i64> %2
1631}
1632declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
1633
1634define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1635; CHECK-LABEL: stack_fold_psllvq_ymm:
1636; CHECK:       # %bb.0:
1637; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1638; CHECK-NEXT:    #APP
1639; CHECK-NEXT:    nop
1640; CHECK-NEXT:    #NO_APP
1641; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1642; CHECK-NEXT:    retq
1643  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1644  %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
1645  ret <4 x i64> %2
1646}
1647declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
1648
1649define <16 x i16> @stack_fold_psllw(<16 x i16> %a0, <8 x i16> %a1) {
1650; CHECK-LABEL: stack_fold_psllw:
1651; CHECK:       # %bb.0:
1652; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1653; CHECK-NEXT:    #APP
1654; CHECK-NEXT:    nop
1655; CHECK-NEXT:    #NO_APP
1656; CHECK-NEXT:    vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1657; CHECK-NEXT:    retq
1658  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1659  %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
1660  ret <16 x i16> %2
1661}
1662declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
1663
1664define <8 x i32> @stack_fold_psrad(<8 x i32> %a0, <4 x i32> %a1) {
1665; CHECK-LABEL: stack_fold_psrad:
1666; CHECK:       # %bb.0:
1667; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1668; CHECK-NEXT:    #APP
1669; CHECK-NEXT:    nop
1670; CHECK-NEXT:    #NO_APP
1671; CHECK-NEXT:    vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1672; CHECK-NEXT:    retq
1673  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1674  %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
1675  ret <8 x i32> %2
1676}
1677declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
1678
1679define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) {
1680; CHECK-LABEL: stack_fold_psravd:
1681; CHECK:       # %bb.0:
1682; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1683; CHECK-NEXT:    #APP
1684; CHECK-NEXT:    nop
1685; CHECK-NEXT:    #NO_APP
1686; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1687; CHECK-NEXT:    retq
1688  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1689  %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
1690  ret <4 x i32> %2
1691}
1692declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
1693
1694define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1695; CHECK-LABEL: stack_fold_psravd_ymm:
1696; CHECK:       # %bb.0:
1697; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1698; CHECK-NEXT:    #APP
1699; CHECK-NEXT:    nop
1700; CHECK-NEXT:    #NO_APP
1701; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1702; CHECK-NEXT:    retq
1703  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1704  %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
1705  ret <8 x i32> %2
1706}
1707declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
1708
1709define <16 x i16> @stack_fold_psraw(<16 x i16> %a0, <8 x i16> %a1) {
1710; CHECK-LABEL: stack_fold_psraw:
1711; CHECK:       # %bb.0:
1712; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1713; CHECK-NEXT:    #APP
1714; CHECK-NEXT:    nop
1715; CHECK-NEXT:    #NO_APP
1716; CHECK-NEXT:    vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1717; CHECK-NEXT:    retq
1718  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1719  %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
1720  ret <16 x i16> %2
1721}
1722declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
1723
1724define <8 x i32> @stack_fold_psrld(<8 x i32> %a0, <4 x i32> %a1) {
1725; CHECK-LABEL: stack_fold_psrld:
1726; CHECK:       # %bb.0:
1727; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1728; CHECK-NEXT:    #APP
1729; CHECK-NEXT:    nop
1730; CHECK-NEXT:    #NO_APP
1731; CHECK-NEXT:    vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1732; CHECK-NEXT:    retq
1733  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1734  %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
1735  ret <8 x i32> %2
1736}
1737declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
1738
1739define <4 x i64> @stack_fold_psrlq(<4 x i64> %a0, <2 x i64> %a1) {
1740; CHECK-LABEL: stack_fold_psrlq:
1741; CHECK:       # %bb.0:
1742; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1743; CHECK-NEXT:    #APP
1744; CHECK-NEXT:    nop
1745; CHECK-NEXT:    #NO_APP
1746; CHECK-NEXT:    vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1747; CHECK-NEXT:    retq
1748  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1749  %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
1750  ret <4 x i64> %2
1751}
1752declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
1753
1754define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) {
1755; CHECK-LABEL: stack_fold_psrlvd:
1756; CHECK:       # %bb.0:
1757; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1758; CHECK-NEXT:    #APP
1759; CHECK-NEXT:    nop
1760; CHECK-NEXT:    #NO_APP
1761; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1762; CHECK-NEXT:    retq
1763  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1764  %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
1765  ret <4 x i32> %2
1766}
1767declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
1768
1769define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1770; CHECK-LABEL: stack_fold_psrlvd_ymm:
1771; CHECK:       # %bb.0:
1772; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1773; CHECK-NEXT:    #APP
1774; CHECK-NEXT:    nop
1775; CHECK-NEXT:    #NO_APP
1776; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1777; CHECK-NEXT:    retq
1778  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1779  %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
1780  ret <8 x i32> %2
1781}
1782declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
1783
1784define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) {
1785; CHECK-LABEL: stack_fold_psrlvq:
1786; CHECK:       # %bb.0:
1787; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1788; CHECK-NEXT:    #APP
1789; CHECK-NEXT:    nop
1790; CHECK-NEXT:    #NO_APP
1791; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1792; CHECK-NEXT:    retq
1793  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1794  %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
1795  ret <2 x i64> %2
1796}
1797declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
1798
1799define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1800; CHECK-LABEL: stack_fold_psrlvq_ymm:
1801; CHECK:       # %bb.0:
1802; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1803; CHECK-NEXT:    #APP
1804; CHECK-NEXT:    nop
1805; CHECK-NEXT:    #NO_APP
1806; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1807; CHECK-NEXT:    retq
1808  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1809  %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
1810  ret <4 x i64> %2
1811}
1812declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
1813
1814define <16 x i16> @stack_fold_psrlw(<16 x i16> %a0, <8 x i16> %a1) {
1815; CHECK-LABEL: stack_fold_psrlw:
1816; CHECK:       # %bb.0:
1817; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1818; CHECK-NEXT:    #APP
1819; CHECK-NEXT:    nop
1820; CHECK-NEXT:    #NO_APP
1821; CHECK-NEXT:    vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1822; CHECK-NEXT:    retq
1823  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1824  %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
1825  ret <16 x i16> %2
1826}
1827declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
1828
1829define <32 x i8> @stack_fold_psubb(<32 x i8> %a0, <32 x i8> %a1) {
1830; CHECK-LABEL: stack_fold_psubb:
1831; CHECK:       # %bb.0:
1832; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1833; CHECK-NEXT:    #APP
1834; CHECK-NEXT:    nop
1835; CHECK-NEXT:    #NO_APP
1836; CHECK-NEXT:    vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1837; CHECK-NEXT:    retq
1838  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1839  %2 = sub <32 x i8> %a0, %a1
1840  ret <32 x i8> %2
1841}
1842
1843define <8 x i32> @stack_fold_psubd(<8 x i32> %a0, <8 x i32> %a1) {
1844; CHECK-LABEL: stack_fold_psubd:
1845; CHECK:       # %bb.0:
1846; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1847; CHECK-NEXT:    #APP
1848; CHECK-NEXT:    nop
1849; CHECK-NEXT:    #NO_APP
1850; CHECK-NEXT:    vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1851; CHECK-NEXT:    retq
1852  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1853  %2 = sub <8 x i32> %a0, %a1
1854  ret <8 x i32> %2
1855}
1856
1857define <4 x i64> @stack_fold_psubq(<4 x i64> %a0, <4 x i64> %a1) {
1858; CHECK-LABEL: stack_fold_psubq:
1859; CHECK:       # %bb.0:
1860; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1861; CHECK-NEXT:    #APP
1862; CHECK-NEXT:    nop
1863; CHECK-NEXT:    #NO_APP
1864; CHECK-NEXT:    vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1865; CHECK-NEXT:    retq
1866  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1867  %2 = sub <4 x i64> %a0, %a1
1868  ret <4 x i64> %2
1869}
1870
1871define <32 x i8> @stack_fold_psubsb(<32 x i8> %a0, <32 x i8> %a1) {
1872; CHECK-LABEL: stack_fold_psubsb:
1873; CHECK:       # %bb.0:
1874; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1875; CHECK-NEXT:    #APP
1876; CHECK-NEXT:    nop
1877; CHECK-NEXT:    #NO_APP
1878; CHECK-NEXT:    vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1879; CHECK-NEXT:    retq
1880  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1881  %2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
1882  ret <32 x i8> %2
1883}
1884declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
1885
1886define <16 x i16> @stack_fold_psubsw(<16 x i16> %a0, <16 x i16> %a1) {
1887; CHECK-LABEL: stack_fold_psubsw:
1888; CHECK:       # %bb.0:
1889; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1890; CHECK-NEXT:    #APP
1891; CHECK-NEXT:    nop
1892; CHECK-NEXT:    #NO_APP
1893; CHECK-NEXT:    vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1894; CHECK-NEXT:    retq
1895  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1896  %2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
1897  ret <16 x i16> %2
1898}
1899declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
1900
1901define <32 x i8> @stack_fold_psubusb(<32 x i8> %a0, <32 x i8> %a1) {
1902; CHECK-LABEL: stack_fold_psubusb:
1903; CHECK:       # %bb.0:
1904; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1905; CHECK-NEXT:    #APP
1906; CHECK-NEXT:    nop
1907; CHECK-NEXT:    #NO_APP
1908; CHECK-NEXT:    vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1909; CHECK-NEXT:    retq
1910  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1911  %2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
1912  ret <32 x i8> %2
1913}
1914declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
1915
1916define <16 x i16> @stack_fold_psubusw(<16 x i16> %a0, <16 x i16> %a1) {
1917; CHECK-LABEL: stack_fold_psubusw:
1918; CHECK:       # %bb.0:
1919; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1920; CHECK-NEXT:    #APP
1921; CHECK-NEXT:    nop
1922; CHECK-NEXT:    #NO_APP
1923; CHECK-NEXT:    vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1924; CHECK-NEXT:    retq
1925  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1926  %2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
1927  ret <16 x i16> %2
1928}
1929declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
1930
1931define <16 x i16> @stack_fold_psubw(<16 x i16> %a0, <16 x i16> %a1) {
1932; CHECK-LABEL: stack_fold_psubw:
1933; CHECK:       # %bb.0:
1934; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1935; CHECK-NEXT:    #APP
1936; CHECK-NEXT:    nop
1937; CHECK-NEXT:    #NO_APP
1938; CHECK-NEXT:    vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1939; CHECK-NEXT:    retq
1940  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1941  %2 = sub <16 x i16> %a0, %a1
1942  ret <16 x i16> %2
1943}
1944
1945define <32 x i8> @stack_fold_punpckhbw(<32 x i8> %a0, <32 x i8> %a1) {
1946; CHECK-LABEL: stack_fold_punpckhbw:
1947; CHECK:       # %bb.0:
1948; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1949; CHECK-NEXT:    #APP
1950; CHECK-NEXT:    nop
1951; CHECK-NEXT:    #NO_APP
1952; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1953; CHECK-NEXT:    # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
1954; CHECK-NEXT:    retq
1955  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1956  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
1957  ret <32 x i8> %2
1958}
1959
1960define <8 x i32> @stack_fold_punpckhdq(<8 x i32> %a0, <8 x i32> %a1) {
1961; CHECK-LABEL: stack_fold_punpckhdq:
1962; CHECK:       # %bb.0:
1963; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1964; CHECK-NEXT:    #APP
1965; CHECK-NEXT:    nop
1966; CHECK-NEXT:    #NO_APP
1967; CHECK-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1968; CHECK-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
1969; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1970; CHECK-NEXT:    retq
1971  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1972  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
1973  ; add forces execution domain
1974  %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1975  ret <8 x i32> %3
1976}
1977
1978define <4 x i64> @stack_fold_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1) {
1979; CHECK-LABEL: stack_fold_punpckhqdq:
1980; CHECK:       # %bb.0:
1981; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1982; CHECK-NEXT:    #APP
1983; CHECK-NEXT:    nop
1984; CHECK-NEXT:    #NO_APP
1985; CHECK-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1986; CHECK-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
1987; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1988; CHECK-NEXT:    retq
1989  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1990  %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1991  ; add forces execution domain
1992  %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1>
1993  ret <4 x i64> %3
1994}
1995
1996define <16 x i16> @stack_fold_punpckhwd(<16 x i16> %a0, <16 x i16> %a1) {
1997; CHECK-LABEL: stack_fold_punpckhwd:
1998; CHECK:       # %bb.0:
1999; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2000; CHECK-NEXT:    #APP
2001; CHECK-NEXT:    nop
2002; CHECK-NEXT:    #NO_APP
2003; CHECK-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2004; CHECK-NEXT:    # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
2005; CHECK-NEXT:    retq
2006  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2007  %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2008  ret <16 x i16> %2
2009}
2010
2011define <32 x i8> @stack_fold_punpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
2012; CHECK-LABEL: stack_fold_punpcklbw:
2013; CHECK:       # %bb.0:
2014; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2015; CHECK-NEXT:    #APP
2016; CHECK-NEXT:    nop
2017; CHECK-NEXT:    #NO_APP
2018; CHECK-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2019; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23]
2020; CHECK-NEXT:    retq
2021  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2022  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2023  ret <32 x i8> %2
2024}
2025
2026define <8 x i32> @stack_fold_punpckldq(<8 x i32> %a0, <8 x i32> %a1) {
2027; CHECK-LABEL: stack_fold_punpckldq:
2028; CHECK:       # %bb.0:
2029; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2030; CHECK-NEXT:    #APP
2031; CHECK-NEXT:    nop
2032; CHECK-NEXT:    #NO_APP
2033; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2034; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2035; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2036; CHECK-NEXT:    retq
2037  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2038  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2039  ; add forces execution domain
2040  %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2041  ret <8 x i32> %3
2042}
2043
2044define <4 x i64> @stack_fold_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1) {
2045; CHECK-LABEL: stack_fold_punpcklqdq:
2046; CHECK:       # %bb.0:
2047; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2048; CHECK-NEXT:    #APP
2049; CHECK-NEXT:    nop
2050; CHECK-NEXT:    #NO_APP
2051; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2052; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
2053; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2054; CHECK-NEXT:    retq
2055  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2056  %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2057  ; add forces execution domain
2058  %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1>
2059  ret <4 x i64> %3
2060}
2061
2062define <16 x i16> @stack_fold_punpcklwd(<16 x i16> %a0, <16 x i16> %a1) {
2063; CHECK-LABEL: stack_fold_punpcklwd:
2064; CHECK:       # %bb.0:
2065; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2066; CHECK-NEXT:    #APP
2067; CHECK-NEXT:    nop
2068; CHECK-NEXT:    #NO_APP
2069; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2070; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
2071; CHECK-NEXT:    retq
2072  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2073  %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2074  ret <16 x i16> %2
2075}
2076
2077define <32 x i8> @stack_fold_pxor(<32 x i8> %a0, <32 x i8> %a1) {
2078; CHECK-LABEL: stack_fold_pxor:
2079; CHECK:       # %bb.0:
2080; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2081; CHECK-NEXT:    #APP
2082; CHECK-NEXT:    nop
2083; CHECK-NEXT:    #NO_APP
2084; CHECK-NEXT:    vpxor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2085; CHECK-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2086; CHECK-NEXT:    retq
2087  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2088  %2 = xor <32 x i8> %a0, %a1
2089  ; add forces execution domain
2090  %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2091  ret <32 x i8> %3
2092}
2093